5 * "A fair jaw-cracker dwarf-language must be." --Samwise Gamgee
8 /* This file contains functions for compiling a regular expression. See
9 * also regexec.c which funnily enough, contains functions for executing
10 * a regular expression.
12 * This file is also copied at build time to ext/re/re_comp.c, where
13 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
14 * This causes the main functions to be compiled under new names and with
15 * debugging support added, which makes "use re 'debug'" work.
18 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
19 * confused with the original package (see point 3 below). Thanks, Henry!
22 /* Additional note: this code is very heavily munged from Henry's version
23 * in places. In some spots I've traded clarity for efficiency, so don't
24 * blame Henry for some of the lack of readability.
27 /* The names of the functions have been changed from regcomp and
28 * regexec to pregcomp and pregexec in order to avoid conflicts
29 * with the POSIX routines of the same names.
32 #ifdef PERL_EXT_RE_BUILD
37 * pregcomp and pregexec -- regsub and regerror are not used in perl
39 * Copyright (c) 1986 by University of Toronto.
40 * Written by Henry Spencer. Not derived from licensed software.
42 * Permission is granted to anyone to use this software for any
43 * purpose on any computer system, and to redistribute it freely,
44 * subject to the following restrictions:
46 * 1. The author is not responsible for the consequences of use of
47 * this software, no matter how awful, even if they arise
50 * 2. The origin of this software must not be misrepresented, either
51 * by explicit claim or by omission.
53 * 3. Altered versions must be plainly marked as such, and must not
54 * be misrepresented as being the original software.
57 **** Alterations to Henry's code are...
59 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
60 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007 by Larry Wall and others
62 **** You may distribute under the terms of either the GNU General Public
63 **** License or the Artistic License, as specified in the README file.
66 * Beware that some of this code is subtly aware of the way operator
67 * precedence is structured in regular expressions. Serious changes in
68 * regular-expression syntax might require a total rethink.
71 #define PERL_IN_REGCOMP_C
74 #ifndef PERL_IN_XSUB_RE
79 #ifdef PERL_IN_XSUB_RE
90 # if defined(BUGGY_MSC6)
91 /* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */
92 # pragma optimize("a",off)
93 /* But MSC 6.00A is happy with 'w', for aliases only across function calls*/
94 # pragma optimize("w",on )
95 # endif /* BUGGY_MSC6 */
102 typedef struct RExC_state_t {
103 U32 flags; /* are we folding, multilining? */
104 char *precomp; /* uncompiled string. */
105 regexp *rx; /* perl core regexp structure */
106 regexp_internal *rxi; /* internal data for regexp object pprivate field */
107 char *start; /* Start of input for compile */
108 char *end; /* End of input for compile */
109 char *parse; /* Input-scan pointer. */
110 I32 whilem_seen; /* number of WHILEM in this expr */
111 regnode *emit_start; /* Start of emitted-code area */
112 regnode *emit_bound; /* First regnode outside of the allocated space */
113 regnode *emit; /* Code-emit pointer; ®dummy = don't = compiling */
114 I32 naughty; /* How bad is this pattern? */
115 I32 sawback; /* Did we see \1, ...? */
117 I32 size; /* Code size. */
118 I32 npar; /* Capture buffer count, (OPEN). */
119 I32 cpar; /* Capture buffer count, (CLOSE). */
120 I32 nestroot; /* root parens we are in - used by accept */
124 regnode **open_parens; /* pointers to open parens */
125 regnode **close_parens; /* pointers to close parens */
126 regnode *opend; /* END node in program */
127 I32 utf8; /* whether the pattern is utf8 or not */
128 I32 orig_utf8; /* whether the pattern was originally in utf8 */
129 /* XXX use this for future optimisation of case
130 * where pattern must be upgraded to utf8. */
131 HV *charnames; /* cache of named sequences */
132 HV *paren_names; /* Paren names */
134 regnode **recurse; /* Recurse regops */
135 I32 recurse_count; /* Number of recurse regops */
137 char *starttry; /* -Dr: where regtry was called. */
138 #define RExC_starttry (pRExC_state->starttry)
141 const char *lastparse;
143 AV *paren_name_list; /* idx -> name */
144 #define RExC_lastparse (pRExC_state->lastparse)
145 #define RExC_lastnum (pRExC_state->lastnum)
146 #define RExC_paren_name_list (pRExC_state->paren_name_list)
150 #define RExC_flags (pRExC_state->flags)
151 #define RExC_precomp (pRExC_state->precomp)
152 #define RExC_rx (pRExC_state->rx)
153 #define RExC_rxi (pRExC_state->rxi)
154 #define RExC_start (pRExC_state->start)
155 #define RExC_end (pRExC_state->end)
156 #define RExC_parse (pRExC_state->parse)
157 #define RExC_whilem_seen (pRExC_state->whilem_seen)
158 #ifdef RE_TRACK_PATTERN_OFFSETS
159 #define RExC_offsets (pRExC_state->rxi->u.offsets) /* I am not like the others */
161 #define RExC_emit (pRExC_state->emit)
162 #define RExC_emit_start (pRExC_state->emit_start)
163 #define RExC_emit_bound (pRExC_state->emit_bound)
164 #define RExC_naughty (pRExC_state->naughty)
165 #define RExC_sawback (pRExC_state->sawback)
166 #define RExC_seen (pRExC_state->seen)
167 #define RExC_size (pRExC_state->size)
168 #define RExC_npar (pRExC_state->npar)
169 #define RExC_nestroot (pRExC_state->nestroot)
170 #define RExC_extralen (pRExC_state->extralen)
171 #define RExC_seen_zerolen (pRExC_state->seen_zerolen)
172 #define RExC_seen_evals (pRExC_state->seen_evals)
173 #define RExC_utf8 (pRExC_state->utf8)
174 #define RExC_orig_utf8 (pRExC_state->orig_utf8)
175 #define RExC_charnames (pRExC_state->charnames)
176 #define RExC_open_parens (pRExC_state->open_parens)
177 #define RExC_close_parens (pRExC_state->close_parens)
178 #define RExC_opend (pRExC_state->opend)
179 #define RExC_paren_names (pRExC_state->paren_names)
180 #define RExC_recurse (pRExC_state->recurse)
181 #define RExC_recurse_count (pRExC_state->recurse_count)
184 #define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?')
185 #define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
186 ((*s) == '{' && regcurly(s)))
189 #undef SPSTART /* dratted cpp namespace... */
192 * Flags to be passed up and down.
194 #define WORST 0 /* Worst case. */
195 #define HASWIDTH 0x01 /* Known to match non-null strings. */
196 #define SIMPLE 0x02 /* Simple enough to be STAR/PLUS operand. */
197 #define SPSTART 0x04 /* Starts with * or +. */
198 #define TRYAGAIN 0x08 /* Weeded out a declaration. */
199 #define POSTPONED 0x10 /* (?1),(?&name), (??{...}) or similar */
201 #define REG_NODE_NUM(x) ((x) ? (int)((x)-RExC_emit_start) : -1)
203 /* whether trie related optimizations are enabled */
204 #if PERL_ENABLE_EXTENDED_TRIE_OPTIMISATION
205 #define TRIE_STUDY_OPT
206 #define FULL_TRIE_STUDY
212 #define PBYTE(u8str,paren) ((U8*)(u8str))[(paren) >> 3]
213 #define PBITVAL(paren) (1 << ((paren) & 7))
214 #define PAREN_TEST(u8str,paren) ( PBYTE(u8str,paren) & PBITVAL(paren))
215 #define PAREN_SET(u8str,paren) PBYTE(u8str,paren) |= PBITVAL(paren)
216 #define PAREN_UNSET(u8str,paren) PBYTE(u8str,paren) &= (~PBITVAL(paren))
219 /* About scan_data_t.
221 During optimisation we recurse through the regexp program performing
222 various inplace (keyhole style) optimisations. In addition study_chunk
223 and scan_commit populate this data structure with information about
224 what strings MUST appear in the pattern. We look for the longest
225 string that must appear for at a fixed location, and we look for the
226 longest string that may appear at a floating location. So for instance
231 Both 'FOO' and 'A' are fixed strings. Both 'B' and 'BAR' are floating
232 strings (because they follow a .* construct). study_chunk will identify
233 both FOO and BAR as being the longest fixed and floating strings respectively.
235 The strings can be composites, for instance
239 will result in a composite fixed substring 'foo'.
241 For each string some basic information is maintained:
243 - offset or min_offset
244 This is the position the string must appear at, or not before.
245 It also implicitly (when combined with minlenp) tells us how many
246 character must match before the string we are searching.
247 Likewise when combined with minlenp and the length of the string
248 tells us how many characters must appear after the string we have
252 Only used for floating strings. This is the rightmost point that
253 the string can appear at. Ifset to I32 max it indicates that the
254 string can occur infinitely far to the right.
257 A pointer to the minimum length of the pattern that the string
258 was found inside. This is important as in the case of positive
259 lookahead or positive lookbehind we can have multiple patterns
264 The minimum length of the pattern overall is 3, the minimum length
265 of the lookahead part is 3, but the minimum length of the part that
266 will actually match is 1. So 'FOO's minimum length is 3, but the
267 minimum length for the F is 1. This is important as the minimum length
268 is used to determine offsets in front of and behind the string being
269 looked for. Since strings can be composites this is the length of the
270 pattern at the time it was commited with a scan_commit. Note that
271 the length is calculated by study_chunk, so that the minimum lengths
272 are not known until the full pattern has been compiled, thus the
273 pointer to the value.
277 In the case of lookbehind the string being searched for can be
278 offset past the start point of the final matching string.
279 If this value was just blithely removed from the min_offset it would
280 invalidate some of the calculations for how many chars must match
281 before or after (as they are derived from min_offset and minlen and
282 the length of the string being searched for).
283 When the final pattern is compiled and the data is moved from the
284 scan_data_t structure into the regexp structure the information
285 about lookbehind is factored in, with the information that would
286 have been lost precalculated in the end_shift field for the
289 The fields pos_min and pos_delta are used to store the minimum offset
290 and the delta to the maximum offset at the current point in the pattern.
294 typedef struct scan_data_t {
295 /*I32 len_min; unused */
296 /*I32 len_delta; unused */
300 I32 last_end; /* min value, <0 unless valid. */
303 SV **longest; /* Either &l_fixed, or &l_float. */
304 SV *longest_fixed; /* longest fixed string found in pattern */
305 I32 offset_fixed; /* offset where it starts */
306 I32 *minlen_fixed; /* pointer to the minlen relevent to the string */
307 I32 lookbehind_fixed; /* is the position of the string modfied by LB */
308 SV *longest_float; /* longest floating string found in pattern */
309 I32 offset_float_min; /* earliest point in string it can appear */
310 I32 offset_float_max; /* latest point in string it can appear */
311 I32 *minlen_float; /* pointer to the minlen relevent to the string */
312 I32 lookbehind_float; /* is the position of the string modified by LB */
316 struct regnode_charclass_class *start_class;
320 * Forward declarations for pregcomp()'s friends.
323 static const scan_data_t zero_scan_data =
324 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ,0};
326 #define SF_BEFORE_EOL (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
327 #define SF_BEFORE_SEOL 0x0001
328 #define SF_BEFORE_MEOL 0x0002
329 #define SF_FIX_BEFORE_EOL (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
330 #define SF_FL_BEFORE_EOL (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
333 # define SF_FIX_SHIFT_EOL (0+2)
334 # define SF_FL_SHIFT_EOL (0+4)
336 # define SF_FIX_SHIFT_EOL (+2)
337 # define SF_FL_SHIFT_EOL (+4)
340 #define SF_FIX_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
341 #define SF_FIX_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
343 #define SF_FL_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
344 #define SF_FL_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
345 #define SF_IS_INF 0x0040
346 #define SF_HAS_PAR 0x0080
347 #define SF_IN_PAR 0x0100
348 #define SF_HAS_EVAL 0x0200
349 #define SCF_DO_SUBSTR 0x0400
350 #define SCF_DO_STCLASS_AND 0x0800
351 #define SCF_DO_STCLASS_OR 0x1000
352 #define SCF_DO_STCLASS (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
353 #define SCF_WHILEM_VISITED_POS 0x2000
355 #define SCF_TRIE_RESTUDY 0x4000 /* Do restudy? */
356 #define SCF_SEEN_ACCEPT 0x8000
358 #define UTF (RExC_utf8 != 0)
359 #define LOC ((RExC_flags & RXf_PMf_LOCALE) != 0)
360 #define FOLD ((RExC_flags & RXf_PMf_FOLD) != 0)
362 #define OOB_UNICODE 12345678
363 #define OOB_NAMEDCLASS -1
365 #define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
366 #define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
369 /* length of regex to show in messages that don't mark a position within */
370 #define RegexLengthToShowInErrorMessages 127
373 * If MARKER[12] are adjusted, be sure to adjust the constants at the top
374 * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
375 * op/pragma/warn/regcomp.
377 #define MARKER1 "<-- HERE" /* marker as it appears in the description */
378 #define MARKER2 " <-- HERE " /* marker as it appears within the regex */
380 #define REPORT_LOCATION " in regex; marked by " MARKER1 " in m/%.*s" MARKER2 "%s/"
383 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
384 * arg. Show regex, up to a maximum length. If it's too long, chop and add
387 #define _FAIL(code) STMT_START { \
388 const char *ellipses = ""; \
389 IV len = RExC_end - RExC_precomp; \
392 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
393 if (len > RegexLengthToShowInErrorMessages) { \
394 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
395 len = RegexLengthToShowInErrorMessages - 10; \
401 #define FAIL(msg) _FAIL( \
402 Perl_croak(aTHX_ "%s in regex m/%.*s%s/", \
403 msg, (int)len, RExC_precomp, ellipses))
405 #define FAIL2(msg,arg) _FAIL( \
406 Perl_croak(aTHX_ msg " in regex m/%.*s%s/", \
407 arg, (int)len, RExC_precomp, ellipses))
410 * Simple_vFAIL -- like FAIL, but marks the current location in the scan
412 #define Simple_vFAIL(m) STMT_START { \
413 const IV offset = RExC_parse - RExC_precomp; \
414 Perl_croak(aTHX_ "%s" REPORT_LOCATION, \
415 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
419 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
421 #define vFAIL(m) STMT_START { \
423 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
428 * Like Simple_vFAIL(), but accepts two arguments.
430 #define Simple_vFAIL2(m,a1) STMT_START { \
431 const IV offset = RExC_parse - RExC_precomp; \
432 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, \
433 (int)offset, RExC_precomp, RExC_precomp + offset); \
437 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
439 #define vFAIL2(m,a1) STMT_START { \
441 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
442 Simple_vFAIL2(m, a1); \
447 * Like Simple_vFAIL(), but accepts three arguments.
449 #define Simple_vFAIL3(m, a1, a2) STMT_START { \
450 const IV offset = RExC_parse - RExC_precomp; \
451 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, \
452 (int)offset, RExC_precomp, RExC_precomp + offset); \
456 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
458 #define vFAIL3(m,a1,a2) STMT_START { \
460 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
461 Simple_vFAIL3(m, a1, a2); \
465 * Like Simple_vFAIL(), but accepts four arguments.
467 #define Simple_vFAIL4(m, a1, a2, a3) STMT_START { \
468 const IV offset = RExC_parse - RExC_precomp; \
469 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3, \
470 (int)offset, RExC_precomp, RExC_precomp + offset); \
473 #define vWARN(loc,m) STMT_START { \
474 const IV offset = loc - RExC_precomp; \
475 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s" REPORT_LOCATION, \
476 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
479 #define vWARNdep(loc,m) STMT_START { \
480 const IV offset = loc - RExC_precomp; \
481 Perl_warner(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), \
482 "%s" REPORT_LOCATION, \
483 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
487 #define vWARN2(loc, m, a1) STMT_START { \
488 const IV offset = loc - RExC_precomp; \
489 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
490 a1, (int)offset, RExC_precomp, RExC_precomp + offset); \
493 #define vWARN3(loc, m, a1, a2) STMT_START { \
494 const IV offset = loc - RExC_precomp; \
495 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
496 a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset); \
499 #define vWARN4(loc, m, a1, a2, a3) STMT_START { \
500 const IV offset = loc - RExC_precomp; \
501 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
502 a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
505 #define vWARN5(loc, m, a1, a2, a3, a4) STMT_START { \
506 const IV offset = loc - RExC_precomp; \
507 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
508 a1, a2, a3, a4, (int)offset, RExC_precomp, RExC_precomp + offset); \
512 /* Allow for side effects in s */
513 #define REGC(c,s) STMT_START { \
514 if (!SIZE_ONLY) *(s) = (c); else (void)(s); \
517 /* Macros for recording node offsets. 20001227 mjd@plover.com
518 * Nodes are numbered 1, 2, 3, 4. Node #n's position is recorded in
519 * element 2*n-1 of the array. Element #2n holds the byte length node #n.
520 * Element 0 holds the number n.
521 * Position is 1 indexed.
523 #ifndef RE_TRACK_PATTERN_OFFSETS
524 #define Set_Node_Offset_To_R(node,byte)
525 #define Set_Node_Offset(node,byte)
526 #define Set_Cur_Node_Offset
527 #define Set_Node_Length_To_R(node,len)
528 #define Set_Node_Length(node,len)
529 #define Set_Node_Cur_Length(node)
530 #define Node_Offset(n)
531 #define Node_Length(n)
532 #define Set_Node_Offset_Length(node,offset,len)
533 #define ProgLen(ri) ri->u.proglen
534 #define SetProgLen(ri,x) ri->u.proglen = x
536 #define ProgLen(ri) ri->u.offsets[0]
537 #define SetProgLen(ri,x) ri->u.offsets[0] = x
538 #define Set_Node_Offset_To_R(node,byte) STMT_START { \
540 MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n", \
541 __LINE__, (int)(node), (int)(byte))); \
543 Perl_croak(aTHX_ "value of node is %d in Offset macro", (int)(node)); \
545 RExC_offsets[2*(node)-1] = (byte); \
550 #define Set_Node_Offset(node,byte) \
551 Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
552 #define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
554 #define Set_Node_Length_To_R(node,len) STMT_START { \
556 MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n", \
557 __LINE__, (int)(node), (int)(len))); \
559 Perl_croak(aTHX_ "value of node is %d in Length macro", (int)(node)); \
561 RExC_offsets[2*(node)] = (len); \
566 #define Set_Node_Length(node,len) \
567 Set_Node_Length_To_R((node)-RExC_emit_start, len)
568 #define Set_Cur_Node_Length(len) Set_Node_Length(RExC_emit, len)
569 #define Set_Node_Cur_Length(node) \
570 Set_Node_Length(node, RExC_parse - parse_start)
572 /* Get offsets and lengths */
573 #define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
574 #define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
576 #define Set_Node_Offset_Length(node,offset,len) STMT_START { \
577 Set_Node_Offset_To_R((node)-RExC_emit_start, (offset)); \
578 Set_Node_Length_To_R((node)-RExC_emit_start, (len)); \
582 #if PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS
583 #define EXPERIMENTAL_INPLACESCAN
584 #endif /*RE_TRACK_PATTERN_OFFSETS*/
586 #define DEBUG_STUDYDATA(str,data,depth) \
587 DEBUG_OPTIMISE_MORE_r(if(data){ \
588 PerlIO_printf(Perl_debug_log, \
589 "%*s" str "Pos:%"IVdf"/%"IVdf \
590 " Flags: 0x%"UVXf" Whilem_c: %"IVdf" Lcp: %"IVdf" %s", \
591 (int)(depth)*2, "", \
592 (IV)((data)->pos_min), \
593 (IV)((data)->pos_delta), \
594 (UV)((data)->flags), \
595 (IV)((data)->whilem_c), \
596 (IV)((data)->last_closep ? *((data)->last_closep) : -1), \
597 is_inf ? "INF " : "" \
599 if ((data)->last_found) \
600 PerlIO_printf(Perl_debug_log, \
601 "Last:'%s' %"IVdf":%"IVdf"/%"IVdf" %sFixed:'%s' @ %"IVdf \
602 " %sFloat: '%s' @ %"IVdf"/%"IVdf"", \
603 SvPVX_const((data)->last_found), \
604 (IV)((data)->last_end), \
605 (IV)((data)->last_start_min), \
606 (IV)((data)->last_start_max), \
607 ((data)->longest && \
608 (data)->longest==&((data)->longest_fixed)) ? "*" : "", \
609 SvPVX_const((data)->longest_fixed), \
610 (IV)((data)->offset_fixed), \
611 ((data)->longest && \
612 (data)->longest==&((data)->longest_float)) ? "*" : "", \
613 SvPVX_const((data)->longest_float), \
614 (IV)((data)->offset_float_min), \
615 (IV)((data)->offset_float_max) \
617 PerlIO_printf(Perl_debug_log,"\n"); \
620 static void clear_re(pTHX_ void *r);
622 /* Mark that we cannot extend a found fixed substring at this point.
623 Update the longest found anchored substring and the longest found
624 floating substrings if needed. */
627 S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data, I32 *minlenp, int is_inf)
629 const STRLEN l = CHR_SVLEN(data->last_found);
630 const STRLEN old_l = CHR_SVLEN(*data->longest);
631 GET_RE_DEBUG_FLAGS_DECL;
633 if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
634 SvSetMagicSV(*data->longest, data->last_found);
635 if (*data->longest == data->longest_fixed) {
636 data->offset_fixed = l ? data->last_start_min : data->pos_min;
637 if (data->flags & SF_BEFORE_EOL)
639 |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
641 data->flags &= ~SF_FIX_BEFORE_EOL;
642 data->minlen_fixed=minlenp;
643 data->lookbehind_fixed=0;
645 else { /* *data->longest == data->longest_float */
646 data->offset_float_min = l ? data->last_start_min : data->pos_min;
647 data->offset_float_max = (l
648 ? data->last_start_max
649 : data->pos_min + data->pos_delta);
650 if (is_inf || (U32)data->offset_float_max > (U32)I32_MAX)
651 data->offset_float_max = I32_MAX;
652 if (data->flags & SF_BEFORE_EOL)
654 |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
656 data->flags &= ~SF_FL_BEFORE_EOL;
657 data->minlen_float=minlenp;
658 data->lookbehind_float=0;
661 SvCUR_set(data->last_found, 0);
663 SV * const sv = data->last_found;
664 if (SvUTF8(sv) && SvMAGICAL(sv)) {
665 MAGIC * const mg = mg_find(sv, PERL_MAGIC_utf8);
671 data->flags &= ~SF_BEFORE_EOL;
672 DEBUG_STUDYDATA("commit: ",data,0);
675 /* Can match anything (initialization) */
677 S_cl_anything(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
679 ANYOF_CLASS_ZERO(cl);
680 ANYOF_BITMAP_SETALL(cl);
681 cl->flags = ANYOF_EOS|ANYOF_UNICODE_ALL;
683 cl->flags |= ANYOF_LOCALE;
686 /* Can match anything (initialization) */
688 S_cl_is_anything(const struct regnode_charclass_class *cl)
692 for (value = 0; value <= ANYOF_MAX; value += 2)
693 if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1))
695 if (!(cl->flags & ANYOF_UNICODE_ALL))
697 if (!ANYOF_BITMAP_TESTALLSET((const void*)cl))
702 /* Can match anything (initialization) */
704 S_cl_init(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
706 Zero(cl, 1, struct regnode_charclass_class);
708 cl_anything(pRExC_state, cl);
712 S_cl_init_zero(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
714 Zero(cl, 1, struct regnode_charclass_class);
716 cl_anything(pRExC_state, cl);
718 cl->flags |= ANYOF_LOCALE;
721 /* 'And' a given class with another one. Can create false positives */
722 /* We assume that cl is not inverted */
724 S_cl_and(struct regnode_charclass_class *cl,
725 const struct regnode_charclass_class *and_with)
728 assert(and_with->type == ANYOF);
729 if (!(and_with->flags & ANYOF_CLASS)
730 && !(cl->flags & ANYOF_CLASS)
731 && (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
732 && !(and_with->flags & ANYOF_FOLD)
733 && !(cl->flags & ANYOF_FOLD)) {
736 if (and_with->flags & ANYOF_INVERT)
737 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
738 cl->bitmap[i] &= ~and_with->bitmap[i];
740 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
741 cl->bitmap[i] &= and_with->bitmap[i];
742 } /* XXXX: logic is complicated otherwise, leave it along for a moment. */
743 if (!(and_with->flags & ANYOF_EOS))
744 cl->flags &= ~ANYOF_EOS;
746 if (cl->flags & ANYOF_UNICODE_ALL && and_with->flags & ANYOF_UNICODE &&
747 !(and_with->flags & ANYOF_INVERT)) {
748 cl->flags &= ~ANYOF_UNICODE_ALL;
749 cl->flags |= ANYOF_UNICODE;
750 ARG_SET(cl, ARG(and_with));
752 if (!(and_with->flags & ANYOF_UNICODE_ALL) &&
753 !(and_with->flags & ANYOF_INVERT))
754 cl->flags &= ~ANYOF_UNICODE_ALL;
755 if (!(and_with->flags & (ANYOF_UNICODE|ANYOF_UNICODE_ALL)) &&
756 !(and_with->flags & ANYOF_INVERT))
757 cl->flags &= ~ANYOF_UNICODE;
760 /* 'OR' a given class with another one. Can create false positives */
761 /* We assume that cl is not inverted */
763 S_cl_or(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, const struct regnode_charclass_class *or_with)
765 if (or_with->flags & ANYOF_INVERT) {
767 * (B1 | CL1) | (!B2 & !CL2) = (B1 | !B2 & !CL2) | (CL1 | (!B2 & !CL2))
768 * <= (B1 | !B2) | (CL1 | !CL2)
769 * which is wasteful if CL2 is small, but we ignore CL2:
770 * (B1 | CL1) | (!B2 & !CL2) <= (B1 | CL1) | !B2 = (B1 | !B2) | CL1
771 * XXXX Can we handle case-fold? Unclear:
772 * (OK1(i) | OK1(i')) | !(OK1(i) | OK1(i')) =
773 * (OK1(i) | OK1(i')) | (!OK1(i) & !OK1(i'))
775 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
776 && !(or_with->flags & ANYOF_FOLD)
777 && !(cl->flags & ANYOF_FOLD) ) {
780 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
781 cl->bitmap[i] |= ~or_with->bitmap[i];
782 } /* XXXX: logic is complicated otherwise */
784 cl_anything(pRExC_state, cl);
787 /* (B1 | CL1) | (B2 | CL2) = (B1 | B2) | (CL1 | CL2)) */
788 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
789 && (!(or_with->flags & ANYOF_FOLD)
790 || (cl->flags & ANYOF_FOLD)) ) {
793 /* OR char bitmap and class bitmap separately */
794 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
795 cl->bitmap[i] |= or_with->bitmap[i];
796 if (or_with->flags & ANYOF_CLASS) {
797 for (i = 0; i < ANYOF_CLASSBITMAP_SIZE; i++)
798 cl->classflags[i] |= or_with->classflags[i];
799 cl->flags |= ANYOF_CLASS;
802 else { /* XXXX: logic is complicated, leave it along for a moment. */
803 cl_anything(pRExC_state, cl);
806 if (or_with->flags & ANYOF_EOS)
807 cl->flags |= ANYOF_EOS;
809 if (cl->flags & ANYOF_UNICODE && or_with->flags & ANYOF_UNICODE &&
810 ARG(cl) != ARG(or_with)) {
811 cl->flags |= ANYOF_UNICODE_ALL;
812 cl->flags &= ~ANYOF_UNICODE;
814 if (or_with->flags & ANYOF_UNICODE_ALL) {
815 cl->flags |= ANYOF_UNICODE_ALL;
816 cl->flags &= ~ANYOF_UNICODE;
820 #define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ]
821 #define TRIE_LIST_CUR(state) ( TRIE_LIST_ITEM( state, 0 ).forid )
822 #define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate )
823 #define TRIE_LIST_USED(idx) ( trie->states[state].trans.list ? (TRIE_LIST_CUR( idx ) - 1) : 0 )
828 dump_trie(trie,widecharmap,revcharmap)
829 dump_trie_interim_list(trie,widecharmap,revcharmap,next_alloc)
830 dump_trie_interim_table(trie,widecharmap,revcharmap,next_alloc)
832 These routines dump out a trie in a somewhat readable format.
833 The _interim_ variants are used for debugging the interim
834 tables that are used to generate the final compressed
835 representation which is what dump_trie expects.
837 Part of the reason for their existance is to provide a form
838 of documentation as to how the different representations function.
843 Dumps the final compressed table form of the trie to Perl_debug_log.
844 Used for debugging make_trie().
848 S_dump_trie(pTHX_ const struct _reg_trie_data *trie, HV *widecharmap,
849 AV *revcharmap, U32 depth)
852 SV *sv=sv_newmortal();
853 int colwidth= widecharmap ? 6 : 4;
854 GET_RE_DEBUG_FLAGS_DECL;
857 PerlIO_printf( Perl_debug_log, "%*sChar : %-6s%-6s%-4s ",
858 (int)depth * 2 + 2,"",
859 "Match","Base","Ofs" );
861 for( state = 0 ; state < trie->uniquecharcount ; state++ ) {
862 SV ** const tmp = av_fetch( revcharmap, state, 0);
864 PerlIO_printf( Perl_debug_log, "%*s",
866 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
867 PL_colors[0], PL_colors[1],
868 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
869 PERL_PV_ESCAPE_FIRSTCHAR
874 PerlIO_printf( Perl_debug_log, "\n%*sState|-----------------------",
875 (int)depth * 2 + 2,"");
877 for( state = 0 ; state < trie->uniquecharcount ; state++ )
878 PerlIO_printf( Perl_debug_log, "%.*s", colwidth, "--------");
879 PerlIO_printf( Perl_debug_log, "\n");
881 for( state = 1 ; state < trie->statecount ; state++ ) {
882 const U32 base = trie->states[ state ].trans.base;
884 PerlIO_printf( Perl_debug_log, "%*s#%4"UVXf"|", (int)depth * 2 + 2,"", (UV)state);
886 if ( trie->states[ state ].wordnum ) {
887 PerlIO_printf( Perl_debug_log, " W%4X", trie->states[ state ].wordnum );
889 PerlIO_printf( Perl_debug_log, "%6s", "" );
892 PerlIO_printf( Perl_debug_log, " @%4"UVXf" ", (UV)base );
897 while( ( base + ofs < trie->uniquecharcount ) ||
898 ( base + ofs - trie->uniquecharcount < trie->lasttrans
899 && trie->trans[ base + ofs - trie->uniquecharcount ].check != state))
902 PerlIO_printf( Perl_debug_log, "+%2"UVXf"[ ", (UV)ofs);
904 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
905 if ( ( base + ofs >= trie->uniquecharcount ) &&
906 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
907 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
909 PerlIO_printf( Perl_debug_log, "%*"UVXf,
911 (UV)trie->trans[ base + ofs - trie->uniquecharcount ].next );
913 PerlIO_printf( Perl_debug_log, "%*s",colwidth," ." );
917 PerlIO_printf( Perl_debug_log, "]");
920 PerlIO_printf( Perl_debug_log, "\n" );
924 Dumps a fully constructed but uncompressed trie in list form.
925 List tries normally only are used for construction when the number of
926 possible chars (trie->uniquecharcount) is very high.
927 Used for debugging make_trie().
930 S_dump_trie_interim_list(pTHX_ const struct _reg_trie_data *trie,
931 HV *widecharmap, AV *revcharmap, U32 next_alloc,
935 SV *sv=sv_newmortal();
936 int colwidth= widecharmap ? 6 : 4;
937 GET_RE_DEBUG_FLAGS_DECL;
938 /* print out the table precompression. */
939 PerlIO_printf( Perl_debug_log, "%*sState :Word | Transition Data\n%*s%s",
940 (int)depth * 2 + 2,"", (int)depth * 2 + 2,"",
941 "------:-----+-----------------\n" );
943 for( state=1 ; state < next_alloc ; state ++ ) {
946 PerlIO_printf( Perl_debug_log, "%*s %4"UVXf" :",
947 (int)depth * 2 + 2,"", (UV)state );
948 if ( ! trie->states[ state ].wordnum ) {
949 PerlIO_printf( Perl_debug_log, "%5s| ","");
951 PerlIO_printf( Perl_debug_log, "W%4x| ",
952 trie->states[ state ].wordnum
955 for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) {
956 SV ** const tmp = av_fetch( revcharmap, TRIE_LIST_ITEM(state,charid).forid, 0);
958 PerlIO_printf( Perl_debug_log, "%*s:%3X=%4"UVXf" | ",
960 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
961 PL_colors[0], PL_colors[1],
962 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
963 PERL_PV_ESCAPE_FIRSTCHAR
965 TRIE_LIST_ITEM(state,charid).forid,
966 (UV)TRIE_LIST_ITEM(state,charid).newstate
969 PerlIO_printf(Perl_debug_log, "\n%*s| ",
970 (int)((depth * 2) + 14), "");
973 PerlIO_printf( Perl_debug_log, "\n");
978 Dumps a fully constructed but uncompressed trie in table form.
979 This is the normal DFA style state transition table, with a few
980 twists to facilitate compression later.
981 Used for debugging make_trie().
984 S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie,
985 HV *widecharmap, AV *revcharmap, U32 next_alloc,
990 SV *sv=sv_newmortal();
991 int colwidth= widecharmap ? 6 : 4;
992 GET_RE_DEBUG_FLAGS_DECL;
995 print out the table precompression so that we can do a visual check
996 that they are identical.
999 PerlIO_printf( Perl_debug_log, "%*sChar : ",(int)depth * 2 + 2,"" );
1001 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
1002 SV ** const tmp = av_fetch( revcharmap, charid, 0);
1004 PerlIO_printf( Perl_debug_log, "%*s",
1006 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
1007 PL_colors[0], PL_colors[1],
1008 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1009 PERL_PV_ESCAPE_FIRSTCHAR
1015 PerlIO_printf( Perl_debug_log, "\n%*sState+-",(int)depth * 2 + 2,"" );
1017 for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) {
1018 PerlIO_printf( Perl_debug_log, "%.*s", colwidth,"--------");
1021 PerlIO_printf( Perl_debug_log, "\n" );
1023 for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) {
1025 PerlIO_printf( Perl_debug_log, "%*s%4"UVXf" : ",
1026 (int)depth * 2 + 2,"",
1027 (UV)TRIE_NODENUM( state ) );
1029 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
1030 UV v=(UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next );
1032 PerlIO_printf( Perl_debug_log, "%*"UVXf, colwidth, v );
1034 PerlIO_printf( Perl_debug_log, "%*s", colwidth, "." );
1036 if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) {
1037 PerlIO_printf( Perl_debug_log, " (%4"UVXf")\n", (UV)trie->trans[ state ].check );
1039 PerlIO_printf( Perl_debug_log, " (%4"UVXf") W%4X\n", (UV)trie->trans[ state ].check,
1040 trie->states[ TRIE_NODENUM( state ) ].wordnum );
1047 /* make_trie(startbranch,first,last,tail,word_count,flags,depth)
1048 startbranch: the first branch in the whole branch sequence
1049 first : start branch of sequence of branch-exact nodes.
1050 May be the same as startbranch
1051 last : Thing following the last branch.
1052 May be the same as tail.
1053 tail : item following the branch sequence
1054 count : words in the sequence
1055 flags : currently the OP() type we will be building one of /EXACT(|F|Fl)/
1056 depth : indent depth
1058 Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node.
1060 A trie is an N'ary tree where the branches are determined by digital
1061 decomposition of the key. IE, at the root node you look up the 1st character and
1062 follow that branch repeat until you find the end of the branches. Nodes can be
1063 marked as "accepting" meaning they represent a complete word. Eg:
1067 would convert into the following structure. Numbers represent states, letters
1068 following numbers represent valid transitions on the letter from that state, if
1069 the number is in square brackets it represents an accepting state, otherwise it
1070 will be in parenthesis.
1072 +-h->+-e->[3]-+-r->(8)-+-s->[9]
1076 (1) +-i->(6)-+-s->[7]
1078 +-s->(3)-+-h->(4)-+-e->[5]
1080 Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers)
1082 This shows that when matching against the string 'hers' we will begin at state 1
1083 read 'h' and move to state 2, read 'e' and move to state 3 which is accepting,
1084 then read 'r' and go to state 8 followed by 's' which takes us to state 9 which
1085 is also accepting. Thus we know that we can match both 'he' and 'hers' with a
1086 single traverse. We store a mapping from accepting to state to which word was
1087 matched, and then when we have multiple possibilities we try to complete the
1088 rest of the regex in the order in which they occured in the alternation.
1090 The only prior NFA like behaviour that would be changed by the TRIE support is
1091 the silent ignoring of duplicate alternations which are of the form:
1093 / (DUPE|DUPE) X? (?{ ... }) Y /x
1095 Thus EVAL blocks follwing a trie may be called a different number of times with
1096 and without the optimisation. With the optimisations dupes will be silently
1097 ignored. This inconsistant behaviour of EVAL type nodes is well established as
1098 the following demonstrates:
1100 'words'=~/(word|word|word)(?{ print $1 })[xyz]/
1102 which prints out 'word' three times, but
1104 'words'=~/(word|word|word)(?{ print $1 })S/
1106 which doesnt print it out at all. This is due to other optimisations kicking in.
1108 Example of what happens on a structural level:
1110 The regexp /(ac|ad|ab)+/ will produce the folowing debug output:
1112 1: CURLYM[1] {1,32767}(18)
1123 This would be optimizable with startbranch=5, first=5, last=16, tail=16
1124 and should turn into:
1126 1: CURLYM[1] {1,32767}(18)
1128 [Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1]
1136 Cases where tail != last would be like /(?foo|bar)baz/:
1146 which would be optimizable with startbranch=1, first=1, last=7, tail=8
1147 and would end up looking like:
1150 [Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1]
1157 d = uvuni_to_utf8_flags(d, uv, 0);
1159 is the recommended Unicode-aware way of saying
1164 #define TRIE_STORE_REVCHAR \
1167 SV *zlopp = newSV(2); \
1168 unsigned char *flrbbbbb = (unsigned char *) SvPVX(zlopp); \
1169 unsigned const char *const kapow = uvuni_to_utf8(flrbbbbb, uvc & 0xFF); \
1170 SvCUR_set(zlopp, kapow - flrbbbbb); \
1173 av_push(revcharmap, zlopp); \
1175 char ooooff = (char)uvc; \
1176 av_push(revcharmap, newSVpvn(&ooooff, 1)); \
1180 #define TRIE_READ_CHAR STMT_START { \
1184 if ( foldlen > 0 ) { \
1185 uvc = utf8n_to_uvuni( scan, UTF8_MAXLEN, &len, uniflags ); \
1190 uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
1191 uvc = to_uni_fold( uvc, foldbuf, &foldlen ); \
1192 foldlen -= UNISKIP( uvc ); \
1193 scan = foldbuf + UNISKIP( uvc ); \
1196 uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
1206 #define TRIE_LIST_PUSH(state,fid,ns) STMT_START { \
1207 if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) { \
1208 U32 ging = TRIE_LIST_LEN( state ) *= 2; \
1209 Renew( trie->states[ state ].trans.list, ging, reg_trie_trans_le ); \
1211 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid; \
1212 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns; \
1213 TRIE_LIST_CUR( state )++; \
1216 #define TRIE_LIST_NEW(state) STMT_START { \
1217 Newxz( trie->states[ state ].trans.list, \
1218 4, reg_trie_trans_le ); \
1219 TRIE_LIST_CUR( state ) = 1; \
1220 TRIE_LIST_LEN( state ) = 4; \
1223 #define TRIE_HANDLE_WORD(state) STMT_START { \
1224 U16 dupe= trie->states[ state ].wordnum; \
1225 regnode * const noper_next = regnext( noper ); \
1227 if (trie->wordlen) \
1228 trie->wordlen[ curword ] = wordlen; \
1230 /* store the word for dumping */ \
1232 if (OP(noper) != NOTHING) \
1233 tmp = newSVpvn(STRING(noper), STR_LEN(noper)); \
1235 tmp = newSVpvn( "", 0 ); \
1236 if ( UTF ) SvUTF8_on( tmp ); \
1237 av_push( trie_words, tmp ); \
1242 if ( noper_next < tail ) { \
1244 trie->jump = (U16 *) PerlMemShared_calloc( word_count + 1, sizeof(U16) ); \
1245 trie->jump[curword] = (U16)(noper_next - convert); \
1247 jumper = noper_next; \
1249 nextbranch= regnext(cur); \
1253 /* So it's a dupe. This means we need to maintain a */\
1254 /* linked-list from the first to the next. */\
1255 /* we only allocate the nextword buffer when there */\
1256 /* a dupe, so first time we have to do the allocation */\
1257 if (!trie->nextword) \
1258 trie->nextword = (U16 *) \
1259 PerlMemShared_calloc( word_count + 1, sizeof(U16)); \
1260 while ( trie->nextword[dupe] ) \
1261 dupe= trie->nextword[dupe]; \
1262 trie->nextword[dupe]= curword; \
1264 /* we haven't inserted this word yet. */ \
1265 trie->states[ state ].wordnum = curword; \
1270 #define TRIE_TRANS_STATE(state,base,ucharcount,charid,special) \
1271 ( ( base + charid >= ucharcount \
1272 && base + charid < ubound \
1273 && state == trie->trans[ base - ucharcount + charid ].check \
1274 && trie->trans[ base - ucharcount + charid ].next ) \
1275 ? trie->trans[ base - ucharcount + charid ].next \
1276 : ( state==1 ? special : 0 ) \
1280 #define MADE_JUMP_TRIE 2
1281 #define MADE_EXACT_TRIE 4
1284 S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *first, regnode *last, regnode *tail, U32 word_count, U32 flags, U32 depth)
1287 /* first pass, loop through and scan words */
1288 reg_trie_data *trie;
1289 HV *widecharmap = NULL;
1290 AV *revcharmap = newAV();
1292 const U32 uniflags = UTF8_ALLOW_DEFAULT;
1297 regnode *jumper = NULL;
1298 regnode *nextbranch = NULL;
1299 regnode *convert = NULL;
1300 /* we just use folder as a flag in utf8 */
1301 const U8 * const folder = ( flags == EXACTF
1303 : ( flags == EXACTFL
1310 const U32 data_slot = add_data( pRExC_state, 4, "tuuu" );
1311 AV *trie_words = NULL;
1312 /* along with revcharmap, this only used during construction but both are
1313 * useful during debugging so we store them in the struct when debugging.
1316 const U32 data_slot = add_data( pRExC_state, 2, "tu" );
1317 STRLEN trie_charcount=0;
1319 SV *re_trie_maxbuff;
1320 GET_RE_DEBUG_FLAGS_DECL;
1322 PERL_UNUSED_ARG(depth);
1325 trie = (reg_trie_data *) PerlMemShared_calloc( 1, sizeof(reg_trie_data) );
1327 trie->startstate = 1;
1328 trie->wordcount = word_count;
1329 RExC_rxi->data->data[ data_slot ] = (void*)trie;
1330 trie->charmap = (U16 *) PerlMemShared_calloc( 256, sizeof(U16) );
1331 if (!(UTF && folder))
1332 trie->bitmap = (char *) PerlMemShared_calloc( ANYOF_BITMAP_SIZE, 1 );
1334 trie_words = newAV();
1337 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
1338 if (!SvIOK(re_trie_maxbuff)) {
1339 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
1342 PerlIO_printf( Perl_debug_log,
1343 "%*smake_trie start==%d, first==%d, last==%d, tail==%d depth=%d\n",
1344 (int)depth * 2 + 2, "",
1345 REG_NODE_NUM(startbranch),REG_NODE_NUM(first),
1346 REG_NODE_NUM(last), REG_NODE_NUM(tail),
1350 /* Find the node we are going to overwrite */
1351 if ( first == startbranch && OP( last ) != BRANCH ) {
1352 /* whole branch chain */
1355 /* branch sub-chain */
1356 convert = NEXTOPER( first );
1359 /* -- First loop and Setup --
1361 We first traverse the branches and scan each word to determine if it
1362 contains widechars, and how many unique chars there are, this is
1363 important as we have to build a table with at least as many columns as we
1366 We use an array of integers to represent the character codes 0..255
1367 (trie->charmap) and we use a an HV* to store Unicode characters. We use the
1368 native representation of the character value as the key and IV's for the
1371 *TODO* If we keep track of how many times each character is used we can
1372 remap the columns so that the table compression later on is more
1373 efficient in terms of memory by ensuring most common value is in the
1374 middle and the least common are on the outside. IMO this would be better
1375 than a most to least common mapping as theres a decent chance the most
1376 common letter will share a node with the least common, meaning the node
1377 will not be compressable. With a middle is most common approach the worst
1378 case is when we have the least common nodes twice.
1382 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1383 regnode * const noper = NEXTOPER( cur );
1384 const U8 *uc = (U8*)STRING( noper );
1385 const U8 * const e = uc + STR_LEN( noper );
1387 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1388 const U8 *scan = (U8*)NULL;
1389 U32 wordlen = 0; /* required init */
1391 bool set_bit = trie->bitmap ? 1 : 0; /*store the first char in the bitmap?*/
1393 if (OP(noper) == NOTHING) {
1397 if ( set_bit ) /* bitmap only alloced when !(UTF&&Folding) */
1398 TRIE_BITMAP_SET(trie,*uc); /* store the raw first byte
1399 regardless of encoding */
1401 for ( ; uc < e ; uc += len ) {
1402 TRIE_CHARCOUNT(trie)++;
1406 if ( !trie->charmap[ uvc ] ) {
1407 trie->charmap[ uvc ]=( ++trie->uniquecharcount );
1409 trie->charmap[ folder[ uvc ] ] = trie->charmap[ uvc ];
1413 /* store the codepoint in the bitmap, and if its ascii
1414 also store its folded equivelent. */
1415 TRIE_BITMAP_SET(trie,uvc);
1417 /* store the folded codepoint */
1418 if ( folder ) TRIE_BITMAP_SET(trie,folder[ uvc ]);
1421 /* store first byte of utf8 representation of
1422 codepoints in the 127 < uvc < 256 range */
1423 if (127 < uvc && uvc < 192) {
1424 TRIE_BITMAP_SET(trie,194);
1425 } else if (191 < uvc ) {
1426 TRIE_BITMAP_SET(trie,195);
1427 /* && uvc < 256 -- we know uvc is < 256 already */
1430 set_bit = 0; /* We've done our bit :-) */
1435 widecharmap = newHV();
1437 svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 1 );
1440 Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%"UVXf, uvc );
1442 if ( !SvTRUE( *svpp ) ) {
1443 sv_setiv( *svpp, ++trie->uniquecharcount );
1448 if( cur == first ) {
1451 } else if (chars < trie->minlen) {
1453 } else if (chars > trie->maxlen) {
1457 } /* end first pass */
1458 DEBUG_TRIE_COMPILE_r(
1459 PerlIO_printf( Perl_debug_log, "%*sTRIE(%s): W:%d C:%d Uq:%d Min:%d Max:%d\n",
1460 (int)depth * 2 + 2,"",
1461 ( widecharmap ? "UTF8" : "NATIVE" ), (int)word_count,
1462 (int)TRIE_CHARCOUNT(trie), trie->uniquecharcount,
1463 (int)trie->minlen, (int)trie->maxlen )
1465 trie->wordlen = (U32 *) PerlMemShared_calloc( word_count, sizeof(U32) );
1468 We now know what we are dealing with in terms of unique chars and
1469 string sizes so we can calculate how much memory a naive
1470 representation using a flat table will take. If it's over a reasonable
1471 limit (as specified by ${^RE_TRIE_MAXBUF}) we use a more memory
1472 conservative but potentially much slower representation using an array
1475 At the end we convert both representations into the same compressed
1476 form that will be used in regexec.c for matching with. The latter
1477 is a form that cannot be used to construct with but has memory
1478 properties similar to the list form and access properties similar
1479 to the table form making it both suitable for fast searches and
1480 small enough that its feasable to store for the duration of a program.
1482 See the comment in the code where the compressed table is produced
1483 inplace from the flat tabe representation for an explanation of how
1484 the compression works.
1489 if ( (IV)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1) > SvIV(re_trie_maxbuff) ) {
1491 Second Pass -- Array Of Lists Representation
1493 Each state will be represented by a list of charid:state records
1494 (reg_trie_trans_le) the first such element holds the CUR and LEN
1495 points of the allocated array. (See defines above).
1497 We build the initial structure using the lists, and then convert
1498 it into the compressed table form which allows faster lookups
1499 (but cant be modified once converted).
1502 STRLEN transcount = 1;
1504 DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
1505 "%*sCompiling trie using list compiler\n",
1506 (int)depth * 2 + 2, ""));
1508 trie->states = (reg_trie_state *)
1509 PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
1510 sizeof(reg_trie_state) );
1514 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1516 regnode * const noper = NEXTOPER( cur );
1517 U8 *uc = (U8*)STRING( noper );
1518 const U8 * const e = uc + STR_LEN( noper );
1519 U32 state = 1; /* required init */
1520 U16 charid = 0; /* sanity init */
1521 U8 *scan = (U8*)NULL; /* sanity init */
1522 STRLEN foldlen = 0; /* required init */
1523 U32 wordlen = 0; /* required init */
1524 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1526 if (OP(noper) != NOTHING) {
1527 for ( ; uc < e ; uc += len ) {
1532 charid = trie->charmap[ uvc ];
1534 SV** const svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 0);
1538 charid=(U16)SvIV( *svpp );
1541 /* charid is now 0 if we dont know the char read, or nonzero if we do */
1548 if ( !trie->states[ state ].trans.list ) {
1549 TRIE_LIST_NEW( state );
1551 for ( check = 1; check <= TRIE_LIST_USED( state ); check++ ) {
1552 if ( TRIE_LIST_ITEM( state, check ).forid == charid ) {
1553 newstate = TRIE_LIST_ITEM( state, check ).newstate;
1558 newstate = next_alloc++;
1559 TRIE_LIST_PUSH( state, charid, newstate );
1564 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
1568 TRIE_HANDLE_WORD(state);
1570 } /* end second pass */
1572 /* next alloc is the NEXT state to be allocated */
1573 trie->statecount = next_alloc;
1574 trie->states = (reg_trie_state *)
1575 PerlMemShared_realloc( trie->states,
1577 * sizeof(reg_trie_state) );
1579 /* and now dump it out before we compress it */
1580 DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_list(trie, widecharmap,
1581 revcharmap, next_alloc,
1585 trie->trans = (reg_trie_trans *)
1586 PerlMemShared_calloc( transcount, sizeof(reg_trie_trans) );
1593 for( state=1 ; state < next_alloc ; state ++ ) {
1597 DEBUG_TRIE_COMPILE_MORE_r(
1598 PerlIO_printf( Perl_debug_log, "tp: %d zp: %d ",tp,zp)
1602 if (trie->states[state].trans.list) {
1603 U16 minid=TRIE_LIST_ITEM( state, 1).forid;
1607 for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
1608 const U16 forid = TRIE_LIST_ITEM( state, idx).forid;
1609 if ( forid < minid ) {
1611 } else if ( forid > maxid ) {
1615 if ( transcount < tp + maxid - minid + 1) {
1617 trie->trans = (reg_trie_trans *)
1618 PerlMemShared_realloc( trie->trans,
1620 * sizeof(reg_trie_trans) );
1621 Zero( trie->trans + (transcount / 2), transcount / 2 , reg_trie_trans );
1623 base = trie->uniquecharcount + tp - minid;
1624 if ( maxid == minid ) {
1626 for ( ; zp < tp ; zp++ ) {
1627 if ( ! trie->trans[ zp ].next ) {
1628 base = trie->uniquecharcount + zp - minid;
1629 trie->trans[ zp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1630 trie->trans[ zp ].check = state;
1636 trie->trans[ tp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1637 trie->trans[ tp ].check = state;
1642 for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
1643 const U32 tid = base - trie->uniquecharcount + TRIE_LIST_ITEM( state, idx ).forid;
1644 trie->trans[ tid ].next = TRIE_LIST_ITEM( state, idx ).newstate;
1645 trie->trans[ tid ].check = state;
1647 tp += ( maxid - minid + 1 );
1649 Safefree(trie->states[ state ].trans.list);
1652 DEBUG_TRIE_COMPILE_MORE_r(
1653 PerlIO_printf( Perl_debug_log, " base: %d\n",base);
1656 trie->states[ state ].trans.base=base;
1658 trie->lasttrans = tp + 1;
1662 Second Pass -- Flat Table Representation.
1664 we dont use the 0 slot of either trans[] or states[] so we add 1 to each.
1665 We know that we will need Charcount+1 trans at most to store the data
1666 (one row per char at worst case) So we preallocate both structures
1667 assuming worst case.
1669 We then construct the trie using only the .next slots of the entry
1672 We use the .check field of the first entry of the node temporarily to
1673 make compression both faster and easier by keeping track of how many non
1674 zero fields are in the node.
1676 Since trans are numbered from 1 any 0 pointer in the table is a FAIL
1679 There are two terms at use here: state as a TRIE_NODEIDX() which is a
1680 number representing the first entry of the node, and state as a
1681 TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1) and
1682 TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3) if there
1683 are 2 entrys per node. eg:
1691 The table is internally in the right hand, idx form. However as we also
1692 have to deal with the states array which is indexed by nodenum we have to
1693 use TRIE_NODENUM() to convert.
1696 DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
1697 "%*sCompiling trie using table compiler\n",
1698 (int)depth * 2 + 2, ""));
1700 trie->trans = (reg_trie_trans *)
1701 PerlMemShared_calloc( ( TRIE_CHARCOUNT(trie) + 1 )
1702 * trie->uniquecharcount + 1,
1703 sizeof(reg_trie_trans) );
1704 trie->states = (reg_trie_state *)
1705 PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
1706 sizeof(reg_trie_state) );
1707 next_alloc = trie->uniquecharcount + 1;
1710 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1712 regnode * const noper = NEXTOPER( cur );
1713 const U8 *uc = (U8*)STRING( noper );
1714 const U8 * const e = uc + STR_LEN( noper );
1716 U32 state = 1; /* required init */
1718 U16 charid = 0; /* sanity init */
1719 U32 accept_state = 0; /* sanity init */
1720 U8 *scan = (U8*)NULL; /* sanity init */
1722 STRLEN foldlen = 0; /* required init */
1723 U32 wordlen = 0; /* required init */
1724 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1726 if ( OP(noper) != NOTHING ) {
1727 for ( ; uc < e ; uc += len ) {
1732 charid = trie->charmap[ uvc ];
1734 SV* const * const svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 0);
1735 charid = svpp ? (U16)SvIV(*svpp) : 0;
1739 if ( !trie->trans[ state + charid ].next ) {
1740 trie->trans[ state + charid ].next = next_alloc;
1741 trie->trans[ state ].check++;
1742 next_alloc += trie->uniquecharcount;
1744 state = trie->trans[ state + charid ].next;
1746 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
1748 /* charid is now 0 if we dont know the char read, or nonzero if we do */
1751 accept_state = TRIE_NODENUM( state );
1752 TRIE_HANDLE_WORD(accept_state);
1754 } /* end second pass */
1756 /* and now dump it out before we compress it */
1757 DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_table(trie, widecharmap,
1759 next_alloc, depth+1));
1763 * Inplace compress the table.*
1765 For sparse data sets the table constructed by the trie algorithm will
1766 be mostly 0/FAIL transitions or to put it another way mostly empty.
1767 (Note that leaf nodes will not contain any transitions.)
1769 This algorithm compresses the tables by eliminating most such
1770 transitions, at the cost of a modest bit of extra work during lookup:
1772 - Each states[] entry contains a .base field which indicates the
1773 index in the state[] array wheres its transition data is stored.
1775 - If .base is 0 there are no valid transitions from that node.
1777 - If .base is nonzero then charid is added to it to find an entry in
1780 -If trans[states[state].base+charid].check!=state then the
1781 transition is taken to be a 0/Fail transition. Thus if there are fail
1782 transitions at the front of the node then the .base offset will point
1783 somewhere inside the previous nodes data (or maybe even into a node
1784 even earlier), but the .check field determines if the transition is
1788 The following process inplace converts the table to the compressed
1789 table: We first do not compress the root node 1,and mark its all its
1790 .check pointers as 1 and set its .base pointer as 1 as well. This
1791 allows to do a DFA construction from the compressed table later, and
1792 ensures that any .base pointers we calculate later are greater than
1795 - We set 'pos' to indicate the first entry of the second node.
1797 - We then iterate over the columns of the node, finding the first and
1798 last used entry at l and m. We then copy l..m into pos..(pos+m-l),
1799 and set the .check pointers accordingly, and advance pos
1800 appropriately and repreat for the next node. Note that when we copy
1801 the next pointers we have to convert them from the original
1802 NODEIDX form to NODENUM form as the former is not valid post
1805 - If a node has no transitions used we mark its base as 0 and do not
1806 advance the pos pointer.
1808 - If a node only has one transition we use a second pointer into the
1809 structure to fill in allocated fail transitions from other states.
1810 This pointer is independent of the main pointer and scans forward
1811 looking for null transitions that are allocated to a state. When it
1812 finds one it writes the single transition into the "hole". If the
1813 pointer doesnt find one the single transition is appended as normal.
1815 - Once compressed we can Renew/realloc the structures to release the
1818 See "Table-Compression Methods" in sec 3.9 of the Red Dragon,
1819 specifically Fig 3.47 and the associated pseudocode.
1823 const U32 laststate = TRIE_NODENUM( next_alloc );
1826 trie->statecount = laststate;
1828 for ( state = 1 ; state < laststate ; state++ ) {
1830 const U32 stateidx = TRIE_NODEIDX( state );
1831 const U32 o_used = trie->trans[ stateidx ].check;
1832 U32 used = trie->trans[ stateidx ].check;
1833 trie->trans[ stateidx ].check = 0;
1835 for ( charid = 0 ; used && charid < trie->uniquecharcount ; charid++ ) {
1836 if ( flag || trie->trans[ stateidx + charid ].next ) {
1837 if ( trie->trans[ stateidx + charid ].next ) {
1839 for ( ; zp < pos ; zp++ ) {
1840 if ( ! trie->trans[ zp ].next ) {
1844 trie->states[ state ].trans.base = zp + trie->uniquecharcount - charid ;
1845 trie->trans[ zp ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
1846 trie->trans[ zp ].check = state;
1847 if ( ++zp > pos ) pos = zp;
1854 trie->states[ state ].trans.base = pos + trie->uniquecharcount - charid ;
1856 trie->trans[ pos ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
1857 trie->trans[ pos ].check = state;
1862 trie->lasttrans = pos + 1;
1863 trie->states = (reg_trie_state *)
1864 PerlMemShared_realloc( trie->states, laststate
1865 * sizeof(reg_trie_state) );
1866 DEBUG_TRIE_COMPILE_MORE_r(
1867 PerlIO_printf( Perl_debug_log,
1868 "%*sAlloc: %d Orig: %"IVdf" elements, Final:%"IVdf". Savings of %%%5.2f\n",
1869 (int)depth * 2 + 2,"",
1870 (int)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1 ),
1873 ( ( next_alloc - pos ) * 100 ) / (double)next_alloc );
1876 } /* end table compress */
1878 DEBUG_TRIE_COMPILE_MORE_r(
1879 PerlIO_printf(Perl_debug_log, "%*sStatecount:%"UVxf" Lasttrans:%"UVxf"\n",
1880 (int)depth * 2 + 2, "",
1881 (UV)trie->statecount,
1882 (UV)trie->lasttrans)
1884 /* resize the trans array to remove unused space */
1885 trie->trans = (reg_trie_trans *)
1886 PerlMemShared_realloc( trie->trans, trie->lasttrans
1887 * sizeof(reg_trie_trans) );
1889 /* and now dump out the compressed format */
1890 DEBUG_TRIE_COMPILE_r(dump_trie(trie, widecharmap, revcharmap, depth+1));
1892 { /* Modify the program and insert the new TRIE node*/
1893 U8 nodetype =(U8)(flags & 0xFF);
1897 regnode *optimize = NULL;
1898 #ifdef RE_TRACK_PATTERN_OFFSETS
1901 U32 mjd_nodelen = 0;
1902 #endif /* RE_TRACK_PATTERN_OFFSETS */
1903 #endif /* DEBUGGING */
1905 This means we convert either the first branch or the first Exact,
1906 depending on whether the thing following (in 'last') is a branch
1907 or not and whther first is the startbranch (ie is it a sub part of
1908 the alternation or is it the whole thing.)
1909 Assuming its a sub part we conver the EXACT otherwise we convert
1910 the whole branch sequence, including the first.
1912 /* Find the node we are going to overwrite */
1913 if ( first != startbranch || OP( last ) == BRANCH ) {
1914 /* branch sub-chain */
1915 NEXT_OFF( first ) = (U16)(last - first);
1916 #ifdef RE_TRACK_PATTERN_OFFSETS
1918 mjd_offset= Node_Offset((convert));
1919 mjd_nodelen= Node_Length((convert));
1922 /* whole branch chain */
1924 #ifdef RE_TRACK_PATTERN_OFFSETS
1927 const regnode *nop = NEXTOPER( convert );
1928 mjd_offset= Node_Offset((nop));
1929 mjd_nodelen= Node_Length((nop));
1933 PerlIO_printf(Perl_debug_log, "%*sMJD offset:%"UVuf" MJD length:%"UVuf"\n",
1934 (int)depth * 2 + 2, "",
1935 (UV)mjd_offset, (UV)mjd_nodelen)
1938 /* But first we check to see if there is a common prefix we can
1939 split out as an EXACT and put in front of the TRIE node. */
1940 trie->startstate= 1;
1941 if ( trie->bitmap && !widecharmap && !trie->jump ) {
1943 for ( state = 1 ; state < trie->statecount-1 ; state++ ) {
1947 const U32 base = trie->states[ state ].trans.base;
1949 if ( trie->states[state].wordnum )
1952 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
1953 if ( ( base + ofs >= trie->uniquecharcount ) &&
1954 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
1955 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
1957 if ( ++count > 1 ) {
1958 SV **tmp = av_fetch( revcharmap, ofs, 0);
1959 const U8 *ch = (U8*)SvPV_nolen_const( *tmp );
1960 if ( state == 1 ) break;
1962 Zero(trie->bitmap, ANYOF_BITMAP_SIZE, char);
1964 PerlIO_printf(Perl_debug_log,
1965 "%*sNew Start State=%"UVuf" Class: [",
1966 (int)depth * 2 + 2, "",
1969 SV ** const tmp = av_fetch( revcharmap, idx, 0);
1970 const U8 * const ch = (U8*)SvPV_nolen_const( *tmp );
1972 TRIE_BITMAP_SET(trie,*ch);
1974 TRIE_BITMAP_SET(trie, folder[ *ch ]);
1976 PerlIO_printf(Perl_debug_log, (char*)ch)
1980 TRIE_BITMAP_SET(trie,*ch);
1982 TRIE_BITMAP_SET(trie,folder[ *ch ]);
1983 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"%s", ch));
1989 SV **tmp = av_fetch( revcharmap, idx, 0);
1991 char *ch = SvPV( *tmp, len );
1993 SV *sv=sv_newmortal();
1994 PerlIO_printf( Perl_debug_log,
1995 "%*sPrefix State: %"UVuf" Idx:%"UVuf" Char='%s'\n",
1996 (int)depth * 2 + 2, "",
1998 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 6,
1999 PL_colors[0], PL_colors[1],
2000 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
2001 PERL_PV_ESCAPE_FIRSTCHAR
2006 OP( convert ) = nodetype;
2007 str=STRING(convert);
2010 STR_LEN(convert) += len;
2016 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"]\n"));
2022 regnode *n = convert+NODE_SZ_STR(convert);
2023 NEXT_OFF(convert) = NODE_SZ_STR(convert);
2024 trie->startstate = state;
2025 trie->minlen -= (state - 1);
2026 trie->maxlen -= (state - 1);
2028 regnode *fix = convert;
2029 U32 word = trie->wordcount;
2031 Set_Node_Offset_Length(convert, mjd_offset, state - 1);
2032 while( ++fix < n ) {
2033 Set_Node_Offset_Length(fix, 0, 0);
2036 SV ** const tmp = av_fetch( trie_words, word, 0 );
2038 if ( STR_LEN(convert) <= SvCUR(*tmp) )
2039 sv_chop(*tmp, SvPV_nolen(*tmp) + STR_LEN(convert));
2041 sv_chop(*tmp, SvPV_nolen(*tmp) + SvCUR(*tmp));
2048 NEXT_OFF(convert) = (U16)(tail - convert);
2049 DEBUG_r(optimize= n);
2055 if ( trie->maxlen ) {
2056 NEXT_OFF( convert ) = (U16)(tail - convert);
2057 ARG_SET( convert, data_slot );
2058 /* Store the offset to the first unabsorbed branch in
2059 jump[0], which is otherwise unused by the jump logic.
2060 We use this when dumping a trie and during optimisation. */
2062 trie->jump[0] = (U16)(nextbranch - convert);
2065 if ( !trie->states[trie->startstate].wordnum && trie->bitmap &&
2066 ( (char *)jumper - (char *)convert) >= (int)sizeof(struct regnode_charclass) )
2068 OP( convert ) = TRIEC;
2069 Copy(trie->bitmap, ((struct regnode_charclass *)convert)->bitmap, ANYOF_BITMAP_SIZE, char);
2070 PerlMemShared_free(trie->bitmap);
2073 OP( convert ) = TRIE;
2075 /* store the type in the flags */
2076 convert->flags = nodetype;
2080 + regarglen[ OP( convert ) ];
2082 /* XXX We really should free up the resource in trie now,
2083 as we won't use them - (which resources?) dmq */
2085 /* needed for dumping*/
2086 DEBUG_r(if (optimize) {
2087 regnode *opt = convert;
2089 while ( ++opt < optimize) {
2090 Set_Node_Offset_Length(opt,0,0);
2093 Try to clean up some of the debris left after the
2096 while( optimize < jumper ) {
2097 mjd_nodelen += Node_Length((optimize));
2098 OP( optimize ) = OPTIMIZED;
2099 Set_Node_Offset_Length(optimize,0,0);
2102 Set_Node_Offset_Length(convert,mjd_offset,mjd_nodelen);
2104 } /* end node insert */
2105 RExC_rxi->data->data[ data_slot + 1 ] = (void*)widecharmap;
2107 RExC_rxi->data->data[ data_slot + TRIE_WORDS_OFFSET ] = (void*)trie_words;
2108 RExC_rxi->data->data[ data_slot + 3 ] = (void*)revcharmap;
2110 SvREFCNT_dec(revcharmap);
2114 : trie->startstate>1
2120 S_make_trie_failtable(pTHX_ RExC_state_t *pRExC_state, regnode *source, regnode *stclass, U32 depth)
2122 /* The Trie is constructed and compressed now so we can build a fail array now if its needed
2124 This is basically the Aho-Corasick algorithm. Its from exercise 3.31 and 3.32 in the
2125 "Red Dragon" -- Compilers, principles, techniques, and tools. Aho, Sethi, Ullman 1985/88
2128 We find the fail state for each state in the trie, this state is the longest proper
2129 suffix of the current states 'word' that is also a proper prefix of another word in our
2130 trie. State 1 represents the word '' and is the thus the default fail state. This allows
2131 the DFA not to have to restart after its tried and failed a word at a given point, it
2132 simply continues as though it had been matching the other word in the first place.
2134 'abcdgu'=~/abcdefg|cdgu/
2135 When we get to 'd' we are still matching the first word, we would encounter 'g' which would
2136 fail, which would bring use to the state representing 'd' in the second word where we would
2137 try 'g' and succeed, prodceding to match 'cdgu'.
2139 /* add a fail transition */
2140 const U32 trie_offset = ARG(source);
2141 reg_trie_data *trie=(reg_trie_data *)RExC_rxi->data->data[trie_offset];
2143 const U32 ucharcount = trie->uniquecharcount;
2144 const U32 numstates = trie->statecount;
2145 const U32 ubound = trie->lasttrans + ucharcount;
2149 U32 base = trie->states[ 1 ].trans.base;
2152 const U32 data_slot = add_data( pRExC_state, 1, "T" );
2153 GET_RE_DEBUG_FLAGS_DECL;
2155 PERL_UNUSED_ARG(depth);
2159 ARG_SET( stclass, data_slot );
2160 aho = (reg_ac_data *) PerlMemShared_calloc( 1, sizeof(reg_ac_data) );
2161 RExC_rxi->data->data[ data_slot ] = (void*)aho;
2162 aho->trie=trie_offset;
2163 aho->states=(reg_trie_state *)PerlMemShared_malloc( numstates * sizeof(reg_trie_state) );
2164 Copy( trie->states, aho->states, numstates, reg_trie_state );
2165 Newxz( q, numstates, U32);
2166 aho->fail = (U32 *) PerlMemShared_calloc( numstates, sizeof(U32) );
2169 /* initialize fail[0..1] to be 1 so that we always have
2170 a valid final fail state */
2171 fail[ 0 ] = fail[ 1 ] = 1;
2173 for ( charid = 0; charid < ucharcount ; charid++ ) {
2174 const U32 newstate = TRIE_TRANS_STATE( 1, base, ucharcount, charid, 0 );
2176 q[ q_write ] = newstate;
2177 /* set to point at the root */
2178 fail[ q[ q_write++ ] ]=1;
2181 while ( q_read < q_write) {
2182 const U32 cur = q[ q_read++ % numstates ];
2183 base = trie->states[ cur ].trans.base;
2185 for ( charid = 0 ; charid < ucharcount ; charid++ ) {
2186 const U32 ch_state = TRIE_TRANS_STATE( cur, base, ucharcount, charid, 1 );
2188 U32 fail_state = cur;
2191 fail_state = fail[ fail_state ];
2192 fail_base = aho->states[ fail_state ].trans.base;
2193 } while ( !TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 ) );
2195 fail_state = TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 );
2196 fail[ ch_state ] = fail_state;
2197 if ( !aho->states[ ch_state ].wordnum && aho->states[ fail_state ].wordnum )
2199 aho->states[ ch_state ].wordnum = aho->states[ fail_state ].wordnum;
2201 q[ q_write++ % numstates] = ch_state;
2205 /* restore fail[0..1] to 0 so that we "fall out" of the AC loop
2206 when we fail in state 1, this allows us to use the
2207 charclass scan to find a valid start char. This is based on the principle
2208 that theres a good chance the string being searched contains lots of stuff
2209 that cant be a start char.
2211 fail[ 0 ] = fail[ 1 ] = 0;
2212 DEBUG_TRIE_COMPILE_r({
2213 PerlIO_printf(Perl_debug_log,
2214 "%*sStclass Failtable (%"UVuf" states): 0",
2215 (int)(depth * 2), "", (UV)numstates
2217 for( q_read=1; q_read<numstates; q_read++ ) {
2218 PerlIO_printf(Perl_debug_log, ", %"UVuf, (UV)fail[q_read]);
2220 PerlIO_printf(Perl_debug_log, "\n");
2223 /*RExC_seen |= REG_SEEN_TRIEDFA;*/
2228 * There are strange code-generation bugs caused on sparc64 by gcc-2.95.2.
2229 * These need to be revisited when a newer toolchain becomes available.
2231 #if defined(__sparc64__) && defined(__GNUC__)
2232 # if __GNUC__ < 2 || (__GNUC__ == 2 && __GNUC_MINOR__ < 96)
2233 # undef SPARC64_GCC_WORKAROUND
2234 # define SPARC64_GCC_WORKAROUND 1
2238 #define DEBUG_PEEP(str,scan,depth) \
2239 DEBUG_OPTIMISE_r({if (scan){ \
2240 SV * const mysv=sv_newmortal(); \
2241 regnode *Next = regnext(scan); \
2242 regprop(RExC_rx, mysv, scan); \
2243 PerlIO_printf(Perl_debug_log, "%*s" str ">%3d: %s (%d)\n", \
2244 (int)depth*2, "", REG_NODE_NUM(scan), SvPV_nolen_const(mysv),\
2245 Next ? (REG_NODE_NUM(Next)) : 0 ); \
2252 #define JOIN_EXACT(scan,min,flags) \
2253 if (PL_regkind[OP(scan)] == EXACT) \
2254 join_exact(pRExC_state,(scan),(min),(flags),NULL,depth+1)
2257 S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, I32 *min, U32 flags,regnode *val, U32 depth) {
2258 /* Merge several consecutive EXACTish nodes into one. */
2259 regnode *n = regnext(scan);
2261 regnode *next = scan + NODE_SZ_STR(scan);
2265 regnode *stop = scan;
2266 GET_RE_DEBUG_FLAGS_DECL;
2268 PERL_UNUSED_ARG(depth);
2270 #ifndef EXPERIMENTAL_INPLACESCAN
2271 PERL_UNUSED_ARG(flags);
2272 PERL_UNUSED_ARG(val);
2274 DEBUG_PEEP("join",scan,depth);
2276 /* Skip NOTHING, merge EXACT*. */
2278 ( PL_regkind[OP(n)] == NOTHING ||
2279 (stringok && (OP(n) == OP(scan))))
2281 && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX) {
2283 if (OP(n) == TAIL || n > next)
2285 if (PL_regkind[OP(n)] == NOTHING) {
2286 DEBUG_PEEP("skip:",n,depth);
2287 NEXT_OFF(scan) += NEXT_OFF(n);
2288 next = n + NODE_STEP_REGNODE;
2295 else if (stringok) {
2296 const unsigned int oldl = STR_LEN(scan);
2297 regnode * const nnext = regnext(n);
2299 DEBUG_PEEP("merg",n,depth);
2302 if (oldl + STR_LEN(n) > U8_MAX)
2304 NEXT_OFF(scan) += NEXT_OFF(n);
2305 STR_LEN(scan) += STR_LEN(n);
2306 next = n + NODE_SZ_STR(n);
2307 /* Now we can overwrite *n : */
2308 Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
2316 #ifdef EXPERIMENTAL_INPLACESCAN
2317 if (flags && !NEXT_OFF(n)) {
2318 DEBUG_PEEP("atch", val, depth);
2319 if (reg_off_by_arg[OP(n)]) {
2320 ARG_SET(n, val - n);
2323 NEXT_OFF(n) = val - n;
2330 if (UTF && ( OP(scan) == EXACTF ) && ( STR_LEN(scan) >= 6 ) ) {
2332 Two problematic code points in Unicode casefolding of EXACT nodes:
2334 U+0390 - GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
2335 U+03B0 - GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
2341 U+03B9 U+0308 U+0301 0xCE 0xB9 0xCC 0x88 0xCC 0x81
2342 U+03C5 U+0308 U+0301 0xCF 0x85 0xCC 0x88 0xCC 0x81
2344 This means that in case-insensitive matching (or "loose matching",
2345 as Unicode calls it), an EXACTF of length six (the UTF-8 encoded byte
2346 length of the above casefolded versions) can match a target string
2347 of length two (the byte length of UTF-8 encoded U+0390 or U+03B0).
2348 This would rather mess up the minimum length computation.
2350 What we'll do is to look for the tail four bytes, and then peek
2351 at the preceding two bytes to see whether we need to decrease
2352 the minimum length by four (six minus two).
2354 Thanks to the design of UTF-8, there cannot be false matches:
2355 A sequence of valid UTF-8 bytes cannot be a subsequence of
2356 another valid sequence of UTF-8 bytes.
2359 char * const s0 = STRING(scan), *s, *t;
2360 char * const s1 = s0 + STR_LEN(scan) - 1;
2361 char * const s2 = s1 - 4;
2362 #ifdef EBCDIC /* RD tunifold greek 0390 and 03B0 */
2363 const char t0[] = "\xaf\x49\xaf\x42";
2365 const char t0[] = "\xcc\x88\xcc\x81";
2367 const char * const t1 = t0 + 3;
2370 s < s2 && (t = ninstr(s, s1, t0, t1));
2373 if (((U8)t[-1] == 0x68 && (U8)t[-2] == 0xB4) ||
2374 ((U8)t[-1] == 0x46 && (U8)t[-2] == 0xB5))
2376 if (((U8)t[-1] == 0xB9 && (U8)t[-2] == 0xCE) ||
2377 ((U8)t[-1] == 0x85 && (U8)t[-2] == 0xCF))
2385 n = scan + NODE_SZ_STR(scan);
2387 if (PL_regkind[OP(n)] != NOTHING || OP(n) == NOTHING) {
2394 DEBUG_OPTIMISE_r(if (merged){DEBUG_PEEP("finl",scan,depth)});
2398 /* REx optimizer. Converts nodes into quickier variants "in place".
2399 Finds fixed substrings. */
2401 /* Stops at toplevel WHILEM as well as at "last". At end *scanp is set
2402 to the position after last scanned or to NULL. */
2404 #define INIT_AND_WITHP \
2405 assert(!and_withp); \
2406 Newx(and_withp,1,struct regnode_charclass_class); \
2407 SAVEFREEPV(and_withp)
2409 /* this is a chain of data about sub patterns we are processing that
2410 need to be handled seperately/specially in study_chunk. Its so
2411 we can simulate recursion without losing state. */
2413 typedef struct scan_frame {
2414 regnode *last; /* last node to process in this frame */
2415 regnode *next; /* next node to process when last is reached */
2416 struct scan_frame *prev; /*previous frame*/
2417 I32 stop; /* what stopparen do we use */
2421 #define SCAN_COMMIT(s, data, m) scan_commit(s, data, m, is_inf)
2423 #define CASE_SYNST_FNC(nAmE) \
2425 if (flags & SCF_DO_STCLASS_AND) { \
2426 for (value = 0; value < 256; value++) \
2427 if (!is_ ## nAmE ## _cp(value)) \
2428 ANYOF_BITMAP_CLEAR(data->start_class, value); \
2431 for (value = 0; value < 256; value++) \
2432 if (is_ ## nAmE ## _cp(value)) \
2433 ANYOF_BITMAP_SET(data->start_class, value); \
2437 if (flags & SCF_DO_STCLASS_AND) { \
2438 for (value = 0; value < 256; value++) \
2439 if (is_ ## nAmE ## _cp(value)) \
2440 ANYOF_BITMAP_CLEAR(data->start_class, value); \
2443 for (value = 0; value < 256; value++) \
2444 if (!is_ ## nAmE ## _cp(value)) \
2445 ANYOF_BITMAP_SET(data->start_class, value); \
2452 S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
2453 I32 *minlenp, I32 *deltap,
2458 struct regnode_charclass_class *and_withp,
2459 U32 flags, U32 depth)
2460 /* scanp: Start here (read-write). */
2461 /* deltap: Write maxlen-minlen here. */
2462 /* last: Stop before this one. */
2463 /* data: string data about the pattern */
2464 /* stopparen: treat close N as END */
2465 /* recursed: which subroutines have we recursed into */
2466 /* and_withp: Valid if flags & SCF_DO_STCLASS_OR */
2469 I32 min = 0, pars = 0, code;
2470 regnode *scan = *scanp, *next;
2472 int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
2473 int is_inf_internal = 0; /* The studied chunk is infinite */
2474 I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
2475 scan_data_t data_fake;
2476 SV *re_trie_maxbuff = NULL;
2477 regnode *first_non_open = scan;
2478 I32 stopmin = I32_MAX;
2479 scan_frame *frame = NULL;
2481 GET_RE_DEBUG_FLAGS_DECL;
2484 StructCopy(&zero_scan_data, &data_fake, scan_data_t);
2488 while (first_non_open && OP(first_non_open) == OPEN)
2489 first_non_open=regnext(first_non_open);
2494 while ( scan && OP(scan) != END && scan < last ){
2495 /* Peephole optimizer: */
2496 DEBUG_STUDYDATA("Peep:", data,depth);
2497 DEBUG_PEEP("Peep",scan,depth);
2498 JOIN_EXACT(scan,&min,0);
2500 /* Follow the next-chain of the current node and optimize
2501 away all the NOTHINGs from it. */
2502 if (OP(scan) != CURLYX) {
2503 const int max = (reg_off_by_arg[OP(scan)]
2505 /* I32 may be smaller than U16 on CRAYs! */
2506 : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
2507 int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
2511 /* Skip NOTHING and LONGJMP. */
2512 while ((n = regnext(n))
2513 && ((PL_regkind[OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
2514 || ((OP(n) == LONGJMP) && (noff = ARG(n))))
2515 && off + noff < max)
2517 if (reg_off_by_arg[OP(scan)])
2520 NEXT_OFF(scan) = off;
2525 /* The principal pseudo-switch. Cannot be a switch, since we
2526 look into several different things. */
2527 if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
2528 || OP(scan) == IFTHEN) {
2529 next = regnext(scan);
2531 /* demq: the op(next)==code check is to see if we have "branch-branch" AFAICT */
2533 if (OP(next) == code || code == IFTHEN) {
2534 /* NOTE - There is similar code to this block below for handling
2535 TRIE nodes on a re-study. If you change stuff here check there
2537 I32 max1 = 0, min1 = I32_MAX, num = 0;
2538 struct regnode_charclass_class accum;
2539 regnode * const startbranch=scan;
2541 if (flags & SCF_DO_SUBSTR)
2542 SCAN_COMMIT(pRExC_state, data, minlenp); /* Cannot merge strings after this. */
2543 if (flags & SCF_DO_STCLASS)
2544 cl_init_zero(pRExC_state, &accum);
2546 while (OP(scan) == code) {
2547 I32 deltanext, minnext, f = 0, fake;
2548 struct regnode_charclass_class this_class;
2551 data_fake.flags = 0;
2553 data_fake.whilem_c = data->whilem_c;
2554 data_fake.last_closep = data->last_closep;
2557 data_fake.last_closep = &fake;
2559 data_fake.pos_delta = delta;
2560 next = regnext(scan);
2561 scan = NEXTOPER(scan);
2563 scan = NEXTOPER(scan);
2564 if (flags & SCF_DO_STCLASS) {
2565 cl_init(pRExC_state, &this_class);
2566 data_fake.start_class = &this_class;
2567 f = SCF_DO_STCLASS_AND;
2569 if (flags & SCF_WHILEM_VISITED_POS)
2570 f |= SCF_WHILEM_VISITED_POS;
2572 /* we suppose the run is continuous, last=next...*/
2573 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
2575 stopparen, recursed, NULL, f,depth+1);
2578 if (max1 < minnext + deltanext)
2579 max1 = minnext + deltanext;
2580 if (deltanext == I32_MAX)
2581 is_inf = is_inf_internal = 1;
2583 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
2585 if (data_fake.flags & SCF_SEEN_ACCEPT) {
2586 if ( stopmin > minnext)
2587 stopmin = min + min1;
2588 flags &= ~SCF_DO_SUBSTR;
2590 data->flags |= SCF_SEEN_ACCEPT;
2593 if (data_fake.flags & SF_HAS_EVAL)
2594 data->flags |= SF_HAS_EVAL;
2595 data->whilem_c = data_fake.whilem_c;
2597 if (flags & SCF_DO_STCLASS)
2598 cl_or(pRExC_state, &accum, &this_class);
2600 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
2602 if (flags & SCF_DO_SUBSTR) {
2603 data->pos_min += min1;
2604 data->pos_delta += max1 - min1;
2605 if (max1 != min1 || is_inf)
2606 data->longest = &(data->longest_float);
2609 delta += max1 - min1;
2610 if (flags & SCF_DO_STCLASS_OR) {
2611 cl_or(pRExC_state, data->start_class, &accum);
2613 cl_and(data->start_class, and_withp);
2614 flags &= ~SCF_DO_STCLASS;
2617 else if (flags & SCF_DO_STCLASS_AND) {
2619 cl_and(data->start_class, &accum);
2620 flags &= ~SCF_DO_STCLASS;
2623 /* Switch to OR mode: cache the old value of
2624 * data->start_class */
2626 StructCopy(data->start_class, and_withp,
2627 struct regnode_charclass_class);
2628 flags &= ~SCF_DO_STCLASS_AND;
2629 StructCopy(&accum, data->start_class,
2630 struct regnode_charclass_class);
2631 flags |= SCF_DO_STCLASS_OR;
2632 data->start_class->flags |= ANYOF_EOS;
2636 if (PERL_ENABLE_TRIE_OPTIMISATION && OP( startbranch ) == BRANCH ) {
2639 Assuming this was/is a branch we are dealing with: 'scan' now
2640 points at the item that follows the branch sequence, whatever
2641 it is. We now start at the beginning of the sequence and look
2648 which would be constructed from a pattern like /A|LIST|OF|WORDS/
2650 If we can find such a subseqence we need to turn the first
2651 element into a trie and then add the subsequent branch exact
2652 strings to the trie.
2656 1. patterns where the whole set of branch can be converted.
2658 2. patterns where only a subset can be converted.
2660 In case 1 we can replace the whole set with a single regop
2661 for the trie. In case 2 we need to keep the start and end
2664 'BRANCH EXACT; BRANCH EXACT; BRANCH X'
2665 becomes BRANCH TRIE; BRANCH X;
2667 There is an additional case, that being where there is a
2668 common prefix, which gets split out into an EXACT like node
2669 preceding the TRIE node.
2671 If x(1..n)==tail then we can do a simple trie, if not we make
2672 a "jump" trie, such that when we match the appropriate word
2673 we "jump" to the appopriate tail node. Essentailly we turn
2674 a nested if into a case structure of sorts.
2679 if (!re_trie_maxbuff) {
2680 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
2681 if (!SvIOK(re_trie_maxbuff))
2682 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
2684 if ( SvIV(re_trie_maxbuff)>=0 ) {
2686 regnode *first = (regnode *)NULL;
2687 regnode *last = (regnode *)NULL;
2688 regnode *tail = scan;
2693 SV * const mysv = sv_newmortal(); /* for dumping */
2695 /* var tail is used because there may be a TAIL
2696 regop in the way. Ie, the exacts will point to the
2697 thing following the TAIL, but the last branch will
2698 point at the TAIL. So we advance tail. If we
2699 have nested (?:) we may have to move through several
2703 while ( OP( tail ) == TAIL ) {
2704 /* this is the TAIL generated by (?:) */
2705 tail = regnext( tail );
2710 regprop(RExC_rx, mysv, tail );
2711 PerlIO_printf( Perl_debug_log, "%*s%s%s\n",
2712 (int)depth * 2 + 2, "",
2713 "Looking for TRIE'able sequences. Tail node is: ",
2714 SvPV_nolen_const( mysv )
2720 step through the branches, cur represents each
2721 branch, noper is the first thing to be matched
2722 as part of that branch and noper_next is the
2723 regnext() of that node. if noper is an EXACT
2724 and noper_next is the same as scan (our current
2725 position in the regex) then the EXACT branch is
2726 a possible optimization target. Once we have
2727 two or more consequetive such branches we can
2728 create a trie of the EXACT's contents and stich
2729 it in place. If the sequence represents all of
2730 the branches we eliminate the whole thing and
2731 replace it with a single TRIE. If it is a
2732 subsequence then we need to stitch it in. This
2733 means the first branch has to remain, and needs
2734 to be repointed at the item on the branch chain
2735 following the last branch optimized. This could
2736 be either a BRANCH, in which case the
2737 subsequence is internal, or it could be the
2738 item following the branch sequence in which
2739 case the subsequence is at the end.
2743 /* dont use tail as the end marker for this traverse */
2744 for ( cur = startbranch ; cur != scan ; cur = regnext( cur ) ) {
2745 regnode * const noper = NEXTOPER( cur );
2746 #if defined(DEBUGGING) || defined(NOJUMPTRIE)
2747 regnode * const noper_next = regnext( noper );
2751 regprop(RExC_rx, mysv, cur);
2752 PerlIO_printf( Perl_debug_log, "%*s- %s (%d)",
2753 (int)depth * 2 + 2,"", SvPV_nolen_const( mysv ), REG_NODE_NUM(cur) );
2755 regprop(RExC_rx, mysv, noper);
2756 PerlIO_printf( Perl_debug_log, " -> %s",
2757 SvPV_nolen_const(mysv));
2760 regprop(RExC_rx, mysv, noper_next );
2761 PerlIO_printf( Perl_debug_log,"\t=> %s\t",
2762 SvPV_nolen_const(mysv));
2764 PerlIO_printf( Perl_debug_log, "(First==%d,Last==%d,Cur==%d)\n",
2765 REG_NODE_NUM(first), REG_NODE_NUM(last), REG_NODE_NUM(cur) );
2767 if ( (((first && optype!=NOTHING) ? OP( noper ) == optype
2768 : PL_regkind[ OP( noper ) ] == EXACT )
2769 || OP(noper) == NOTHING )
2771 && noper_next == tail
2776 if ( !first || optype == NOTHING ) {
2777 if (!first) first = cur;
2778 optype = OP( noper );
2784 make_trie( pRExC_state,
2785 startbranch, first, cur, tail, count,
2788 if ( PL_regkind[ OP( noper ) ] == EXACT
2790 && noper_next == tail
2795 optype = OP( noper );
2805 regprop(RExC_rx, mysv, cur);
2806 PerlIO_printf( Perl_debug_log,
2807 "%*s- %s (%d) <SCAN FINISHED>\n", (int)depth * 2 + 2,
2808 "", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur));
2812 made= make_trie( pRExC_state, startbranch, first, scan, tail, count, optype, depth+1 );
2813 #ifdef TRIE_STUDY_OPT
2814 if ( ((made == MADE_EXACT_TRIE &&
2815 startbranch == first)
2816 || ( first_non_open == first )) &&
2818 flags |= SCF_TRIE_RESTUDY;
2819 if ( startbranch == first
2822 RExC_seen &=~REG_TOP_LEVEL_BRANCHES;
2832 else if ( code == BRANCHJ ) { /* single branch is optimized. */
2833 scan = NEXTOPER(NEXTOPER(scan));
2834 } else /* single branch is optimized. */
2835 scan = NEXTOPER(scan);
2837 } else if (OP(scan) == SUSPEND || OP(scan) == GOSUB || OP(scan) == GOSTART) {
2838 scan_frame *newframe = NULL;
2843 if (OP(scan) != SUSPEND) {
2844 /* set the pointer */
2845 if (OP(scan) == GOSUB) {
2847 RExC_recurse[ARG2L(scan)] = scan;
2848 start = RExC_open_parens[paren-1];
2849 end = RExC_close_parens[paren-1];
2852 start = RExC_rxi->program + 1;
2856 Newxz(recursed, (((RExC_npar)>>3) +1), U8);
2857 SAVEFREEPV(recursed);
2859 if (!PAREN_TEST(recursed,paren+1)) {
2860 PAREN_SET(recursed,paren+1);
2861 Newx(newframe,1,scan_frame);
2863 if (flags & SCF_DO_SUBSTR) {
2864 SCAN_COMMIT(pRExC_state,data,minlenp);
2865 data->longest = &(data->longest_float);
2867 is_inf = is_inf_internal = 1;
2868 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
2869 cl_anything(pRExC_state, data->start_class);
2870 flags &= ~SCF_DO_STCLASS;
2873 Newx(newframe,1,scan_frame);
2876 end = regnext(scan);
2881 SAVEFREEPV(newframe);
2882 newframe->next = regnext(scan);
2883 newframe->last = last;
2884 newframe->stop = stopparen;
2885 newframe->prev = frame;
2895 else if (OP(scan) == EXACT) {
2896 I32 l = STR_LEN(scan);
2899 const U8 * const s = (U8*)STRING(scan);
2900 l = utf8_length(s, s + l);
2901 uc = utf8_to_uvchr(s, NULL);
2903 uc = *((U8*)STRING(scan));
2906 if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
2907 /* The code below prefers earlier match for fixed
2908 offset, later match for variable offset. */
2909 if (data->last_end == -1) { /* Update the start info. */
2910 data->last_start_min = data->pos_min;
2911 data->last_start_max = is_inf
2912 ? I32_MAX : data->pos_min + data->pos_delta;
2914 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
2916 SvUTF8_on(data->last_found);
2918 SV * const sv = data->last_found;
2919 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
2920 mg_find(sv, PERL_MAGIC_utf8) : NULL;
2921 if (mg && mg->mg_len >= 0)
2922 mg->mg_len += utf8_length((U8*)STRING(scan),
2923 (U8*)STRING(scan)+STR_LEN(scan));
2925 data->last_end = data->pos_min + l;
2926 data->pos_min += l; /* As in the first entry. */
2927 data->flags &= ~SF_BEFORE_EOL;
2929 if (flags & SCF_DO_STCLASS_AND) {
2930 /* Check whether it is compatible with what we know already! */
2934 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
2935 && !ANYOF_BITMAP_TEST(data->start_class, uc)
2936 && (!(data->start_class->flags & ANYOF_FOLD)
2937 || !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
2940 ANYOF_CLASS_ZERO(data->start_class);
2941 ANYOF_BITMAP_ZERO(data->start_class);
2943 ANYOF_BITMAP_SET(data->start_class, uc);
2944 data->start_class->flags &= ~ANYOF_EOS;
2946 data->start_class->flags &= ~ANYOF_UNICODE_ALL;
2948 else if (flags & SCF_DO_STCLASS_OR) {
2949 /* false positive possible if the class is case-folded */
2951 ANYOF_BITMAP_SET(data->start_class, uc);
2953 data->start_class->flags |= ANYOF_UNICODE_ALL;
2954 data->start_class->flags &= ~ANYOF_EOS;
2955 cl_and(data->start_class, and_withp);
2957 flags &= ~SCF_DO_STCLASS;
2959 else if (PL_regkind[OP(scan)] == EXACT) { /* But OP != EXACT! */
2960 I32 l = STR_LEN(scan);
2961 UV uc = *((U8*)STRING(scan));
2963 /* Search for fixed substrings supports EXACT only. */
2964 if (flags & SCF_DO_SUBSTR) {
2966 SCAN_COMMIT(pRExC_state, data, minlenp);
2969 const U8 * const s = (U8 *)STRING(scan);
2970 l = utf8_length(s, s + l);
2971 uc = utf8_to_uvchr(s, NULL);
2974 if (flags & SCF_DO_SUBSTR)
2976 if (flags & SCF_DO_STCLASS_AND) {
2977 /* Check whether it is compatible with what we know already! */
2981 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
2982 && !ANYOF_BITMAP_TEST(data->start_class, uc)
2983 && !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
2985 ANYOF_CLASS_ZERO(data->start_class);
2986 ANYOF_BITMAP_ZERO(data->start_class);
2988 ANYOF_BITMAP_SET(data->start_class, uc);
2989 data->start_class->flags &= ~ANYOF_EOS;
2990 data->start_class->flags |= ANYOF_FOLD;
2991 if (OP(scan) == EXACTFL)
2992 data->start_class->flags |= ANYOF_LOCALE;
2995 else if (flags & SCF_DO_STCLASS_OR) {
2996 if (data->start_class->flags & ANYOF_FOLD) {
2997 /* false positive possible if the class is case-folded.
2998 Assume that the locale settings are the same... */
3000 ANYOF_BITMAP_SET(data->start_class, uc);
3001 data->start_class->flags &= ~ANYOF_EOS;
3003 cl_and(data->start_class, and_withp);
3005 flags &= ~SCF_DO_STCLASS;
3007 else if (strchr((const char*)PL_varies,OP(scan))) {
3008 I32 mincount, maxcount, minnext, deltanext, fl = 0;
3009 I32 f = flags, pos_before = 0;
3010 regnode * const oscan = scan;
3011 struct regnode_charclass_class this_class;
3012 struct regnode_charclass_class *oclass = NULL;
3013 I32 next_is_eval = 0;
3015 switch (PL_regkind[OP(scan)]) {
3016 case WHILEM: /* End of (?:...)* . */
3017 scan = NEXTOPER(scan);
3020 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
3021 next = NEXTOPER(scan);
3022 if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) {
3024 maxcount = REG_INFTY;
3025 next = regnext(scan);
3026 scan = NEXTOPER(scan);
3030 if (flags & SCF_DO_SUBSTR)
3035 if (flags & SCF_DO_STCLASS) {
3037 maxcount = REG_INFTY;
3038 next = regnext(scan);
3039 scan = NEXTOPER(scan);
3042 is_inf = is_inf_internal = 1;
3043 scan = regnext(scan);
3044 if (flags & SCF_DO_SUBSTR) {
3045 SCAN_COMMIT(pRExC_state, data, minlenp); /* Cannot extend fixed substrings */
3046 data->longest = &(data->longest_float);
3048 goto optimize_curly_tail;
3050 if (stopparen>0 && (OP(scan)==CURLYN || OP(scan)==CURLYM)
3051 && (scan->flags == stopparen))
3056 mincount = ARG1(scan);
3057 maxcount = ARG2(scan);
3059 next = regnext(scan);
3060 if (OP(scan) == CURLYX) {
3061 I32 lp = (data ? *(data->last_closep) : 0);
3062 scan->flags = ((lp <= (I32)U8_MAX) ? (U8)lp : U8_MAX);
3064 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
3065 next_is_eval = (OP(scan) == EVAL);
3067 if (flags & SCF_DO_SUBSTR) {
3068 if (mincount == 0) SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot extend fixed substrings */
3069 pos_before = data->pos_min;
3073 data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
3075 data->flags |= SF_IS_INF;
3077 if (flags & SCF_DO_STCLASS) {
3078 cl_init(pRExC_state, &this_class);
3079 oclass = data->start_class;
3080 data->start_class = &this_class;
3081 f |= SCF_DO_STCLASS_AND;
3082 f &= ~SCF_DO_STCLASS_OR;
3084 /* These are the cases when once a subexpression
3085 fails at a particular position, it cannot succeed
3086 even after backtracking at the enclosing scope.
3088 XXXX what if minimal match and we are at the
3089 initial run of {n,m}? */
3090 if ((mincount != maxcount - 1) && (maxcount != REG_INFTY))
3091 f &= ~SCF_WHILEM_VISITED_POS;
3093 /* This will finish on WHILEM, setting scan, or on NULL: */
3094 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
3095 last, data, stopparen, recursed, NULL,
3097 ? (f & ~SCF_DO_SUBSTR) : f),depth+1);
3099 if (flags & SCF_DO_STCLASS)
3100 data->start_class = oclass;
3101 if (mincount == 0 || minnext == 0) {
3102 if (flags & SCF_DO_STCLASS_OR) {
3103 cl_or(pRExC_state, data->start_class, &this_class);
3105 else if (flags & SCF_DO_STCLASS_AND) {
3106 /* Switch to OR mode: cache the old value of
3107 * data->start_class */
3109 StructCopy(data->start_class, and_withp,
3110 struct regnode_charclass_class);
3111 flags &= ~SCF_DO_STCLASS_AND;
3112 StructCopy(&this_class, data->start_class,
3113 struct regnode_charclass_class);
3114 flags |= SCF_DO_STCLASS_OR;
3115 data->start_class->flags |= ANYOF_EOS;
3117 } else { /* Non-zero len */
3118 if (flags & SCF_DO_STCLASS_OR) {
3119 cl_or(pRExC_state, data->start_class, &this_class);
3120 cl_and(data->start_class, and_withp);
3122 else if (flags & SCF_DO_STCLASS_AND)
3123 cl_and(data->start_class, &this_class);
3124 flags &= ~SCF_DO_STCLASS;
3126 if (!scan) /* It was not CURLYX, but CURLY. */
3128 if ( /* ? quantifier ok, except for (?{ ... }) */
3129 (next_is_eval || !(mincount == 0 && maxcount == 1))
3130 && (minnext == 0) && (deltanext == 0)
3131 && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
3132 && maxcount <= REG_INFTY/3 /* Complement check for big count */
3133 && ckWARN(WARN_REGEXP))
3136 "Quantifier unexpected on zero-length expression");
3139 min += minnext * mincount;
3140 is_inf_internal |= ((maxcount == REG_INFTY
3141 && (minnext + deltanext) > 0)
3142 || deltanext == I32_MAX);
3143 is_inf |= is_inf_internal;
3144 delta += (minnext + deltanext) * maxcount - minnext * mincount;
3146 /* Try powerful optimization CURLYX => CURLYN. */
3147 if ( OP(oscan) == CURLYX && data
3148 && data->flags & SF_IN_PAR
3149 && !(data->flags & SF_HAS_EVAL)
3150 && !deltanext && minnext == 1 ) {
3151 /* Try to optimize to CURLYN. */
3152 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
3153 regnode * const nxt1 = nxt;
3160 if (!strchr((const char*)PL_simple,OP(nxt))
3161 && !(PL_regkind[OP(nxt)] == EXACT
3162 && STR_LEN(nxt) == 1))
3168 if (OP(nxt) != CLOSE)
3170 if (RExC_open_parens) {
3171 RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
3172 RExC_close_parens[ARG(nxt1)-1]=nxt+2; /*close->while*/
3174 /* Now we know that nxt2 is the only contents: */
3175 oscan->flags = (U8)ARG(nxt);
3177 OP(nxt1) = NOTHING; /* was OPEN. */
3180 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
3181 NEXT_OFF(nxt1+ 1) = 0; /* just for consistancy. */
3182 NEXT_OFF(nxt2) = 0; /* just for consistancy with CURLY. */
3183 OP(nxt) = OPTIMIZED; /* was CLOSE. */
3184 OP(nxt + 1) = OPTIMIZED; /* was count. */
3185 NEXT_OFF(nxt+ 1) = 0; /* just for consistancy. */
3190 /* Try optimization CURLYX => CURLYM. */
3191 if ( OP(oscan) == CURLYX && data
3192 && !(data->flags & SF_HAS_PAR)
3193 && !(data->flags & SF_HAS_EVAL)
3194 && !deltanext /* atom is fixed width */
3195 && minnext != 0 /* CURLYM can't handle zero width */
3197 /* XXXX How to optimize if data == 0? */
3198 /* Optimize to a simpler form. */
3199 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
3203 while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
3204 && (OP(nxt2) != WHILEM))
3206 OP(nxt2) = SUCCEED; /* Whas WHILEM */
3207 /* Need to optimize away parenths. */
3208 if (data->flags & SF_IN_PAR) {
3209 /* Set the parenth number. */
3210 regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
3212 if (OP(nxt) != CLOSE)
3213 FAIL("Panic opt close");
3214 oscan->flags = (U8)ARG(nxt);
3215 if (RExC_open_parens) {
3216 RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
3217 RExC_close_parens[ARG(nxt1)-1]=nxt2+1; /*close->NOTHING*/
3219 OP(nxt1) = OPTIMIZED; /* was OPEN. */
3220 OP(nxt) = OPTIMIZED; /* was CLOSE. */
3223 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
3224 OP(nxt + 1) = OPTIMIZED; /* was count. */
3225 NEXT_OFF(nxt1 + 1) = 0; /* just for consistancy. */
3226 NEXT_OFF(nxt + 1) = 0; /* just for consistancy. */
3229 while ( nxt1 && (OP(nxt1) != WHILEM)) {
3230 regnode *nnxt = regnext(nxt1);
3233 if (reg_off_by_arg[OP(nxt1)])
3234 ARG_SET(nxt1, nxt2 - nxt1);
3235 else if (nxt2 - nxt1 < U16_MAX)
3236 NEXT_OFF(nxt1) = nxt2 - nxt1;
3238 OP(nxt) = NOTHING; /* Cannot beautify */
3243 /* Optimize again: */
3244 study_chunk(pRExC_state, &nxt1, minlenp, &deltanext, nxt,
3245 NULL, stopparen, recursed, NULL, 0,depth+1);
3250 else if ((OP(oscan) == CURLYX)
3251 && (flags & SCF_WHILEM_VISITED_POS)
3252 /* See the comment on a similar expression above.
3253 However, this time it not a subexpression
3254 we care about, but the expression itself. */
3255 && (maxcount == REG_INFTY)
3256 && data && ++data->whilem_c < 16) {
3257 /* This stays as CURLYX, we can put the count/of pair. */
3258 /* Find WHILEM (as in regexec.c) */
3259 regnode *nxt = oscan + NEXT_OFF(oscan);
3261 if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
3263 PREVOPER(nxt)->flags = (U8)(data->whilem_c
3264 | (RExC_whilem_seen << 4)); /* On WHILEM */
3266 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
3268 if (flags & SCF_DO_SUBSTR) {
3269 SV *last_str = NULL;
3270 int counted = mincount != 0;
3272 if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */
3273 #if defined(SPARC64_GCC_WORKAROUND)
3276 const char *s = NULL;
3279 if (pos_before >= data->last_start_min)
3282 b = data->last_start_min;
3285 s = SvPV_const(data->last_found, l);
3286 old = b - data->last_start_min;
3289 I32 b = pos_before >= data->last_start_min
3290 ? pos_before : data->last_start_min;
3292 const char * const s = SvPV_const(data->last_found, l);
3293 I32 old = b - data->last_start_min;
3297 old = utf8_hop((U8*)s, old) - (U8*)s;
3300 /* Get the added string: */
3301 last_str = newSVpvn(s + old, l);
3303 SvUTF8_on(last_str);
3304 if (deltanext == 0 && pos_before == b) {
3305 /* What was added is a constant string */
3307 SvGROW(last_str, (mincount * l) + 1);
3308 repeatcpy(SvPVX(last_str) + l,
3309 SvPVX_const(last_str), l, mincount - 1);
3310 SvCUR_set(last_str, SvCUR(last_str) * mincount);
3311 /* Add additional parts. */
3312 SvCUR_set(data->last_found,
3313 SvCUR(data->last_found) - l);
3314 sv_catsv(data->last_found, last_str);
3316 SV * sv = data->last_found;
3318 SvUTF8(sv) && SvMAGICAL(sv) ?
3319 mg_find(sv, PERL_MAGIC_utf8) : NULL;
3320 if (mg && mg->mg_len >= 0)
3321 mg->mg_len += CHR_SVLEN(last_str) - l;
3323 data->last_end += l * (mincount - 1);
3326 /* start offset must point into the last copy */
3327 data->last_start_min += minnext * (mincount - 1);
3328 data->last_start_max += is_inf ? I32_MAX
3329 : (maxcount - 1) * (minnext + data->pos_delta);
3332 /* It is counted once already... */
3333 data->pos_min += minnext * (mincount - counted);
3334 data->pos_delta += - counted * deltanext +
3335 (minnext + deltanext) * maxcount - minnext * mincount;
3336 if (mincount != maxcount) {
3337 /* Cannot extend fixed substrings found inside
3339 SCAN_COMMIT(pRExC_state,data,minlenp);
3340 if (mincount && last_str) {
3341 SV * const sv = data->last_found;
3342 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
3343 mg_find(sv, PERL_MAGIC_utf8) : NULL;
3347 sv_setsv(sv, last_str);
3348 data->last_end = data->pos_min;
3349 data->last_start_min =
3350 data->pos_min - CHR_SVLEN(last_str);
3351 data->last_start_max = is_inf
3353 : data->pos_min + data->pos_delta
3354 - CHR_SVLEN(last_str);
3356 data->longest = &(data->longest_float);
3358 SvREFCNT_dec(last_str);
3360 if (data && (fl & SF_HAS_EVAL))
3361 data->flags |= SF_HAS_EVAL;
3362 optimize_curly_tail:
3363 if (OP(oscan) != CURLYX) {
3364 while (PL_regkind[OP(next = regnext(oscan))] == NOTHING
3366 NEXT_OFF(oscan) += NEXT_OFF(next);
3369 default: /* REF and CLUMP only? */
3370 if (flags & SCF_DO_SUBSTR) {
3371 SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */
3372 data->longest = &(data->longest_float);
3374 is_inf = is_inf_internal = 1;
3375 if (flags & SCF_DO_STCLASS_OR)
3376 cl_anything(pRExC_state, data->start_class);
3377 flags &= ~SCF_DO_STCLASS;
3381 else if (OP(scan) == LNBREAK) {
3382 if (flags & SCF_DO_STCLASS) {
3384 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
3385 if (flags & SCF_DO_STCLASS_AND) {
3386 for (value = 0; value < 256; value++)
3387 if (!is_VERTWS_cp(value))
3388 ANYOF_BITMAP_CLEAR(data->start_class, value);
3391 for (value = 0; value < 256; value++)
3392 if (is_VERTWS_cp(value))
3393 ANYOF_BITMAP_SET(data->start_class, value);
3395 if (flags & SCF_DO_STCLASS_OR)
3396 cl_and(data->start_class, and_withp);
3397 flags &= ~SCF_DO_STCLASS;
3401 if (flags & SCF_DO_SUBSTR) {
3402 SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */
3404 data->pos_delta += 1;
3405 data->longest = &(data->longest_float);
3409 else if (OP(scan) == FOLDCHAR) {
3410 int d = ARG(scan)==0xDF ? 1 : 2;
3411 flags &= ~SCF_DO_STCLASS;
3414 if (flags & SCF_DO_SUBSTR) {
3415 SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */
3417 data->pos_delta += d;
3418 data->longest = &(data->longest_float);
3421 else if (strchr((const char*)PL_simple,OP(scan))) {
3424 if (flags & SCF_DO_SUBSTR) {
3425 SCAN_COMMIT(pRExC_state,data,minlenp);
3429 if (flags & SCF_DO_STCLASS) {
3430 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
3432 /* Some of the logic below assumes that switching
3433 locale on will only add false positives. */
3434 switch (PL_regkind[OP(scan)]) {
3438 /* Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan)); */
3439 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
3440 cl_anything(pRExC_state, data->start_class);
3443 if (OP(scan) == SANY)
3445 if (flags & SCF_DO_STCLASS_OR) { /* Everything but \n */
3446 value = (ANYOF_BITMAP_TEST(data->start_class,'\n')
3447 || (data->start_class->flags & ANYOF_CLASS));
3448 cl_anything(pRExC_state, data->start_class);
3450 if (flags & SCF_DO_STCLASS_AND || !value)
3451 ANYOF_BITMAP_CLEAR(data->start_class,'\n');
3454 if (flags & SCF_DO_STCLASS_AND)
3455 cl_and(data->start_class,
3456 (struct regnode_charclass_class*)scan);
3458 cl_or(pRExC_state, data->start_class,
3459 (struct regnode_charclass_class*)scan);
3462 if (flags & SCF_DO_STCLASS_AND) {
3463 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3464 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
3465 for (value = 0; value < 256; value++)
3466 if (!isALNUM(value))
3467 ANYOF_BITMAP_CLEAR(data->start_class, value);
3471 if (data->start_class->flags & ANYOF_LOCALE)
3472 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
3474 for (value = 0; value < 256; value++)
3476 ANYOF_BITMAP_SET(data->start_class, value);
3481 if (flags & SCF_DO_STCLASS_AND) {
3482 if (data->start_class->flags & ANYOF_LOCALE)
3483 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
3486 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
3487 data->start_class->flags |= ANYOF_LOCALE;
3491 if (flags & SCF_DO_STCLASS_AND) {
3492 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3493 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
3494 for (value = 0; value < 256; value++)
3496 ANYOF_BITMAP_CLEAR(data->start_class, value);
3500 if (data->start_class->flags & ANYOF_LOCALE)
3501 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
3503 for (value = 0; value < 256; value++)
3504 if (!isALNUM(value))
3505 ANYOF_BITMAP_SET(data->start_class, value);
3510 if (flags & SCF_DO_STCLASS_AND) {
3511 if (data->start_class->flags & ANYOF_LOCALE)
3512 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
3515 data->start_class->flags |= ANYOF_LOCALE;
3516 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
3520 if (flags & SCF_DO_STCLASS_AND) {
3521 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3522 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
3523 for (value = 0; value < 256; value++)
3524 if (!isSPACE(value))
3525 ANYOF_BITMAP_CLEAR(data->start_class, value);
3529 if (data->start_class->flags & ANYOF_LOCALE)
3530 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
3532 for (value = 0; value < 256; value++)
3534 ANYOF_BITMAP_SET(data->start_class, value);
3539 if (flags & SCF_DO_STCLASS_AND) {
3540 if (data->start_class->flags & ANYOF_LOCALE)
3541 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
3544 data->start_class->flags |= ANYOF_LOCALE;
3545 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
3549 if (flags & SCF_DO_STCLASS_AND) {
3550 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3551 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
3552 for (value = 0; value < 256; value++)
3554 ANYOF_BITMAP_CLEAR(data->start_class, value);
3558 if (data->start_class->flags & ANYOF_LOCALE)
3559 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
3561 for (value = 0; value < 256; value++)
3562 if (!isSPACE(value))
3563 ANYOF_BITMAP_SET(data->start_class, value);
3568 if (flags & SCF_DO_STCLASS_AND) {
3569 if (data->start_class->flags & ANYOF_LOCALE) {
3570 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
3571 for (value = 0; value < 256; value++)
3572 if (!isSPACE(value))
3573 ANYOF_BITMAP_CLEAR(data->start_class, value);
3577 data->start_class->flags |= ANYOF_LOCALE;
3578 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
3582 if (flags & SCF_DO_STCLASS_AND) {
3583 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NDIGIT);
3584 for (value = 0; value < 256; value++)
3585 if (!isDIGIT(value))
3586 ANYOF_BITMAP_CLEAR(data->start_class, value);
3589 if (data->start_class->flags & ANYOF_LOCALE)
3590 ANYOF_CLASS_SET(data->start_class,ANYOF_DIGIT);
3592 for (value = 0; value < 256; value++)
3594 ANYOF_BITMAP_SET(data->start_class, value);
3599 if (flags & SCF_DO_STCLASS_AND) {
3600 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_DIGIT);
3601 for (value = 0; value < 256; value++)
3603 ANYOF_BITMAP_CLEAR(data->start_class, value);
3606 if (data->start_class->flags & ANYOF_LOCALE)
3607 ANYOF_CLASS_SET(data->start_class,ANYOF_NDIGIT);
3609 for (value = 0; value < 256; value++)
3610 if (!isDIGIT(value))
3611 ANYOF_BITMAP_SET(data->start_class, value);
3615 CASE_SYNST_FNC(VERTWS);
3616 CASE_SYNST_FNC(HORIZWS);
3619 if (flags & SCF_DO_STCLASS_OR)
3620 cl_and(data->start_class, and_withp);
3621 flags &= ~SCF_DO_STCLASS;
3624 else if (PL_regkind[OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
3625 data->flags |= (OP(scan) == MEOL
3629 else if ( PL_regkind[OP(scan)] == BRANCHJ
3630 /* Lookbehind, or need to calculate parens/evals/stclass: */
3631 && (scan->flags || data || (flags & SCF_DO_STCLASS))
3632 && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) {
3633 if ( !PERL_ENABLE_POSITIVE_ASSERTION_STUDY
3634 || OP(scan) == UNLESSM )
3636 /* Negative Lookahead/lookbehind
3637 In this case we can't do fixed string optimisation.
3640 I32 deltanext, minnext, fake = 0;
3642 struct regnode_charclass_class intrnl;
3645 data_fake.flags = 0;
3647 data_fake.whilem_c = data->whilem_c;
3648 data_fake.last_closep = data->last_closep;
3651 data_fake.last_closep = &fake;
3652 data_fake.pos_delta = delta;
3653 if ( flags & SCF_DO_STCLASS && !scan->flags
3654 && OP(scan) == IFMATCH ) { /* Lookahead */
3655 cl_init(pRExC_state, &intrnl);
3656 data_fake.start_class = &intrnl;
3657 f |= SCF_DO_STCLASS_AND;
3659 if (flags & SCF_WHILEM_VISITED_POS)
3660 f |= SCF_WHILEM_VISITED_POS;
3661 next = regnext(scan);
3662 nscan = NEXTOPER(NEXTOPER(scan));
3663 minnext = study_chunk(pRExC_state, &nscan, minlenp, &deltanext,
3664 last, &data_fake, stopparen, recursed, NULL, f, depth+1);
3667 FAIL("Variable length lookbehind not implemented");
3669 else if (minnext > (I32)U8_MAX) {
3670 FAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
3672 scan->flags = (U8)minnext;
3675 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3677 if (data_fake.flags & SF_HAS_EVAL)
3678 data->flags |= SF_HAS_EVAL;
3679 data->whilem_c = data_fake.whilem_c;
3681 if (f & SCF_DO_STCLASS_AND) {
3682 const int was = (data->start_class->flags & ANYOF_EOS);
3684 cl_and(data->start_class, &intrnl);
3686 data->start_class->flags |= ANYOF_EOS;
3689 #if PERL_ENABLE_POSITIVE_ASSERTION_STUDY
3691 /* Positive Lookahead/lookbehind
3692 In this case we can do fixed string optimisation,
3693 but we must be careful about it. Note in the case of
3694 lookbehind the positions will be offset by the minimum
3695 length of the pattern, something we won't know about
3696 until after the recurse.
3698 I32 deltanext, fake = 0;
3700 struct regnode_charclass_class intrnl;
3702 /* We use SAVEFREEPV so that when the full compile
3703 is finished perl will clean up the allocated
3704 minlens when its all done. This was we don't
3705 have to worry about freeing them when we know
3706 they wont be used, which would be a pain.
3709 Newx( minnextp, 1, I32 );
3710 SAVEFREEPV(minnextp);
3713 StructCopy(data, &data_fake, scan_data_t);
3714 if ((flags & SCF_DO_SUBSTR) && data->last_found) {
3717 SCAN_COMMIT(pRExC_state, &data_fake,minlenp);
3718 data_fake.last_found=newSVsv(data->last_found);
3722 data_fake.last_closep = &fake;
3723 data_fake.flags = 0;
3724 data_fake.pos_delta = delta;
3726 data_fake.flags |= SF_IS_INF;
3727 if ( flags & SCF_DO_STCLASS && !scan->flags
3728 && OP(scan) == IFMATCH ) { /* Lookahead */
3729 cl_init(pRExC_state, &intrnl);
3730 data_fake.start_class = &intrnl;
3731 f |= SCF_DO_STCLASS_AND;
3733 if (flags & SCF_WHILEM_VISITED_POS)
3734 f |= SCF_WHILEM_VISITED_POS;
3735 next = regnext(scan);
3736 nscan = NEXTOPER(NEXTOPER(scan));
3738 *minnextp = study_chunk(pRExC_state, &nscan, minnextp, &deltanext,
3739 last, &data_fake, stopparen, recursed, NULL, f,depth+1);
3742 FAIL("Variable length lookbehind not implemented");
3744 else if (*minnextp > (I32)U8_MAX) {
3745 FAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
3747 scan->flags = (U8)*minnextp;
3752 if (f & SCF_DO_STCLASS_AND) {
3753 const int was = (data->start_class->flags & ANYOF_EOS);
3755 cl_and(data->start_class, &intrnl);
3757 data->start_class->flags |= ANYOF_EOS;
3760 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3762 if (data_fake.flags & SF_HAS_EVAL)
3763 data->flags |= SF_HAS_EVAL;
3764 data->whilem_c = data_fake.whilem_c;
3765 if ((flags & SCF_DO_SUBSTR) && data_fake.last_found) {
3766 if (RExC_rx->minlen<*minnextp)
3767 RExC_rx->minlen=*minnextp;
3768 SCAN_COMMIT(pRExC_state, &data_fake, minnextp);
3769 SvREFCNT_dec(data_fake.last_found);
3771 if ( data_fake.minlen_fixed != minlenp )
3773 data->offset_fixed= data_fake.offset_fixed;
3774 data->minlen_fixed= data_fake.minlen_fixed;
3775 data->lookbehind_fixed+= scan->flags;
3777 if ( data_fake.minlen_float != minlenp )
3779 data->minlen_float= data_fake.minlen_float;
3780 data->offset_float_min=data_fake.offset_float_min;
3781 data->offset_float_max=data_fake.offset_float_max;
3782 data->lookbehind_float+= scan->flags;
3791 else if (OP(scan) == OPEN) {
3792 if (stopparen != (I32)ARG(scan))
3795 else if (OP(scan) == CLOSE) {
3796 if (stopparen == (I32)ARG(scan)) {
3799 if ((I32)ARG(scan) == is_par) {
3800 next = regnext(scan);
3802 if ( next && (OP(next) != WHILEM) && next < last)
3803 is_par = 0; /* Disable optimization */
3806 *(data->last_closep) = ARG(scan);
3808 else if (OP(scan) == EVAL) {
3810 data->flags |= SF_HAS_EVAL;
3812 else if ( PL_regkind[OP(scan)] == ENDLIKE ) {
3813 if (flags & SCF_DO_SUBSTR) {
3814 SCAN_COMMIT(pRExC_state,data,minlenp);
3815 flags &= ~SCF_DO_SUBSTR;
3817 if (data && OP(scan)==ACCEPT) {
3818 data->flags |= SCF_SEEN_ACCEPT;
3823 else if (OP(scan) == LOGICAL && scan->flags == 2) /* Embedded follows */
3825 if (flags & SCF_DO_SUBSTR) {
3826 SCAN_COMMIT(pRExC_state,data,minlenp);
3827 data->longest = &(data->longest_float);
3829 is_inf = is_inf_internal = 1;
3830 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
3831 cl_anything(pRExC_state, data->start_class);
3832 flags &= ~SCF_DO_STCLASS;
3834 else if (OP(scan) == GPOS) {
3835 if (!(RExC_rx->extflags & RXf_GPOS_FLOAT) &&
3836 !(delta || is_inf || (data && data->pos_delta)))
3838 if (!(RExC_rx->extflags & RXf_ANCH) && (flags & SCF_DO_SUBSTR))
3839 RExC_rx->extflags |= RXf_ANCH_GPOS;
3840 if (RExC_rx->gofs < (U32)min)
3841 RExC_rx->gofs = min;
3843 RExC_rx->extflags |= RXf_GPOS_FLOAT;
3847 #ifdef TRIE_STUDY_OPT
3848 #ifdef FULL_TRIE_STUDY
3849 else if (PL_regkind[OP(scan)] == TRIE) {
3850 /* NOTE - There is similar code to this block above for handling
3851 BRANCH nodes on the initial study. If you change stuff here
3853 regnode *trie_node= scan;
3854 regnode *tail= regnext(scan);
3855 reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
3856 I32 max1 = 0, min1 = I32_MAX;
3857 struct regnode_charclass_class accum;
3859 if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
3860 SCAN_COMMIT(pRExC_state, data,minlenp); /* Cannot merge strings after this. */
3861 if (flags & SCF_DO_STCLASS)
3862 cl_init_zero(pRExC_state, &accum);
3868 const regnode *nextbranch= NULL;
3871 for ( word=1 ; word <= trie->wordcount ; word++)
3873 I32 deltanext=0, minnext=0, f = 0, fake;
3874 struct regnode_charclass_class this_class;
3876 data_fake.flags = 0;
3878 data_fake.whilem_c = data->whilem_c;
3879 data_fake.last_closep = data->last_closep;
3882 data_fake.last_closep = &fake;
3883 data_fake.pos_delta = delta;
3884 if (flags & SCF_DO_STCLASS) {
3885 cl_init(pRExC_state, &this_class);
3886 data_fake.start_class = &this_class;
3887 f = SCF_DO_STCLASS_AND;
3889 if (flags & SCF_WHILEM_VISITED_POS)
3890 f |= SCF_WHILEM_VISITED_POS;
3892 if (trie->jump[word]) {
3894 nextbranch = trie_node + trie->jump[0];
3895 scan= trie_node + trie->jump[word];
3896 /* We go from the jump point to the branch that follows
3897 it. Note this means we need the vestigal unused branches
3898 even though they arent otherwise used.
3900 minnext = study_chunk(pRExC_state, &scan, minlenp,
3901 &deltanext, (regnode *)nextbranch, &data_fake,
3902 stopparen, recursed, NULL, f,depth+1);
3904 if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
3905 nextbranch= regnext((regnode*)nextbranch);
3907 if (min1 > (I32)(minnext + trie->minlen))
3908 min1 = minnext + trie->minlen;
3909 if (max1 < (I32)(minnext + deltanext + trie->maxlen))
3910 max1 = minnext + deltanext + trie->maxlen;
3911 if (deltanext == I32_MAX)
3912 is_inf = is_inf_internal = 1;
3914 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3916 if (data_fake.flags & SCF_SEEN_ACCEPT) {
3917 if ( stopmin > min + min1)
3918 stopmin = min + min1;
3919 flags &= ~SCF_DO_SUBSTR;
3921 data->flags |= SCF_SEEN_ACCEPT;
3924 if (data_fake.flags & SF_HAS_EVAL)
3925 data->flags |= SF_HAS_EVAL;
3926 data->whilem_c = data_fake.whilem_c;
3928 if (flags & SCF_DO_STCLASS)
3929 cl_or(pRExC_state, &accum, &this_class);
3932 if (flags & SCF_DO_SUBSTR) {
3933 data->pos_min += min1;
3934 data->pos_delta += max1 - min1;
3935 if (max1 != min1 || is_inf)
3936 data->longest = &(data->longest_float);
3939 delta += max1 - min1;
3940 if (flags & SCF_DO_STCLASS_OR) {
3941 cl_or(pRExC_state, data->start_class, &accum);
3943 cl_and(data->start_class, and_withp);
3944 flags &= ~SCF_DO_STCLASS;
3947 else if (flags & SCF_DO_STCLASS_AND) {
3949 cl_and(data->start_class, &accum);
3950 flags &= ~SCF_DO_STCLASS;
3953 /* Switch to OR mode: cache the old value of
3954 * data->start_class */
3956 StructCopy(data->start_class, and_withp,
3957 struct regnode_charclass_class);
3958 flags &= ~SCF_DO_STCLASS_AND;
3959 StructCopy(&accum, data->start_class,
3960 struct regnode_charclass_class);
3961 flags |= SCF_DO_STCLASS_OR;
3962 data->start_class->flags |= ANYOF_EOS;
3969 else if (PL_regkind[OP(scan)] == TRIE) {
3970 reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
3973 min += trie->minlen;
3974 delta += (trie->maxlen - trie->minlen);
3975 flags &= ~SCF_DO_STCLASS; /* xxx */
3976 if (flags & SCF_DO_SUBSTR) {
3977 SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */
3978 data->pos_min += trie->minlen;
3979 data->pos_delta += (trie->maxlen - trie->minlen);
3980 if (trie->maxlen != trie->minlen)
3981 data->longest = &(data->longest_float);
3983 if (trie->jump) /* no more substrings -- for now /grr*/
3984 flags &= ~SCF_DO_SUBSTR;
3986 #endif /* old or new */
3987 #endif /* TRIE_STUDY_OPT */
3989 /* Else: zero-length, ignore. */
3990 scan = regnext(scan);
3995 stopparen = frame->stop;
3996 frame = frame->prev;
3997 goto fake_study_recurse;
4002 DEBUG_STUDYDATA("pre-fin:",data,depth);
4005 *deltap = is_inf_internal ? I32_MAX : delta;
4006 if (flags & SCF_DO_SUBSTR && is_inf)
4007 data->pos_delta = I32_MAX - data->pos_min;
4008 if (is_par > (I32)U8_MAX)
4010 if (is_par && pars==1 && data) {
4011 data->flags |= SF_IN_PAR;
4012 data->flags &= ~SF_HAS_PAR;
4014 else if (pars && data) {
4015 data->flags |= SF_HAS_PAR;
4016 data->flags &= ~SF_IN_PAR;
4018 if (flags & SCF_DO_STCLASS_OR)
4019 cl_and(data->start_class, and_withp);
4020 if (flags & SCF_TRIE_RESTUDY)
4021 data->flags |= SCF_TRIE_RESTUDY;
4023 DEBUG_STUDYDATA("post-fin:",data,depth);
4025 return min < stopmin ? min : stopmin;
4029 S_add_data(RExC_state_t *pRExC_state, U32 n, const char *s)
4031 U32 count = RExC_rxi->data ? RExC_rxi->data->count : 0;
4033 Renewc(RExC_rxi->data,
4034 sizeof(*RExC_rxi->data) + sizeof(void*) * (count + n - 1),
4035 char, struct reg_data);
4037 Renew(RExC_rxi->data->what, count + n, U8);
4039 Newx(RExC_rxi->data->what, n, U8);
4040 RExC_rxi->data->count = count + n;
4041 Copy(s, RExC_rxi->data->what + count, n, U8);
4045 /*XXX: todo make this not included in a non debugging perl */
4046 #ifndef PERL_IN_XSUB_RE
4048 Perl_reginitcolors(pTHX)
4051 const char * const s = PerlEnv_getenv("PERL_RE_COLORS");
4053 char *t = savepv(s);
4057 t = strchr(t, '\t');
4063 PL_colors[i] = t = (char *)"";
4068 PL_colors[i++] = (char *)"";
4075 #ifdef TRIE_STUDY_OPT
4076 #define CHECK_RESTUDY_GOTO \
4078 (data.flags & SCF_TRIE_RESTUDY) \
4082 #define CHECK_RESTUDY_GOTO
4086 - pregcomp - compile a regular expression into internal code
4088 * We can't allocate space until we know how big the compiled form will be,
4089 * but we can't compile it (and thus know how big it is) until we've got a
4090 * place to put the code. So we cheat: we compile it twice, once with code
4091 * generation turned off and size counting turned on, and once "for real".
4092 * This also means that we don't allocate space until we are sure that the
4093 * thing really will compile successfully, and we never have to move the
4094 * code and thus invalidate pointers into it. (Note that it has to be in
4095 * one piece because free() must be able to free it all.) [NB: not true in perl]
4097 * Beware that the optimization-preparation code in here knows about some
4098 * of the structure of the compiled regexp. [I'll say.]
4103 #ifndef PERL_IN_XSUB_RE
4104 #define RE_ENGINE_PTR &PL_core_reg_engine
4106 extern const struct regexp_engine my_reg_engine;
4107 #define RE_ENGINE_PTR &my_reg_engine
4110 #ifndef PERL_IN_XSUB_RE
4112 Perl_pregcomp(pTHX_ const SV * const pattern, const U32 flags)
4115 HV * const table = GvHV(PL_hintgv);
4116 /* Dispatch a request to compile a regexp to correct
4119 SV **ptr= hv_fetchs(table, "regcomp", FALSE);
4120 GET_RE_DEBUG_FLAGS_DECL;
4121 if (ptr && SvIOK(*ptr) && SvIV(*ptr)) {
4122 const regexp_engine *eng=INT2PTR(regexp_engine*,SvIV(*ptr));
4124 PerlIO_printf(Perl_debug_log, "Using engine %"UVxf"\n",
4127 return CALLREGCOMP_ENG(eng, pattern, flags);
4130 return Perl_re_compile(aTHX_ pattern, flags);
4135 Perl_re_compile(pTHX_ const SV * const pattern, const U32 pm_flags)
4139 register regexp_internal *ri;
4141 char* exp = SvPV((SV*)pattern, plen);
4142 char* xend = exp + plen;
4149 RExC_state_t RExC_state;
4150 RExC_state_t * const pRExC_state = &RExC_state;
4151 #ifdef TRIE_STUDY_OPT
4153 RExC_state_t copyRExC_state;
4155 GET_RE_DEBUG_FLAGS_DECL;
4156 DEBUG_r(if (!PL_colorset) reginitcolors());
4158 RExC_utf8 = RExC_orig_utf8 = pm_flags & RXf_UTF8;
4161 SV *dsv= sv_newmortal();
4162 RE_PV_QUOTED_DECL(s, RExC_utf8,
4163 dsv, exp, plen, 60);
4164 PerlIO_printf(Perl_debug_log, "%sCompiling REx%s %s\n",
4165 PL_colors[4],PL_colors[5],s);
4170 RExC_flags = pm_flags;
4174 RExC_seen_zerolen = *exp == '^' ? -1 : 0;
4175 RExC_seen_evals = 0;
4178 /* First pass: determine size, legality. */
4186 RExC_emit = &PL_regdummy;
4187 RExC_whilem_seen = 0;
4188 RExC_charnames = NULL;
4189 RExC_open_parens = NULL;
4190 RExC_close_parens = NULL;
4192 RExC_paren_names = NULL;
4194 RExC_paren_name_list = NULL;
4196 RExC_recurse = NULL;
4197 RExC_recurse_count = 0;
4199 #if 0 /* REGC() is (currently) a NOP at the first pass.
4200 * Clever compilers notice this and complain. --jhi */
4201 REGC((U8)REG_MAGIC, (char*)RExC_emit);
4203 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "Starting first pass (sizing)\n"));
4204 if (reg(pRExC_state, 0, &flags,1) == NULL) {
4205 RExC_precomp = NULL;
4208 if (RExC_utf8 && !RExC_orig_utf8) {
4209 /* It's possible to write a regexp in ascii that represents Unicode
4210 codepoints outside of the byte range, such as via \x{100}. If we
4211 detect such a sequence we have to convert the entire pattern to utf8
4212 and then recompile, as our sizing calculation will have been based
4213 on 1 byte == 1 character, but we will need to use utf8 to encode
4214 at least some part of the pattern, and therefore must convert the whole
4216 XXX: somehow figure out how to make this less expensive...
4219 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log,
4220 "UTF8 mismatch! Converting to utf8 for resizing and compile\n"));
4221 exp = (char*)Perl_bytes_to_utf8(aTHX_ (U8*)exp, &len);
4223 RExC_orig_utf8 = RExC_utf8;
4225 goto redo_first_pass;
4228 PerlIO_printf(Perl_debug_log,
4229 "Required size %"IVdf" nodes\n"
4230 "Starting second pass (creation)\n",
4233 RExC_lastparse=NULL;
4235 /* Small enough for pointer-storage convention?
4236 If extralen==0, this means that we will not need long jumps. */
4237 if (RExC_size >= 0x10000L && RExC_extralen)
4238 RExC_size += RExC_extralen;
4241 if (RExC_whilem_seen > 15)
4242 RExC_whilem_seen = 15;
4244 /* Allocate space and zero-initialize. Note, the two step process
4245 of zeroing when in debug mode, thus anything assigned has to
4246 happen after that */
4247 Newxz(r, 1, regexp);
4248 Newxc(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode),
4249 char, regexp_internal);
4250 if ( r == NULL || ri == NULL )
4251 FAIL("Regexp out of space");
4253 /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */
4254 Zero(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode), char);
4256 /* bulk initialize base fields with 0. */
4257 Zero(ri, sizeof(regexp_internal), char);
4260 /* non-zero initialization begins here */
4262 r->engine= RE_ENGINE_PTR;
4265 r->extflags = pm_flags;
4267 bool has_p = ((r->extflags & RXf_PMf_KEEPCOPY) == RXf_PMf_KEEPCOPY);
4268 bool has_minus = ((r->extflags & RXf_PMf_STD_PMMOD) != RXf_PMf_STD_PMMOD);
4269 bool has_runon = ((RExC_seen & REG_SEEN_RUN_ON_COMMENT)==REG_SEEN_RUN_ON_COMMENT);
4270 U16 reganch = (U16)((r->extflags & RXf_PMf_STD_PMMOD) >> 12);
4271 const char *fptr = STD_PAT_MODS; /*"msix"*/
4273 r->wraplen = r->prelen + has_minus + has_p + has_runon
4274 + (sizeof(STD_PAT_MODS) - 1)
4275 + (sizeof("(?:)") - 1);
4277 Newx(r->wrapped, r->wraplen + 1, char );
4281 *p++ = KEEPCOPY_PAT_MOD; /*'p'*/
4283 char *r = p + (sizeof(STD_PAT_MODS) - 1) + has_minus - 1;
4284 char *colon = r + 1;
4287 while((ch = *fptr++)) {
4301 Copy(RExC_precomp, p, r->prelen, char);
4311 r->nparens = RExC_npar - 1; /* set early to validate backrefs */
4313 if (RExC_seen & REG_SEEN_RECURSE) {
4314 Newxz(RExC_open_parens, RExC_npar,regnode *);
4315 SAVEFREEPV(RExC_open_parens);
4316 Newxz(RExC_close_parens,RExC_npar,regnode *);
4317 SAVEFREEPV(RExC_close_parens);
4320 /* Useful during FAIL. */
4321 #ifdef RE_TRACK_PATTERN_OFFSETS
4322 Newxz(ri->u.offsets, 2*RExC_size+1, U32); /* MJD 20001228 */
4323 DEBUG_OFFSETS_r(PerlIO_printf(Perl_debug_log,
4324 "%s %"UVuf" bytes for offset annotations.\n",
4325 ri->u.offsets ? "Got" : "Couldn't get",
4326 (UV)((2*RExC_size+1) * sizeof(U32))));
4328 SetProgLen(ri,RExC_size);
4332 /* Second pass: emit code. */
4333 RExC_flags = pm_flags; /* don't let top level (?i) bleed */
4338 RExC_emit_start = ri->program;
4339 RExC_emit = ri->program;
4340 RExC_emit_bound = ri->program + RExC_size + 1;
4342 /* Store the count of eval-groups for security checks: */
4343 RExC_rx->seen_evals = RExC_seen_evals;
4344 REGC((U8)REG_MAGIC, (char*) RExC_emit++);
4345 if (reg(pRExC_state, 0, &flags,1) == NULL) {
4349 /* XXXX To minimize changes to RE engine we always allocate
4350 3-units-long substrs field. */
4351 Newx(r->substrs, 1, struct reg_substr_data);
4352 if (RExC_recurse_count) {
4353 Newxz(RExC_recurse,RExC_recurse_count,regnode *);
4354 SAVEFREEPV(RExC_recurse);
4358 r->minlen = minlen = sawplus = sawopen = 0;
4359 Zero(r->substrs, 1, struct reg_substr_data);
4361 #ifdef TRIE_STUDY_OPT
4364 DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log,"Restudying\n"));
4366 RExC_state = copyRExC_state;
4367 if (seen & REG_TOP_LEVEL_BRANCHES)
4368 RExC_seen |= REG_TOP_LEVEL_BRANCHES;
4370 RExC_seen &= ~REG_TOP_LEVEL_BRANCHES;
4371 if (data.last_found) {
4372 SvREFCNT_dec(data.longest_fixed);
4373 SvREFCNT_dec(data.longest_float);
4374 SvREFCNT_dec(data.last_found);
4376 StructCopy(&zero_scan_data, &data, scan_data_t);
4378 StructCopy(&zero_scan_data, &data, scan_data_t);
4379 copyRExC_state = RExC_state;
4382 StructCopy(&zero_scan_data, &data, scan_data_t);
4385 /* Dig out information for optimizations. */
4386 r->extflags = RExC_flags; /* was pm_op */
4387 /*dmq: removed as part of de-PMOP: pm->op_pmflags = RExC_flags; */
4390 r->extflags |= RXf_UTF8; /* Unicode in it? */
4391 ri->regstclass = NULL;
4392 if (RExC_naughty >= 10) /* Probably an expensive pattern. */
4393 r->intflags |= PREGf_NAUGHTY;
4394 scan = ri->program + 1; /* First BRANCH. */
4396 /* testing for BRANCH here tells us whether there is "must appear"
4397 data in the pattern. If there is then we can use it for optimisations */
4398 if (!(RExC_seen & REG_TOP_LEVEL_BRANCHES)) { /* Only one top-level choice. */
4400 STRLEN longest_float_length, longest_fixed_length;
4401 struct regnode_charclass_class ch_class; /* pointed to by data */
4403 I32 last_close = 0; /* pointed to by data */
4404 regnode *first= scan;
4405 regnode *first_next= regnext(first);
4407 /* Skip introductions and multiplicators >= 1. */
4408 while ((OP(first) == OPEN && (sawopen = 1)) ||
4409 /* An OR of *one* alternative - should not happen now. */
4410 (OP(first) == BRANCH && OP(first_next) != BRANCH) ||
4411 /* for now we can't handle lookbehind IFMATCH*/
4412 (OP(first) == IFMATCH && !first->flags) ||
4413 (OP(first) == PLUS) ||
4414 (OP(first) == MINMOD) ||
4415 /* An {n,m} with n>0 */
4416 (PL_regkind[OP(first)] == CURLY && ARG1(first) > 0) ||
4417 (OP(first) == NOTHING && PL_regkind[OP(first_next)] != END ))
4420 if (OP(first) == PLUS)
4423 first += regarglen[OP(first)];
4424 if (OP(first) == IFMATCH) {
4425 first = NEXTOPER(first);
4426 first += EXTRA_STEP_2ARGS;
4427 } else /* XXX possible optimisation for /(?=)/ */
4428 first = NEXTOPER(first);
4429 first_next= regnext(first);
4432 /* Starting-point info. */
4434 DEBUG_PEEP("first:",first,0);
4435 /* Ignore EXACT as we deal with it later. */
4436 if (PL_regkind[OP(first)] == EXACT) {
4437 if (OP(first) == EXACT)
4438 NOOP; /* Empty, get anchored substr later. */
4439 else if ((OP(first) == EXACTF || OP(first) == EXACTFL))
4440 ri->regstclass = first;
4443 else if (PL_regkind[OP(first)] == TRIE &&
4444 ((reg_trie_data *)ri->data->data[ ARG(first) ])->minlen>0)
4447 /* this can happen only on restudy */
4448 if ( OP(first) == TRIE ) {
4449 struct regnode_1 *trieop = (struct regnode_1 *)
4450 PerlMemShared_calloc(1, sizeof(struct regnode_1));
4451 StructCopy(first,trieop,struct regnode_1);
4452 trie_op=(regnode *)trieop;
4454 struct regnode_charclass *trieop = (struct regnode_charclass *)
4455 PerlMemShared_calloc(1, sizeof(struct regnode_charclass));
4456 StructCopy(first,trieop,struct regnode_charclass);
4457 trie_op=(regnode *)trieop;
4460 make_trie_failtable(pRExC_state, (regnode *)first, trie_op, 0);
4461 ri->regstclass = trie_op;
4464 else if (strchr((const char*)PL_simple,OP(first)))
4465 ri->regstclass = first;
4466 else if (PL_regkind[OP(first)] == BOUND ||
4467 PL_regkind[OP(first)] == NBOUND)
4468 ri->regstclass = first;
4469 else if (PL_regkind[OP(first)] == BOL) {
4470 r->extflags |= (OP(first) == MBOL
4472 : (OP(first) == SBOL
4475 first = NEXTOPER(first);
4478 else if (OP(first) == GPOS) {
4479 r->extflags |= RXf_ANCH_GPOS;
4480 first = NEXTOPER(first);
4483 else if ((!sawopen || !RExC_sawback) &&
4484 (OP(first) == STAR &&
4485 PL_regkind[OP(NEXTOPER(first))] == REG_ANY) &&
4486 !(r->extflags & RXf_ANCH) && !(RExC_seen & REG_SEEN_EVAL))
4488 /* turn .* into ^.* with an implied $*=1 */
4490 (OP(NEXTOPER(first)) == REG_ANY)
4493 r->extflags |= type;
4494 r->intflags |= PREGf_IMPLICIT;
4495 first = NEXTOPER(first);
4498 if (sawplus && (!sawopen || !RExC_sawback)
4499 && !(RExC_seen & REG_SEEN_EVAL)) /* May examine pos and $& */
4500 /* x+ must match at the 1st pos of run of x's */
4501 r->intflags |= PREGf_SKIP;
4503 /* Scan is after the zeroth branch, first is atomic matcher. */
4504 #ifdef TRIE_STUDY_OPT
4507 PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
4508 (IV)(first - scan + 1))
4512 PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
4513 (IV)(first - scan + 1))
4519 * If there's something expensive in the r.e., find the
4520 * longest literal string that must appear and make it the
4521 * regmust. Resolve ties in favor of later strings, since
4522 * the regstart check works with the beginning of the r.e.
4523 * and avoiding duplication strengthens checking. Not a
4524 * strong reason, but sufficient in the absence of others.
4525 * [Now we resolve ties in favor of the earlier string if
4526 * it happens that c_offset_min has been invalidated, since the
4527 * earlier string may buy us something the later one won't.]
4530 data.longest_fixed = newSVpvs("");
4531 data.longest_float = newSVpvs("");
4532 data.last_found = newSVpvs("");
4533 data.longest = &(data.longest_fixed);
4535 if (!ri->regstclass) {
4536 cl_init(pRExC_state, &ch_class);
4537 data.start_class = &ch_class;
4538 stclass_flag = SCF_DO_STCLASS_AND;
4539 } else /* XXXX Check for BOUND? */
4541 data.last_closep = &last_close;
4543 minlen = study_chunk(pRExC_state, &first, &minlen, &fake, scan + RExC_size, /* Up to end */
4544 &data, -1, NULL, NULL,
4545 SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag,0);
4551 if ( RExC_npar == 1 && data.longest == &(data.longest_fixed)
4552 && data.last_start_min == 0 && data.last_end > 0
4553 && !RExC_seen_zerolen
4554 && !(RExC_seen & REG_SEEN_VERBARG)
4555 && (!(RExC_seen & REG_SEEN_GPOS) || (r->extflags & RXf_ANCH_GPOS)))
4556 r->extflags |= RXf_CHECK_ALL;
4557 scan_commit(pRExC_state, &data,&minlen,0);
4558 SvREFCNT_dec(data.last_found);
4560 /* Note that code very similar to this but for anchored string
4561 follows immediately below, changes may need to be made to both.
4564 longest_float_length = CHR_SVLEN(data.longest_float);
4565 if (longest_float_length
4566 || (data.flags & SF_FL_BEFORE_EOL
4567 && (!(data.flags & SF_FL_BEFORE_MEOL)
4568 || (RExC_flags & RXf_PMf_MULTILINE))))
4572 if (SvCUR(data.longest_fixed) /* ok to leave SvCUR */
4573 && data.offset_fixed == data.offset_float_min
4574 && SvCUR(data.longest_fixed) == SvCUR(data.longest_float))
4575 goto remove_float; /* As in (a)+. */
4577 /* copy the information about the longest float from the reg_scan_data
4578 over to the program. */
4579 if (SvUTF8(data.longest_float)) {
4580 r->float_utf8 = data.longest_float;
4581 r->float_substr = NULL;
4583 r->float_substr = data.longest_float;
4584 r->float_utf8 = NULL;
4586 /* float_end_shift is how many chars that must be matched that
4587 follow this item. We calculate it ahead of time as once the
4588 lookbehind offset is added in we lose the ability to correctly
4590 ml = data.minlen_float ? *(data.minlen_float)
4591 : (I32)longest_float_length;
4592 r->float_end_shift = ml - data.offset_float_min
4593 - longest_float_length + (SvTAIL(data.longest_float) != 0)
4594 + data.lookbehind_float;
4595 r->float_min_offset = data.offset_float_min - data.lookbehind_float;
4596 r->float_max_offset = data.offset_float_max;
4597 if (data.offset_float_max < I32_MAX) /* Don't offset infinity */
4598 r->float_max_offset -= data.lookbehind_float;
4600 t = (data.flags & SF_FL_BEFORE_EOL /* Can't have SEOL and MULTI */
4601 && (!(data.flags & SF_FL_BEFORE_MEOL)
4602 || (RExC_flags & RXf_PMf_MULTILINE)));
4603 fbm_compile(data.longest_float, t ? FBMcf_TAIL : 0);
4607 r->float_substr = r->float_utf8 = NULL;
4608 SvREFCNT_dec(data.longest_float);
4609 longest_float_length = 0;
4612 /* Note that code very similar to this but for floating string
4613 is immediately above, changes may need to be made to both.
4616 longest_fixed_length = CHR_SVLEN(data.longest_fixed);
4617 if (longest_fixed_length
4618 || (data.flags & SF_FIX_BEFORE_EOL /* Cannot have SEOL and MULTI */
4619 && (!(data.flags & SF_FIX_BEFORE_MEOL)
4620 || (RExC_flags & RXf_PMf_MULTILINE))))
4624 /* copy the information about the longest fixed
4625 from the reg_scan_data over to the program. */
4626 if (SvUTF8(data.longest_fixed)) {
4627 r->anchored_utf8 = data.longest_fixed;
4628 r->anchored_substr = NULL;
4630 r->anchored_substr = data.longest_fixed;
4631 r->anchored_utf8 = NULL;
4633 /* fixed_end_shift is how many chars that must be matched that
4634 follow this item. We calculate it ahead of time as once the
4635 lookbehind offset is added in we lose the ability to correctly
4637 ml = data.minlen_fixed ? *(data.minlen_fixed)
4638 : (I32)longest_fixed_length;
4639 r->anchored_end_shift = ml - data.offset_fixed
4640 - longest_fixed_length + (SvTAIL(data.longest_fixed) != 0)
4641 + data.lookbehind_fixed;
4642 r->anchored_offset = data.offset_fixed - data.lookbehind_fixed;
4644 t = (data.flags & SF_FIX_BEFORE_EOL /* Can't have SEOL and MULTI */
4645 && (!(data.flags & SF_FIX_BEFORE_MEOL)
4646 || (RExC_flags & RXf_PMf_MULTILINE)));
4647 fbm_compile(data.longest_fixed, t ? FBMcf_TAIL : 0);
4650 r->anchored_substr = r->anchored_utf8 = NULL;
4651 SvREFCNT_dec(data.longest_fixed);
4652 longest_fixed_length = 0;
4655 && (OP(ri->regstclass) == REG_ANY || OP(ri->regstclass) == SANY))
4656 ri->regstclass = NULL;
4657 if ((!(r->anchored_substr || r->anchored_utf8) || r->anchored_offset)
4659 && !(data.start_class->flags & ANYOF_EOS)
4660 && !cl_is_anything(data.start_class))
4662 const U32 n = add_data(pRExC_state, 1, "f");
4664 Newx(RExC_rxi->data->data[n], 1,
4665 struct regnode_charclass_class);
4666 StructCopy(data.start_class,
4667 (struct regnode_charclass_class*)RExC_rxi->data->data[n],
4668 struct regnode_charclass_class);
4669 ri->regstclass = (regnode*)RExC_rxi->data->data[n];
4670 r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
4671 DEBUG_COMPILE_r({ SV *sv = sv_newmortal();
4672 regprop(r, sv, (regnode*)data.start_class);
4673 PerlIO_printf(Perl_debug_log,
4674 "synthetic stclass \"%s\".\n",
4675 SvPVX_const(sv));});
4678 /* A temporary algorithm prefers floated substr to fixed one to dig more info. */
4679 if (longest_fixed_length > longest_float_length) {
4680 r->check_end_shift = r->anchored_end_shift;
4681 r->check_substr = r->anchored_substr;
4682 r->check_utf8 = r->anchored_utf8;
4683 r->check_offset_min = r->check_offset_max = r->anchored_offset;
4684 if (r->extflags & RXf_ANCH_SINGLE)
4685 r->extflags |= RXf_NOSCAN;
4688 r->check_end_shift = r->float_end_shift;
4689 r->check_substr = r->float_substr;
4690 r->check_utf8 = r->float_utf8;
4691 r->check_offset_min = r->float_min_offset;
4692 r->check_offset_max = r->float_max_offset;
4694 /* XXXX Currently intuiting is not compatible with ANCH_GPOS.
4695 This should be changed ASAP! */
4696 if ((r->check_substr || r->check_utf8) && !(r->extflags & RXf_ANCH_GPOS)) {
4697 r->extflags |= RXf_USE_INTUIT;
4698 if (SvTAIL(r->check_substr ? r->check_substr : r->check_utf8))
4699 r->extflags |= RXf_INTUIT_TAIL;
4701 /* XXX Unneeded? dmq (shouldn't as this is handled elsewhere)
4702 if ( (STRLEN)minlen < longest_float_length )
4703 minlen= longest_float_length;
4704 if ( (STRLEN)minlen < longest_fixed_length )
4705 minlen= longest_fixed_length;
4709 /* Several toplevels. Best we can is to set minlen. */
4711 struct regnode_charclass_class ch_class;
4714 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "\nMulti Top Level\n"));
4716 scan = ri->program + 1;
4717 cl_init(pRExC_state, &ch_class);
4718 data.start_class = &ch_class;
4719 data.last_closep = &last_close;
4722 minlen = study_chunk(pRExC_state, &scan, &minlen, &fake, scan + RExC_size,
4723 &data, -1, NULL, NULL, SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS,0);
4727 r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8
4728 = r->float_substr = r->float_utf8 = NULL;
4729 if (!(data.start_class->flags & ANYOF_EOS)
4730 && !cl_is_anything(data.start_class))
4732 const U32 n = add_data(pRExC_state, 1, "f");
4734 Newx(RExC_rxi->data->data[n], 1,
4735 struct regnode_charclass_class);
4736 StructCopy(data.start_class,
4737 (struct regnode_charclass_class*)RExC_rxi->data->data[n],
4738 struct regnode_charclass_class);
4739 ri->regstclass = (regnode*)RExC_rxi->data->data[n];
4740 r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
4741 DEBUG_COMPILE_r({ SV* sv = sv_newmortal();
4742 regprop(r, sv, (regnode*)data.start_class);
4743 PerlIO_printf(Perl_debug_log,
4744 "synthetic stclass \"%s\".\n",
4745 SvPVX_const(sv));});
4749 /* Guard against an embedded (?=) or (?<=) with a longer minlen than
4750 the "real" pattern. */
4752 PerlIO_printf(Perl_debug_log,"minlen: %"IVdf" r->minlen:%"IVdf"\n",
4753 (IV)minlen, (IV)r->minlen);
4755 r->minlenret = minlen;
4756 if (r->minlen < minlen)
4759 if (RExC_seen & REG_SEEN_GPOS)
4760 r->extflags |= RXf_GPOS_SEEN;
4761 if (RExC_seen & REG_SEEN_LOOKBEHIND)
4762 r->extflags |= RXf_LOOKBEHIND_SEEN;
4763 if (RExC_seen & REG_SEEN_EVAL)
4764 r->extflags |= RXf_EVAL_SEEN;
4765 if (RExC_seen & REG_SEEN_CANY)
4766 r->extflags |= RXf_CANY_SEEN;
4767 if (RExC_seen & REG_SEEN_VERBARG)
4768 r->intflags |= PREGf_VERBARG_SEEN;
4769 if (RExC_seen & REG_SEEN_CUTGROUP)
4770 r->intflags |= PREGf_CUTGROUP_SEEN;
4771 if (RExC_paren_names)
4772 r->paren_names = (HV*)SvREFCNT_inc(RExC_paren_names);
4774 r->paren_names = NULL;
4776 #ifdef STUPID_PATTERN_CHECKS
4778 r->extflags |= RXf_NULL;
4779 if (r->extflags & RXf_SPLIT && r->prelen == 1 && r->precomp[0] == ' ')
4780 /* XXX: this should happen BEFORE we compile */
4781 r->extflags |= (RXf_SKIPWHITE|RXf_WHITE);
4782 else if (r->prelen == 3 && memEQ("\\s+", r->precomp, 3))
4783 r->extflags |= RXf_WHITE;
4784 else if (r->prelen == 1 && r->precomp[0] == '^')
4785 r->extflags |= RXf_START_ONLY;
4787 if (r->extflags & RXf_SPLIT && r->prelen == 1 && r->precomp[0] == ' ')
4788 /* XXX: this should happen BEFORE we compile */
4789 r->extflags |= (RXf_SKIPWHITE|RXf_WHITE);
4791 regnode *first = ri->program + 1;
4793 U8 nop = OP(NEXTOPER(first));
4795 if (PL_regkind[fop] == NOTHING && nop == END)
4796 r->extflags |= RXf_NULL;
4797 else if (PL_regkind[fop] == BOL && nop == END)
4798 r->extflags |= RXf_START_ONLY;
4799 else if (fop == PLUS && nop ==SPACE && OP(regnext(first))==END)
4800 r->extflags |= RXf_WHITE;
4804 if (RExC_paren_names) {
4805 ri->name_list_idx = add_data( pRExC_state, 1, "p" );
4806 ri->data->data[ri->name_list_idx] = (void*)SvREFCNT_inc(RExC_paren_name_list);
4809 ri->name_list_idx = 0;
4811 if (RExC_recurse_count) {
4812 for ( ; RExC_recurse_count ; RExC_recurse_count-- ) {
4813 const regnode *scan = RExC_recurse[RExC_recurse_count-1];
4814 ARG2L_SET( scan, RExC_open_parens[ARG(scan)-1] - scan );
4817 Newxz(r->offs, RExC_npar, regexp_paren_pair);
4818 /* assume we don't need to swap parens around before we match */
4821 PerlIO_printf(Perl_debug_log,"Final program:\n");
4824 #ifdef RE_TRACK_PATTERN_OFFSETS
4825 DEBUG_OFFSETS_r(if (ri->u.offsets) {
4826 const U32 len = ri->u.offsets[0];
4828 GET_RE_DEBUG_FLAGS_DECL;
4829 PerlIO_printf(Perl_debug_log, "Offsets: [%"UVuf"]\n\t", (UV)ri->u.offsets[0]);
4830 for (i = 1; i <= len; i++) {
4831 if (ri->u.offsets[i*2-1] || ri->u.offsets[i*2])
4832 PerlIO_printf(Perl_debug_log, "%"UVuf":%"UVuf"[%"UVuf"] ",
4833 (UV)i, (UV)ri->u.offsets[i*2-1], (UV)ri->u.offsets[i*2]);
4835 PerlIO_printf(Perl_debug_log, "\n");
4841 #undef RE_ENGINE_PTR
4845 Perl_reg_named_buff(pTHX_ REGEXP * const rx, SV * const key, SV * const value,
4848 PERL_UNUSED_ARG(value);
4850 if (flags & RXapif_FETCH) {
4851 return reg_named_buff_fetch(rx, key, flags);
4852 } else if (flags & (RXapif_STORE | RXapif_DELETE | RXapif_CLEAR)) {
4853 Perl_croak(aTHX_ PL_no_modify);
4855 } else if (flags & RXapif_EXISTS) {
4856 return reg_named_buff_exists(rx, key, flags)
4859 } else if (flags & RXapif_REGNAMES) {
4860 return reg_named_buff_all(rx, flags);
4861 } else if (flags & (RXapif_SCALAR | RXapif_REGNAMES_COUNT)) {
4862 return reg_named_buff_scalar(rx, flags);
4864 Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff", (int)flags);
4870 Perl_reg_named_buff_iter(pTHX_ REGEXP * const rx, const SV * const lastkey,
4873 PERL_UNUSED_ARG(lastkey);
4875 if (flags & RXapif_FIRSTKEY)
4876 return reg_named_buff_firstkey(rx, flags);
4877 else if (flags & RXapif_NEXTKEY)
4878 return reg_named_buff_nextkey(rx, flags);
4880 Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_iter", (int)flags);
4886 Perl_reg_named_buff_fetch(pTHX_ REGEXP * const rx, SV * const namesv, const U32 flags)
4888 AV *retarray = NULL;
4890 if (flags & RXapif_ALL)
4893 if (rx && rx->paren_names) {
4894 HE *he_str = hv_fetch_ent( rx->paren_names, namesv, 0, 0 );
4897 SV* sv_dat=HeVAL(he_str);
4898 I32 *nums=(I32*)SvPVX(sv_dat);
4899 for ( i=0; i<SvIVX(sv_dat); i++ ) {
4900 if ((I32)(rx->nparens) >= nums[i]
4901 && rx->offs[nums[i]].start != -1
4902 && rx->offs[nums[i]].end != -1)
4905 CALLREG_NUMBUF_FETCH(rx,nums[i],ret);
4909 ret = newSVsv(&PL_sv_undef);
4912 SvREFCNT_inc_simple_void(ret);
4913 av_push(retarray, ret);
4917 return newRV((SV*)retarray);
4924 Perl_reg_named_buff_exists(pTHX_ REGEXP * const rx, SV * const key,
4927 if (rx && rx->paren_names) {
4928 if (flags & RXapif_ALL) {
4929 return hv_exists_ent(rx->paren_names, key, 0);
4931 SV *sv = CALLREG_NAMED_BUFF_FETCH(rx, key, flags);
4945 Perl_reg_named_buff_firstkey(pTHX_ REGEXP * const rx, const U32 flags)
4947 if ( rx && rx->paren_names ) {
4948 (void)hv_iterinit(rx->paren_names);
4950 return CALLREG_NAMED_BUFF_NEXTKEY(rx, NULL, flags & ~RXapif_FIRSTKEY);
4957 Perl_reg_named_buff_nextkey(pTHX_ REGEXP * const rx, const U32 flags)
4959 if (rx && rx->paren_names) {
4960 HV *hv = rx->paren_names;
4962 while ( (temphe = hv_iternext_flags(hv,0)) ) {
4965 SV* sv_dat = HeVAL(temphe);
4966 I32 *nums = (I32*)SvPVX(sv_dat);
4967 for ( i = 0; i < SvIVX(sv_dat); i++ ) {
4968 if ((I32)(rx->lastcloseparen) >= nums[i] &&
4969 rx->offs[nums[i]].start != -1 &&
4970 rx->offs[nums[i]].end != -1)
4976 if (parno || flags & RXapif_ALL) {
4978 char *pv = HePV(temphe, len);
4979 return newSVpvn(pv,len);
4987 Perl_reg_named_buff_scalar(pTHX_ REGEXP * const rx, const U32 flags)
4993 if (rx && rx->paren_names) {
4994 if (flags & (RXapif_ALL | RXapif_REGNAMES_COUNT)) {
4995 return newSViv(HvTOTALKEYS(rx->paren_names));
4996 } else if (flags & RXapif_ONE) {
4997 ret = CALLREG_NAMED_BUFF_ALL(rx, (flags | RXapif_REGNAMES));
4998 av = (AV*)SvRV(ret);
4999 length = av_len(av);
5000 return newSViv(length + 1);
5002 Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_scalar", (int)flags);
5006 return &PL_sv_undef;
5010 Perl_reg_named_buff_all(pTHX_ REGEXP * const rx, const U32 flags)
5014 if (rx && rx->paren_names) {
5015 HV *hv= rx->paren_names;
5017 (void)hv_iterinit(hv);
5018 while ( (temphe = hv_iternext_flags(hv,0)) ) {
5021 SV* sv_dat = HeVAL(temphe);
5022 I32 *nums = (I32*)SvPVX(sv_dat);
5023 for ( i = 0; i < SvIVX(sv_dat); i++ ) {
5024 if ((I32)(rx->lastcloseparen) >= nums[i] &&
5025 rx->offs[nums[i]].start != -1 &&
5026 rx->offs[nums[i]].end != -1)
5032 if (parno || flags & RXapif_ALL) {
5034 char *pv = HePV(temphe, len);
5035 av_push(av, newSVpvn(pv,len));
5040 return newRV((SV*)av);
5044 Perl_reg_numbered_buff_fetch(pTHX_ REGEXP * const rx, const I32 paren, SV * const sv)
5051 sv_setsv(sv,&PL_sv_undef);
5055 if (paren == RX_BUFF_IDX_PREMATCH && rx->offs[0].start != -1) {
5057 i = rx->offs[0].start;
5061 if (paren == RX_BUFF_IDX_POSTMATCH && rx->offs[0].end != -1) {
5063 s = rx->subbeg + rx->offs[0].end;
5064 i = rx->sublen - rx->offs[0].end;
5067 if ( 0 <= paren && paren <= (I32)rx->nparens &&
5068 (s1 = rx->offs[paren].start) != -1 &&
5069 (t1 = rx->offs[paren].end) != -1)
5073 s = rx->subbeg + s1;
5075 sv_setsv(sv,&PL_sv_undef);
5078 assert(rx->sublen >= (s - rx->subbeg) + i );
5080 const int oldtainted = PL_tainted;
5082 sv_setpvn(sv, s, i);
5083 PL_tainted = oldtainted;
5084 if ( (rx->extflags & RXf_CANY_SEEN)
5085 ? (RX_MATCH_UTF8(rx)
5086 && (!i || is_utf8_string((U8*)s, i)))
5087 : (RX_MATCH_UTF8(rx)) )
5094 if (RX_MATCH_TAINTED(rx)) {
5095 if (SvTYPE(sv) >= SVt_PVMG) {
5096 MAGIC* const mg = SvMAGIC(sv);
5099 SvMAGIC_set(sv, mg->mg_moremagic);
5101 if ((mgt = SvMAGIC(sv))) {
5102 mg->mg_moremagic = mgt;
5103 SvMAGIC_set(sv, mg);
5113 sv_setsv(sv,&PL_sv_undef);
5119 Perl_reg_numbered_buff_store(pTHX_ REGEXP * const rx, const I32 paren,
5120 SV const * const value)
5122 PERL_UNUSED_ARG(rx);
5123 PERL_UNUSED_ARG(paren);
5124 PERL_UNUSED_ARG(value);
5127 Perl_croak(aTHX_ PL_no_modify);
5131 Perl_reg_numbered_buff_length(pTHX_ REGEXP * const rx, const SV * const sv,
5137 /* Some of this code was originally in C<Perl_magic_len> in F<mg.c> */
5139 /* $` / ${^PREMATCH} */
5140 case RX_BUFF_IDX_PREMATCH:
5141 if (rx->offs[0].start != -1) {
5142 i = rx->offs[0].start;
5150 /* $' / ${^POSTMATCH} */
5151 case RX_BUFF_IDX_POSTMATCH:
5152 if (rx->offs[0].end != -1) {
5153 i = rx->sublen - rx->offs[0].end;
5155 s1 = rx->offs[0].end;
5161 /* $& / ${^MATCH}, $1, $2, ... */
5163 if (paren <= (I32)rx->nparens &&
5164 (s1 = rx->offs[paren].start) != -1 &&
5165 (t1 = rx->offs[paren].end) != -1)
5170 if (ckWARN(WARN_UNINITIALIZED))
5171 report_uninit((SV*)sv);
5176 if (i > 0 && RX_MATCH_UTF8(rx)) {
5177 const char * const s = rx->subbeg + s1;
5182 if (is_utf8_string_loclen((U8*)s, i, &ep, &el))
5189 Perl_reg_qr_package(pTHX_ REGEXP * const rx)
5191 PERL_UNUSED_ARG(rx);
5192 return newSVpvs("Regexp");
5195 /* Scans the name of a named buffer from the pattern.
5196 * If flags is REG_RSN_RETURN_NULL returns null.
5197 * If flags is REG_RSN_RETURN_NAME returns an SV* containing the name
5198 * If flags is REG_RSN_RETURN_DATA returns the data SV* corresponding
5199 * to the parsed name as looked up in the RExC_paren_names hash.
5200 * If there is an error throws a vFAIL().. type exception.
5203 #define REG_RSN_RETURN_NULL 0
5204 #define REG_RSN_RETURN_NAME 1
5205 #define REG_RSN_RETURN_DATA 2
5208 S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags) {
5209 char *name_start = RExC_parse;
5211 if (isIDFIRST_lazy_if(RExC_parse, UTF)) {
5212 /* skip IDFIRST by using do...while */
5215 RExC_parse += UTF8SKIP(RExC_parse);
5216 } while (isALNUM_utf8((U8*)RExC_parse));
5220 } while (isALNUM(*RExC_parse));
5224 SV* sv_name = sv_2mortal(Perl_newSVpvn(aTHX_ name_start,
5225 (int)(RExC_parse - name_start)));
5228 if ( flags == REG_RSN_RETURN_NAME)
5230 else if (flags==REG_RSN_RETURN_DATA) {
5233 if ( ! sv_name ) /* should not happen*/
5234 Perl_croak(aTHX_ "panic: no svname in reg_scan_name");
5235 if (RExC_paren_names)
5236 he_str = hv_fetch_ent( RExC_paren_names, sv_name, 0, 0 );
5238 sv_dat = HeVAL(he_str);
5240 vFAIL("Reference to nonexistent named group");
5244 Perl_croak(aTHX_ "panic: bad flag in reg_scan_name");
5251 #define DEBUG_PARSE_MSG(funcname) DEBUG_PARSE_r({ \
5252 int rem=(int)(RExC_end - RExC_parse); \
5261 if (RExC_lastparse!=RExC_parse) \
5262 PerlIO_printf(Perl_debug_log," >%.*s%-*s", \
5265 iscut ? "..." : "<" \
5268 PerlIO_printf(Perl_debug_log,"%16s",""); \
5271 num = RExC_size + 1; \
5273 num=REG_NODE_NUM(RExC_emit); \
5274 if (RExC_lastnum!=num) \
5275 PerlIO_printf(Perl_debug_log,"|%4d",num); \
5277 PerlIO_printf(Perl_debug_log,"|%4s",""); \
5278 PerlIO_printf(Perl_debug_log,"|%*s%-4s", \
5279 (int)((depth*2)), "", \
5283 RExC_lastparse=RExC_parse; \
5288 #define DEBUG_PARSE(funcname) DEBUG_PARSE_r({ \
5289 DEBUG_PARSE_MSG((funcname)); \
5290 PerlIO_printf(Perl_debug_log,"%4s","\n"); \
5292 #define DEBUG_PARSE_FMT(funcname,fmt,args) DEBUG_PARSE_r({ \
5293 DEBUG_PARSE_MSG((funcname)); \
5294 PerlIO_printf(Perl_debug_log,fmt "\n",args); \
5297 - reg - regular expression, i.e. main body or parenthesized thing
5299 * Caller must absorb opening parenthesis.
5301 * Combining parenthesis handling with the base level of regular expression
5302 * is a trifle forced, but the need to tie the tails of the branches to what
5303 * follows makes it hard to avoid.
5305 #define REGTAIL(x,y,z) regtail((x),(y),(z),depth+1)
5307 #define REGTAIL_STUDY(x,y,z) regtail_study((x),(y),(z),depth+1)
5309 #define REGTAIL_STUDY(x,y,z) regtail((x),(y),(z),depth+1)
5313 S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
5314 /* paren: Parenthesized? 0=top, 1=(, inside: changed to letter. */
5317 register regnode *ret; /* Will be the head of the group. */
5318 register regnode *br;
5319 register regnode *lastbr;
5320 register regnode *ender = NULL;
5321 register I32 parno = 0;
5323 U32 oregflags = RExC_flags;
5324 bool have_branch = 0;
5326 I32 freeze_paren = 0;
5327 I32 after_freeze = 0;
5329 /* for (?g), (?gc), and (?o) warnings; warning
5330 about (?c) will warn about (?g) -- japhy */
5332 #define WASTED_O 0x01
5333 #define WASTED_G 0x02
5334 #define WASTED_C 0x04
5335 #define WASTED_GC (0x02|0x04)
5336 I32 wastedflags = 0x00;
5338 char * parse_start = RExC_parse; /* MJD */
5339 char * const oregcomp_parse = RExC_parse;
5341 GET_RE_DEBUG_FLAGS_DECL;
5342 DEBUG_PARSE("reg ");
5344 *flagp = 0; /* Tentatively. */
5347 /* Make an OPEN node, if parenthesized. */
5349 if ( *RExC_parse == '*') { /* (*VERB:ARG) */
5350 char *start_verb = RExC_parse;
5351 STRLEN verb_len = 0;
5352 char *start_arg = NULL;
5353 unsigned char op = 0;
5355 int internal_argval = 0; /* internal_argval is only useful if !argok */
5356 while ( *RExC_parse && *RExC_parse != ')' ) {
5357 if ( *RExC_parse == ':' ) {
5358 start_arg = RExC_parse + 1;
5364 verb_len = RExC_parse - start_verb;
5367 while ( *RExC_parse && *RExC_parse != ')' )
5369 if ( *RExC_parse != ')' )
5370 vFAIL("Unterminated verb pattern argument");
5371 if ( RExC_parse == start_arg )
5374 if ( *RExC_parse != ')' )
5375 vFAIL("Unterminated verb pattern");
5378 switch ( *start_verb ) {
5379 case 'A': /* (*ACCEPT) */
5380 if ( memEQs(start_verb,verb_len,"ACCEPT") ) {
5382 internal_argval = RExC_nestroot;
5385 case 'C': /* (*COMMIT) */
5386 if ( memEQs(start_verb,verb_len,"COMMIT") )
5389 case 'F': /* (*FAIL) */
5390 if ( verb_len==1 || memEQs(start_verb,verb_len,"FAIL") ) {
5395 case ':': /* (*:NAME) */
5396 case 'M': /* (*MARK:NAME) */
5397 if ( verb_len==0 || memEQs(start_verb,verb_len,"MARK") ) {
5402 case 'P': /* (*PRUNE) */
5403 if ( memEQs(start_verb,verb_len,"PRUNE") )
5406 case 'S': /* (*SKIP) */
5407 if ( memEQs(start_verb,verb_len,"SKIP") )
5410 case 'T': /* (*THEN) */
5411 /* [19:06] <TimToady> :: is then */
5412 if ( memEQs(start_verb,verb_len,"THEN") ) {
5414 RExC_seen |= REG_SEEN_CUTGROUP;
5420 vFAIL3("Unknown verb pattern '%.*s'",
5421 verb_len, start_verb);
5424 if ( start_arg && internal_argval ) {
5425 vFAIL3("Verb pattern '%.*s' may not have an argument",
5426 verb_len, start_verb);
5427 } else if ( argok < 0 && !start_arg ) {
5428 vFAIL3("Verb pattern '%.*s' has a mandatory argument",
5429 verb_len, start_verb);
5431 ret = reganode(pRExC_state, op, internal_argval);
5432 if ( ! internal_argval && ! SIZE_ONLY ) {
5434 SV *sv = newSVpvn( start_arg, RExC_parse - start_arg);
5435 ARG(ret) = add_data( pRExC_state, 1, "S" );
5436 RExC_rxi->data->data[ARG(ret)]=(void*)sv;
5443 if (!internal_argval)
5444 RExC_seen |= REG_SEEN_VERBARG;
5445 } else if ( start_arg ) {
5446 vFAIL3("Verb pattern '%.*s' may not have an argument",
5447 verb_len, start_verb);
5449 ret = reg_node(pRExC_state, op);
5451 nextchar(pRExC_state);
5454 if (*RExC_parse == '?') { /* (?...) */
5455 bool is_logical = 0;
5456 const char * const seqstart = RExC_parse;
5459 paren = *RExC_parse++;
5460 ret = NULL; /* For look-ahead/behind. */
5463 case 'P': /* (?P...) variants for those used to PCRE/Python */
5464 paren = *RExC_parse++;
5465 if ( paren == '<') /* (?P<...>) named capture */
5467 else if (paren == '>') { /* (?P>name) named recursion */
5468 goto named_recursion;
5470 else if (paren == '=') { /* (?P=...) named backref */
5471 /* this pretty much dupes the code for \k<NAME> in regatom(), if
5472 you change this make sure you change that */
5473 char* name_start = RExC_parse;
5475 SV *sv_dat = reg_scan_name(pRExC_state,
5476 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
5477 if (RExC_parse == name_start || *RExC_parse != ')')
5478 vFAIL2("Sequence %.3s... not terminated",parse_start);
5481 num = add_data( pRExC_state, 1, "S" );
5482 RExC_rxi->data->data[num]=(void*)sv_dat;
5483 SvREFCNT_inc_simple_void(sv_dat);
5486 ret = reganode(pRExC_state,
5487 (U8)(FOLD ? (LOC ? NREFFL : NREFF) : NREF),
5491 Set_Node_Offset(ret, parse_start+1);
5492 Set_Node_Cur_Length(ret); /* MJD */
5494 nextchar(pRExC_state);
5498 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
5500 case '<': /* (?<...) */
5501 if (*RExC_parse == '!')
5503 else if (*RExC_parse != '=')
5509 case '\'': /* (?'...') */
5510 name_start= RExC_parse;
5511 svname = reg_scan_name(pRExC_state,
5512 SIZE_ONLY ? /* reverse test from the others */
5513 REG_RSN_RETURN_NAME :
5514 REG_RSN_RETURN_NULL);
5515 if (RExC_parse == name_start) {
5517 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
5520 if (*RExC_parse != paren)
5521 vFAIL2("Sequence (?%c... not terminated",
5522 paren=='>' ? '<' : paren);
5526 if (!svname) /* shouldnt happen */
5528 "panic: reg_scan_name returned NULL");
5529 if (!RExC_paren_names) {
5530 RExC_paren_names= newHV();
5531 sv_2mortal((SV*)RExC_paren_names);
5533 RExC_paren_name_list= newAV();
5534 sv_2mortal((SV*)RExC_paren_name_list);
5537 he_str = hv_fetch_ent( RExC_paren_names, svname, 1, 0 );
5539 sv_dat = HeVAL(he_str);
5541 /* croak baby croak */
5543 "panic: paren_name hash element allocation failed");
5544 } else if ( SvPOK(sv_dat) ) {
5545 /* (?|...) can mean we have dupes so scan to check
5546 its already been stored. Maybe a flag indicating
5547 we are inside such a construct would be useful,
5548 but the arrays are likely to be quite small, so
5549 for now we punt -- dmq */
5550 IV count = SvIV(sv_dat);
5551 I32 *pv = (I32*)SvPVX(sv_dat);
5553 for ( i = 0 ; i < count ; i++ ) {
5554 if ( pv[i] == RExC_npar ) {
5560 pv = (I32*)SvGROW(sv_dat, SvCUR(sv_dat) + sizeof(I32)+1);
5561 SvCUR_set(sv_dat, SvCUR(sv_dat) + sizeof(I32));
5562 pv[count] = RExC_npar;
5566 (void)SvUPGRADE(sv_dat,SVt_PVNV);
5567 sv_setpvn(sv_dat, (char *)&(RExC_npar), sizeof(I32));
5572 if (!av_store(RExC_paren_name_list, RExC_npar, SvREFCNT_inc(svname)))
5573 SvREFCNT_dec(svname);
5576 /*sv_dump(sv_dat);*/
5578 nextchar(pRExC_state);
5580 goto capturing_parens;
5582 RExC_seen |= REG_SEEN_LOOKBEHIND;
5584 case '=': /* (?=...) */
5585 case '!': /* (?!...) */
5586 RExC_seen_zerolen++;
5587 if (*RExC_parse == ')') {
5588 ret=reg_node(pRExC_state, OPFAIL);
5589 nextchar(pRExC_state);
5593 case '|': /* (?|...) */
5594 /* branch reset, behave like a (?:...) except that
5595 buffers in alternations share the same numbers */
5597 after_freeze = freeze_paren = RExC_npar;
5599 case ':': /* (?:...) */
5600 case '>': /* (?>...) */
5602 case '$': /* (?$...) */
5603 case '@': /* (?@...) */
5604 vFAIL2("Sequence (?%c...) not implemented", (int)paren);
5606 case '#': /* (?#...) */
5607 while (*RExC_parse && *RExC_parse != ')')
5609 if (*RExC_parse != ')')
5610 FAIL("Sequence (?#... not terminated");
5611 nextchar(pRExC_state);
5614 case '0' : /* (?0) */
5615 case 'R' : /* (?R) */
5616 if (*RExC_parse != ')')
5617 FAIL("Sequence (?R) not terminated");
5618 ret = reg_node(pRExC_state, GOSTART);
5619 *flagp |= POSTPONED;
5620 nextchar(pRExC_state);
5623 { /* named and numeric backreferences */
5625 case '&': /* (?&NAME) */
5626 parse_start = RExC_parse - 1;
5629 SV *sv_dat = reg_scan_name(pRExC_state,
5630 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
5631 num = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
5633 goto gen_recurse_regop;
5636 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
5638 vFAIL("Illegal pattern");
5640 goto parse_recursion;
5642 case '-': /* (?-1) */
5643 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
5644 RExC_parse--; /* rewind to let it be handled later */
5648 case '1': case '2': case '3': case '4': /* (?1) */
5649 case '5': case '6': case '7': case '8': case '9':
5652 num = atoi(RExC_parse);
5653 parse_start = RExC_parse - 1; /* MJD */
5654 if (*RExC_parse == '-')
5656 while (isDIGIT(*RExC_parse))
5658 if (*RExC_parse!=')')
5659 vFAIL("Expecting close bracket");
5662 if ( paren == '-' ) {
5664 Diagram of capture buffer numbering.
5665 Top line is the normal capture buffer numbers
5666 Botton line is the negative indexing as from
5670 /(a(x)y)(a(b(c(?-2)d)e)f)(g(h))/
5674 num = RExC_npar + num;
5677 vFAIL("Reference to nonexistent group");
5679 } else if ( paren == '+' ) {
5680 num = RExC_npar + num - 1;
5683 ret = reganode(pRExC_state, GOSUB, num);
5685 if (num > (I32)RExC_rx->nparens) {
5687 vFAIL("Reference to nonexistent group");
5689 ARG2L_SET( ret, RExC_recurse_count++);
5691 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
5692 "Recurse #%"UVuf" to %"IVdf"\n", (UV)ARG(ret), (IV)ARG2L(ret)));
5696 RExC_seen |= REG_SEEN_RECURSE;
5697 Set_Node_Length(ret, 1 + regarglen[OP(ret)]); /* MJD */
5698 Set_Node_Offset(ret, parse_start); /* MJD */
5700 *flagp |= POSTPONED;
5701 nextchar(pRExC_state);
5703 } /* named and numeric backreferences */
5706 case '?': /* (??...) */
5708 if (*RExC_parse != '{') {
5710 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
5713 *flagp |= POSTPONED;
5714 paren = *RExC_parse++;
5716 case '{': /* (?{...}) */
5721 char *s = RExC_parse;
5723 RExC_seen_zerolen++;
5724 RExC_seen |= REG_SEEN_EVAL;
5725 while (count && (c = *RExC_parse)) {
5736 if (*RExC_parse != ')') {
5738 vFAIL("Sequence (?{...}) not terminated or not {}-balanced");
5742 OP_4tree *sop, *rop;
5743 SV * const sv = newSVpvn(s, RExC_parse - 1 - s);
5746 Perl_save_re_context(aTHX);
5747 rop = sv_compile_2op(sv, &sop, "re", &pad);
5748 sop->op_private |= OPpREFCOUNTED;
5749 /* re_dup will OpREFCNT_inc */
5750 OpREFCNT_set(sop, 1);
5753 n = add_data(pRExC_state, 3, "nop");
5754 RExC_rxi->data->data[n] = (void*)rop;
5755 RExC_rxi->data->data[n+1] = (void*)sop;
5756 RExC_rxi->data->data[n+2] = (void*)pad;
5759 else { /* First pass */
5760 if (PL_reginterp_cnt < ++RExC_seen_evals
5762 /* No compiled RE interpolated, has runtime
5763 components ===> unsafe. */
5764 FAIL("Eval-group not allowed at runtime, use re 'eval'");
5765 if (PL_tainting && PL_tainted)
5766 FAIL("Eval-group in insecure regular expression");
5767 #if PERL_VERSION > 8
5768 if (IN_PERL_COMPILETIME)
5773 nextchar(pRExC_state);
5775 ret = reg_node(pRExC_state, LOGICAL);
5778 REGTAIL(pRExC_state, ret, reganode(pRExC_state, EVAL, n));
5779 /* deal with the length of this later - MJD */
5782 ret = reganode(pRExC_state, EVAL, n);
5783 Set_Node_Length(ret, RExC_parse - parse_start + 1);
5784 Set_Node_Offset(ret, parse_start);
5787 case '(': /* (?(?{...})...) and (?(?=...)...) */
5790 if (RExC_parse[0] == '?') { /* (?(?...)) */
5791 if (RExC_parse[1] == '=' || RExC_parse[1] == '!'
5792 || RExC_parse[1] == '<'
5793 || RExC_parse[1] == '{') { /* Lookahead or eval. */
5796 ret = reg_node(pRExC_state, LOGICAL);
5799 REGTAIL(pRExC_state, ret, reg(pRExC_state, 1, &flag,depth+1));
5803 else if ( RExC_parse[0] == '<' /* (?(<NAME>)...) */
5804 || RExC_parse[0] == '\'' ) /* (?('NAME')...) */
5806 char ch = RExC_parse[0] == '<' ? '>' : '\'';
5807 char *name_start= RExC_parse++;
5809 SV *sv_dat=reg_scan_name(pRExC_state,
5810 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
5811 if (RExC_parse == name_start || *RExC_parse != ch)
5812 vFAIL2("Sequence (?(%c... not terminated",
5813 (ch == '>' ? '<' : ch));
5816 num = add_data( pRExC_state, 1, "S" );
5817 RExC_rxi->data->data[num]=(void*)sv_dat;
5818 SvREFCNT_inc_simple_void(sv_dat);
5820 ret = reganode(pRExC_state,NGROUPP,num);
5821 goto insert_if_check_paren;
5823 else if (RExC_parse[0] == 'D' &&
5824 RExC_parse[1] == 'E' &&
5825 RExC_parse[2] == 'F' &&
5826 RExC_parse[3] == 'I' &&
5827 RExC_parse[4] == 'N' &&
5828 RExC_parse[5] == 'E')
5830 ret = reganode(pRExC_state,DEFINEP,0);
5833 goto insert_if_check_paren;
5835 else if (RExC_parse[0] == 'R') {
5838 if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
5839 parno = atoi(RExC_parse++);
5840 while (isDIGIT(*RExC_parse))
5842 } else if (RExC_parse[0] == '&') {
5845 sv_dat = reg_scan_name(pRExC_state,
5846 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
5847 parno = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
5849 ret = reganode(pRExC_state,INSUBP,parno);
5850 goto insert_if_check_paren;
5852 else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
5855 parno = atoi(RExC_parse++);
5857 while (isDIGIT(*RExC_parse))
5859 ret = reganode(pRExC_state, GROUPP, parno);
5861 insert_if_check_paren:
5862 if ((c = *nextchar(pRExC_state)) != ')')
5863 vFAIL("Switch condition not recognized");
5865 REGTAIL(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
5866 br = regbranch(pRExC_state, &flags, 1,depth+1);
5868 br = reganode(pRExC_state, LONGJMP, 0);
5870 REGTAIL(pRExC_state, br, reganode(pRExC_state, LONGJMP, 0));
5871 c = *nextchar(pRExC_state);
5876 vFAIL("(?(DEFINE)....) does not allow branches");
5877 lastbr = reganode(pRExC_state, IFTHEN, 0); /* Fake one for optimizer. */
5878 regbranch(pRExC_state, &flags, 1,depth+1);
5879 REGTAIL(pRExC_state, ret, lastbr);
5882 c = *nextchar(pRExC_state);
5887 vFAIL("Switch (?(condition)... contains too many branches");
5888 ender = reg_node(pRExC_state, TAIL);
5889 REGTAIL(pRExC_state, br, ender);
5891 REGTAIL(pRExC_state, lastbr, ender);
5892 REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
5895 REGTAIL(pRExC_state, ret, ender);
5896 RExC_size++; /* XXX WHY do we need this?!!
5897 For large programs it seems to be required
5898 but I can't figure out why. -- dmq*/
5902 vFAIL2("Unknown switch condition (?(%.2s", RExC_parse);
5906 RExC_parse--; /* for vFAIL to print correctly */
5907 vFAIL("Sequence (? incomplete");
5911 parse_flags: /* (?i) */
5913 U32 posflags = 0, negflags = 0;
5914 U32 *flagsp = &posflags;
5916 while (*RExC_parse) {
5917 /* && strchr("iogcmsx", *RExC_parse) */
5918 /* (?g), (?gc) and (?o) are useless here
5919 and must be globally applied -- japhy */
5920 switch (*RExC_parse) {
5921 CASE_STD_PMMOD_FLAGS_PARSE_SET(flagsp);
5922 case ONCE_PAT_MOD: /* 'o' */
5923 case GLOBAL_PAT_MOD: /* 'g' */
5924 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
5925 const I32 wflagbit = *RExC_parse == 'o' ? WASTED_O : WASTED_G;
5926 if (! (wastedflags & wflagbit) ) {
5927 wastedflags |= wflagbit;
5930 "Useless (%s%c) - %suse /%c modifier",
5931 flagsp == &negflags ? "?-" : "?",
5933 flagsp == &negflags ? "don't " : "",
5940 case CONTINUE_PAT_MOD: /* 'c' */
5941 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
5942 if (! (wastedflags & WASTED_C) ) {
5943 wastedflags |= WASTED_GC;
5946 "Useless (%sc) - %suse /gc modifier",
5947 flagsp == &negflags ? "?-" : "?",
5948 flagsp == &negflags ? "don't " : ""
5953 case KEEPCOPY_PAT_MOD: /* 'p' */
5954 if (flagsp == &negflags) {
5955 if (SIZE_ONLY && ckWARN(WARN_REGEXP))
5956 vWARN(RExC_parse + 1,"Useless use of (?-p)");
5958 *flagsp |= RXf_PMf_KEEPCOPY;
5962 if (flagsp == &negflags) {
5964 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
5968 wastedflags = 0; /* reset so (?g-c) warns twice */
5974 RExC_flags |= posflags;
5975 RExC_flags &= ~negflags;
5977 oregflags |= posflags;
5978 oregflags &= ~negflags;
5980 nextchar(pRExC_state);
5991 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
5996 }} /* one for the default block, one for the switch */
6003 ret = reganode(pRExC_state, OPEN, parno);
6006 RExC_nestroot = parno;
6007 if (RExC_seen & REG_SEEN_RECURSE
6008 && !RExC_open_parens[parno-1])
6010 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
6011 "Setting open paren #%"IVdf" to %d\n",
6012 (IV)parno, REG_NODE_NUM(ret)));
6013 RExC_open_parens[parno-1]= ret;
6016 Set_Node_Length(ret, 1); /* MJD */
6017 Set_Node_Offset(ret, RExC_parse); /* MJD */
6025 /* Pick up the branches, linking them together. */
6026 parse_start = RExC_parse; /* MJD */
6027 br = regbranch(pRExC_state, &flags, 1,depth+1);
6028 /* branch_len = (paren != 0); */
6032 if (*RExC_parse == '|') {
6033 if (!SIZE_ONLY && RExC_extralen) {
6034 reginsert(pRExC_state, BRANCHJ, br, depth+1);
6037 reginsert(pRExC_state, BRANCH, br, depth+1);
6038 Set_Node_Length(br, paren != 0);
6039 Set_Node_Offset_To_R(br-RExC_emit_start, parse_start-RExC_start);
6043 RExC_extralen += 1; /* For BRANCHJ-BRANCH. */
6045 else if (paren == ':') {
6046 *flagp |= flags&SIMPLE;
6048 if (is_open) { /* Starts with OPEN. */
6049 REGTAIL(pRExC_state, ret, br); /* OPEN -> first. */
6051 else if (paren != '?') /* Not Conditional */
6053 *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
6055 while (*RExC_parse == '|') {
6056 if (!SIZE_ONLY && RExC_extralen) {
6057 ender = reganode(pRExC_state, LONGJMP,0);
6058 REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender); /* Append to the previous. */
6061 RExC_extralen += 2; /* Account for LONGJMP. */
6062 nextchar(pRExC_state);
6064 if (RExC_npar > after_freeze)
6065 after_freeze = RExC_npar;
6066 RExC_npar = freeze_paren;
6068 br = regbranch(pRExC_state, &flags, 0, depth+1);
6072 REGTAIL(pRExC_state, lastbr, br); /* BRANCH -> BRANCH. */
6074 *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
6077 if (have_branch || paren != ':') {
6078 /* Make a closing node, and hook it on the end. */
6081 ender = reg_node(pRExC_state, TAIL);
6084 ender = reganode(pRExC_state, CLOSE, parno);
6085 if (!SIZE_ONLY && RExC_seen & REG_SEEN_RECURSE) {
6086 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
6087 "Setting close paren #%"IVdf" to %d\n",
6088 (IV)parno, REG_NODE_NUM(ender)));
6089 RExC_close_parens[parno-1]= ender;
6090 if (RExC_nestroot == parno)
6093 Set_Node_Offset(ender,RExC_parse+1); /* MJD */
6094 Set_Node_Length(ender,1); /* MJD */
6100 *flagp &= ~HASWIDTH;
6103 ender = reg_node(pRExC_state, SUCCEED);
6106 ender = reg_node(pRExC_state, END);
6108 assert(!RExC_opend); /* there can only be one! */
6113 REGTAIL(pRExC_state, lastbr, ender);
6115 if (have_branch && !SIZE_ONLY) {
6117 RExC_seen |= REG_TOP_LEVEL_BRANCHES;
6119 /* Hook the tails of the branches to the closing node. */
6120 for (br = ret; br; br = regnext(br)) {
6121 const U8 op = PL_regkind[OP(br)];
6123 REGTAIL_STUDY(pRExC_state, NEXTOPER(br), ender);
6125 else if (op == BRANCHJ) {
6126 REGTAIL_STUDY(pRExC_state, NEXTOPER(NEXTOPER(br)), ender);
6134 static const char parens[] = "=!<,>";
6136 if (paren && (p = strchr(parens, paren))) {
6137 U8 node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
6138 int flag = (p - parens) > 1;
6141 node = SUSPEND, flag = 0;
6142 reginsert(pRExC_state, node,ret, depth+1);
6143 Set_Node_Cur_Length(ret);
6144 Set_Node_Offset(ret, parse_start + 1);
6146 REGTAIL_STUDY(pRExC_state, ret, reg_node(pRExC_state, TAIL));
6150 /* Check for proper termination. */
6152 RExC_flags = oregflags;
6153 if (RExC_parse >= RExC_end || *nextchar(pRExC_state) != ')') {
6154 RExC_parse = oregcomp_parse;
6155 vFAIL("Unmatched (");
6158 else if (!paren && RExC_parse < RExC_end) {
6159 if (*RExC_parse == ')') {
6161 vFAIL("Unmatched )");
6164 FAIL("Junk on end of regexp"); /* "Can't happen". */
6168 RExC_npar = after_freeze;
6173 - regbranch - one alternative of an | operator
6175 * Implements the concatenation operator.
6178 S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first, U32 depth)
6181 register regnode *ret;
6182 register regnode *chain = NULL;
6183 register regnode *latest;
6184 I32 flags = 0, c = 0;
6185 GET_RE_DEBUG_FLAGS_DECL;
6186 DEBUG_PARSE("brnc");
6191 if (!SIZE_ONLY && RExC_extralen)
6192 ret = reganode(pRExC_state, BRANCHJ,0);
6194 ret = reg_node(pRExC_state, BRANCH);
6195 Set_Node_Length(ret, 1);
6199 if (!first && SIZE_ONLY)
6200 RExC_extralen += 1; /* BRANCHJ */
6202 *flagp = WORST; /* Tentatively. */
6205 nextchar(pRExC_state);
6206 while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
6208 latest = regpiece(pRExC_state, &flags,depth+1);
6209 if (latest == NULL) {
6210 if (flags & TRYAGAIN)
6214 else if (ret == NULL)
6216 *flagp |= flags&(HASWIDTH|POSTPONED);
6217 if (chain == NULL) /* First piece. */
6218 *flagp |= flags&SPSTART;
6221 REGTAIL(pRExC_state, chain, latest);
6226 if (chain == NULL) { /* Loop ran zero times. */
6227 chain = reg_node(pRExC_state, NOTHING);
6232 *flagp |= flags&SIMPLE;
6239 - regpiece - something followed by possible [*+?]
6241 * Note that the branching code sequences used for ? and the general cases
6242 * of * and + are somewhat optimized: they use the same NOTHING node as
6243 * both the endmarker for their branch list and the body of the last branch.
6244 * It might seem that this node could be dispensed with entirely, but the
6245 * endmarker role is not redundant.
6248 S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
6251 register regnode *ret;
6253 register char *next;
6255 const char * const origparse = RExC_parse;
6257 I32 max = REG_INFTY;
6259 const char *maxpos = NULL;
6260 GET_RE_DEBUG_FLAGS_DECL;
6261 DEBUG_PARSE("piec");
6263 ret = regatom(pRExC_state, &flags,depth+1);
6265 if (flags & TRYAGAIN)
6272 if (op == '{' && regcurly(RExC_parse)) {
6274 parse_start = RExC_parse; /* MJD */
6275 next = RExC_parse + 1;
6276 while (isDIGIT(*next) || *next == ',') {
6285 if (*next == '}') { /* got one */
6289 min = atoi(RExC_parse);
6293 maxpos = RExC_parse;
6295 if (!max && *maxpos != '0')
6296 max = REG_INFTY; /* meaning "infinity" */
6297 else if (max >= REG_INFTY)
6298 vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
6300 nextchar(pRExC_state);
6303 if ((flags&SIMPLE)) {
6304 RExC_naughty += 2 + RExC_naughty / 2;
6305 reginsert(pRExC_state, CURLY, ret, depth+1);
6306 Set_Node_Offset(ret, parse_start+1); /* MJD */
6307 Set_Node_Cur_Length(ret);
6310 regnode * const w = reg_node(pRExC_state, WHILEM);
6313 REGTAIL(pRExC_state, ret, w);
6314 if (!SIZE_ONLY && RExC_extralen) {
6315 reginsert(pRExC_state, LONGJMP,ret, depth+1);
6316 reginsert(pRExC_state, NOTHING,ret, depth+1);
6317 NEXT_OFF(ret) = 3; /* Go over LONGJMP. */
6319 reginsert(pRExC_state, CURLYX,ret, depth+1);
6321 Set_Node_Offset(ret, parse_start+1);
6322 Set_Node_Length(ret,
6323 op == '{' ? (RExC_parse - parse_start) : 1);
6325 if (!SIZE_ONLY && RExC_extralen)
6326 NEXT_OFF(ret) = 3; /* Go over NOTHING to LONGJMP. */
6327 REGTAIL(pRExC_state, ret, reg_node(pRExC_state, NOTHING));
6329 RExC_whilem_seen++, RExC_extralen += 3;
6330 RExC_naughty += 4 + RExC_naughty; /* compound interest */
6338 if (max && max < min)
6339 vFAIL("Can't do {n,m} with n > m");
6341 ARG1_SET(ret, (U16)min);
6342 ARG2_SET(ret, (U16)max);
6354 #if 0 /* Now runtime fix should be reliable. */
6356 /* if this is reinstated, don't forget to put this back into perldiag:
6358 =item Regexp *+ operand could be empty at {#} in regex m/%s/
6360 (F) The part of the regexp subject to either the * or + quantifier
6361 could match an empty string. The {#} shows in the regular
6362 expression about where the problem was discovered.
6366 if (!(flags&HASWIDTH) && op != '?')
6367 vFAIL("Regexp *+ operand could be empty");
6370 parse_start = RExC_parse;
6371 nextchar(pRExC_state);
6373 *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH);
6375 if (op == '*' && (flags&SIMPLE)) {
6376 reginsert(pRExC_state, STAR, ret, depth+1);
6380 else if (op == '*') {
6384 else if (op == '+' && (flags&SIMPLE)) {
6385 reginsert(pRExC_state, PLUS, ret, depth+1);
6389 else if (op == '+') {
6393 else if (op == '?') {
6398 if (!SIZE_ONLY && !(flags&(HASWIDTH|POSTPONED)) && max > REG_INFTY/3 && ckWARN(WARN_REGEXP)) {
6400 "%.*s matches null string many times",
6401 (int)(RExC_parse >= origparse ? RExC_parse - origparse : 0),
6405 if (RExC_parse < RExC_end && *RExC_parse == '?') {
6406 nextchar(pRExC_state);
6407 reginsert(pRExC_state, MINMOD, ret, depth+1);
6408 REGTAIL(pRExC_state, ret, ret + NODE_STEP_REGNODE);
6410 #ifndef REG_ALLOW_MINMOD_SUSPEND
6413 if (RExC_parse < RExC_end && *RExC_parse == '+') {
6415 nextchar(pRExC_state);
6416 ender = reg_node(pRExC_state, SUCCEED);
6417 REGTAIL(pRExC_state, ret, ender);
6418 reginsert(pRExC_state, SUSPEND, ret, depth+1);
6420 ender = reg_node(pRExC_state, TAIL);
6421 REGTAIL(pRExC_state, ret, ender);
6425 if (RExC_parse < RExC_end && ISMULT2(RExC_parse)) {
6427 vFAIL("Nested quantifiers");
6434 /* reg_namedseq(pRExC_state,UVp)
6436 This is expected to be called by a parser routine that has
6437 recognized'\N' and needs to handle the rest. RExC_parse is
6438 expected to point at the first char following the N at the time
6441 If valuep is non-null then it is assumed that we are parsing inside
6442 of a charclass definition and the first codepoint in the resolved
6443 string is returned via *valuep and the routine will return NULL.
6444 In this mode if a multichar string is returned from the charnames
6445 handler a warning will be issued, and only the first char in the
6446 sequence will be examined. If the string returned is zero length
6447 then the value of *valuep is undefined and NON-NULL will
6448 be returned to indicate failure. (This will NOT be a valid pointer
6451 If value is null then it is assumed that we are parsing normal text
6452 and inserts a new EXACT node into the program containing the resolved
6453 string and returns a pointer to the new node. If the string is
6454 zerolength a NOTHING node is emitted.
6456 On success RExC_parse is set to the char following the endbrace.
6457 Parsing failures will generate a fatal errorvia vFAIL(...)
6459 NOTE: We cache all results from the charnames handler locally in
6460 the RExC_charnames hash (created on first use) to prevent a charnames
6461 handler from playing silly-buggers and returning a short string and
6462 then a long string for a given pattern. Since the regexp program
6463 size is calculated during an initial parse this would result
6464 in a buffer overrun so we cache to prevent the charname result from
6465 changing during the course of the parse.
6469 S_reg_namedseq(pTHX_ RExC_state_t *pRExC_state, UV *valuep)
6471 char * name; /* start of the content of the name */
6472 char * endbrace; /* endbrace following the name */
6475 STRLEN len; /* this has various purposes throughout the code */
6476 bool cached = 0; /* if this is true then we shouldn't refcount dev sv_str */
6477 regnode *ret = NULL;
6479 if (*RExC_parse != '{') {
6480 vFAIL("Missing braces on \\N{}");
6482 name = RExC_parse+1;
6483 endbrace = strchr(RExC_parse, '}');
6486 vFAIL("Missing right brace on \\N{}");
6488 RExC_parse = endbrace + 1;
6491 /* RExC_parse points at the beginning brace,
6492 endbrace points at the last */
6493 if ( name[0]=='U' && name[1]=='+' ) {
6494 /* its a "Unicode hex" notation {U+89AB} */
6495 I32 fl = PERL_SCAN_ALLOW_UNDERSCORES
6496 | PERL_SCAN_DISALLOW_PREFIX
6497 | (SIZE_ONLY ? PERL_SCAN_SILENT_ILLDIGIT : 0);
6500 len = (STRLEN)(endbrace - name - 2);
6501 cp = grok_hex(name + 2, &len, &fl, NULL);
6502 if ( len != (STRLEN)(endbrace - name - 2) ) {
6512 sv_str= newSVpvn(&string, 1);
6514 /* fetch the charnames handler for this scope */
6515 HV * const table = GvHV(PL_hintgv);
6517 hv_fetchs(table, "charnames", FALSE) :
6519 SV *cv= cvp ? *cvp : NULL;
6522 /* create an SV with the name as argument */
6523 sv_name = newSVpvn(name, endbrace - name);
6525 if (!table || !(PL_hints & HINT_LOCALIZE_HH)) {
6526 vFAIL2("Constant(\\N{%s}) unknown: "
6527 "(possibly a missing \"use charnames ...\")",
6530 if (!cvp || !SvOK(*cvp)) { /* when $^H{charnames} = undef; */
6531 vFAIL2("Constant(\\N{%s}): "
6532 "$^H{charnames} is not defined",SvPVX(sv_name));
6537 if (!RExC_charnames) {
6538 /* make sure our cache is allocated */
6539 RExC_charnames = newHV();
6540 sv_2mortal((SV*)RExC_charnames);
6542 /* see if we have looked this one up before */
6543 he_str = hv_fetch_ent( RExC_charnames, sv_name, 0, 0 );
6545 sv_str = HeVAL(he_str);
6558 count= call_sv(cv, G_SCALAR);
6560 if (count == 1) { /* XXXX is this right? dmq */
6562 SvREFCNT_inc_simple_void(sv_str);
6570 if ( !sv_str || !SvOK(sv_str) ) {
6571 vFAIL2("Constant(\\N{%s}): Call to &{$^H{charnames}} "
6572 "did not return a defined value",SvPVX(sv_name));
6574 if (hv_store_ent( RExC_charnames, sv_name, sv_str, 0))
6579 char *p = SvPV(sv_str, len);
6582 if ( SvUTF8(sv_str) ) {
6583 *valuep = utf8_to_uvchr((U8*)p, &numlen);
6587 We have to turn on utf8 for high bit chars otherwise
6588 we get failures with
6590 "ss" =~ /[\N{LATIN SMALL LETTER SHARP S}]/i
6591 "SS" =~ /[\N{LATIN SMALL LETTER SHARP S}]/i
6593 This is different from what \x{} would do with the same
6594 codepoint, where the condition is > 0xFF.
6601 /* warn if we havent used the whole string? */
6603 if (numlen<len && SIZE_ONLY && ckWARN(WARN_REGEXP)) {
6605 "Ignoring excess chars from \\N{%s} in character class",
6609 } else if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
6611 "Ignoring zero length \\N{%s} in character class",
6616 SvREFCNT_dec(sv_name);
6618 SvREFCNT_dec(sv_str);
6619 return len ? NULL : (regnode *)&len;
6620 } else if(SvCUR(sv_str)) {
6626 char * parse_start = name-3; /* needed for the offsets */
6628 GET_RE_DEBUG_FLAGS_DECL; /* needed for the offsets */
6630 ret = reg_node(pRExC_state,
6631 (U8)(FOLD ? (LOC ? EXACTFL : EXACTF) : EXACT));
6634 if ( RExC_utf8 && !SvUTF8(sv_str) ) {
6635 sv_utf8_upgrade(sv_str);
6636 } else if ( !RExC_utf8 && SvUTF8(sv_str) ) {
6640 p = SvPV(sv_str, len);
6642 /* len is the length written, charlen is the size the char read */
6643 for ( len = 0; p < pend; p += charlen ) {
6645 UV uvc = utf8_to_uvchr((U8*)p, &charlen);
6647 STRLEN foldlen,numlen;
6648 U8 tmpbuf[UTF8_MAXBYTES_CASE+1], *foldbuf;
6649 uvc = toFOLD_uni(uvc, tmpbuf, &foldlen);
6650 /* Emit all the Unicode characters. */
6652 for (foldbuf = tmpbuf;
6656 uvc = utf8_to_uvchr(foldbuf, &numlen);
6658 const STRLEN unilen = reguni(pRExC_state, uvc, s);
6661 /* In EBCDIC the numlen
6662 * and unilen can differ. */
6664 if (numlen >= foldlen)
6668 break; /* "Can't happen." */
6671 const STRLEN unilen = reguni(pRExC_state, uvc, s);
6683 RExC_size += STR_SZ(len);
6686 RExC_emit += STR_SZ(len);
6688 Set_Node_Cur_Length(ret); /* MJD */
6690 nextchar(pRExC_state);
6692 ret = reg_node(pRExC_state,NOTHING);
6695 SvREFCNT_dec(sv_str);
6698 SvREFCNT_dec(sv_name);
6708 * It returns the code point in utf8 for the value in *encp.
6709 * value: a code value in the source encoding
6710 * encp: a pointer to an Encode object
6712 * If the result from Encode is not a single character,
6713 * it returns U+FFFD (Replacement character) and sets *encp to NULL.
6716 S_reg_recode(pTHX_ const char value, SV **encp)
6719 SV * const sv = sv_2mortal(newSVpvn(&value, numlen));
6720 const char * const s = *encp ? sv_recode_to_utf8(sv, *encp) : SvPVX(sv);
6721 const STRLEN newlen = SvCUR(sv);
6722 UV uv = UNICODE_REPLACEMENT;
6726 ? utf8n_to_uvchr((U8*)s, newlen, &numlen, UTF8_ALLOW_DEFAULT)
6729 if (!newlen || numlen != newlen) {
6730 uv = UNICODE_REPLACEMENT;
6738 - regatom - the lowest level
6740 Try to identify anything special at the start of the pattern. If there
6741 is, then handle it as required. This may involve generating a single regop,
6742 such as for an assertion; or it may involve recursing, such as to
6743 handle a () structure.
6745 If the string doesn't start with something special then we gobble up
6746 as much literal text as we can.
6748 Once we have been able to handle whatever type of thing started the
6749 sequence, we return.
6751 Note: we have to be careful with escapes, as they can be both literal
6752 and special, and in the case of \10 and friends can either, depending
6753 on context. Specifically there are two seperate switches for handling
6754 escape sequences, with the one for handling literal escapes requiring
6755 a dummy entry for all of the special escapes that are actually handled
6760 S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
6763 register regnode *ret = NULL;
6765 char *parse_start = RExC_parse;
6766 GET_RE_DEBUG_FLAGS_DECL;
6767 DEBUG_PARSE("atom");
6768 *flagp = WORST; /* Tentatively. */
6772 switch ((U8)*RExC_parse) {
6774 RExC_seen_zerolen++;
6775 nextchar(pRExC_state);
6776 if (RExC_flags & RXf_PMf_MULTILINE)
6777 ret = reg_node(pRExC_state, MBOL);
6778 else if (RExC_flags & RXf_PMf_SINGLELINE)
6779 ret = reg_node(pRExC_state, SBOL);
6781 ret = reg_node(pRExC_state, BOL);
6782 Set_Node_Length(ret, 1); /* MJD */
6785 nextchar(pRExC_state);
6787 RExC_seen_zerolen++;
6788 if (RExC_flags & RXf_PMf_MULTILINE)
6789 ret = reg_node(pRExC_state, MEOL);
6790 else if (RExC_flags & RXf_PMf_SINGLELINE)
6791 ret = reg_node(pRExC_state, SEOL);
6793 ret = reg_node(pRExC_state, EOL);
6794 Set_Node_Length(ret, 1); /* MJD */
6797 nextchar(pRExC_state);
6798 if (RExC_flags & RXf_PMf_SINGLELINE)
6799 ret = reg_node(pRExC_state, SANY);
6801 ret = reg_node(pRExC_state, REG_ANY);
6802 *flagp |= HASWIDTH|SIMPLE;
6804 Set_Node_Length(ret, 1); /* MJD */
6808 char * const oregcomp_parse = ++RExC_parse;
6809 ret = regclass(pRExC_state,depth+1);
6810 if (*RExC_parse != ']') {
6811 RExC_parse = oregcomp_parse;
6812 vFAIL("Unmatched [");
6814 nextchar(pRExC_state);
6815 *flagp |= HASWIDTH|SIMPLE;
6816 Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */
6820 nextchar(pRExC_state);
6821 ret = reg(pRExC_state, 1, &flags,depth+1);
6823 if (flags & TRYAGAIN) {
6824 if (RExC_parse == RExC_end) {
6825 /* Make parent create an empty node if needed. */
6833 *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE|POSTPONED);
6837 if (flags & TRYAGAIN) {
6841 vFAIL("Internal urp");
6842 /* Supposed to be caught earlier. */
6845 if (!regcurly(RExC_parse)) {
6854 vFAIL("Quantifier follows nothing");
6861 len=0; /* silence a spurious compiler warning */
6862 if ((cp = what_len_TRICKYFOLD_safe(RExC_parse,RExC_end,UTF,len))) {
6863 *flagp |= HASWIDTH; /* could be SIMPLE too, but needs a handler in regexec.regrepeat */
6864 RExC_parse+=len-1; /* we get one from nextchar() as well. :-( */
6865 ret = reganode(pRExC_state, FOLDCHAR, cp);
6866 Set_Node_Length(ret, 1); /* MJD */
6867 nextchar(pRExC_state); /* kill whitespace under /x */
6875 This switch handles escape sequences that resolve to some kind
6876 of special regop and not to literal text. Escape sequnces that
6877 resolve to literal text are handled below in the switch marked
6880 Every entry in this switch *must* have a corresponding entry
6881 in the literal escape switch. However, the opposite is not
6882 required, as the default for this switch is to jump to the
6883 literal text handling code.
6885 switch (*++RExC_parse) {
6886 /* Special Escapes */
6888 RExC_seen_zerolen++;
6889 ret = reg_node(pRExC_state, SBOL);
6891 goto finish_meta_pat;
6893 ret = reg_node(pRExC_state, GPOS);
6894 RExC_seen |= REG_SEEN_GPOS;
6896 goto finish_meta_pat;
6898 RExC_seen_zerolen++;
6899 ret = reg_node(pRExC_state, KEEPS);
6901 /* XXX:dmq : disabling in-place substitution seems to
6902 * be necessary here to avoid cases of memory corruption, as
6903 * with: C<$_="x" x 80; s/x\K/y/> -- rgs
6905 RExC_seen |= REG_SEEN_LOOKBEHIND;
6906 goto finish_meta_pat;
6908 ret = reg_node(pRExC_state, SEOL);
6910 RExC_seen_zerolen++; /* Do not optimize RE away */
6911 goto finish_meta_pat;
6913 ret = reg_node(pRExC_state, EOS);
6915 RExC_seen_zerolen++; /* Do not optimize RE away */
6916 goto finish_meta_pat;
6918 ret = reg_node(pRExC_state, CANY);
6919 RExC_seen |= REG_SEEN_CANY;
6920 *flagp |= HASWIDTH|SIMPLE;
6921 goto finish_meta_pat;
6923 ret = reg_node(pRExC_state, CLUMP);
6925 goto finish_meta_pat;
6927 ret = reg_node(pRExC_state, (U8)(LOC ? ALNUML : ALNUM));
6928 *flagp |= HASWIDTH|SIMPLE;
6929 goto finish_meta_pat;
6931 ret = reg_node(pRExC_state, (U8)(LOC ? NALNUML : NALNUM));
6932 *flagp |= HASWIDTH|SIMPLE;
6933 goto finish_meta_pat;
6935 RExC_seen_zerolen++;
6936 RExC_seen |= REG_SEEN_LOOKBEHIND;
6937 ret = reg_node(pRExC_state, (U8)(LOC ? BOUNDL : BOUND));
6939 goto finish_meta_pat;
6941 RExC_seen_zerolen++;
6942 RExC_seen |= REG_SEEN_LOOKBEHIND;
6943 ret = reg_node(pRExC_state, (U8)(LOC ? NBOUNDL : NBOUND));
6945 goto finish_meta_pat;
6947 ret = reg_node(pRExC_state, (U8)(LOC ? SPACEL : SPACE));
6948 *flagp |= HASWIDTH|SIMPLE;
6949 goto finish_meta_pat;
6951 ret = reg_node(pRExC_state, (U8)(LOC ? NSPACEL : NSPACE));
6952 *flagp |= HASWIDTH|SIMPLE;
6953 goto finish_meta_pat;
6955 ret = reg_node(pRExC_state, DIGIT);
6956 *flagp |= HASWIDTH|SIMPLE;
6957 goto finish_meta_pat;
6959 ret = reg_node(pRExC_state, NDIGIT);
6960 *flagp |= HASWIDTH|SIMPLE;
6961 goto finish_meta_pat;
6963 ret = reg_node(pRExC_state, LNBREAK);
6964 *flagp |= HASWIDTH|SIMPLE;
6965 goto finish_meta_pat;
6967 ret = reg_node(pRExC_state, HORIZWS);
6968 *flagp |= HASWIDTH|SIMPLE;
6969 goto finish_meta_pat;
6971 ret = reg_node(pRExC_state, NHORIZWS);
6972 *flagp |= HASWIDTH|SIMPLE;
6973 goto finish_meta_pat;
6975 ret = reg_node(pRExC_state, VERTWS);
6976 *flagp |= HASWIDTH|SIMPLE;
6977 goto finish_meta_pat;
6979 ret = reg_node(pRExC_state, NVERTWS);
6980 *flagp |= HASWIDTH|SIMPLE;
6982 nextchar(pRExC_state);
6983 Set_Node_Length(ret, 2); /* MJD */
6988 char* const oldregxend = RExC_end;
6990 char* parse_start = RExC_parse - 2;
6993 if (RExC_parse[1] == '{') {
6994 /* a lovely hack--pretend we saw [\pX] instead */
6995 RExC_end = strchr(RExC_parse, '}');
6997 const U8 c = (U8)*RExC_parse;
6999 RExC_end = oldregxend;
7000 vFAIL2("Missing right brace on \\%c{}", c);
7005 RExC_end = RExC_parse + 2;
7006 if (RExC_end > oldregxend)
7007 RExC_end = oldregxend;
7011 ret = regclass(pRExC_state,depth+1);
7013 RExC_end = oldregxend;
7016 Set_Node_Offset(ret, parse_start + 2);
7017 Set_Node_Cur_Length(ret);
7018 nextchar(pRExC_state);
7019 *flagp |= HASWIDTH|SIMPLE;
7023 /* Handle \N{NAME} here and not below because it can be
7024 multicharacter. join_exact() will join them up later on.
7025 Also this makes sure that things like /\N{BLAH}+/ and
7026 \N{BLAH} being multi char Just Happen. dmq*/
7028 ret= reg_namedseq(pRExC_state, NULL);
7030 case 'k': /* Handle \k<NAME> and \k'NAME' */
7033 char ch= RExC_parse[1];
7034 if (ch != '<' && ch != '\'' && ch != '{') {
7036 vFAIL2("Sequence %.2s... not terminated",parse_start);
7038 /* this pretty much dupes the code for (?P=...) in reg(), if
7039 you change this make sure you change that */
7040 char* name_start = (RExC_parse += 2);
7042 SV *sv_dat = reg_scan_name(pRExC_state,
7043 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
7044 ch= (ch == '<') ? '>' : (ch == '{') ? '}' : '\'';
7045 if (RExC_parse == name_start || *RExC_parse != ch)
7046 vFAIL2("Sequence %.3s... not terminated",parse_start);
7049 num = add_data( pRExC_state, 1, "S" );
7050 RExC_rxi->data->data[num]=(void*)sv_dat;
7051 SvREFCNT_inc_simple_void(sv_dat);
7055 ret = reganode(pRExC_state,
7056 (U8)(FOLD ? (LOC ? NREFFL : NREFF) : NREF),
7060 /* override incorrect value set in reganode MJD */
7061 Set_Node_Offset(ret, parse_start+1);
7062 Set_Node_Cur_Length(ret); /* MJD */
7063 nextchar(pRExC_state);
7069 case '1': case '2': case '3': case '4':
7070 case '5': case '6': case '7': case '8': case '9':
7073 bool isg = *RExC_parse == 'g';
7078 if (*RExC_parse == '{') {
7082 if (*RExC_parse == '-') {
7086 if (hasbrace && !isDIGIT(*RExC_parse)) {
7087 if (isrel) RExC_parse--;
7089 goto parse_named_seq;
7091 num = atoi(RExC_parse);
7092 if (isg && num == 0)
7093 vFAIL("Reference to invalid group 0");
7095 num = RExC_npar - num;
7097 vFAIL("Reference to nonexistent or unclosed group");
7099 if (!isg && num > 9 && num >= RExC_npar)
7102 char * const parse_start = RExC_parse - 1; /* MJD */
7103 while (isDIGIT(*RExC_parse))
7105 if (parse_start == RExC_parse - 1)
7106 vFAIL("Unterminated \\g... pattern");
7108 if (*RExC_parse != '}')
7109 vFAIL("Unterminated \\g{...} pattern");
7113 if (num > (I32)RExC_rx->nparens)
7114 vFAIL("Reference to nonexistent group");
7117 ret = reganode(pRExC_state,
7118 (U8)(FOLD ? (LOC ? REFFL : REFF) : REF),
7122 /* override incorrect value set in reganode MJD */
7123 Set_Node_Offset(ret, parse_start+1);
7124 Set_Node_Cur_Length(ret); /* MJD */
7126 nextchar(pRExC_state);
7131 if (RExC_parse >= RExC_end)
7132 FAIL("Trailing \\");
7135 /* Do not generate "unrecognized" warnings here, we fall
7136 back into the quick-grab loop below */
7143 if (RExC_flags & RXf_PMf_EXTENDED) {
7144 if ( reg_skipcomment( pRExC_state ) )
7151 register STRLEN len;
7156 U8 tmpbuf[UTF8_MAXBYTES_CASE+1], *foldbuf;
7158 parse_start = RExC_parse - 1;
7164 ret = reg_node(pRExC_state,
7165 (U8)(FOLD ? (LOC ? EXACTFL : EXACTF) : EXACT));
7167 for (len = 0, p = RExC_parse - 1;
7168 len < 127 && p < RExC_end;
7171 char * const oldp = p;
7173 if (RExC_flags & RXf_PMf_EXTENDED)
7174 p = regwhite( pRExC_state, p );
7179 if (LOC || !FOLD || !is_TRICKYFOLD_safe(p,RExC_end,UTF))
7180 goto normal_default;
7190 /* Literal Escapes Switch
7192 This switch is meant to handle escape sequences that
7193 resolve to a literal character.
7195 Every escape sequence that represents something
7196 else, like an assertion or a char class, is handled
7197 in the switch marked 'Special Escapes' above in this
7198 routine, but also has an entry here as anything that
7199 isn't explicitly mentioned here will be treated as
7200 an unescaped equivalent literal.
7204 /* These are all the special escapes. */
7205 case 'A': /* Start assertion */
7206 case 'b': case 'B': /* Word-boundary assertion*/
7207 case 'C': /* Single char !DANGEROUS! */
7208 case 'd': case 'D': /* digit class */
7209 case 'g': case 'G': /* generic-backref, pos assertion */
7210 case 'h': case 'H': /* HORIZWS */
7211 case 'k': case 'K': /* named backref, keep marker */
7212 case 'N': /* named char sequence */
7213 case 'p': case 'P': /* Unicode property */
7214 case 'R': /* LNBREAK */
7215 case 's': case 'S': /* space class */
7216 case 'v': case 'V': /* VERTWS */
7217 case 'w': case 'W': /* word class */
7218 case 'X': /* eXtended Unicode "combining character sequence" */
7219 case 'z': case 'Z': /* End of line/string assertion */
7223 /* Anything after here is an escape that resolves to a
7224 literal. (Except digits, which may or may not)
7243 ender = ASCII_TO_NATIVE('\033');
7247 ender = ASCII_TO_NATIVE('\007');
7252 char* const e = strchr(p, '}');
7256 vFAIL("Missing right brace on \\x{}");
7259 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
7260 | PERL_SCAN_DISALLOW_PREFIX;
7261 STRLEN numlen = e - p - 1;
7262 ender = grok_hex(p + 1, &numlen, &flags, NULL);
7269 I32 flags = PERL_SCAN_DISALLOW_PREFIX;
7271 ender = grok_hex(p, &numlen, &flags, NULL);
7274 if (PL_encoding && ender < 0x100)
7275 goto recode_encoding;
7279 ender = UCHARAT(p++);
7280 ender = toCTRL(ender);
7282 case '0': case '1': case '2': case '3':case '4':
7283 case '5': case '6': case '7': case '8':case '9':
7285 (isDIGIT(p[1]) && atoi(p) >= RExC_npar) ) {
7288 ender = grok_oct(p, &numlen, &flags, NULL);
7295 if (PL_encoding && ender < 0x100)
7296 goto recode_encoding;
7300 SV* enc = PL_encoding;
7301 ender = reg_recode((const char)(U8)ender, &enc);
7302 if (!enc && SIZE_ONLY && ckWARN(WARN_REGEXP))
7303 vWARN(p, "Invalid escape in the specified encoding");
7309 FAIL("Trailing \\");
7312 if (!SIZE_ONLY&& isALPHA(*p) && ckWARN(WARN_REGEXP))
7313 vWARN2(p + 1, "Unrecognized escape \\%c passed through", UCHARAT(p));
7314 goto normal_default;
7319 if (UTF8_IS_START(*p) && UTF) {
7321 ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
7322 &numlen, UTF8_ALLOW_DEFAULT);
7329 if ( RExC_flags & RXf_PMf_EXTENDED)
7330 p = regwhite( pRExC_state, p );
7332 /* Prime the casefolded buffer. */
7333 ender = toFOLD_uni(ender, tmpbuf, &foldlen);
7335 if (p < RExC_end && ISMULT2(p)) { /* Back off on ?+*. */
7340 /* Emit all the Unicode characters. */
7342 for (foldbuf = tmpbuf;
7344 foldlen -= numlen) {
7345 ender = utf8_to_uvchr(foldbuf, &numlen);
7347 const STRLEN unilen = reguni(pRExC_state, ender, s);
7350 /* In EBCDIC the numlen
7351 * and unilen can differ. */
7353 if (numlen >= foldlen)
7357 break; /* "Can't happen." */
7361 const STRLEN unilen = reguni(pRExC_state, ender, s);
7370 REGC((char)ender, s++);
7376 /* Emit all the Unicode characters. */
7378 for (foldbuf = tmpbuf;
7380 foldlen -= numlen) {
7381 ender = utf8_to_uvchr(foldbuf, &numlen);
7383 const STRLEN unilen = reguni(pRExC_state, ender, s);
7386 /* In EBCDIC the numlen
7387 * and unilen can differ. */
7389 if (numlen >= foldlen)
7397 const STRLEN unilen = reguni(pRExC_state, ender, s);
7406 REGC((char)ender, s++);
7410 Set_Node_Cur_Length(ret); /* MJD */
7411 nextchar(pRExC_state);
7413 /* len is STRLEN which is unsigned, need to copy to signed */
7416 vFAIL("Internal disaster");
7420 if (len == 1 && UNI_IS_INVARIANT(ender))
7424 RExC_size += STR_SZ(len);
7427 RExC_emit += STR_SZ(len);
7437 S_regwhite( RExC_state_t *pRExC_state, char *p )
7439 const char *e = RExC_end;
7443 else if (*p == '#') {
7452 RExC_seen |= REG_SEEN_RUN_ON_COMMENT;
7460 /* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
7461 Character classes ([:foo:]) can also be negated ([:^foo:]).
7462 Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
7463 Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
7464 but trigger failures because they are currently unimplemented. */
7466 #define POSIXCC_DONE(c) ((c) == ':')
7467 #define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.')
7468 #define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c))
7471 S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value)
7474 I32 namedclass = OOB_NAMEDCLASS;
7476 if (value == '[' && RExC_parse + 1 < RExC_end &&
7477 /* I smell either [: or [= or [. -- POSIX has been here, right? */
7478 POSIXCC(UCHARAT(RExC_parse))) {
7479 const char c = UCHARAT(RExC_parse);
7480 char* const s = RExC_parse++;
7482 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != c)
7484 if (RExC_parse == RExC_end)
7485 /* Grandfather lone [:, [=, [. */
7488 const char* const t = RExC_parse++; /* skip over the c */
7491 if (UCHARAT(RExC_parse) == ']') {
7492 const char *posixcc = s + 1;
7493 RExC_parse++; /* skip over the ending ] */
7496 const I32 complement = *posixcc == '^' ? *posixcc++ : 0;
7497 const I32 skip = t - posixcc;
7499 /* Initially switch on the length of the name. */
7502 if (memEQ(posixcc, "word", 4)) /* this is not POSIX, this is the Perl \w */
7503 namedclass = complement ? ANYOF_NALNUM : ANYOF_ALNUM;
7506 /* Names all of length 5. */
7507 /* alnum alpha ascii blank cntrl digit graph lower
7508 print punct space upper */
7509 /* Offset 4 gives the best switch position. */
7510 switch (posixcc[4]) {
7512 if (memEQ(posixcc, "alph", 4)) /* alpha */
7513 namedclass = complement ? ANYOF_NALPHA : ANYOF_ALPHA;
7516 if (memEQ(posixcc, "spac", 4)) /* space */
7517 namedclass = complement ? ANYOF_NPSXSPC : ANYOF_PSXSPC;
7520 if (memEQ(posixcc, "grap", 4)) /* graph */
7521 namedclass = complement ? ANYOF_NGRAPH : ANYOF_GRAPH;
7524 if (memEQ(posixcc, "asci", 4)) /* ascii */
7525 namedclass = complement ? ANYOF_NASCII : ANYOF_ASCII;
7528 if (memEQ(posixcc, "blan", 4)) /* blank */
7529 namedclass = complement ? ANYOF_NBLANK : ANYOF_BLANK;
7532 if (memEQ(posixcc, "cntr", 4)) /* cntrl */
7533 namedclass = complement ? ANYOF_NCNTRL : ANYOF_CNTRL;
7536 if (memEQ(posixcc, "alnu", 4)) /* alnum */
7537 namedclass = complement ? ANYOF_NALNUMC : ANYOF_ALNUMC;
7540 if (memEQ(posixcc, "lowe", 4)) /* lower */
7541 namedclass = complement ? ANYOF_NLOWER : ANYOF_LOWER;
7542 else if (memEQ(posixcc, "uppe", 4)) /* upper */
7543 namedclass = complement ? ANYOF_NUPPER : ANYOF_UPPER;
7546 if (memEQ(posixcc, "digi", 4)) /* digit */
7547 namedclass = complement ? ANYOF_NDIGIT : ANYOF_DIGIT;
7548 else if (memEQ(posixcc, "prin", 4)) /* print */
7549 namedclass = complement ? ANYOF_NPRINT : ANYOF_PRINT;
7550 else if (memEQ(posixcc, "punc", 4)) /* punct */
7551 namedclass = complement ? ANYOF_NPUNCT : ANYOF_PUNCT;
7556 if (memEQ(posixcc, "xdigit", 6))
7557 namedclass = complement ? ANYOF_NXDIGIT : ANYOF_XDIGIT;
7561 if (namedclass == OOB_NAMEDCLASS)
7562 Simple_vFAIL3("POSIX class [:%.*s:] unknown",
7564 assert (posixcc[skip] == ':');
7565 assert (posixcc[skip+1] == ']');
7566 } else if (!SIZE_ONLY) {
7567 /* [[=foo=]] and [[.foo.]] are still future. */
7569 /* adjust RExC_parse so the warning shows after
7571 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse) != ']')
7573 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
7576 /* Maternal grandfather:
7577 * "[:" ending in ":" but not in ":]" */
7587 S_checkposixcc(pTHX_ RExC_state_t *pRExC_state)
7590 if (POSIXCC(UCHARAT(RExC_parse))) {
7591 const char *s = RExC_parse;
7592 const char c = *s++;
7596 if (*s && c == *s && s[1] == ']') {
7597 if (ckWARN(WARN_REGEXP))
7599 "POSIX syntax [%c %c] belongs inside character classes",
7602 /* [[=foo=]] and [[.foo.]] are still future. */
7603 if (POSIXCC_NOTYET(c)) {
7604 /* adjust RExC_parse so the error shows after
7606 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse++) != ']')
7608 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
7615 #define _C_C_T_(NAME,TEST,WORD) \
7618 ANYOF_CLASS_SET(ret, ANYOF_##NAME); \
7620 for (value = 0; value < 256; value++) \
7622 ANYOF_BITMAP_SET(ret, value); \
7627 case ANYOF_N##NAME: \
7629 ANYOF_CLASS_SET(ret, ANYOF_N##NAME); \
7631 for (value = 0; value < 256; value++) \
7633 ANYOF_BITMAP_SET(ret, value); \
7639 #define _C_C_T_NOLOC_(NAME,TEST,WORD) \
7641 for (value = 0; value < 256; value++) \
7643 ANYOF_BITMAP_SET(ret, value); \
7647 case ANYOF_N##NAME: \
7648 for (value = 0; value < 256; value++) \
7650 ANYOF_BITMAP_SET(ret, value); \
7656 parse a class specification and produce either an ANYOF node that
7657 matches the pattern or if the pattern matches a single char only and
7658 that char is < 256 and we are case insensitive then we produce an
7663 S_regclass(pTHX_ RExC_state_t *pRExC_state, U32 depth)
7666 register UV nextvalue;
7667 register IV prevvalue = OOB_UNICODE;
7668 register IV range = 0;
7669 UV value = 0; /* XXX:dmq: needs to be referenceable (unfortunately) */
7670 register regnode *ret;
7673 char *rangebegin = NULL;
7674 bool need_class = 0;
7677 bool optimize_invert = TRUE;
7678 AV* unicode_alternate = NULL;
7680 UV literal_endpoint = 0;
7682 UV stored = 0; /* number of chars stored in the class */
7684 regnode * const orig_emit = RExC_emit; /* Save the original RExC_emit in
7685 case we need to change the emitted regop to an EXACT. */
7686 const char * orig_parse = RExC_parse;
7687 GET_RE_DEBUG_FLAGS_DECL;
7689 PERL_UNUSED_ARG(depth);
7692 DEBUG_PARSE("clas");
7694 /* Assume we are going to generate an ANYOF node. */
7695 ret = reganode(pRExC_state, ANYOF, 0);
7698 ANYOF_FLAGS(ret) = 0;
7700 if (UCHARAT(RExC_parse) == '^') { /* Complement of range. */
7704 ANYOF_FLAGS(ret) |= ANYOF_INVERT;
7708 RExC_size += ANYOF_SKIP;
7709 listsv = &PL_sv_undef; /* For code scanners: listsv always non-NULL. */
7712 RExC_emit += ANYOF_SKIP;
7714 ANYOF_FLAGS(ret) |= ANYOF_FOLD;
7716 ANYOF_FLAGS(ret) |= ANYOF_LOCALE;
7717 ANYOF_BITMAP_ZERO(ret);
7718 listsv = newSVpvs("# comment\n");
7721 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
7723 if (!SIZE_ONLY && POSIXCC(nextvalue))
7724 checkposixcc(pRExC_state);
7726 /* allow 1st char to be ] (allowing it to be - is dealt with later) */
7727 if (UCHARAT(RExC_parse) == ']')
7731 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != ']') {
7735 namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
7738 rangebegin = RExC_parse;
7740 value = utf8n_to_uvchr((U8*)RExC_parse,
7741 RExC_end - RExC_parse,
7742 &numlen, UTF8_ALLOW_DEFAULT);
7743 RExC_parse += numlen;
7746 value = UCHARAT(RExC_parse++);
7748 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
7749 if (value == '[' && POSIXCC(nextvalue))
7750 namedclass = regpposixcc(pRExC_state, value);
7751 else if (value == '\\') {
7753 value = utf8n_to_uvchr((U8*)RExC_parse,
7754 RExC_end - RExC_parse,
7755 &numlen, UTF8_ALLOW_DEFAULT);
7756 RExC_parse += numlen;
7759 value = UCHARAT(RExC_parse++);
7760 /* Some compilers cannot handle switching on 64-bit integer
7761 * values, therefore value cannot be an UV. Yes, this will
7762 * be a problem later if we want switch on Unicode.
7763 * A similar issue a little bit later when switching on
7764 * namedclass. --jhi */
7765 switch ((I32)value) {
7766 case 'w': namedclass = ANYOF_ALNUM; break;
7767 case 'W': namedclass = ANYOF_NALNUM; break;
7768 case 's': namedclass = ANYOF_SPACE; break;
7769 case 'S': namedclass = ANYOF_NSPACE; break;
7770 case 'd': namedclass = ANYOF_DIGIT; break;
7771 case 'D': namedclass = ANYOF_NDIGIT; break;
7772 case 'v': namedclass = ANYOF_VERTWS; break;
7773 case 'V': namedclass = ANYOF_NVERTWS; break;
7774 case 'h': namedclass = ANYOF_HORIZWS; break;
7775 case 'H': namedclass = ANYOF_NHORIZWS; break;
7776 case 'N': /* Handle \N{NAME} in class */
7778 /* We only pay attention to the first char of
7779 multichar strings being returned. I kinda wonder
7780 if this makes sense as it does change the behaviour
7781 from earlier versions, OTOH that behaviour was broken
7783 UV v; /* value is register so we cant & it /grrr */
7784 if (reg_namedseq(pRExC_state, &v)) {
7794 if (RExC_parse >= RExC_end)
7795 vFAIL2("Empty \\%c{}", (U8)value);
7796 if (*RExC_parse == '{') {
7797 const U8 c = (U8)value;
7798 e = strchr(RExC_parse++, '}');
7800 vFAIL2("Missing right brace on \\%c{}", c);
7801 while (isSPACE(UCHARAT(RExC_parse)))
7803 if (e == RExC_parse)
7804 vFAIL2("Empty \\%c{}", c);
7806 while (isSPACE(UCHARAT(RExC_parse + n - 1)))
7814 if (UCHARAT(RExC_parse) == '^') {
7817 value = value == 'p' ? 'P' : 'p'; /* toggle */
7818 while (isSPACE(UCHARAT(RExC_parse))) {
7823 Perl_sv_catpvf(aTHX_ listsv, "%cutf8::%.*s\n",
7824 (value=='p' ? '+' : '!'), (int)n, RExC_parse);
7827 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
7828 namedclass = ANYOF_MAX; /* no official name, but it's named */
7831 case 'n': value = '\n'; break;
7832 case 'r': value = '\r'; break;
7833 case 't': value = '\t'; break;
7834 case 'f': value = '\f'; break;
7835 case 'b': value = '\b'; break;
7836 case 'e': value = ASCII_TO_NATIVE('\033');break;
7837 case 'a': value = ASCII_TO_NATIVE('\007');break;
7839 if (*RExC_parse == '{') {
7840 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
7841 | PERL_SCAN_DISALLOW_PREFIX;
7842 char * const e = strchr(RExC_parse++, '}');
7844 vFAIL("Missing right brace on \\x{}");
7846 numlen = e - RExC_parse;
7847 value = grok_hex(RExC_parse, &numlen, &flags, NULL);
7851 I32 flags = PERL_SCAN_DISALLOW_PREFIX;
7853 value = grok_hex(RExC_parse, &numlen, &flags, NULL);
7854 RExC_parse += numlen;
7856 if (PL_encoding && value < 0x100)
7857 goto recode_encoding;
7860 value = UCHARAT(RExC_parse++);
7861 value = toCTRL(value);
7863 case '0': case '1': case '2': case '3': case '4':
7864 case '5': case '6': case '7': case '8': case '9':
7868 value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
7869 RExC_parse += numlen;
7870 if (PL_encoding && value < 0x100)
7871 goto recode_encoding;
7876 SV* enc = PL_encoding;
7877 value = reg_recode((const char)(U8)value, &enc);
7878 if (!enc && SIZE_ONLY && ckWARN(WARN_REGEXP))
7880 "Invalid escape in the specified encoding");
7884 if (!SIZE_ONLY && isALPHA(value) && ckWARN(WARN_REGEXP))
7886 "Unrecognized escape \\%c in character class passed through",
7890 } /* end of \blah */
7896 if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
7898 if (!SIZE_ONLY && !need_class)
7899 ANYOF_CLASS_ZERO(ret);
7903 /* a bad range like a-\d, a-[:digit:] ? */
7906 if (ckWARN(WARN_REGEXP)) {
7908 RExC_parse >= rangebegin ?
7909 RExC_parse - rangebegin : 0;
7911 "False [] range \"%*.*s\"",
7914 if (prevvalue < 256) {
7915 ANYOF_BITMAP_SET(ret, prevvalue);
7916 ANYOF_BITMAP_SET(ret, '-');
7919 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
7920 Perl_sv_catpvf(aTHX_ listsv,
7921 "%04"UVxf"\n%04"UVxf"\n", (UV)prevvalue, (UV) '-');
7925 range = 0; /* this was not a true range */
7931 const char *what = NULL;
7934 if (namedclass > OOB_NAMEDCLASS)
7935 optimize_invert = FALSE;
7936 /* Possible truncation here but in some 64-bit environments
7937 * the compiler gets heartburn about switch on 64-bit values.
7938 * A similar issue a little earlier when switching on value.
7940 switch ((I32)namedclass) {
7941 case _C_C_T_(ALNUM, isALNUM(value), "Word");
7942 case _C_C_T_(ALNUMC, isALNUMC(value), "Alnum");
7943 case _C_C_T_(ALPHA, isALPHA(value), "Alpha");
7944 case _C_C_T_(BLANK, isBLANK(value), "Blank");
7945 case _C_C_T_(CNTRL, isCNTRL(value), "Cntrl");
7946 case _C_C_T_(GRAPH, isGRAPH(value), "Graph");
7947 case _C_C_T_(LOWER, isLOWER(value), "Lower");
7948 case _C_C_T_(PRINT, isPRINT(value), "Print");
7949 case _C_C_T_(PSXSPC, isPSXSPC(value), "Space");
7950 case _C_C_T_(PUNCT, isPUNCT(value), "Punct");
7951 case _C_C_T_(SPACE, isSPACE(value), "SpacePerl");
7952 case _C_C_T_(UPPER, isUPPER(value), "Upper");
7953 case _C_C_T_(XDIGIT, isXDIGIT(value), "XDigit");
7954 case _C_C_T_NOLOC_(VERTWS, is_VERTWS_latin1(&value), "VertSpace");
7955 case _C_C_T_NOLOC_(HORIZWS, is_HORIZWS_latin1(&value), "HorizSpace");
7958 ANYOF_CLASS_SET(ret, ANYOF_ASCII);
7961 for (value = 0; value < 128; value++)
7962 ANYOF_BITMAP_SET(ret, value);
7964 for (value = 0; value < 256; value++) {
7966 ANYOF_BITMAP_SET(ret, value);
7975 ANYOF_CLASS_SET(ret, ANYOF_NASCII);
7978 for (value = 128; value < 256; value++)
7979 ANYOF_BITMAP_SET(ret, value);
7981 for (value = 0; value < 256; value++) {
7982 if (!isASCII(value))
7983 ANYOF_BITMAP_SET(ret, value);
7992 ANYOF_CLASS_SET(ret, ANYOF_DIGIT);
7994 /* consecutive digits assumed */
7995 for (value = '0'; value <= '9'; value++)
7996 ANYOF_BITMAP_SET(ret, value);
8003 ANYOF_CLASS_SET(ret, ANYOF_NDIGIT);
8005 /* consecutive digits assumed */
8006 for (value = 0; value < '0'; value++)
8007 ANYOF_BITMAP_SET(ret, value);
8008 for (value = '9' + 1; value < 256; value++)
8009 ANYOF_BITMAP_SET(ret, value);
8015 /* this is to handle \p and \P */
8018 vFAIL("Invalid [::] class");
8022 /* Strings such as "+utf8::isWord\n" */
8023 Perl_sv_catpvf(aTHX_ listsv, "%cutf8::Is%s\n", yesno, what);
8026 ANYOF_FLAGS(ret) |= ANYOF_CLASS;
8029 } /* end of namedclass \blah */
8032 if (prevvalue > (IV)value) /* b-a */ {
8033 const int w = RExC_parse - rangebegin;
8034 Simple_vFAIL4("Invalid [] range \"%*.*s\"", w, w, rangebegin);
8035 range = 0; /* not a valid range */
8039 prevvalue = value; /* save the beginning of the range */
8040 if (*RExC_parse == '-' && RExC_parse+1 < RExC_end &&
8041 RExC_parse[1] != ']') {
8044 /* a bad range like \w-, [:word:]- ? */
8045 if (namedclass > OOB_NAMEDCLASS) {
8046 if (ckWARN(WARN_REGEXP)) {
8048 RExC_parse >= rangebegin ?
8049 RExC_parse - rangebegin : 0;
8051 "False [] range \"%*.*s\"",
8055 ANYOF_BITMAP_SET(ret, '-');
8057 range = 1; /* yeah, it's a range! */
8058 continue; /* but do it the next time */
8062 /* now is the next time */
8063 /*stored += (value - prevvalue + 1);*/
8065 if (prevvalue < 256) {
8066 const IV ceilvalue = value < 256 ? value : 255;
8069 /* In EBCDIC [\x89-\x91] should include
8070 * the \x8e but [i-j] should not. */
8071 if (literal_endpoint == 2 &&
8072 ((isLOWER(prevvalue) && isLOWER(ceilvalue)) ||
8073 (isUPPER(prevvalue) && isUPPER(ceilvalue))))
8075 if (isLOWER(prevvalue)) {
8076 for (i = prevvalue; i <= ceilvalue; i++)
8077 if (isLOWER(i) && !ANYOF_BITMAP_TEST(ret,i)) {
8079 ANYOF_BITMAP_SET(ret, i);
8082 for (i = prevvalue; i <= ceilvalue; i++)
8083 if (isUPPER(i) && !ANYOF_BITMAP_TEST(ret,i)) {
8085 ANYOF_BITMAP_SET(ret, i);
8091 for (i = prevvalue; i <= ceilvalue; i++) {
8092 if (!ANYOF_BITMAP_TEST(ret,i)) {
8094 ANYOF_BITMAP_SET(ret, i);
8098 if (value > 255 || UTF) {
8099 const UV prevnatvalue = NATIVE_TO_UNI(prevvalue);
8100 const UV natvalue = NATIVE_TO_UNI(value);
8101 stored+=2; /* can't optimize this class */
8102 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
8103 if (prevnatvalue < natvalue) { /* what about > ? */
8104 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\t%04"UVxf"\n",
8105 prevnatvalue, natvalue);
8107 else if (prevnatvalue == natvalue) {
8108 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n", natvalue);
8110 U8 foldbuf[UTF8_MAXBYTES_CASE+1];
8112 const UV f = to_uni_fold(natvalue, foldbuf, &foldlen);
8114 #ifdef EBCDIC /* RD t/uni/fold ff and 6b */
8115 if (RExC_precomp[0] == ':' &&
8116 RExC_precomp[1] == '[' &&
8117 (f == 0xDF || f == 0x92)) {
8118 f = NATIVE_TO_UNI(f);
8121 /* If folding and foldable and a single
8122 * character, insert also the folded version
8123 * to the charclass. */
8125 #ifdef EBCDIC /* RD tunifold ligatures s,t fb05, fb06 */
8126 if ((RExC_precomp[0] == ':' &&
8127 RExC_precomp[1] == '[' &&
8129 (value == 0xFB05 || value == 0xFB06))) ?
8130 foldlen == ((STRLEN)UNISKIP(f) - 1) :
8131 foldlen == (STRLEN)UNISKIP(f) )
8133 if (foldlen == (STRLEN)UNISKIP(f))
8135 Perl_sv_catpvf(aTHX_ listsv,
8138 /* Any multicharacter foldings
8139 * require the following transform:
8140 * [ABCDEF] -> (?:[ABCabcDEFd]|pq|rst)
8141 * where E folds into "pq" and F folds
8142 * into "rst", all other characters
8143 * fold to single characters. We save
8144 * away these multicharacter foldings,
8145 * to be later saved as part of the
8146 * additional "s" data. */
8149 if (!unicode_alternate)
8150 unicode_alternate = newAV();
8151 sv = newSVpvn((char*)foldbuf, foldlen);
8153 av_push(unicode_alternate, sv);
8157 /* If folding and the value is one of the Greek
8158 * sigmas insert a few more sigmas to make the
8159 * folding rules of the sigmas to work right.
8160 * Note that not all the possible combinations
8161 * are handled here: some of them are handled
8162 * by the standard folding rules, and some of
8163 * them (literal or EXACTF cases) are handled
8164 * during runtime in regexec.c:S_find_byclass(). */
8165 if (value == UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA) {
8166 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
8167 (UV)UNICODE_GREEK_CAPITAL_LETTER_SIGMA);
8168 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
8169 (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
8171 else if (value == UNICODE_GREEK_CAPITAL_LETTER_SIGMA)
8172 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
8173 (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
8178 literal_endpoint = 0;
8182 range = 0; /* this range (if it was one) is done now */
8186 ANYOF_FLAGS(ret) |= ANYOF_LARGE;
8188 RExC_size += ANYOF_CLASS_ADD_SKIP;
8190 RExC_emit += ANYOF_CLASS_ADD_SKIP;
8196 /****** !SIZE_ONLY AFTER HERE *********/
8198 if( stored == 1 && (value < 128 || (value < 256 && !UTF))
8199 && !( ANYOF_FLAGS(ret) & ( ANYOF_FLAGS_ALL ^ ANYOF_FOLD ) )
8201 /* optimize single char class to an EXACT node
8202 but *only* when its not a UTF/high char */
8203 const char * cur_parse= RExC_parse;
8204 RExC_emit = (regnode *)orig_emit;
8205 RExC_parse = (char *)orig_parse;
8206 ret = reg_node(pRExC_state,
8207 (U8)((ANYOF_FLAGS(ret) & ANYOF_FOLD) ? EXACTF : EXACT));
8208 RExC_parse = (char *)cur_parse;
8209 *STRING(ret)= (char)value;
8211 RExC_emit += STR_SZ(1);
8214 /* optimize case-insensitive simple patterns (e.g. /[a-z]/i) */
8215 if ( /* If the only flag is folding (plus possibly inversion). */
8216 ((ANYOF_FLAGS(ret) & (ANYOF_FLAGS_ALL ^ ANYOF_INVERT)) == ANYOF_FOLD)
8218 for (value = 0; value < 256; ++value) {
8219 if (ANYOF_BITMAP_TEST(ret, value)) {
8220 UV fold = PL_fold[value];
8223 ANYOF_BITMAP_SET(ret, fold);
8226 ANYOF_FLAGS(ret) &= ~ANYOF_FOLD;
8229 /* optimize inverted simple patterns (e.g. [^a-z]) */
8230 if (optimize_invert &&
8231 /* If the only flag is inversion. */
8232 (ANYOF_FLAGS(ret) & ANYOF_FLAGS_ALL) == ANYOF_INVERT) {
8233 for (value = 0; value < ANYOF_BITMAP_SIZE; ++value)
8234 ANYOF_BITMAP(ret)[value] ^= ANYOF_FLAGS_ALL;
8235 ANYOF_FLAGS(ret) = ANYOF_UNICODE_ALL;
8238 AV * const av = newAV();
8240 /* The 0th element stores the character class description
8241 * in its textual form: used later (regexec.c:Perl_regclass_swash())
8242 * to initialize the appropriate swash (which gets stored in
8243 * the 1st element), and also useful for dumping the regnode.
8244 * The 2nd element stores the multicharacter foldings,
8245 * used later (regexec.c:S_reginclass()). */
8246 av_store(av, 0, listsv);
8247 av_store(av, 1, NULL);
8248 av_store(av, 2, (SV*)unicode_alternate);
8249 rv = newRV_noinc((SV*)av);
8250 n = add_data(pRExC_state, 1, "s");
8251 RExC_rxi->data->data[n] = (void*)rv;
8259 /* reg_skipcomment()
8261 Absorbs an /x style # comments from the input stream.
8262 Returns true if there is more text remaining in the stream.
8263 Will set the REG_SEEN_RUN_ON_COMMENT flag if the comment
8264 terminates the pattern without including a newline.
8266 Note its the callers responsibility to ensure that we are
8272 S_reg_skipcomment(pTHX_ RExC_state_t *pRExC_state)
8275 while (RExC_parse < RExC_end)
8276 if (*RExC_parse++ == '\n') {
8281 /* we ran off the end of the pattern without ending
8282 the comment, so we have to add an \n when wrapping */
8283 RExC_seen |= REG_SEEN_RUN_ON_COMMENT;
8291 Advance that parse position, and optionall absorbs
8292 "whitespace" from the inputstream.
8294 Without /x "whitespace" means (?#...) style comments only,
8295 with /x this means (?#...) and # comments and whitespace proper.
8297 Returns the RExC_parse point from BEFORE the scan occurs.
8299 This is the /x friendly way of saying RExC_parse++.
8303 S_nextchar(pTHX_ RExC_state_t *pRExC_state)
8305 char* const retval = RExC_parse++;
8308 if (*RExC_parse == '(' && RExC_parse[1] == '?' &&
8309 RExC_parse[2] == '#') {
8310 while (*RExC_parse != ')') {
8311 if (RExC_parse == RExC_end)
8312 FAIL("Sequence (?#... not terminated");
8318 if (RExC_flags & RXf_PMf_EXTENDED) {
8319 if (isSPACE(*RExC_parse)) {
8323 else if (*RExC_parse == '#') {
8324 if ( reg_skipcomment( pRExC_state ) )
8333 - reg_node - emit a node
8335 STATIC regnode * /* Location. */
8336 S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
8339 register regnode *ptr;
8340 regnode * const ret = RExC_emit;
8341 GET_RE_DEBUG_FLAGS_DECL;
8344 SIZE_ALIGN(RExC_size);
8348 if (RExC_emit >= RExC_emit_bound)
8349 Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d", op);
8351 NODE_ALIGN_FILL(ret);
8353 FILL_ADVANCE_NODE(ptr, op);
8354 #ifdef RE_TRACK_PATTERN_OFFSETS
8355 if (RExC_offsets) { /* MJD */
8356 MJD_OFFSET_DEBUG(("%s:%d: (op %s) %s %"UVuf" (len %"UVuf") (max %"UVuf").\n",
8357 "reg_node", __LINE__,
8359 (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0]
8360 ? "Overwriting end of array!\n" : "OK",
8361 (UV)(RExC_emit - RExC_emit_start),
8362 (UV)(RExC_parse - RExC_start),
8363 (UV)RExC_offsets[0]));
8364 Set_Node_Offset(RExC_emit, RExC_parse + (op == END));
8372 - reganode - emit a node with an argument
8374 STATIC regnode * /* Location. */
8375 S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
8378 register regnode *ptr;
8379 regnode * const ret = RExC_emit;
8380 GET_RE_DEBUG_FLAGS_DECL;
8383 SIZE_ALIGN(RExC_size);
8388 assert(2==regarglen[op]+1);
8390 Anything larger than this has to allocate the extra amount.
8391 If we changed this to be:
8393 RExC_size += (1 + regarglen[op]);
8395 then it wouldn't matter. Its not clear what side effect
8396 might come from that so its not done so far.
8401 if (RExC_emit >= RExC_emit_bound)
8402 Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d", op);
8404 NODE_ALIGN_FILL(ret);
8406 FILL_ADVANCE_NODE_ARG(ptr, op, arg);
8407 #ifdef RE_TRACK_PATTERN_OFFSETS
8408 if (RExC_offsets) { /* MJD */
8409 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
8413 (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0] ?
8414 "Overwriting end of array!\n" : "OK",
8415 (UV)(RExC_emit - RExC_emit_start),
8416 (UV)(RExC_parse - RExC_start),
8417 (UV)RExC_offsets[0]));
8418 Set_Cur_Node_Offset;
8426 - reguni - emit (if appropriate) a Unicode character
8429 S_reguni(pTHX_ const RExC_state_t *pRExC_state, UV uv, char* s)
8432 return SIZE_ONLY ? UNISKIP(uv) : (uvchr_to_utf8((U8*)s, uv) - (U8*)s);
8436 - reginsert - insert an operator in front of already-emitted operand
8438 * Means relocating the operand.
8441 S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd, U32 depth)
8444 register regnode *src;
8445 register regnode *dst;
8446 register regnode *place;
8447 const int offset = regarglen[(U8)op];
8448 const int size = NODE_STEP_REGNODE + offset;
8449 GET_RE_DEBUG_FLAGS_DECL;
8450 PERL_UNUSED_ARG(depth);
8451 /* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */
8452 DEBUG_PARSE_FMT("inst"," - %s",PL_reg_name[op]);
8461 if (RExC_open_parens) {
8463 /*DEBUG_PARSE_FMT("inst"," - %"IVdf, (IV)RExC_npar);*/
8464 for ( paren=0 ; paren < RExC_npar ; paren++ ) {
8465 if ( RExC_open_parens[paren] >= opnd ) {
8466 /*DEBUG_PARSE_FMT("open"," - %d",size);*/
8467 RExC_open_parens[paren] += size;
8469 /*DEBUG_PARSE_FMT("open"," - %s","ok");*/
8471 if ( RExC_close_parens[paren] >= opnd ) {
8472 /*DEBUG_PARSE_FMT("close"," - %d",size);*/
8473 RExC_close_parens[paren] += size;
8475 /*DEBUG_PARSE_FMT("close"," - %s","ok");*/
8480 while (src > opnd) {
8481 StructCopy(--src, --dst, regnode);
8482 #ifdef RE_TRACK_PATTERN_OFFSETS
8483 if (RExC_offsets) { /* MJD 20010112 */
8484 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s copy %"UVuf" -> %"UVuf" (max %"UVuf").\n",
8488 (UV)(dst - RExC_emit_start) > RExC_offsets[0]
8489 ? "Overwriting end of array!\n" : "OK",
8490 (UV)(src - RExC_emit_start),
8491 (UV)(dst - RExC_emit_start),
8492 (UV)RExC_offsets[0]));
8493 Set_Node_Offset_To_R(dst-RExC_emit_start, Node_Offset(src));
8494 Set_Node_Length_To_R(dst-RExC_emit_start, Node_Length(src));
8500 place = opnd; /* Op node, where operand used to be. */
8501 #ifdef RE_TRACK_PATTERN_OFFSETS
8502 if (RExC_offsets) { /* MJD */
8503 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
8507 (UV)(place - RExC_emit_start) > RExC_offsets[0]
8508 ? "Overwriting end of array!\n" : "OK",
8509 (UV)(place - RExC_emit_start),
8510 (UV)(RExC_parse - RExC_start),
8511 (UV)RExC_offsets[0]));
8512 Set_Node_Offset(place, RExC_parse);
8513 Set_Node_Length(place, 1);
8516 src = NEXTOPER(place);
8517 FILL_ADVANCE_NODE(place, op);
8518 Zero(src, offset, regnode);
8522 - regtail - set the next-pointer at the end of a node chain of p to val.
8523 - SEE ALSO: regtail_study
8525 /* TODO: All three parms should be const */
8527 S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
8530 register regnode *scan;
8531 GET_RE_DEBUG_FLAGS_DECL;
8533 PERL_UNUSED_ARG(depth);
8539 /* Find last node. */
8542 regnode * const temp = regnext(scan);
8544 SV * const mysv=sv_newmortal();
8545 DEBUG_PARSE_MSG((scan==p ? "tail" : ""));
8546 regprop(RExC_rx, mysv, scan);
8547 PerlIO_printf(Perl_debug_log, "~ %s (%d) %s %s\n",
8548 SvPV_nolen_const(mysv), REG_NODE_NUM(scan),
8549 (temp == NULL ? "->" : ""),
8550 (temp == NULL ? PL_reg_name[OP(val)] : "")
8558 if (reg_off_by_arg[OP(scan)]) {
8559 ARG_SET(scan, val - scan);
8562 NEXT_OFF(scan) = val - scan;
8568 - regtail_study - set the next-pointer at the end of a node chain of p to val.
8569 - Look for optimizable sequences at the same time.
8570 - currently only looks for EXACT chains.
8572 This is expermental code. The idea is to use this routine to perform
8573 in place optimizations on branches and groups as they are constructed,
8574 with the long term intention of removing optimization from study_chunk so
8575 that it is purely analytical.
8577 Currently only used when in DEBUG mode. The macro REGTAIL_STUDY() is used
8578 to control which is which.
8581 /* TODO: All four parms should be const */
8584 S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
8587 register regnode *scan;
8589 #ifdef EXPERIMENTAL_INPLACESCAN
8593 GET_RE_DEBUG_FLAGS_DECL;
8599 /* Find last node. */
8603 regnode * const temp = regnext(scan);
8604 #ifdef EXPERIMENTAL_INPLACESCAN
8605 if (PL_regkind[OP(scan)] == EXACT)
8606 if (join_exact(pRExC_state,scan,&min,1,val,depth+1))
8614 if( exact == PSEUDO )
8616 else if ( exact != OP(scan) )
8625 SV * const mysv=sv_newmortal();
8626 DEBUG_PARSE_MSG((scan==p ? "tsdy" : ""));
8627 regprop(RExC_rx, mysv, scan);
8628 PerlIO_printf(Perl_debug_log, "~ %s (%d) -> %s\n",
8629 SvPV_nolen_const(mysv),
8631 PL_reg_name[exact]);
8638 SV * const mysv_val=sv_newmortal();
8639 DEBUG_PARSE_MSG("");
8640 regprop(RExC_rx, mysv_val, val);
8641 PerlIO_printf(Perl_debug_log, "~ attach to %s (%"IVdf") offset to %"IVdf"\n",
8642 SvPV_nolen_const(mysv_val),
8643 (IV)REG_NODE_NUM(val),
8647 if (reg_off_by_arg[OP(scan)]) {
8648 ARG_SET(scan, val - scan);
8651 NEXT_OFF(scan) = val - scan;
8659 - regcurly - a little FSA that accepts {\d+,?\d*}
8662 S_regcurly(register const char *s)
8681 - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
8685 S_regdump_extflags(pTHX_ const char *lead, const U32 flags) {
8688 for (bit=0; bit<32; bit++) {
8689 if (flags & (1<<bit)) {
8691 PerlIO_printf(Perl_debug_log, "%s",lead);
8692 PerlIO_printf(Perl_debug_log, "%s ",PL_reg_extflags_name[bit]);
8697 PerlIO_printf(Perl_debug_log, "\n");
8699 PerlIO_printf(Perl_debug_log, "%s[none-set]\n",lead);
8705 Perl_regdump(pTHX_ const regexp *r)
8709 SV * const sv = sv_newmortal();
8710 SV *dsv= sv_newmortal();
8712 GET_RE_DEBUG_FLAGS_DECL;
8714 (void)dumpuntil(r, ri->program, ri->program + 1, NULL, NULL, sv, 0, 0);
8716 /* Header fields of interest. */
8717 if (r->anchored_substr) {
8718 RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->anchored_substr),
8719 RE_SV_DUMPLEN(r->anchored_substr), 30);
8720 PerlIO_printf(Perl_debug_log,
8721 "anchored %s%s at %"IVdf" ",
8722 s, RE_SV_TAIL(r->anchored_substr),
8723 (IV)r->anchored_offset);
8724 } else if (r->anchored_utf8) {
8725 RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->anchored_utf8),
8726 RE_SV_DUMPLEN(r->anchored_utf8), 30);
8727 PerlIO_printf(Perl_debug_log,
8728 "anchored utf8 %s%s at %"IVdf" ",
8729 s, RE_SV_TAIL(r->anchored_utf8),
8730 (IV)r->anchored_offset);
8732 if (r->float_substr) {
8733 RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->float_substr),
8734 RE_SV_DUMPLEN(r->float_substr), 30);
8735 PerlIO_printf(Perl_debug_log,
8736 "floating %s%s at %"IVdf"..%"UVuf" ",
8737 s, RE_SV_TAIL(r->float_substr),
8738 (IV)r->float_min_offset, (UV)r->float_max_offset);
8739 } else if (r->float_utf8) {
8740 RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->float_utf8),
8741 RE_SV_DUMPLEN(r->float_utf8), 30);
8742 PerlIO_printf(Perl_debug_log,
8743 "floating utf8 %s%s at %"IVdf"..%"UVuf" ",
8744 s, RE_SV_TAIL(r->float_utf8),
8745 (IV)r->float_min_offset, (UV)r->float_max_offset);
8747 if (r->check_substr || r->check_utf8)
8748 PerlIO_printf(Perl_debug_log,
8750 (r->check_substr == r->float_substr
8751 && r->check_utf8 == r->float_utf8
8752 ? "(checking floating" : "(checking anchored"));
8753 if (r->extflags & RXf_NOSCAN)
8754 PerlIO_printf(Perl_debug_log, " noscan");
8755 if (r->extflags & RXf_CHECK_ALL)
8756 PerlIO_printf(Perl_debug_log, " isall");
8757 if (r->check_substr || r->check_utf8)
8758 PerlIO_printf(Perl_debug_log, ") ");
8760 if (ri->regstclass) {
8761 regprop(r, sv, ri->regstclass);
8762 PerlIO_printf(Perl_debug_log, "stclass %s ", SvPVX_const(sv));
8764 if (r->extflags & RXf_ANCH) {
8765 PerlIO_printf(Perl_debug_log, "anchored");
8766 if (r->extflags & RXf_ANCH_BOL)
8767 PerlIO_printf(Perl_debug_log, "(BOL)");
8768 if (r->extflags & RXf_ANCH_MBOL)
8769 PerlIO_printf(Perl_debug_log, "(MBOL)");
8770 if (r->extflags & RXf_ANCH_SBOL)
8771 PerlIO_printf(Perl_debug_log, "(SBOL)");
8772 if (r->extflags & RXf_ANCH_GPOS)
8773 PerlIO_printf(Perl_debug_log, "(GPOS)");
8774 PerlIO_putc(Perl_debug_log, ' ');
8776 if (r->extflags & RXf_GPOS_SEEN)
8777 PerlIO_printf(Perl_debug_log, "GPOS:%"UVuf" ", (UV)r->gofs);
8778 if (r->intflags & PREGf_SKIP)
8779 PerlIO_printf(Perl_debug_log, "plus ");
8780 if (r->intflags & PREGf_IMPLICIT)
8781 PerlIO_printf(Perl_debug_log, "implicit ");
8782 PerlIO_printf(Perl_debug_log, "minlen %"IVdf" ", (IV)r->minlen);
8783 if (r->extflags & RXf_EVAL_SEEN)
8784 PerlIO_printf(Perl_debug_log, "with eval ");
8785 PerlIO_printf(Perl_debug_log, "\n");
8786 DEBUG_FLAGS_r(regdump_extflags("r->extflags: ",r->extflags));
8788 PERL_UNUSED_CONTEXT;
8790 #endif /* DEBUGGING */
8794 - regprop - printable representation of opcode
8797 Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o)
8802 RXi_GET_DECL(prog,progi);
8803 GET_RE_DEBUG_FLAGS_DECL;
8806 sv_setpvn(sv, "", 0);
8808 if (OP(o) > REGNODE_MAX) /* regnode.type is unsigned */
8809 /* It would be nice to FAIL() here, but this may be called from
8810 regexec.c, and it would be hard to supply pRExC_state. */
8811 Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d", (int)OP(o), (int)REGNODE_MAX);
8812 sv_catpv(sv, PL_reg_name[OP(o)]); /* Take off const! */
8814 k = PL_regkind[OP(o)];
8818 /* Using is_utf8_string() (via PERL_PV_UNI_DETECT)
8819 * is a crude hack but it may be the best for now since
8820 * we have no flag "this EXACTish node was UTF-8"
8822 pv_pretty(sv, STRING(o), STR_LEN(o), 60, PL_colors[0], PL_colors[1],
8823 PERL_PV_ESCAPE_UNI_DETECT |
8824 PERL_PV_PRETTY_ELLIPSES |
8825 PERL_PV_PRETTY_LTGT |
8826 PERL_PV_PRETTY_NOCLEAR
8828 } else if (k == TRIE) {
8829 /* print the details of the trie in dumpuntil instead, as
8830 * progi->data isn't available here */
8831 const char op = OP(o);
8832 const U32 n = ARG(o);
8833 const reg_ac_data * const ac = IS_TRIE_AC(op) ?
8834 (reg_ac_data *)progi->data->data[n] :
8836 const reg_trie_data * const trie
8837 = (reg_trie_data*)progi->data->data[!IS_TRIE_AC(op) ? n : ac->trie];
8839 Perl_sv_catpvf(aTHX_ sv, "-%s",PL_reg_name[o->flags]);
8840 DEBUG_TRIE_COMPILE_r(
8841 Perl_sv_catpvf(aTHX_ sv,
8842 "<S:%"UVuf"/%"IVdf" W:%"UVuf" L:%"UVuf"/%"UVuf" C:%"UVuf"/%"UVuf">",
8843 (UV)trie->startstate,
8844 (IV)trie->statecount-1, /* -1 because of the unused 0 element */
8845 (UV)trie->wordcount,
8848 (UV)TRIE_CHARCOUNT(trie),
8849 (UV)trie->uniquecharcount
8852 if ( IS_ANYOF_TRIE(op) || trie->bitmap ) {
8854 int rangestart = -1;
8855 U8* bitmap = IS_ANYOF_TRIE(op) ? (U8*)ANYOF_BITMAP(o) : (U8*)TRIE_BITMAP(trie);
8857 for (i = 0; i <= 256; i++) {
8858 if (i < 256 && BITMAP_TEST(bitmap,i)) {
8859 if (rangestart == -1)
8861 } else if (rangestart != -1) {
8862 if (i <= rangestart + 3)
8863 for (; rangestart < i; rangestart++)
8864 put_byte(sv, rangestart);
8866 put_byte(sv, rangestart);
8868 put_byte(sv, i - 1);
8876 } else if (k == CURLY) {
8877 if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX)
8878 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */
8879 Perl_sv_catpvf(aTHX_ sv, " {%d,%d}", ARG1(o), ARG2(o));
8881 else if (k == WHILEM && o->flags) /* Ordinal/of */
8882 Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4);
8883 else if (k == REF || k == OPEN || k == CLOSE || k == GROUPP || OP(o)==ACCEPT) {
8884 Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o)); /* Parenth number */
8885 if ( prog->paren_names ) {
8886 if ( k != REF || OP(o) < NREF) {
8887 AV *list= (AV *)progi->data->data[progi->name_list_idx];
8888 SV **name= av_fetch(list, ARG(o), 0 );
8890 Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
8893 AV *list= (AV *)progi->data->data[ progi->name_list_idx ];
8894 SV *sv_dat=(SV*)progi->data->data[ ARG( o ) ];
8895 I32 *nums=(I32*)SvPVX(sv_dat);
8896 SV **name= av_fetch(list, nums[0], 0 );
8899 for ( n=0; n<SvIVX(sv_dat); n++ ) {
8900 Perl_sv_catpvf(aTHX_ sv, "%s%"IVdf,
8901 (n ? "," : ""), (IV)nums[n]);
8903 Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
8907 } else if (k == GOSUB)
8908 Perl_sv_catpvf(aTHX_ sv, "%d[%+d]", (int)ARG(o),(int)ARG2L(o)); /* Paren and offset */
8909 else if (k == VERB) {
8911 Perl_sv_catpvf(aTHX_ sv, ":%"SVf,
8912 SVfARG((SV*)progi->data->data[ ARG( o ) ]));
8913 } else if (k == LOGICAL)
8914 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* 2: embedded, otherwise 1 */
8915 else if (k == FOLDCHAR)
8916 Perl_sv_catpvf(aTHX_ sv, "[0x%"UVXf"]", PTR2UV(ARG(o)) );
8917 else if (k == ANYOF) {
8918 int i, rangestart = -1;
8919 const U8 flags = ANYOF_FLAGS(o);
8921 /* Should be synchronized with * ANYOF_ #xdefines in regcomp.h */
8922 static const char * const anyofs[] = {
8955 if (flags & ANYOF_LOCALE)
8956 sv_catpvs(sv, "{loc}");
8957 if (flags & ANYOF_FOLD)
8958 sv_catpvs(sv, "{i}");
8959 Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
8960 if (flags & ANYOF_INVERT)
8962 for (i = 0; i <= 256; i++) {
8963 if (i < 256 && ANYOF_BITMAP_TEST(o,i)) {
8964 if (rangestart == -1)
8966 } else if (rangestart != -1) {
8967 if (i <= rangestart + 3)
8968 for (; rangestart < i; rangestart++)
8969 put_byte(sv, rangestart);
8971 put_byte(sv, rangestart);
8973 put_byte(sv, i - 1);
8979 if (o->flags & ANYOF_CLASS)
8980 for (i = 0; i < (int)(sizeof(anyofs)/sizeof(char*)); i++)
8981 if (ANYOF_CLASS_TEST(o,i))
8982 sv_catpv(sv, anyofs[i]);
8984 if (flags & ANYOF_UNICODE)
8985 sv_catpvs(sv, "{unicode}");
8986 else if (flags & ANYOF_UNICODE_ALL)
8987 sv_catpvs(sv, "{unicode_all}");
8991 SV * const sw = regclass_swash(prog, o, FALSE, &lv, 0);
8995 U8 s[UTF8_MAXBYTES_CASE+1];
8997 for (i = 0; i <= 256; i++) { /* just the first 256 */
8998 uvchr_to_utf8(s, i);
9000 if (i < 256 && swash_fetch(sw, s, TRUE)) {
9001 if (rangestart == -1)
9003 } else if (rangestart != -1) {
9004 if (i <= rangestart + 3)
9005 for (; rangestart < i; rangestart++) {
9006 const U8 * const e = uvchr_to_utf8(s,rangestart);
9008 for(p = s; p < e; p++)
9012 const U8 *e = uvchr_to_utf8(s,rangestart);
9014 for (p = s; p < e; p++)
9017 e = uvchr_to_utf8(s, i-1);
9018 for (p = s; p < e; p++)
9025 sv_catpvs(sv, "..."); /* et cetera */
9029 char *s = savesvpv(lv);
9030 char * const origs = s;
9032 while (*s && *s != '\n')
9036 const char * const t = ++s;
9054 Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
9056 else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH))
9057 Perl_sv_catpvf(aTHX_ sv, "[%d]", -(o->flags));
9059 PERL_UNUSED_CONTEXT;
9060 PERL_UNUSED_ARG(sv);
9062 PERL_UNUSED_ARG(prog);
9063 #endif /* DEBUGGING */
9067 Perl_re_intuit_string(pTHX_ REGEXP * const prog)
9068 { /* Assume that RE_INTUIT is set */
9070 GET_RE_DEBUG_FLAGS_DECL;
9071 PERL_UNUSED_CONTEXT;
9075 const char * const s = SvPV_nolen_const(prog->check_substr
9076 ? prog->check_substr : prog->check_utf8);
9078 if (!PL_colorset) reginitcolors();
9079 PerlIO_printf(Perl_debug_log,
9080 "%sUsing REx %ssubstr:%s \"%s%.60s%s%s\"\n",
9082 prog->check_substr ? "" : "utf8 ",
9083 PL_colors[5],PL_colors[0],
9086 (strlen(s) > 60 ? "..." : ""));
9089 return prog->check_substr ? prog->check_substr : prog->check_utf8;
9095 handles refcounting and freeing the perl core regexp structure. When
9096 it is necessary to actually free the structure the first thing it
9097 does is call the 'free' method of the regexp_engine associated to to
9098 the regexp, allowing the handling of the void *pprivate; member
9099 first. (This routine is not overridable by extensions, which is why
9100 the extensions free is called first.)
9102 See regdupe and regdupe_internal if you change anything here.
9104 #ifndef PERL_IN_XSUB_RE
9106 Perl_pregfree(pTHX_ struct regexp *r)
9109 GET_RE_DEBUG_FLAGS_DECL;
9111 if (!r || (--r->refcnt > 0))
9114 ReREFCNT_dec(r->mother_re);
9116 CALLREGFREE_PVT(r); /* free the private data */
9118 SvREFCNT_dec(r->paren_names);
9119 Safefree(r->wrapped);
9122 if (r->anchored_substr)
9123 SvREFCNT_dec(r->anchored_substr);
9124 if (r->anchored_utf8)
9125 SvREFCNT_dec(r->anchored_utf8);
9126 if (r->float_substr)
9127 SvREFCNT_dec(r->float_substr);
9129 SvREFCNT_dec(r->float_utf8);
9130 Safefree(r->substrs);
9132 RX_MATCH_COPY_FREE(r);
9133 #ifdef PERL_OLD_COPY_ON_WRITE
9135 SvREFCNT_dec(r->saved_copy);
9144 This is a hacky workaround to the structural issue of match results
9145 being stored in the regexp structure which is in turn stored in
9146 PL_curpm/PL_reg_curpm. The problem is that due to qr// the pattern
9147 could be PL_curpm in multiple contexts, and could require multiple
9148 result sets being associated with the pattern simultaneously, such
9149 as when doing a recursive match with (??{$qr})
9151 The solution is to make a lightweight copy of the regexp structure
9152 when a qr// is returned from the code executed by (??{$qr}) this
9153 lightweight copy doesnt actually own any of its data except for
9154 the starp/end and the actual regexp structure itself.
9160 Perl_reg_temp_copy (pTHX_ struct regexp *r) {
9162 register const I32 npar = r->nparens+1;
9163 (void)ReREFCNT_inc(r);
9164 Newx(ret, 1, regexp);
9165 StructCopy(r, ret, regexp);
9166 Newx(ret->offs, npar, regexp_paren_pair);
9167 Copy(r->offs, ret->offs, npar, regexp_paren_pair);
9170 Newx(ret->substrs, 1, struct reg_substr_data);
9171 StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
9173 SvREFCNT_inc_void(ret->anchored_substr);
9174 SvREFCNT_inc_void(ret->anchored_utf8);
9175 SvREFCNT_inc_void(ret->float_substr);
9176 SvREFCNT_inc_void(ret->float_utf8);
9178 /* check_substr and check_utf8, if non-NULL, point to either their
9179 anchored or float namesakes, and don't hold a second reference. */
9181 RX_MATCH_COPIED_off(ret);
9182 #ifdef PERL_OLD_COPY_ON_WRITE
9183 ret->saved_copy = NULL;
9192 /* regfree_internal()
9194 Free the private data in a regexp. This is overloadable by
9195 extensions. Perl takes care of the regexp structure in pregfree(),
9196 this covers the *pprivate pointer which technically perldoesnt
9197 know about, however of course we have to handle the
9198 regexp_internal structure when no extension is in use.
9200 Note this is called before freeing anything in the regexp
9205 Perl_regfree_internal(pTHX_ REGEXP * const r)
9209 GET_RE_DEBUG_FLAGS_DECL;
9215 SV *dsv= sv_newmortal();
9216 RE_PV_QUOTED_DECL(s, (r->extflags & RXf_UTF8),
9217 dsv, r->precomp, r->prelen, 60);
9218 PerlIO_printf(Perl_debug_log,"%sFreeing REx:%s %s\n",
9219 PL_colors[4],PL_colors[5],s);
9222 #ifdef RE_TRACK_PATTERN_OFFSETS
9224 Safefree(ri->u.offsets); /* 20010421 MJD */
9227 int n = ri->data->count;
9228 PAD* new_comppad = NULL;
9233 /* If you add a ->what type here, update the comment in regcomp.h */
9234 switch (ri->data->what[n]) {
9238 SvREFCNT_dec((SV*)ri->data->data[n]);
9241 Safefree(ri->data->data[n]);
9244 new_comppad = (AV*)ri->data->data[n];
9247 if (new_comppad == NULL)
9248 Perl_croak(aTHX_ "panic: pregfree comppad");
9249 PAD_SAVE_LOCAL(old_comppad,
9250 /* Watch out for global destruction's random ordering. */
9251 (SvTYPE(new_comppad) == SVt_PVAV) ? new_comppad : NULL
9254 refcnt = OpREFCNT_dec((OP_4tree*)ri->data->data[n]);
9257 op_free((OP_4tree*)ri->data->data[n]);
9259 PAD_RESTORE_LOCAL(old_comppad);
9260 SvREFCNT_dec((SV*)new_comppad);
9266 { /* Aho Corasick add-on structure for a trie node.
9267 Used in stclass optimization only */
9269 reg_ac_data *aho=(reg_ac_data*)ri->data->data[n];
9271 refcount = --aho->refcount;
9274 PerlMemShared_free(aho->states);
9275 PerlMemShared_free(aho->fail);
9276 /* do this last!!!! */
9277 PerlMemShared_free(ri->data->data[n]);
9278 PerlMemShared_free(ri->regstclass);
9284 /* trie structure. */
9286 reg_trie_data *trie=(reg_trie_data*)ri->data->data[n];
9288 refcount = --trie->refcount;
9291 PerlMemShared_free(trie->charmap);
9292 PerlMemShared_free(trie->states);
9293 PerlMemShared_free(trie->trans);
9295 PerlMemShared_free(trie->bitmap);
9297 PerlMemShared_free(trie->wordlen);
9299 PerlMemShared_free(trie->jump);
9301 PerlMemShared_free(trie->nextword);
9302 /* do this last!!!! */
9303 PerlMemShared_free(ri->data->data[n]);
9308 Perl_croak(aTHX_ "panic: regfree data code '%c'", ri->data->what[n]);
9311 Safefree(ri->data->what);
9318 #define sv_dup_inc(s,t) SvREFCNT_inc(sv_dup(s,t))
9319 #define av_dup_inc(s,t) (AV*)SvREFCNT_inc(sv_dup((SV*)s,t))
9320 #define hv_dup_inc(s,t) (HV*)SvREFCNT_inc(sv_dup((SV*)s,t))
9321 #define SAVEPVN(p,n) ((p) ? savepvn(p,n) : NULL)
9324 re_dup - duplicate a regexp.
9326 This routine is expected to clone a given regexp structure. It is not
9327 compiler under USE_ITHREADS.
9329 After all of the core data stored in struct regexp is duplicated
9330 the regexp_engine.dupe method is used to copy any private data
9331 stored in the *pprivate pointer. This allows extensions to handle
9332 any duplication it needs to do.
9334 See pregfree() and regfree_internal() if you change anything here.
9336 #if defined(USE_ITHREADS)
9337 #ifndef PERL_IN_XSUB_RE
9339 Perl_re_dup(pTHX_ const regexp *r, CLONE_PARAMS *param)
9346 return (REGEXP *)NULL;
9348 if ((ret = (REGEXP *)ptr_table_fetch(PL_ptr_table, r)))
9352 npar = r->nparens+1;
9353 Newx(ret, 1, regexp);
9354 StructCopy(r, ret, regexp);
9355 Newx(ret->offs, npar, regexp_paren_pair);
9356 Copy(r->offs, ret->offs, npar, regexp_paren_pair);
9358 /* no need to copy these */
9359 Newx(ret->swap, npar, regexp_paren_pair);
9363 /* Do it this way to avoid reading from *r after the StructCopy().
9364 That way, if any of the sv_dup_inc()s dislodge *r from the L1
9365 cache, it doesn't matter. */
9366 const bool anchored = r->check_substr == r->anchored_substr;
9367 Newx(ret->substrs, 1, struct reg_substr_data);
9368 StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
9370 ret->anchored_substr = sv_dup_inc(ret->anchored_substr, param);
9371 ret->anchored_utf8 = sv_dup_inc(ret->anchored_utf8, param);
9372 ret->float_substr = sv_dup_inc(ret->float_substr, param);
9373 ret->float_utf8 = sv_dup_inc(ret->float_utf8, param);
9375 /* check_substr and check_utf8, if non-NULL, point to either their
9376 anchored or float namesakes, and don't hold a second reference. */
9378 if (ret->check_substr) {
9380 assert(r->check_utf8 == r->anchored_utf8);
9381 ret->check_substr = ret->anchored_substr;
9382 ret->check_utf8 = ret->anchored_utf8;
9384 assert(r->check_substr == r->float_substr);
9385 assert(r->check_utf8 == r->float_utf8);
9386 ret->check_substr = ret->float_substr;
9387 ret->check_utf8 = ret->float_utf8;
9392 ret->wrapped = SAVEPVN(ret->wrapped, ret->wraplen+1);
9393 ret->precomp = ret->wrapped + (ret->precomp - ret->wrapped);
9394 ret->paren_names = hv_dup_inc(ret->paren_names, param);
9397 RXi_SET(ret,CALLREGDUPE_PVT(ret,param));
9399 if (RX_MATCH_COPIED(ret))
9400 ret->subbeg = SAVEPVN(ret->subbeg, ret->sublen);
9403 #ifdef PERL_OLD_COPY_ON_WRITE
9404 ret->saved_copy = NULL;
9407 ret->mother_re = NULL;
9409 ret->seen_evals = 0;
9411 ptr_table_store(PL_ptr_table, r, ret);
9414 #endif /* PERL_IN_XSUB_RE */
9419 This is the internal complement to regdupe() which is used to copy
9420 the structure pointed to by the *pprivate pointer in the regexp.
9421 This is the core version of the extension overridable cloning hook.
9422 The regexp structure being duplicated will be copied by perl prior
9423 to this and will be provided as the regexp *r argument, however
9424 with the /old/ structures pprivate pointer value. Thus this routine
9425 may override any copying normally done by perl.
9427 It returns a pointer to the new regexp_internal structure.
9431 Perl_regdupe_internal(pTHX_ REGEXP * const r, CLONE_PARAMS *param)
9434 regexp_internal *reti;
9438 npar = r->nparens+1;
9441 Newxc(reti, sizeof(regexp_internal) + (len+1)*sizeof(regnode), char, regexp_internal);
9442 Copy(ri->program, reti->program, len+1, regnode);
9445 reti->regstclass = NULL;
9449 const int count = ri->data->count;
9452 Newxc(d, sizeof(struct reg_data) + count*sizeof(void *),
9453 char, struct reg_data);
9454 Newx(d->what, count, U8);
9457 for (i = 0; i < count; i++) {
9458 d->what[i] = ri->data->what[i];
9459 switch (d->what[i]) {
9460 /* legal options are one of: sSfpontTu
9461 see also regcomp.h and pregfree() */
9464 case 'p': /* actually an AV, but the dup function is identical. */
9465 case 'u': /* actually an HV, but the dup function is identical. */
9466 d->data[i] = sv_dup_inc((SV *)ri->data->data[i], param);
9469 /* This is cheating. */
9470 Newx(d->data[i], 1, struct regnode_charclass_class);
9471 StructCopy(ri->data->data[i], d->data[i],
9472 struct regnode_charclass_class);
9473 reti->regstclass = (regnode*)d->data[i];
9476 /* Compiled op trees are readonly and in shared memory,
9477 and can thus be shared without duplication. */
9479 d->data[i] = (void*)OpREFCNT_inc((OP*)ri->data->data[i]);
9483 /* Trie stclasses are readonly and can thus be shared
9484 * without duplication. We free the stclass in pregfree
9485 * when the corresponding reg_ac_data struct is freed.
9487 reti->regstclass= ri->regstclass;
9491 ((reg_trie_data*)ri->data->data[i])->refcount++;
9495 d->data[i] = ri->data->data[i];
9498 Perl_croak(aTHX_ "panic: re_dup unknown data code '%c'", ri->data->what[i]);
9507 reti->name_list_idx = ri->name_list_idx;
9509 #ifdef RE_TRACK_PATTERN_OFFSETS
9510 if (ri->u.offsets) {
9511 Newx(reti->u.offsets, 2*len+1, U32);
9512 Copy(ri->u.offsets, reti->u.offsets, 2*len+1, U32);
9515 SetProgLen(reti,len);
9521 #endif /* USE_ITHREADS */
9526 converts a regexp embedded in a MAGIC struct to its stringified form,
9527 caching the converted form in the struct and returns the cached
9530 If lp is nonnull then it is used to return the length of the
9533 If flags is nonnull and the returned string contains UTF8 then
9534 (*flags & 1) will be true.
9536 If haseval is nonnull then it is used to return whether the pattern
9539 Normally called via macro:
9541 CALLREG_STRINGIFY(mg,&len,&utf8);
9545 CALLREG_AS_STR(mg,&lp,&flags,&haseval)
9547 See sv_2pv_flags() in sv.c for an example of internal usage.
9550 #ifndef PERL_IN_XSUB_RE
9553 Perl_reg_stringify(pTHX_ MAGIC *mg, STRLEN *lp, U32 *flags, I32 *haseval ) {
9555 const regexp * const re = (regexp *)mg->mg_obj;
9557 *haseval = re->seen_evals;
9559 *flags = ((re->extflags & RXf_UTF8) ? 1 : 0);
9566 - regnext - dig the "next" pointer out of a node
9569 Perl_regnext(pTHX_ register regnode *p)
9572 register I32 offset;
9577 offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p));
9586 S_re_croak2(pTHX_ const char* pat1,const char* pat2,...)
9589 STRLEN l1 = strlen(pat1);
9590 STRLEN l2 = strlen(pat2);
9593 const char *message;
9599 Copy(pat1, buf, l1 , char);
9600 Copy(pat2, buf + l1, l2 , char);
9601 buf[l1 + l2] = '\n';
9602 buf[l1 + l2 + 1] = '\0';
9604 /* ANSI variant takes additional second argument */
9605 va_start(args, pat2);
9609 msv = vmess(buf, &args);
9611 message = SvPV_const(msv,l1);
9614 Copy(message, buf, l1 , char);
9615 buf[l1-1] = '\0'; /* Overwrite \n */
9616 Perl_croak(aTHX_ "%s", buf);
9619 /* XXX Here's a total kludge. But we need to re-enter for swash routines. */
9621 #ifndef PERL_IN_XSUB_RE
9623 Perl_save_re_context(pTHX)
9627 struct re_save_state *state;
9629 SAVEVPTR(PL_curcop);
9630 SSGROW(SAVESTACK_ALLOC_FOR_RE_SAVE_STATE + 1);
9632 state = (struct re_save_state *)(PL_savestack + PL_savestack_ix);
9633 PL_savestack_ix += SAVESTACK_ALLOC_FOR_RE_SAVE_STATE;
9634 SSPUSHINT(SAVEt_RE_STATE);
9636 Copy(&PL_reg_state, state, 1, struct re_save_state);
9638 PL_reg_start_tmp = 0;
9639 PL_reg_start_tmpl = 0;
9640 PL_reg_oldsaved = NULL;
9641 PL_reg_oldsavedlen = 0;
9643 PL_reg_leftiter = 0;
9644 PL_reg_poscache = NULL;
9645 PL_reg_poscache_size = 0;
9646 #ifdef PERL_OLD_COPY_ON_WRITE
9650 /* Save $1..$n (#18107: UTF-8 s/(\w+)/uc($1)/e); AMS 20021106. */
9652 const REGEXP * const rx = PM_GETRE(PL_curpm);
9655 for (i = 1; i <= rx->nparens; i++) {
9656 char digits[TYPE_CHARS(long)];
9657 const STRLEN len = my_snprintf(digits, sizeof(digits), "%lu", (long)i);
9658 GV *const *const gvp
9659 = (GV**)hv_fetch(PL_defstash, digits, len, 0);
9662 GV * const gv = *gvp;
9663 if (SvTYPE(gv) == SVt_PVGV && GvSV(gv))
9673 clear_re(pTHX_ void *r)
9676 ReREFCNT_dec((regexp *)r);
9682 S_put_byte(pTHX_ SV *sv, int c)
9684 /* Our definition of isPRINT() ignores locales, so only bytes that are
9685 not part of UTF-8 are considered printable. I assume that the same
9686 holds for UTF-EBCDIC.
9687 Also, code point 255 is not printable in either (it's E0 in EBCDIC,
9688 which Wikipedia says:
9690 EO, or Eight Ones, is an 8-bit EBCDIC character code represented as all
9691 ones (binary 1111 1111, hexadecimal FF). It is similar, but not
9692 identical, to the ASCII delete (DEL) or rubout control character.
9693 ) So the old condition can be simplified to !isPRINT(c) */
9695 Perl_sv_catpvf(aTHX_ sv, "\\%o", c);
9697 const char string = c;
9698 if (c == '-' || c == ']' || c == '\\' || c == '^')
9699 sv_catpvs(sv, "\\");
9700 sv_catpvn(sv, &string, 1);
9705 #define CLEAR_OPTSTART \
9706 if (optstart) STMT_START { \
9707 DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log, " (%"IVdf" nodes)\n", (IV)(node - optstart))); \
9711 #define DUMPUNTIL(b,e) CLEAR_OPTSTART; node=dumpuntil(r,start,(b),(e),last,sv,indent+1,depth+1);
9713 STATIC const regnode *
9714 S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node,
9715 const regnode *last, const regnode *plast,
9716 SV* sv, I32 indent, U32 depth)
9719 register U8 op = PSEUDO; /* Arbitrary non-END op. */
9720 register const regnode *next;
9721 const regnode *optstart= NULL;
9724 GET_RE_DEBUG_FLAGS_DECL;
9726 #ifdef DEBUG_DUMPUNTIL
9727 PerlIO_printf(Perl_debug_log, "--- %d : %d - %d - %d\n",indent,node-start,
9728 last ? last-start : 0,plast ? plast-start : 0);
9731 if (plast && plast < last)
9734 while (PL_regkind[op] != END && (!last || node < last)) {
9735 /* While that wasn't END last time... */
9738 if (op == CLOSE || op == WHILEM)
9740 next = regnext((regnode *)node);
9743 if (OP(node) == OPTIMIZED) {
9744 if (!optstart && RE_DEBUG_FLAG(RE_DEBUG_COMPILE_OPTIMISE))
9751 regprop(r, sv, node);
9752 PerlIO_printf(Perl_debug_log, "%4"IVdf":%*s%s", (IV)(node - start),
9753 (int)(2*indent + 1), "", SvPVX_const(sv));
9755 if (OP(node) != OPTIMIZED) {
9756 if (next == NULL) /* Next ptr. */
9757 PerlIO_printf(Perl_debug_log, " (0)");
9758 else if (PL_regkind[(U8)op] == BRANCH && PL_regkind[OP(next)] != BRANCH )
9759 PerlIO_printf(Perl_debug_log, " (FAIL)");
9761 PerlIO_printf(Perl_debug_log, " (%"IVdf")", (IV)(next - start));
9762 (void)PerlIO_putc(Perl_debug_log, '\n');
9766 if (PL_regkind[(U8)op] == BRANCHJ) {
9769 register const regnode *nnode = (OP(next) == LONGJMP
9770 ? regnext((regnode *)next)
9772 if (last && nnode > last)
9774 DUMPUNTIL(NEXTOPER(NEXTOPER(node)), nnode);
9777 else if (PL_regkind[(U8)op] == BRANCH) {
9779 DUMPUNTIL(NEXTOPER(node), next);
9781 else if ( PL_regkind[(U8)op] == TRIE ) {
9782 const regnode *this_trie = node;
9783 const char op = OP(node);
9784 const U32 n = ARG(node);
9785 const reg_ac_data * const ac = op>=AHOCORASICK ?
9786 (reg_ac_data *)ri->data->data[n] :
9788 const reg_trie_data * const trie =
9789 (reg_trie_data*)ri->data->data[op<AHOCORASICK ? n : ac->trie];
9791 AV *const trie_words = (AV *) ri->data->data[n + TRIE_WORDS_OFFSET];
9793 const regnode *nextbranch= NULL;
9795 sv_setpvn(sv, "", 0);
9796 for (word_idx= 0; word_idx < (I32)trie->wordcount; word_idx++) {
9797 SV ** const elem_ptr = av_fetch(trie_words,word_idx,0);
9799 PerlIO_printf(Perl_debug_log, "%*s%s ",
9800 (int)(2*(indent+3)), "",
9801 elem_ptr ? pv_pretty(sv, SvPV_nolen_const(*elem_ptr), SvCUR(*elem_ptr), 60,
9802 PL_colors[0], PL_colors[1],
9803 (SvUTF8(*elem_ptr) ? PERL_PV_ESCAPE_UNI : 0) |
9804 PERL_PV_PRETTY_ELLIPSES |
9810 U16 dist= trie->jump[word_idx+1];
9811 PerlIO_printf(Perl_debug_log, "(%"UVuf")\n",
9812 (UV)((dist ? this_trie + dist : next) - start));
9815 nextbranch= this_trie + trie->jump[0];
9816 DUMPUNTIL(this_trie + dist, nextbranch);
9818 if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
9819 nextbranch= regnext((regnode *)nextbranch);
9821 PerlIO_printf(Perl_debug_log, "\n");
9824 if (last && next > last)
9829 else if ( op == CURLY ) { /* "next" might be very big: optimizer */
9830 DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS,
9831 NEXTOPER(node) + EXTRA_STEP_2ARGS + 1);
9833 else if (PL_regkind[(U8)op] == CURLY && op != CURLYX) {
9835 DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS, next);
9837 else if ( op == PLUS || op == STAR) {
9838 DUMPUNTIL(NEXTOPER(node), NEXTOPER(node) + 1);
9840 else if (op == ANYOF) {
9841 /* arglen 1 + class block */
9842 node += 1 + ((ANYOF_FLAGS(node) & ANYOF_LARGE)
9843 ? ANYOF_CLASS_SKIP : ANYOF_SKIP);
9844 node = NEXTOPER(node);
9846 else if (PL_regkind[(U8)op] == EXACT) {
9847 /* Literal string, where present. */
9848 node += NODE_SZ_STR(node) - 1;
9849 node = NEXTOPER(node);
9852 node = NEXTOPER(node);
9853 node += regarglen[(U8)op];
9855 if (op == CURLYX || op == OPEN)
9859 #ifdef DEBUG_DUMPUNTIL
9860 PerlIO_printf(Perl_debug_log, "--- %d\n", (int)indent);
9865 #endif /* DEBUGGING */
9869 * c-indentation-style: bsd
9871 * indent-tabs-mode: t
9874 * ex: set ts=8 sts=4 sw=4 noet: