5 * "A fair jaw-cracker dwarf-language must be." --Samwise Gamgee
8 /* This file contains functions for compiling a regular expression. See
9 * also regexec.c which funnily enough, contains functions for executing
10 * a regular expression.
12 * This file is also copied at build time to ext/re/re_comp.c, where
13 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
14 * This causes the main functions to be compiled under new names and with
15 * debugging support added, which makes "use re 'debug'" work.
18 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
19 * confused with the original package (see point 3 below). Thanks, Henry!
22 /* Additional note: this code is very heavily munged from Henry's version
23 * in places. In some spots I've traded clarity for efficiency, so don't
24 * blame Henry for some of the lack of readability.
27 /* The names of the functions have been changed from regcomp and
28 * regexec to pregcomp and pregexec in order to avoid conflicts
29 * with the POSIX routines of the same names.
32 #ifdef PERL_EXT_RE_BUILD
37 * pregcomp and pregexec -- regsub and regerror are not used in perl
39 * Copyright (c) 1986 by University of Toronto.
40 * Written by Henry Spencer. Not derived from licensed software.
42 * Permission is granted to anyone to use this software for any
43 * purpose on any computer system, and to redistribute it freely,
44 * subject to the following restrictions:
46 * 1. The author is not responsible for the consequences of use of
47 * this software, no matter how awful, even if they arise
50 * 2. The origin of this software must not be misrepresented, either
51 * by explicit claim or by omission.
53 * 3. Altered versions must be plainly marked as such, and must not
54 * be misrepresented as being the original software.
57 **** Alterations to Henry's code are...
59 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
60 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007 by Larry Wall and others
62 **** You may distribute under the terms of either the GNU General Public
63 **** License or the Artistic License, as specified in the README file.
66 * Beware that some of this code is subtly aware of the way operator
67 * precedence is structured in regular expressions. Serious changes in
68 * regular-expression syntax might require a total rethink.
71 #define PERL_IN_REGCOMP_C
74 #ifndef PERL_IN_XSUB_RE
79 #ifdef PERL_IN_XSUB_RE
90 # if defined(BUGGY_MSC6)
91 /* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */
92 # pragma optimize("a",off)
93 /* But MSC 6.00A is happy with 'w', for aliases only across function calls*/
94 # pragma optimize("w",on )
95 # endif /* BUGGY_MSC6 */
102 typedef struct RExC_state_t {
103 U32 flags; /* are we folding, multilining? */
104 char *precomp; /* uncompiled string. */
105 regexp *rx; /* perl core regexp structure */
106 regexp_internal *rxi; /* internal data for regexp object pprivate field */
107 char *start; /* Start of input for compile */
108 char *end; /* End of input for compile */
109 char *parse; /* Input-scan pointer. */
110 I32 whilem_seen; /* number of WHILEM in this expr */
111 regnode *emit_start; /* Start of emitted-code area */
112 regnode *emit_bound; /* First regnode outside of the allocated space */
113 regnode *emit; /* Code-emit pointer; ®dummy = don't = compiling */
114 I32 naughty; /* How bad is this pattern? */
115 I32 sawback; /* Did we see \1, ...? */
117 I32 size; /* Code size. */
118 I32 npar; /* Capture buffer count, (OPEN). */
119 I32 cpar; /* Capture buffer count, (CLOSE). */
120 I32 nestroot; /* root parens we are in - used by accept */
124 regnode **open_parens; /* pointers to open parens */
125 regnode **close_parens; /* pointers to close parens */
126 regnode *opend; /* END node in program */
127 I32 utf8; /* whether the pattern is utf8 or not */
128 I32 orig_utf8; /* whether the pattern was originally in utf8 */
129 /* XXX use this for future optimisation of case
130 * where pattern must be upgraded to utf8. */
131 HV *charnames; /* cache of named sequences */
132 HV *paren_names; /* Paren names */
134 regnode **recurse; /* Recurse regops */
135 I32 recurse_count; /* Number of recurse regops */
137 char *starttry; /* -Dr: where regtry was called. */
138 #define RExC_starttry (pRExC_state->starttry)
141 const char *lastparse;
143 AV *paren_name_list; /* idx -> name */
144 #define RExC_lastparse (pRExC_state->lastparse)
145 #define RExC_lastnum (pRExC_state->lastnum)
146 #define RExC_paren_name_list (pRExC_state->paren_name_list)
150 #define RExC_flags (pRExC_state->flags)
151 #define RExC_precomp (pRExC_state->precomp)
152 #define RExC_rx (pRExC_state->rx)
153 #define RExC_rxi (pRExC_state->rxi)
154 #define RExC_start (pRExC_state->start)
155 #define RExC_end (pRExC_state->end)
156 #define RExC_parse (pRExC_state->parse)
157 #define RExC_whilem_seen (pRExC_state->whilem_seen)
158 #ifdef RE_TRACK_PATTERN_OFFSETS
159 #define RExC_offsets (pRExC_state->rxi->u.offsets) /* I am not like the others */
161 #define RExC_emit (pRExC_state->emit)
162 #define RExC_emit_start (pRExC_state->emit_start)
163 #define RExC_emit_bound (pRExC_state->emit_bound)
164 #define RExC_naughty (pRExC_state->naughty)
165 #define RExC_sawback (pRExC_state->sawback)
166 #define RExC_seen (pRExC_state->seen)
167 #define RExC_size (pRExC_state->size)
168 #define RExC_npar (pRExC_state->npar)
169 #define RExC_nestroot (pRExC_state->nestroot)
170 #define RExC_extralen (pRExC_state->extralen)
171 #define RExC_seen_zerolen (pRExC_state->seen_zerolen)
172 #define RExC_seen_evals (pRExC_state->seen_evals)
173 #define RExC_utf8 (pRExC_state->utf8)
174 #define RExC_orig_utf8 (pRExC_state->orig_utf8)
175 #define RExC_charnames (pRExC_state->charnames)
176 #define RExC_open_parens (pRExC_state->open_parens)
177 #define RExC_close_parens (pRExC_state->close_parens)
178 #define RExC_opend (pRExC_state->opend)
179 #define RExC_paren_names (pRExC_state->paren_names)
180 #define RExC_recurse (pRExC_state->recurse)
181 #define RExC_recurse_count (pRExC_state->recurse_count)
184 #define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?')
185 #define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
186 ((*s) == '{' && regcurly(s)))
189 #undef SPSTART /* dratted cpp namespace... */
192 * Flags to be passed up and down.
194 #define WORST 0 /* Worst case. */
195 #define HASWIDTH 0x01 /* Known to match non-null strings. */
196 #define SIMPLE 0x02 /* Simple enough to be STAR/PLUS operand. */
197 #define SPSTART 0x04 /* Starts with * or +. */
198 #define TRYAGAIN 0x08 /* Weeded out a declaration. */
199 #define POSTPONED 0x10 /* (?1),(?&name), (??{...}) or similar */
201 #define REG_NODE_NUM(x) ((x) ? (int)((x)-RExC_emit_start) : -1)
203 /* whether trie related optimizations are enabled */
204 #if PERL_ENABLE_EXTENDED_TRIE_OPTIMISATION
205 #define TRIE_STUDY_OPT
206 #define FULL_TRIE_STUDY
212 #define PBYTE(u8str,paren) ((U8*)(u8str))[(paren) >> 3]
213 #define PBITVAL(paren) (1 << ((paren) & 7))
214 #define PAREN_TEST(u8str,paren) ( PBYTE(u8str,paren) & PBITVAL(paren))
215 #define PAREN_SET(u8str,paren) PBYTE(u8str,paren) |= PBITVAL(paren)
216 #define PAREN_UNSET(u8str,paren) PBYTE(u8str,paren) &= (~PBITVAL(paren))
219 /* About scan_data_t.
221 During optimisation we recurse through the regexp program performing
222 various inplace (keyhole style) optimisations. In addition study_chunk
223 and scan_commit populate this data structure with information about
224 what strings MUST appear in the pattern. We look for the longest
225 string that must appear for at a fixed location, and we look for the
226 longest string that may appear at a floating location. So for instance
231 Both 'FOO' and 'A' are fixed strings. Both 'B' and 'BAR' are floating
232 strings (because they follow a .* construct). study_chunk will identify
233 both FOO and BAR as being the longest fixed and floating strings respectively.
235 The strings can be composites, for instance
239 will result in a composite fixed substring 'foo'.
241 For each string some basic information is maintained:
243 - offset or min_offset
244 This is the position the string must appear at, or not before.
245 It also implicitly (when combined with minlenp) tells us how many
246 character must match before the string we are searching.
247 Likewise when combined with minlenp and the length of the string
248 tells us how many characters must appear after the string we have
252 Only used for floating strings. This is the rightmost point that
253 the string can appear at. Ifset to I32 max it indicates that the
254 string can occur infinitely far to the right.
257 A pointer to the minimum length of the pattern that the string
258 was found inside. This is important as in the case of positive
259 lookahead or positive lookbehind we can have multiple patterns
264 The minimum length of the pattern overall is 3, the minimum length
265 of the lookahead part is 3, but the minimum length of the part that
266 will actually match is 1. So 'FOO's minimum length is 3, but the
267 minimum length for the F is 1. This is important as the minimum length
268 is used to determine offsets in front of and behind the string being
269 looked for. Since strings can be composites this is the length of the
270 pattern at the time it was commited with a scan_commit. Note that
271 the length is calculated by study_chunk, so that the minimum lengths
272 are not known until the full pattern has been compiled, thus the
273 pointer to the value.
277 In the case of lookbehind the string being searched for can be
278 offset past the start point of the final matching string.
279 If this value was just blithely removed from the min_offset it would
280 invalidate some of the calculations for how many chars must match
281 before or after (as they are derived from min_offset and minlen and
282 the length of the string being searched for).
283 When the final pattern is compiled and the data is moved from the
284 scan_data_t structure into the regexp structure the information
285 about lookbehind is factored in, with the information that would
286 have been lost precalculated in the end_shift field for the
289 The fields pos_min and pos_delta are used to store the minimum offset
290 and the delta to the maximum offset at the current point in the pattern.
294 typedef struct scan_data_t {
295 /*I32 len_min; unused */
296 /*I32 len_delta; unused */
300 I32 last_end; /* min value, <0 unless valid. */
303 SV **longest; /* Either &l_fixed, or &l_float. */
304 SV *longest_fixed; /* longest fixed string found in pattern */
305 I32 offset_fixed; /* offset where it starts */
306 I32 *minlen_fixed; /* pointer to the minlen relevent to the string */
307 I32 lookbehind_fixed; /* is the position of the string modfied by LB */
308 SV *longest_float; /* longest floating string found in pattern */
309 I32 offset_float_min; /* earliest point in string it can appear */
310 I32 offset_float_max; /* latest point in string it can appear */
311 I32 *minlen_float; /* pointer to the minlen relevent to the string */
312 I32 lookbehind_float; /* is the position of the string modified by LB */
316 struct regnode_charclass_class *start_class;
320 * Forward declarations for pregcomp()'s friends.
323 static const scan_data_t zero_scan_data =
324 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ,0};
326 #define SF_BEFORE_EOL (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
327 #define SF_BEFORE_SEOL 0x0001
328 #define SF_BEFORE_MEOL 0x0002
329 #define SF_FIX_BEFORE_EOL (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
330 #define SF_FL_BEFORE_EOL (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
333 # define SF_FIX_SHIFT_EOL (0+2)
334 # define SF_FL_SHIFT_EOL (0+4)
336 # define SF_FIX_SHIFT_EOL (+2)
337 # define SF_FL_SHIFT_EOL (+4)
340 #define SF_FIX_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
341 #define SF_FIX_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
343 #define SF_FL_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
344 #define SF_FL_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
345 #define SF_IS_INF 0x0040
346 #define SF_HAS_PAR 0x0080
347 #define SF_IN_PAR 0x0100
348 #define SF_HAS_EVAL 0x0200
349 #define SCF_DO_SUBSTR 0x0400
350 #define SCF_DO_STCLASS_AND 0x0800
351 #define SCF_DO_STCLASS_OR 0x1000
352 #define SCF_DO_STCLASS (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
353 #define SCF_WHILEM_VISITED_POS 0x2000
355 #define SCF_TRIE_RESTUDY 0x4000 /* Do restudy? */
356 #define SCF_SEEN_ACCEPT 0x8000
358 #define UTF (RExC_utf8 != 0)
359 #define LOC ((RExC_flags & RXf_PMf_LOCALE) != 0)
360 #define FOLD ((RExC_flags & RXf_PMf_FOLD) != 0)
362 #define OOB_UNICODE 12345678
363 #define OOB_NAMEDCLASS -1
365 #define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
366 #define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
369 /* length of regex to show in messages that don't mark a position within */
370 #define RegexLengthToShowInErrorMessages 127
373 * If MARKER[12] are adjusted, be sure to adjust the constants at the top
374 * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
375 * op/pragma/warn/regcomp.
377 #define MARKER1 "<-- HERE" /* marker as it appears in the description */
378 #define MARKER2 " <-- HERE " /* marker as it appears within the regex */
380 #define REPORT_LOCATION " in regex; marked by " MARKER1 " in m/%.*s" MARKER2 "%s/"
383 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
384 * arg. Show regex, up to a maximum length. If it's too long, chop and add
387 #define _FAIL(code) STMT_START { \
388 const char *ellipses = ""; \
389 IV len = RExC_end - RExC_precomp; \
392 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
393 if (len > RegexLengthToShowInErrorMessages) { \
394 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
395 len = RegexLengthToShowInErrorMessages - 10; \
401 #define FAIL(msg) _FAIL( \
402 Perl_croak(aTHX_ "%s in regex m/%.*s%s/", \
403 msg, (int)len, RExC_precomp, ellipses))
405 #define FAIL2(msg,arg) _FAIL( \
406 Perl_croak(aTHX_ msg " in regex m/%.*s%s/", \
407 arg, (int)len, RExC_precomp, ellipses))
410 * Simple_vFAIL -- like FAIL, but marks the current location in the scan
412 #define Simple_vFAIL(m) STMT_START { \
413 const IV offset = RExC_parse - RExC_precomp; \
414 Perl_croak(aTHX_ "%s" REPORT_LOCATION, \
415 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
419 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
421 #define vFAIL(m) STMT_START { \
423 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
428 * Like Simple_vFAIL(), but accepts two arguments.
430 #define Simple_vFAIL2(m,a1) STMT_START { \
431 const IV offset = RExC_parse - RExC_precomp; \
432 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, \
433 (int)offset, RExC_precomp, RExC_precomp + offset); \
437 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
439 #define vFAIL2(m,a1) STMT_START { \
441 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
442 Simple_vFAIL2(m, a1); \
447 * Like Simple_vFAIL(), but accepts three arguments.
449 #define Simple_vFAIL3(m, a1, a2) STMT_START { \
450 const IV offset = RExC_parse - RExC_precomp; \
451 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, \
452 (int)offset, RExC_precomp, RExC_precomp + offset); \
456 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
458 #define vFAIL3(m,a1,a2) STMT_START { \
460 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
461 Simple_vFAIL3(m, a1, a2); \
465 * Like Simple_vFAIL(), but accepts four arguments.
467 #define Simple_vFAIL4(m, a1, a2, a3) STMT_START { \
468 const IV offset = RExC_parse - RExC_precomp; \
469 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3, \
470 (int)offset, RExC_precomp, RExC_precomp + offset); \
473 #define vWARN(loc,m) STMT_START { \
474 const IV offset = loc - RExC_precomp; \
475 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s" REPORT_LOCATION, \
476 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
479 #define vWARNdep(loc,m) STMT_START { \
480 const IV offset = loc - RExC_precomp; \
481 Perl_warner(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), \
482 "%s" REPORT_LOCATION, \
483 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
487 #define vWARN2(loc, m, a1) STMT_START { \
488 const IV offset = loc - RExC_precomp; \
489 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
490 a1, (int)offset, RExC_precomp, RExC_precomp + offset); \
493 #define vWARN3(loc, m, a1, a2) STMT_START { \
494 const IV offset = loc - RExC_precomp; \
495 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
496 a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset); \
499 #define vWARN4(loc, m, a1, a2, a3) STMT_START { \
500 const IV offset = loc - RExC_precomp; \
501 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
502 a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
505 #define vWARN5(loc, m, a1, a2, a3, a4) STMT_START { \
506 const IV offset = loc - RExC_precomp; \
507 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
508 a1, a2, a3, a4, (int)offset, RExC_precomp, RExC_precomp + offset); \
512 /* Allow for side effects in s */
513 #define REGC(c,s) STMT_START { \
514 if (!SIZE_ONLY) *(s) = (c); else (void)(s); \
517 /* Macros for recording node offsets. 20001227 mjd@plover.com
518 * Nodes are numbered 1, 2, 3, 4. Node #n's position is recorded in
519 * element 2*n-1 of the array. Element #2n holds the byte length node #n.
520 * Element 0 holds the number n.
521 * Position is 1 indexed.
523 #ifndef RE_TRACK_PATTERN_OFFSETS
524 #define Set_Node_Offset_To_R(node,byte)
525 #define Set_Node_Offset(node,byte)
526 #define Set_Cur_Node_Offset
527 #define Set_Node_Length_To_R(node,len)
528 #define Set_Node_Length(node,len)
529 #define Set_Node_Cur_Length(node)
530 #define Node_Offset(n)
531 #define Node_Length(n)
532 #define Set_Node_Offset_Length(node,offset,len)
533 #define ProgLen(ri) ri->u.proglen
534 #define SetProgLen(ri,x) ri->u.proglen = x
536 #define ProgLen(ri) ri->u.offsets[0]
537 #define SetProgLen(ri,x) ri->u.offsets[0] = x
538 #define Set_Node_Offset_To_R(node,byte) STMT_START { \
540 MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n", \
541 __LINE__, (int)(node), (int)(byte))); \
543 Perl_croak(aTHX_ "value of node is %d in Offset macro", (int)(node)); \
545 RExC_offsets[2*(node)-1] = (byte); \
550 #define Set_Node_Offset(node,byte) \
551 Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
552 #define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
554 #define Set_Node_Length_To_R(node,len) STMT_START { \
556 MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n", \
557 __LINE__, (int)(node), (int)(len))); \
559 Perl_croak(aTHX_ "value of node is %d in Length macro", (int)(node)); \
561 RExC_offsets[2*(node)] = (len); \
566 #define Set_Node_Length(node,len) \
567 Set_Node_Length_To_R((node)-RExC_emit_start, len)
568 #define Set_Cur_Node_Length(len) Set_Node_Length(RExC_emit, len)
569 #define Set_Node_Cur_Length(node) \
570 Set_Node_Length(node, RExC_parse - parse_start)
572 /* Get offsets and lengths */
573 #define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
574 #define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
576 #define Set_Node_Offset_Length(node,offset,len) STMT_START { \
577 Set_Node_Offset_To_R((node)-RExC_emit_start, (offset)); \
578 Set_Node_Length_To_R((node)-RExC_emit_start, (len)); \
582 #if PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS
583 #define EXPERIMENTAL_INPLACESCAN
584 #endif /*RE_TRACK_PATTERN_OFFSETS*/
586 #define DEBUG_STUDYDATA(str,data,depth) \
587 DEBUG_OPTIMISE_MORE_r(if(data){ \
588 PerlIO_printf(Perl_debug_log, \
589 "%*s" str "Pos:%"IVdf"/%"IVdf \
590 " Flags: 0x%"UVXf" Whilem_c: %"IVdf" Lcp: %"IVdf" %s", \
591 (int)(depth)*2, "", \
592 (IV)((data)->pos_min), \
593 (IV)((data)->pos_delta), \
594 (UV)((data)->flags), \
595 (IV)((data)->whilem_c), \
596 (IV)((data)->last_closep ? *((data)->last_closep) : -1), \
597 is_inf ? "INF " : "" \
599 if ((data)->last_found) \
600 PerlIO_printf(Perl_debug_log, \
601 "Last:'%s' %"IVdf":%"IVdf"/%"IVdf" %sFixed:'%s' @ %"IVdf \
602 " %sFloat: '%s' @ %"IVdf"/%"IVdf"", \
603 SvPVX_const((data)->last_found), \
604 (IV)((data)->last_end), \
605 (IV)((data)->last_start_min), \
606 (IV)((data)->last_start_max), \
607 ((data)->longest && \
608 (data)->longest==&((data)->longest_fixed)) ? "*" : "", \
609 SvPVX_const((data)->longest_fixed), \
610 (IV)((data)->offset_fixed), \
611 ((data)->longest && \
612 (data)->longest==&((data)->longest_float)) ? "*" : "", \
613 SvPVX_const((data)->longest_float), \
614 (IV)((data)->offset_float_min), \
615 (IV)((data)->offset_float_max) \
617 PerlIO_printf(Perl_debug_log,"\n"); \
620 static void clear_re(pTHX_ void *r);
622 /* Mark that we cannot extend a found fixed substring at this point.
623 Update the longest found anchored substring and the longest found
624 floating substrings if needed. */
627 S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data, I32 *minlenp, int is_inf)
629 const STRLEN l = CHR_SVLEN(data->last_found);
630 const STRLEN old_l = CHR_SVLEN(*data->longest);
631 GET_RE_DEBUG_FLAGS_DECL;
633 if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
634 SvSetMagicSV(*data->longest, data->last_found);
635 if (*data->longest == data->longest_fixed) {
636 data->offset_fixed = l ? data->last_start_min : data->pos_min;
637 if (data->flags & SF_BEFORE_EOL)
639 |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
641 data->flags &= ~SF_FIX_BEFORE_EOL;
642 data->minlen_fixed=minlenp;
643 data->lookbehind_fixed=0;
645 else { /* *data->longest == data->longest_float */
646 data->offset_float_min = l ? data->last_start_min : data->pos_min;
647 data->offset_float_max = (l
648 ? data->last_start_max
649 : data->pos_min + data->pos_delta);
650 if (is_inf || (U32)data->offset_float_max > (U32)I32_MAX)
651 data->offset_float_max = I32_MAX;
652 if (data->flags & SF_BEFORE_EOL)
654 |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
656 data->flags &= ~SF_FL_BEFORE_EOL;
657 data->minlen_float=minlenp;
658 data->lookbehind_float=0;
661 SvCUR_set(data->last_found, 0);
663 SV * const sv = data->last_found;
664 if (SvUTF8(sv) && SvMAGICAL(sv)) {
665 MAGIC * const mg = mg_find(sv, PERL_MAGIC_utf8);
671 data->flags &= ~SF_BEFORE_EOL;
672 DEBUG_STUDYDATA("commit: ",data,0);
675 /* Can match anything (initialization) */
677 S_cl_anything(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
679 ANYOF_CLASS_ZERO(cl);
680 ANYOF_BITMAP_SETALL(cl);
681 cl->flags = ANYOF_EOS|ANYOF_UNICODE_ALL;
683 cl->flags |= ANYOF_LOCALE;
686 /* Can match anything (initialization) */
688 S_cl_is_anything(const struct regnode_charclass_class *cl)
692 for (value = 0; value <= ANYOF_MAX; value += 2)
693 if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1))
695 if (!(cl->flags & ANYOF_UNICODE_ALL))
697 if (!ANYOF_BITMAP_TESTALLSET((const void*)cl))
702 /* Can match anything (initialization) */
704 S_cl_init(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
706 Zero(cl, 1, struct regnode_charclass_class);
708 cl_anything(pRExC_state, cl);
712 S_cl_init_zero(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
714 Zero(cl, 1, struct regnode_charclass_class);
716 cl_anything(pRExC_state, cl);
718 cl->flags |= ANYOF_LOCALE;
721 /* 'And' a given class with another one. Can create false positives */
722 /* We assume that cl is not inverted */
724 S_cl_and(struct regnode_charclass_class *cl,
725 const struct regnode_charclass_class *and_with)
728 assert(and_with->type == ANYOF);
729 if (!(and_with->flags & ANYOF_CLASS)
730 && !(cl->flags & ANYOF_CLASS)
731 && (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
732 && !(and_with->flags & ANYOF_FOLD)
733 && !(cl->flags & ANYOF_FOLD)) {
736 if (and_with->flags & ANYOF_INVERT)
737 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
738 cl->bitmap[i] &= ~and_with->bitmap[i];
740 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
741 cl->bitmap[i] &= and_with->bitmap[i];
742 } /* XXXX: logic is complicated otherwise, leave it along for a moment. */
743 if (!(and_with->flags & ANYOF_EOS))
744 cl->flags &= ~ANYOF_EOS;
746 if (cl->flags & ANYOF_UNICODE_ALL && and_with->flags & ANYOF_UNICODE &&
747 !(and_with->flags & ANYOF_INVERT)) {
748 cl->flags &= ~ANYOF_UNICODE_ALL;
749 cl->flags |= ANYOF_UNICODE;
750 ARG_SET(cl, ARG(and_with));
752 if (!(and_with->flags & ANYOF_UNICODE_ALL) &&
753 !(and_with->flags & ANYOF_INVERT))
754 cl->flags &= ~ANYOF_UNICODE_ALL;
755 if (!(and_with->flags & (ANYOF_UNICODE|ANYOF_UNICODE_ALL)) &&
756 !(and_with->flags & ANYOF_INVERT))
757 cl->flags &= ~ANYOF_UNICODE;
760 /* 'OR' a given class with another one. Can create false positives */
761 /* We assume that cl is not inverted */
763 S_cl_or(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, const struct regnode_charclass_class *or_with)
765 if (or_with->flags & ANYOF_INVERT) {
767 * (B1 | CL1) | (!B2 & !CL2) = (B1 | !B2 & !CL2) | (CL1 | (!B2 & !CL2))
768 * <= (B1 | !B2) | (CL1 | !CL2)
769 * which is wasteful if CL2 is small, but we ignore CL2:
770 * (B1 | CL1) | (!B2 & !CL2) <= (B1 | CL1) | !B2 = (B1 | !B2) | CL1
771 * XXXX Can we handle case-fold? Unclear:
772 * (OK1(i) | OK1(i')) | !(OK1(i) | OK1(i')) =
773 * (OK1(i) | OK1(i')) | (!OK1(i) & !OK1(i'))
775 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
776 && !(or_with->flags & ANYOF_FOLD)
777 && !(cl->flags & ANYOF_FOLD) ) {
780 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
781 cl->bitmap[i] |= ~or_with->bitmap[i];
782 } /* XXXX: logic is complicated otherwise */
784 cl_anything(pRExC_state, cl);
787 /* (B1 | CL1) | (B2 | CL2) = (B1 | B2) | (CL1 | CL2)) */
788 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
789 && (!(or_with->flags & ANYOF_FOLD)
790 || (cl->flags & ANYOF_FOLD)) ) {
793 /* OR char bitmap and class bitmap separately */
794 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
795 cl->bitmap[i] |= or_with->bitmap[i];
796 if (or_with->flags & ANYOF_CLASS) {
797 for (i = 0; i < ANYOF_CLASSBITMAP_SIZE; i++)
798 cl->classflags[i] |= or_with->classflags[i];
799 cl->flags |= ANYOF_CLASS;
802 else { /* XXXX: logic is complicated, leave it along for a moment. */
803 cl_anything(pRExC_state, cl);
806 if (or_with->flags & ANYOF_EOS)
807 cl->flags |= ANYOF_EOS;
809 if (cl->flags & ANYOF_UNICODE && or_with->flags & ANYOF_UNICODE &&
810 ARG(cl) != ARG(or_with)) {
811 cl->flags |= ANYOF_UNICODE_ALL;
812 cl->flags &= ~ANYOF_UNICODE;
814 if (or_with->flags & ANYOF_UNICODE_ALL) {
815 cl->flags |= ANYOF_UNICODE_ALL;
816 cl->flags &= ~ANYOF_UNICODE;
820 #define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ]
821 #define TRIE_LIST_CUR(state) ( TRIE_LIST_ITEM( state, 0 ).forid )
822 #define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate )
823 #define TRIE_LIST_USED(idx) ( trie->states[state].trans.list ? (TRIE_LIST_CUR( idx ) - 1) : 0 )
828 dump_trie(trie,widecharmap,revcharmap)
829 dump_trie_interim_list(trie,widecharmap,revcharmap,next_alloc)
830 dump_trie_interim_table(trie,widecharmap,revcharmap,next_alloc)
832 These routines dump out a trie in a somewhat readable format.
833 The _interim_ variants are used for debugging the interim
834 tables that are used to generate the final compressed
835 representation which is what dump_trie expects.
837 Part of the reason for their existance is to provide a form
838 of documentation as to how the different representations function.
843 Dumps the final compressed table form of the trie to Perl_debug_log.
844 Used for debugging make_trie().
848 S_dump_trie(pTHX_ const struct _reg_trie_data *trie, HV *widecharmap,
849 AV *revcharmap, U32 depth)
852 SV *sv=sv_newmortal();
853 int colwidth= widecharmap ? 6 : 4;
854 GET_RE_DEBUG_FLAGS_DECL;
857 PerlIO_printf( Perl_debug_log, "%*sChar : %-6s%-6s%-4s ",
858 (int)depth * 2 + 2,"",
859 "Match","Base","Ofs" );
861 for( state = 0 ; state < trie->uniquecharcount ; state++ ) {
862 SV ** const tmp = av_fetch( revcharmap, state, 0);
864 PerlIO_printf( Perl_debug_log, "%*s",
866 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
867 PL_colors[0], PL_colors[1],
868 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
869 PERL_PV_ESCAPE_FIRSTCHAR
874 PerlIO_printf( Perl_debug_log, "\n%*sState|-----------------------",
875 (int)depth * 2 + 2,"");
877 for( state = 0 ; state < trie->uniquecharcount ; state++ )
878 PerlIO_printf( Perl_debug_log, "%.*s", colwidth, "--------");
879 PerlIO_printf( Perl_debug_log, "\n");
881 for( state = 1 ; state < trie->statecount ; state++ ) {
882 const U32 base = trie->states[ state ].trans.base;
884 PerlIO_printf( Perl_debug_log, "%*s#%4"UVXf"|", (int)depth * 2 + 2,"", (UV)state);
886 if ( trie->states[ state ].wordnum ) {
887 PerlIO_printf( Perl_debug_log, " W%4X", trie->states[ state ].wordnum );
889 PerlIO_printf( Perl_debug_log, "%6s", "" );
892 PerlIO_printf( Perl_debug_log, " @%4"UVXf" ", (UV)base );
897 while( ( base + ofs < trie->uniquecharcount ) ||
898 ( base + ofs - trie->uniquecharcount < trie->lasttrans
899 && trie->trans[ base + ofs - trie->uniquecharcount ].check != state))
902 PerlIO_printf( Perl_debug_log, "+%2"UVXf"[ ", (UV)ofs);
904 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
905 if ( ( base + ofs >= trie->uniquecharcount ) &&
906 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
907 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
909 PerlIO_printf( Perl_debug_log, "%*"UVXf,
911 (UV)trie->trans[ base + ofs - trie->uniquecharcount ].next );
913 PerlIO_printf( Perl_debug_log, "%*s",colwidth," ." );
917 PerlIO_printf( Perl_debug_log, "]");
920 PerlIO_printf( Perl_debug_log, "\n" );
924 Dumps a fully constructed but uncompressed trie in list form.
925 List tries normally only are used for construction when the number of
926 possible chars (trie->uniquecharcount) is very high.
927 Used for debugging make_trie().
930 S_dump_trie_interim_list(pTHX_ const struct _reg_trie_data *trie,
931 HV *widecharmap, AV *revcharmap, U32 next_alloc,
935 SV *sv=sv_newmortal();
936 int colwidth= widecharmap ? 6 : 4;
937 GET_RE_DEBUG_FLAGS_DECL;
938 /* print out the table precompression. */
939 PerlIO_printf( Perl_debug_log, "%*sState :Word | Transition Data\n%*s%s",
940 (int)depth * 2 + 2,"", (int)depth * 2 + 2,"",
941 "------:-----+-----------------\n" );
943 for( state=1 ; state < next_alloc ; state ++ ) {
946 PerlIO_printf( Perl_debug_log, "%*s %4"UVXf" :",
947 (int)depth * 2 + 2,"", (UV)state );
948 if ( ! trie->states[ state ].wordnum ) {
949 PerlIO_printf( Perl_debug_log, "%5s| ","");
951 PerlIO_printf( Perl_debug_log, "W%4x| ",
952 trie->states[ state ].wordnum
955 for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) {
956 SV ** const tmp = av_fetch( revcharmap, TRIE_LIST_ITEM(state,charid).forid, 0);
958 PerlIO_printf( Perl_debug_log, "%*s:%3X=%4"UVXf" | ",
960 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
961 PL_colors[0], PL_colors[1],
962 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
963 PERL_PV_ESCAPE_FIRSTCHAR
965 TRIE_LIST_ITEM(state,charid).forid,
966 (UV)TRIE_LIST_ITEM(state,charid).newstate
969 PerlIO_printf(Perl_debug_log, "\n%*s| ",
970 (int)((depth * 2) + 14), "");
973 PerlIO_printf( Perl_debug_log, "\n");
978 Dumps a fully constructed but uncompressed trie in table form.
979 This is the normal DFA style state transition table, with a few
980 twists to facilitate compression later.
981 Used for debugging make_trie().
984 S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie,
985 HV *widecharmap, AV *revcharmap, U32 next_alloc,
990 SV *sv=sv_newmortal();
991 int colwidth= widecharmap ? 6 : 4;
992 GET_RE_DEBUG_FLAGS_DECL;
995 print out the table precompression so that we can do a visual check
996 that they are identical.
999 PerlIO_printf( Perl_debug_log, "%*sChar : ",(int)depth * 2 + 2,"" );
1001 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
1002 SV ** const tmp = av_fetch( revcharmap, charid, 0);
1004 PerlIO_printf( Perl_debug_log, "%*s",
1006 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
1007 PL_colors[0], PL_colors[1],
1008 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1009 PERL_PV_ESCAPE_FIRSTCHAR
1015 PerlIO_printf( Perl_debug_log, "\n%*sState+-",(int)depth * 2 + 2,"" );
1017 for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) {
1018 PerlIO_printf( Perl_debug_log, "%.*s", colwidth,"--------");
1021 PerlIO_printf( Perl_debug_log, "\n" );
1023 for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) {
1025 PerlIO_printf( Perl_debug_log, "%*s%4"UVXf" : ",
1026 (int)depth * 2 + 2,"",
1027 (UV)TRIE_NODENUM( state ) );
1029 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
1030 UV v=(UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next );
1032 PerlIO_printf( Perl_debug_log, "%*"UVXf, colwidth, v );
1034 PerlIO_printf( Perl_debug_log, "%*s", colwidth, "." );
1036 if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) {
1037 PerlIO_printf( Perl_debug_log, " (%4"UVXf")\n", (UV)trie->trans[ state ].check );
1039 PerlIO_printf( Perl_debug_log, " (%4"UVXf") W%4X\n", (UV)trie->trans[ state ].check,
1040 trie->states[ TRIE_NODENUM( state ) ].wordnum );
1047 /* make_trie(startbranch,first,last,tail,word_count,flags,depth)
1048 startbranch: the first branch in the whole branch sequence
1049 first : start branch of sequence of branch-exact nodes.
1050 May be the same as startbranch
1051 last : Thing following the last branch.
1052 May be the same as tail.
1053 tail : item following the branch sequence
1054 count : words in the sequence
1055 flags : currently the OP() type we will be building one of /EXACT(|F|Fl)/
1056 depth : indent depth
1058 Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node.
1060 A trie is an N'ary tree where the branches are determined by digital
1061 decomposition of the key. IE, at the root node you look up the 1st character and
1062 follow that branch repeat until you find the end of the branches. Nodes can be
1063 marked as "accepting" meaning they represent a complete word. Eg:
1067 would convert into the following structure. Numbers represent states, letters
1068 following numbers represent valid transitions on the letter from that state, if
1069 the number is in square brackets it represents an accepting state, otherwise it
1070 will be in parenthesis.
1072 +-h->+-e->[3]-+-r->(8)-+-s->[9]
1076 (1) +-i->(6)-+-s->[7]
1078 +-s->(3)-+-h->(4)-+-e->[5]
1080 Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers)
1082 This shows that when matching against the string 'hers' we will begin at state 1
1083 read 'h' and move to state 2, read 'e' and move to state 3 which is accepting,
1084 then read 'r' and go to state 8 followed by 's' which takes us to state 9 which
1085 is also accepting. Thus we know that we can match both 'he' and 'hers' with a
1086 single traverse. We store a mapping from accepting to state to which word was
1087 matched, and then when we have multiple possibilities we try to complete the
1088 rest of the regex in the order in which they occured in the alternation.
1090 The only prior NFA like behaviour that would be changed by the TRIE support is
1091 the silent ignoring of duplicate alternations which are of the form:
1093 / (DUPE|DUPE) X? (?{ ... }) Y /x
1095 Thus EVAL blocks follwing a trie may be called a different number of times with
1096 and without the optimisation. With the optimisations dupes will be silently
1097 ignored. This inconsistant behaviour of EVAL type nodes is well established as
1098 the following demonstrates:
1100 'words'=~/(word|word|word)(?{ print $1 })[xyz]/
1102 which prints out 'word' three times, but
1104 'words'=~/(word|word|word)(?{ print $1 })S/
1106 which doesnt print it out at all. This is due to other optimisations kicking in.
1108 Example of what happens on a structural level:
1110 The regexp /(ac|ad|ab)+/ will produce the folowing debug output:
1112 1: CURLYM[1] {1,32767}(18)
1123 This would be optimizable with startbranch=5, first=5, last=16, tail=16
1124 and should turn into:
1126 1: CURLYM[1] {1,32767}(18)
1128 [Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1]
1136 Cases where tail != last would be like /(?foo|bar)baz/:
1146 which would be optimizable with startbranch=1, first=1, last=7, tail=8
1147 and would end up looking like:
1150 [Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1]
1157 d = uvuni_to_utf8_flags(d, uv, 0);
1159 is the recommended Unicode-aware way of saying
1164 #define TRIE_STORE_REVCHAR \
1167 SV *zlopp = newSV(2); \
1168 unsigned char *flrbbbbb = (unsigned char *) SvPVX(zlopp); \
1169 unsigned const char *const kapow = uvuni_to_utf8(flrbbbbb, uvc & 0xFF); \
1170 SvCUR_set(zlopp, kapow - flrbbbbb); \
1173 av_push(revcharmap, zlopp); \
1175 char ooooff = (char)uvc; \
1176 av_push(revcharmap, newSVpvn(&ooooff, 1)); \
1180 #define TRIE_READ_CHAR STMT_START { \
1184 if ( foldlen > 0 ) { \
1185 uvc = utf8n_to_uvuni( scan, UTF8_MAXLEN, &len, uniflags ); \
1190 uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
1191 uvc = to_uni_fold( uvc, foldbuf, &foldlen ); \
1192 foldlen -= UNISKIP( uvc ); \
1193 scan = foldbuf + UNISKIP( uvc ); \
1196 uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
1206 #define TRIE_LIST_PUSH(state,fid,ns) STMT_START { \
1207 if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) { \
1208 U32 ging = TRIE_LIST_LEN( state ) *= 2; \
1209 Renew( trie->states[ state ].trans.list, ging, reg_trie_trans_le ); \
1211 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid; \
1212 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns; \
1213 TRIE_LIST_CUR( state )++; \
1216 #define TRIE_LIST_NEW(state) STMT_START { \
1217 Newxz( trie->states[ state ].trans.list, \
1218 4, reg_trie_trans_le ); \
1219 TRIE_LIST_CUR( state ) = 1; \
1220 TRIE_LIST_LEN( state ) = 4; \
1223 #define TRIE_HANDLE_WORD(state) STMT_START { \
1224 U16 dupe= trie->states[ state ].wordnum; \
1225 regnode * const noper_next = regnext( noper ); \
1227 if (trie->wordlen) \
1228 trie->wordlen[ curword ] = wordlen; \
1230 /* store the word for dumping */ \
1232 if (OP(noper) != NOTHING) \
1233 tmp = newSVpvn(STRING(noper), STR_LEN(noper)); \
1235 tmp = newSVpvn( "", 0 ); \
1236 if ( UTF ) SvUTF8_on( tmp ); \
1237 av_push( trie_words, tmp ); \
1242 if ( noper_next < tail ) { \
1244 trie->jump = (U16 *) PerlMemShared_calloc( word_count + 1, sizeof(U16) ); \
1245 trie->jump[curword] = (U16)(noper_next - convert); \
1247 jumper = noper_next; \
1249 nextbranch= regnext(cur); \
1253 /* So it's a dupe. This means we need to maintain a */\
1254 /* linked-list from the first to the next. */\
1255 /* we only allocate the nextword buffer when there */\
1256 /* a dupe, so first time we have to do the allocation */\
1257 if (!trie->nextword) \
1258 trie->nextword = (U16 *) \
1259 PerlMemShared_calloc( word_count + 1, sizeof(U16)); \
1260 while ( trie->nextword[dupe] ) \
1261 dupe= trie->nextword[dupe]; \
1262 trie->nextword[dupe]= curword; \
1264 /* we haven't inserted this word yet. */ \
1265 trie->states[ state ].wordnum = curword; \
1270 #define TRIE_TRANS_STATE(state,base,ucharcount,charid,special) \
1271 ( ( base + charid >= ucharcount \
1272 && base + charid < ubound \
1273 && state == trie->trans[ base - ucharcount + charid ].check \
1274 && trie->trans[ base - ucharcount + charid ].next ) \
1275 ? trie->trans[ base - ucharcount + charid ].next \
1276 : ( state==1 ? special : 0 ) \
1280 #define MADE_JUMP_TRIE 2
1281 #define MADE_EXACT_TRIE 4
1284 S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *first, regnode *last, regnode *tail, U32 word_count, U32 flags, U32 depth)
1287 /* first pass, loop through and scan words */
1288 reg_trie_data *trie;
1289 HV *widecharmap = NULL;
1290 AV *revcharmap = newAV();
1292 const U32 uniflags = UTF8_ALLOW_DEFAULT;
1297 regnode *jumper = NULL;
1298 regnode *nextbranch = NULL;
1299 regnode *convert = NULL;
1300 /* we just use folder as a flag in utf8 */
1301 const U8 * const folder = ( flags == EXACTF
1303 : ( flags == EXACTFL
1310 const U32 data_slot = add_data( pRExC_state, 4, "tuuu" );
1311 AV *trie_words = NULL;
1312 /* along with revcharmap, this only used during construction but both are
1313 * useful during debugging so we store them in the struct when debugging.
1316 const U32 data_slot = add_data( pRExC_state, 2, "tu" );
1317 STRLEN trie_charcount=0;
1319 SV *re_trie_maxbuff;
1320 GET_RE_DEBUG_FLAGS_DECL;
1322 PERL_UNUSED_ARG(depth);
1325 trie = (reg_trie_data *) PerlMemShared_calloc( 1, sizeof(reg_trie_data) );
1327 trie->startstate = 1;
1328 trie->wordcount = word_count;
1329 RExC_rxi->data->data[ data_slot ] = (void*)trie;
1330 trie->charmap = (U16 *) PerlMemShared_calloc( 256, sizeof(U16) );
1331 if (!(UTF && folder))
1332 trie->bitmap = (char *) PerlMemShared_calloc( ANYOF_BITMAP_SIZE, 1 );
1334 trie_words = newAV();
1337 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
1338 if (!SvIOK(re_trie_maxbuff)) {
1339 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
1342 PerlIO_printf( Perl_debug_log,
1343 "%*smake_trie start==%d, first==%d, last==%d, tail==%d depth=%d\n",
1344 (int)depth * 2 + 2, "",
1345 REG_NODE_NUM(startbranch),REG_NODE_NUM(first),
1346 REG_NODE_NUM(last), REG_NODE_NUM(tail),
1350 /* Find the node we are going to overwrite */
1351 if ( first == startbranch && OP( last ) != BRANCH ) {
1352 /* whole branch chain */
1355 /* branch sub-chain */
1356 convert = NEXTOPER( first );
1359 /* -- First loop and Setup --
1361 We first traverse the branches and scan each word to determine if it
1362 contains widechars, and how many unique chars there are, this is
1363 important as we have to build a table with at least as many columns as we
1366 We use an array of integers to represent the character codes 0..255
1367 (trie->charmap) and we use a an HV* to store Unicode characters. We use the
1368 native representation of the character value as the key and IV's for the
1371 *TODO* If we keep track of how many times each character is used we can
1372 remap the columns so that the table compression later on is more
1373 efficient in terms of memory by ensuring most common value is in the
1374 middle and the least common are on the outside. IMO this would be better
1375 than a most to least common mapping as theres a decent chance the most
1376 common letter will share a node with the least common, meaning the node
1377 will not be compressable. With a middle is most common approach the worst
1378 case is when we have the least common nodes twice.
1382 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1383 regnode * const noper = NEXTOPER( cur );
1384 const U8 *uc = (U8*)STRING( noper );
1385 const U8 * const e = uc + STR_LEN( noper );
1387 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1388 const U8 *scan = (U8*)NULL;
1389 U32 wordlen = 0; /* required init */
1391 bool set_bit = trie->bitmap ? 1 : 0; /*store the first char in the bitmap?*/
1393 if (OP(noper) == NOTHING) {
1397 if ( set_bit ) /* bitmap only alloced when !(UTF&&Folding) */
1398 TRIE_BITMAP_SET(trie,*uc); /* store the raw first byte
1399 regardless of encoding */
1401 for ( ; uc < e ; uc += len ) {
1402 TRIE_CHARCOUNT(trie)++;
1406 if ( !trie->charmap[ uvc ] ) {
1407 trie->charmap[ uvc ]=( ++trie->uniquecharcount );
1409 trie->charmap[ folder[ uvc ] ] = trie->charmap[ uvc ];
1413 /* store the codepoint in the bitmap, and if its ascii
1414 also store its folded equivelent. */
1415 TRIE_BITMAP_SET(trie,uvc);
1417 /* store the folded codepoint */
1418 if ( folder ) TRIE_BITMAP_SET(trie,folder[ uvc ]);
1421 /* store first byte of utf8 representation of
1422 codepoints in the 127 < uvc < 256 range */
1423 if (127 < uvc && uvc < 192) {
1424 TRIE_BITMAP_SET(trie,194);
1425 } else if (191 < uvc ) {
1426 TRIE_BITMAP_SET(trie,195);
1427 /* && uvc < 256 -- we know uvc is < 256 already */
1430 set_bit = 0; /* We've done our bit :-) */
1435 widecharmap = newHV();
1437 svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 1 );
1440 Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%"UVXf, uvc );
1442 if ( !SvTRUE( *svpp ) ) {
1443 sv_setiv( *svpp, ++trie->uniquecharcount );
1448 if( cur == first ) {
1451 } else if (chars < trie->minlen) {
1453 } else if (chars > trie->maxlen) {
1457 } /* end first pass */
1458 DEBUG_TRIE_COMPILE_r(
1459 PerlIO_printf( Perl_debug_log, "%*sTRIE(%s): W:%d C:%d Uq:%d Min:%d Max:%d\n",
1460 (int)depth * 2 + 2,"",
1461 ( widecharmap ? "UTF8" : "NATIVE" ), (int)word_count,
1462 (int)TRIE_CHARCOUNT(trie), trie->uniquecharcount,
1463 (int)trie->minlen, (int)trie->maxlen )
1465 trie->wordlen = (U32 *) PerlMemShared_calloc( word_count, sizeof(U32) );
1468 We now know what we are dealing with in terms of unique chars and
1469 string sizes so we can calculate how much memory a naive
1470 representation using a flat table will take. If it's over a reasonable
1471 limit (as specified by ${^RE_TRIE_MAXBUF}) we use a more memory
1472 conservative but potentially much slower representation using an array
1475 At the end we convert both representations into the same compressed
1476 form that will be used in regexec.c for matching with. The latter
1477 is a form that cannot be used to construct with but has memory
1478 properties similar to the list form and access properties similar
1479 to the table form making it both suitable for fast searches and
1480 small enough that its feasable to store for the duration of a program.
1482 See the comment in the code where the compressed table is produced
1483 inplace from the flat tabe representation for an explanation of how
1484 the compression works.
1489 if ( (IV)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1) > SvIV(re_trie_maxbuff) ) {
1491 Second Pass -- Array Of Lists Representation
1493 Each state will be represented by a list of charid:state records
1494 (reg_trie_trans_le) the first such element holds the CUR and LEN
1495 points of the allocated array. (See defines above).
1497 We build the initial structure using the lists, and then convert
1498 it into the compressed table form which allows faster lookups
1499 (but cant be modified once converted).
1502 STRLEN transcount = 1;
1504 DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
1505 "%*sCompiling trie using list compiler\n",
1506 (int)depth * 2 + 2, ""));
1508 trie->states = (reg_trie_state *)
1509 PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
1510 sizeof(reg_trie_state) );
1514 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1516 regnode * const noper = NEXTOPER( cur );
1517 U8 *uc = (U8*)STRING( noper );
1518 const U8 * const e = uc + STR_LEN( noper );
1519 U32 state = 1; /* required init */
1520 U16 charid = 0; /* sanity init */
1521 U8 *scan = (U8*)NULL; /* sanity init */
1522 STRLEN foldlen = 0; /* required init */
1523 U32 wordlen = 0; /* required init */
1524 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1526 if (OP(noper) != NOTHING) {
1527 for ( ; uc < e ; uc += len ) {
1532 charid = trie->charmap[ uvc ];
1534 SV** const svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 0);
1538 charid=(U16)SvIV( *svpp );
1541 /* charid is now 0 if we dont know the char read, or nonzero if we do */
1548 if ( !trie->states[ state ].trans.list ) {
1549 TRIE_LIST_NEW( state );
1551 for ( check = 1; check <= TRIE_LIST_USED( state ); check++ ) {
1552 if ( TRIE_LIST_ITEM( state, check ).forid == charid ) {
1553 newstate = TRIE_LIST_ITEM( state, check ).newstate;
1558 newstate = next_alloc++;
1559 TRIE_LIST_PUSH( state, charid, newstate );
1564 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
1568 TRIE_HANDLE_WORD(state);
1570 } /* end second pass */
1572 /* next alloc is the NEXT state to be allocated */
1573 trie->statecount = next_alloc;
1574 trie->states = (reg_trie_state *)
1575 PerlMemShared_realloc( trie->states,
1577 * sizeof(reg_trie_state) );
1579 /* and now dump it out before we compress it */
1580 DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_list(trie, widecharmap,
1581 revcharmap, next_alloc,
1585 trie->trans = (reg_trie_trans *)
1586 PerlMemShared_calloc( transcount, sizeof(reg_trie_trans) );
1593 for( state=1 ; state < next_alloc ; state ++ ) {
1597 DEBUG_TRIE_COMPILE_MORE_r(
1598 PerlIO_printf( Perl_debug_log, "tp: %d zp: %d ",tp,zp)
1602 if (trie->states[state].trans.list) {
1603 U16 minid=TRIE_LIST_ITEM( state, 1).forid;
1607 for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
1608 const U16 forid = TRIE_LIST_ITEM( state, idx).forid;
1609 if ( forid < minid ) {
1611 } else if ( forid > maxid ) {
1615 if ( transcount < tp + maxid - minid + 1) {
1617 trie->trans = (reg_trie_trans *)
1618 PerlMemShared_realloc( trie->trans,
1620 * sizeof(reg_trie_trans) );
1621 Zero( trie->trans + (transcount / 2), transcount / 2 , reg_trie_trans );
1623 base = trie->uniquecharcount + tp - minid;
1624 if ( maxid == minid ) {
1626 for ( ; zp < tp ; zp++ ) {
1627 if ( ! trie->trans[ zp ].next ) {
1628 base = trie->uniquecharcount + zp - minid;
1629 trie->trans[ zp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1630 trie->trans[ zp ].check = state;
1636 trie->trans[ tp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1637 trie->trans[ tp ].check = state;
1642 for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
1643 const U32 tid = base - trie->uniquecharcount + TRIE_LIST_ITEM( state, idx ).forid;
1644 trie->trans[ tid ].next = TRIE_LIST_ITEM( state, idx ).newstate;
1645 trie->trans[ tid ].check = state;
1647 tp += ( maxid - minid + 1 );
1649 Safefree(trie->states[ state ].trans.list);
1652 DEBUG_TRIE_COMPILE_MORE_r(
1653 PerlIO_printf( Perl_debug_log, " base: %d\n",base);
1656 trie->states[ state ].trans.base=base;
1658 trie->lasttrans = tp + 1;
1662 Second Pass -- Flat Table Representation.
1664 we dont use the 0 slot of either trans[] or states[] so we add 1 to each.
1665 We know that we will need Charcount+1 trans at most to store the data
1666 (one row per char at worst case) So we preallocate both structures
1667 assuming worst case.
1669 We then construct the trie using only the .next slots of the entry
1672 We use the .check field of the first entry of the node temporarily to
1673 make compression both faster and easier by keeping track of how many non
1674 zero fields are in the node.
1676 Since trans are numbered from 1 any 0 pointer in the table is a FAIL
1679 There are two terms at use here: state as a TRIE_NODEIDX() which is a
1680 number representing the first entry of the node, and state as a
1681 TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1) and
1682 TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3) if there
1683 are 2 entrys per node. eg:
1691 The table is internally in the right hand, idx form. However as we also
1692 have to deal with the states array which is indexed by nodenum we have to
1693 use TRIE_NODENUM() to convert.
1696 DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
1697 "%*sCompiling trie using table compiler\n",
1698 (int)depth * 2 + 2, ""));
1700 trie->trans = (reg_trie_trans *)
1701 PerlMemShared_calloc( ( TRIE_CHARCOUNT(trie) + 1 )
1702 * trie->uniquecharcount + 1,
1703 sizeof(reg_trie_trans) );
1704 trie->states = (reg_trie_state *)
1705 PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
1706 sizeof(reg_trie_state) );
1707 next_alloc = trie->uniquecharcount + 1;
1710 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1712 regnode * const noper = NEXTOPER( cur );
1713 const U8 *uc = (U8*)STRING( noper );
1714 const U8 * const e = uc + STR_LEN( noper );
1716 U32 state = 1; /* required init */
1718 U16 charid = 0; /* sanity init */
1719 U32 accept_state = 0; /* sanity init */
1720 U8 *scan = (U8*)NULL; /* sanity init */
1722 STRLEN foldlen = 0; /* required init */
1723 U32 wordlen = 0; /* required init */
1724 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1726 if ( OP(noper) != NOTHING ) {
1727 for ( ; uc < e ; uc += len ) {
1732 charid = trie->charmap[ uvc ];
1734 SV* const * const svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 0);
1735 charid = svpp ? (U16)SvIV(*svpp) : 0;
1739 if ( !trie->trans[ state + charid ].next ) {
1740 trie->trans[ state + charid ].next = next_alloc;
1741 trie->trans[ state ].check++;
1742 next_alloc += trie->uniquecharcount;
1744 state = trie->trans[ state + charid ].next;
1746 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
1748 /* charid is now 0 if we dont know the char read, or nonzero if we do */
1751 accept_state = TRIE_NODENUM( state );
1752 TRIE_HANDLE_WORD(accept_state);
1754 } /* end second pass */
1756 /* and now dump it out before we compress it */
1757 DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_table(trie, widecharmap,
1759 next_alloc, depth+1));
1763 * Inplace compress the table.*
1765 For sparse data sets the table constructed by the trie algorithm will
1766 be mostly 0/FAIL transitions or to put it another way mostly empty.
1767 (Note that leaf nodes will not contain any transitions.)
1769 This algorithm compresses the tables by eliminating most such
1770 transitions, at the cost of a modest bit of extra work during lookup:
1772 - Each states[] entry contains a .base field which indicates the
1773 index in the state[] array wheres its transition data is stored.
1775 - If .base is 0 there are no valid transitions from that node.
1777 - If .base is nonzero then charid is added to it to find an entry in
1780 -If trans[states[state].base+charid].check!=state then the
1781 transition is taken to be a 0/Fail transition. Thus if there are fail
1782 transitions at the front of the node then the .base offset will point
1783 somewhere inside the previous nodes data (or maybe even into a node
1784 even earlier), but the .check field determines if the transition is
1788 The following process inplace converts the table to the compressed
1789 table: We first do not compress the root node 1,and mark its all its
1790 .check pointers as 1 and set its .base pointer as 1 as well. This
1791 allows to do a DFA construction from the compressed table later, and
1792 ensures that any .base pointers we calculate later are greater than
1795 - We set 'pos' to indicate the first entry of the second node.
1797 - We then iterate over the columns of the node, finding the first and
1798 last used entry at l and m. We then copy l..m into pos..(pos+m-l),
1799 and set the .check pointers accordingly, and advance pos
1800 appropriately and repreat for the next node. Note that when we copy
1801 the next pointers we have to convert them from the original
1802 NODEIDX form to NODENUM form as the former is not valid post
1805 - If a node has no transitions used we mark its base as 0 and do not
1806 advance the pos pointer.
1808 - If a node only has one transition we use a second pointer into the
1809 structure to fill in allocated fail transitions from other states.
1810 This pointer is independent of the main pointer and scans forward
1811 looking for null transitions that are allocated to a state. When it
1812 finds one it writes the single transition into the "hole". If the
1813 pointer doesnt find one the single transition is appended as normal.
1815 - Once compressed we can Renew/realloc the structures to release the
1818 See "Table-Compression Methods" in sec 3.9 of the Red Dragon,
1819 specifically Fig 3.47 and the associated pseudocode.
1823 const U32 laststate = TRIE_NODENUM( next_alloc );
1826 trie->statecount = laststate;
1828 for ( state = 1 ; state < laststate ; state++ ) {
1830 const U32 stateidx = TRIE_NODEIDX( state );
1831 const U32 o_used = trie->trans[ stateidx ].check;
1832 U32 used = trie->trans[ stateidx ].check;
1833 trie->trans[ stateidx ].check = 0;
1835 for ( charid = 0 ; used && charid < trie->uniquecharcount ; charid++ ) {
1836 if ( flag || trie->trans[ stateidx + charid ].next ) {
1837 if ( trie->trans[ stateidx + charid ].next ) {
1839 for ( ; zp < pos ; zp++ ) {
1840 if ( ! trie->trans[ zp ].next ) {
1844 trie->states[ state ].trans.base = zp + trie->uniquecharcount - charid ;
1845 trie->trans[ zp ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
1846 trie->trans[ zp ].check = state;
1847 if ( ++zp > pos ) pos = zp;
1854 trie->states[ state ].trans.base = pos + trie->uniquecharcount - charid ;
1856 trie->trans[ pos ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
1857 trie->trans[ pos ].check = state;
1862 trie->lasttrans = pos + 1;
1863 trie->states = (reg_trie_state *)
1864 PerlMemShared_realloc( trie->states, laststate
1865 * sizeof(reg_trie_state) );
1866 DEBUG_TRIE_COMPILE_MORE_r(
1867 PerlIO_printf( Perl_debug_log,
1868 "%*sAlloc: %d Orig: %"IVdf" elements, Final:%"IVdf". Savings of %%%5.2f\n",
1869 (int)depth * 2 + 2,"",
1870 (int)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1 ),
1873 ( ( next_alloc - pos ) * 100 ) / (double)next_alloc );
1876 } /* end table compress */
1878 DEBUG_TRIE_COMPILE_MORE_r(
1879 PerlIO_printf(Perl_debug_log, "%*sStatecount:%"UVxf" Lasttrans:%"UVxf"\n",
1880 (int)depth * 2 + 2, "",
1881 (UV)trie->statecount,
1882 (UV)trie->lasttrans)
1884 /* resize the trans array to remove unused space */
1885 trie->trans = (reg_trie_trans *)
1886 PerlMemShared_realloc( trie->trans, trie->lasttrans
1887 * sizeof(reg_trie_trans) );
1889 /* and now dump out the compressed format */
1890 DEBUG_TRIE_COMPILE_r(dump_trie(trie, widecharmap, revcharmap, depth+1));
1892 { /* Modify the program and insert the new TRIE node*/
1893 U8 nodetype =(U8)(flags & 0xFF);
1897 regnode *optimize = NULL;
1898 #ifdef RE_TRACK_PATTERN_OFFSETS
1901 U32 mjd_nodelen = 0;
1902 #endif /* RE_TRACK_PATTERN_OFFSETS */
1903 #endif /* DEBUGGING */
1905 This means we convert either the first branch or the first Exact,
1906 depending on whether the thing following (in 'last') is a branch
1907 or not and whther first is the startbranch (ie is it a sub part of
1908 the alternation or is it the whole thing.)
1909 Assuming its a sub part we conver the EXACT otherwise we convert
1910 the whole branch sequence, including the first.
1912 /* Find the node we are going to overwrite */
1913 if ( first != startbranch || OP( last ) == BRANCH ) {
1914 /* branch sub-chain */
1915 NEXT_OFF( first ) = (U16)(last - first);
1916 #ifdef RE_TRACK_PATTERN_OFFSETS
1918 mjd_offset= Node_Offset((convert));
1919 mjd_nodelen= Node_Length((convert));
1922 /* whole branch chain */
1924 #ifdef RE_TRACK_PATTERN_OFFSETS
1927 const regnode *nop = NEXTOPER( convert );
1928 mjd_offset= Node_Offset((nop));
1929 mjd_nodelen= Node_Length((nop));
1933 PerlIO_printf(Perl_debug_log, "%*sMJD offset:%"UVuf" MJD length:%"UVuf"\n",
1934 (int)depth * 2 + 2, "",
1935 (UV)mjd_offset, (UV)mjd_nodelen)
1938 /* But first we check to see if there is a common prefix we can
1939 split out as an EXACT and put in front of the TRIE node. */
1940 trie->startstate= 1;
1941 if ( trie->bitmap && !widecharmap && !trie->jump ) {
1943 for ( state = 1 ; state < trie->statecount-1 ; state++ ) {
1947 const U32 base = trie->states[ state ].trans.base;
1949 if ( trie->states[state].wordnum )
1952 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
1953 if ( ( base + ofs >= trie->uniquecharcount ) &&
1954 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
1955 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
1957 if ( ++count > 1 ) {
1958 SV **tmp = av_fetch( revcharmap, ofs, 0);
1959 const U8 *ch = (U8*)SvPV_nolen_const( *tmp );
1960 if ( state == 1 ) break;
1962 Zero(trie->bitmap, ANYOF_BITMAP_SIZE, char);
1964 PerlIO_printf(Perl_debug_log,
1965 "%*sNew Start State=%"UVuf" Class: [",
1966 (int)depth * 2 + 2, "",
1969 SV ** const tmp = av_fetch( revcharmap, idx, 0);
1970 const U8 * const ch = (U8*)SvPV_nolen_const( *tmp );
1972 TRIE_BITMAP_SET(trie,*ch);
1974 TRIE_BITMAP_SET(trie, folder[ *ch ]);
1976 PerlIO_printf(Perl_debug_log, (char*)ch)
1980 TRIE_BITMAP_SET(trie,*ch);
1982 TRIE_BITMAP_SET(trie,folder[ *ch ]);
1983 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"%s", ch));
1989 SV **tmp = av_fetch( revcharmap, idx, 0);
1991 char *ch = SvPV( *tmp, len );
1993 SV *sv=sv_newmortal();
1994 PerlIO_printf( Perl_debug_log,
1995 "%*sPrefix State: %"UVuf" Idx:%"UVuf" Char='%s'\n",
1996 (int)depth * 2 + 2, "",
1998 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 6,
1999 PL_colors[0], PL_colors[1],
2000 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
2001 PERL_PV_ESCAPE_FIRSTCHAR
2006 OP( convert ) = nodetype;
2007 str=STRING(convert);
2010 STR_LEN(convert) += len;
2016 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"]\n"));
2022 regnode *n = convert+NODE_SZ_STR(convert);
2023 NEXT_OFF(convert) = NODE_SZ_STR(convert);
2024 trie->startstate = state;
2025 trie->minlen -= (state - 1);
2026 trie->maxlen -= (state - 1);
2028 regnode *fix = convert;
2029 U32 word = trie->wordcount;
2031 Set_Node_Offset_Length(convert, mjd_offset, state - 1);
2032 while( ++fix < n ) {
2033 Set_Node_Offset_Length(fix, 0, 0);
2036 SV ** const tmp = av_fetch( trie_words, word, 0 );
2038 if ( STR_LEN(convert) <= SvCUR(*tmp) )
2039 sv_chop(*tmp, SvPV_nolen(*tmp) + STR_LEN(convert));
2041 sv_chop(*tmp, SvPV_nolen(*tmp) + SvCUR(*tmp));
2048 NEXT_OFF(convert) = (U16)(tail - convert);
2049 DEBUG_r(optimize= n);
2055 if ( trie->maxlen ) {
2056 NEXT_OFF( convert ) = (U16)(tail - convert);
2057 ARG_SET( convert, data_slot );
2058 /* Store the offset to the first unabsorbed branch in
2059 jump[0], which is otherwise unused by the jump logic.
2060 We use this when dumping a trie and during optimisation. */
2062 trie->jump[0] = (U16)(nextbranch - convert);
2065 if ( !trie->states[trie->startstate].wordnum && trie->bitmap &&
2066 ( (char *)jumper - (char *)convert) >= (int)sizeof(struct regnode_charclass) )
2068 OP( convert ) = TRIEC;
2069 Copy(trie->bitmap, ((struct regnode_charclass *)convert)->bitmap, ANYOF_BITMAP_SIZE, char);
2070 PerlMemShared_free(trie->bitmap);
2073 OP( convert ) = TRIE;
2075 /* store the type in the flags */
2076 convert->flags = nodetype;
2080 + regarglen[ OP( convert ) ];
2082 /* XXX We really should free up the resource in trie now,
2083 as we won't use them - (which resources?) dmq */
2085 /* needed for dumping*/
2086 DEBUG_r(if (optimize) {
2087 regnode *opt = convert;
2089 while ( ++opt < optimize) {
2090 Set_Node_Offset_Length(opt,0,0);
2093 Try to clean up some of the debris left after the
2096 while( optimize < jumper ) {
2097 mjd_nodelen += Node_Length((optimize));
2098 OP( optimize ) = OPTIMIZED;
2099 Set_Node_Offset_Length(optimize,0,0);
2102 Set_Node_Offset_Length(convert,mjd_offset,mjd_nodelen);
2104 } /* end node insert */
2105 RExC_rxi->data->data[ data_slot + 1 ] = (void*)widecharmap;
2107 RExC_rxi->data->data[ data_slot + TRIE_WORDS_OFFSET ] = (void*)trie_words;
2108 RExC_rxi->data->data[ data_slot + 3 ] = (void*)revcharmap;
2110 SvREFCNT_dec(revcharmap);
2114 : trie->startstate>1
2120 S_make_trie_failtable(pTHX_ RExC_state_t *pRExC_state, regnode *source, regnode *stclass, U32 depth)
2122 /* The Trie is constructed and compressed now so we can build a fail array now if its needed
2124 This is basically the Aho-Corasick algorithm. Its from exercise 3.31 and 3.32 in the
2125 "Red Dragon" -- Compilers, principles, techniques, and tools. Aho, Sethi, Ullman 1985/88
2128 We find the fail state for each state in the trie, this state is the longest proper
2129 suffix of the current states 'word' that is also a proper prefix of another word in our
2130 trie. State 1 represents the word '' and is the thus the default fail state. This allows
2131 the DFA not to have to restart after its tried and failed a word at a given point, it
2132 simply continues as though it had been matching the other word in the first place.
2134 'abcdgu'=~/abcdefg|cdgu/
2135 When we get to 'd' we are still matching the first word, we would encounter 'g' which would
2136 fail, which would bring use to the state representing 'd' in the second word where we would
2137 try 'g' and succeed, prodceding to match 'cdgu'.
2139 /* add a fail transition */
2140 const U32 trie_offset = ARG(source);
2141 reg_trie_data *trie=(reg_trie_data *)RExC_rxi->data->data[trie_offset];
2143 const U32 ucharcount = trie->uniquecharcount;
2144 const U32 numstates = trie->statecount;
2145 const U32 ubound = trie->lasttrans + ucharcount;
2149 U32 base = trie->states[ 1 ].trans.base;
2152 const U32 data_slot = add_data( pRExC_state, 1, "T" );
2153 GET_RE_DEBUG_FLAGS_DECL;
2155 PERL_UNUSED_ARG(depth);
2159 ARG_SET( stclass, data_slot );
2160 aho = (reg_ac_data *) PerlMemShared_calloc( 1, sizeof(reg_ac_data) );
2161 RExC_rxi->data->data[ data_slot ] = (void*)aho;
2162 aho->trie=trie_offset;
2163 aho->states=(reg_trie_state *)PerlMemShared_malloc( numstates * sizeof(reg_trie_state) );
2164 Copy( trie->states, aho->states, numstates, reg_trie_state );
2165 Newxz( q, numstates, U32);
2166 aho->fail = (U32 *) PerlMemShared_calloc( numstates, sizeof(U32) );
2169 /* initialize fail[0..1] to be 1 so that we always have
2170 a valid final fail state */
2171 fail[ 0 ] = fail[ 1 ] = 1;
2173 for ( charid = 0; charid < ucharcount ; charid++ ) {
2174 const U32 newstate = TRIE_TRANS_STATE( 1, base, ucharcount, charid, 0 );
2176 q[ q_write ] = newstate;
2177 /* set to point at the root */
2178 fail[ q[ q_write++ ] ]=1;
2181 while ( q_read < q_write) {
2182 const U32 cur = q[ q_read++ % numstates ];
2183 base = trie->states[ cur ].trans.base;
2185 for ( charid = 0 ; charid < ucharcount ; charid++ ) {
2186 const U32 ch_state = TRIE_TRANS_STATE( cur, base, ucharcount, charid, 1 );
2188 U32 fail_state = cur;
2191 fail_state = fail[ fail_state ];
2192 fail_base = aho->states[ fail_state ].trans.base;
2193 } while ( !TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 ) );
2195 fail_state = TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 );
2196 fail[ ch_state ] = fail_state;
2197 if ( !aho->states[ ch_state ].wordnum && aho->states[ fail_state ].wordnum )
2199 aho->states[ ch_state ].wordnum = aho->states[ fail_state ].wordnum;
2201 q[ q_write++ % numstates] = ch_state;
2205 /* restore fail[0..1] to 0 so that we "fall out" of the AC loop
2206 when we fail in state 1, this allows us to use the
2207 charclass scan to find a valid start char. This is based on the principle
2208 that theres a good chance the string being searched contains lots of stuff
2209 that cant be a start char.
2211 fail[ 0 ] = fail[ 1 ] = 0;
2212 DEBUG_TRIE_COMPILE_r({
2213 PerlIO_printf(Perl_debug_log,
2214 "%*sStclass Failtable (%"UVuf" states): 0",
2215 (int)(depth * 2), "", (UV)numstates
2217 for( q_read=1; q_read<numstates; q_read++ ) {
2218 PerlIO_printf(Perl_debug_log, ", %"UVuf, (UV)fail[q_read]);
2220 PerlIO_printf(Perl_debug_log, "\n");
2223 /*RExC_seen |= REG_SEEN_TRIEDFA;*/
2228 * There are strange code-generation bugs caused on sparc64 by gcc-2.95.2.
2229 * These need to be revisited when a newer toolchain becomes available.
2231 #if defined(__sparc64__) && defined(__GNUC__)
2232 # if __GNUC__ < 2 || (__GNUC__ == 2 && __GNUC_MINOR__ < 96)
2233 # undef SPARC64_GCC_WORKAROUND
2234 # define SPARC64_GCC_WORKAROUND 1
2238 #define DEBUG_PEEP(str,scan,depth) \
2239 DEBUG_OPTIMISE_r({if (scan){ \
2240 SV * const mysv=sv_newmortal(); \
2241 regnode *Next = regnext(scan); \
2242 regprop(RExC_rx, mysv, scan); \
2243 PerlIO_printf(Perl_debug_log, "%*s" str ">%3d: %s (%d)\n", \
2244 (int)depth*2, "", REG_NODE_NUM(scan), SvPV_nolen_const(mysv),\
2245 Next ? (REG_NODE_NUM(Next)) : 0 ); \
2252 #define JOIN_EXACT(scan,min,flags) \
2253 if (PL_regkind[OP(scan)] == EXACT) \
2254 join_exact(pRExC_state,(scan),(min),(flags),NULL,depth+1)
2257 S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, I32 *min, U32 flags,regnode *val, U32 depth) {
2258 /* Merge several consecutive EXACTish nodes into one. */
2259 regnode *n = regnext(scan);
2261 regnode *next = scan + NODE_SZ_STR(scan);
2265 regnode *stop = scan;
2266 GET_RE_DEBUG_FLAGS_DECL;
2268 PERL_UNUSED_ARG(depth);
2270 #ifndef EXPERIMENTAL_INPLACESCAN
2271 PERL_UNUSED_ARG(flags);
2272 PERL_UNUSED_ARG(val);
2274 DEBUG_PEEP("join",scan,depth);
2276 /* Skip NOTHING, merge EXACT*. */
2278 ( PL_regkind[OP(n)] == NOTHING ||
2279 (stringok && (OP(n) == OP(scan))))
2281 && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX) {
2283 if (OP(n) == TAIL || n > next)
2285 if (PL_regkind[OP(n)] == NOTHING) {
2286 DEBUG_PEEP("skip:",n,depth);
2287 NEXT_OFF(scan) += NEXT_OFF(n);
2288 next = n + NODE_STEP_REGNODE;
2295 else if (stringok) {
2296 const unsigned int oldl = STR_LEN(scan);
2297 regnode * const nnext = regnext(n);
2299 DEBUG_PEEP("merg",n,depth);
2302 if (oldl + STR_LEN(n) > U8_MAX)
2304 NEXT_OFF(scan) += NEXT_OFF(n);
2305 STR_LEN(scan) += STR_LEN(n);
2306 next = n + NODE_SZ_STR(n);
2307 /* Now we can overwrite *n : */
2308 Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
2316 #ifdef EXPERIMENTAL_INPLACESCAN
2317 if (flags && !NEXT_OFF(n)) {
2318 DEBUG_PEEP("atch", val, depth);
2319 if (reg_off_by_arg[OP(n)]) {
2320 ARG_SET(n, val - n);
2323 NEXT_OFF(n) = val - n;
2330 if (UTF && ( OP(scan) == EXACTF ) && ( STR_LEN(scan) >= 6 ) ) {
2332 Two problematic code points in Unicode casefolding of EXACT nodes:
2334 U+0390 - GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
2335 U+03B0 - GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
2341 U+03B9 U+0308 U+0301 0xCE 0xB9 0xCC 0x88 0xCC 0x81
2342 U+03C5 U+0308 U+0301 0xCF 0x85 0xCC 0x88 0xCC 0x81
2344 This means that in case-insensitive matching (or "loose matching",
2345 as Unicode calls it), an EXACTF of length six (the UTF-8 encoded byte
2346 length of the above casefolded versions) can match a target string
2347 of length two (the byte length of UTF-8 encoded U+0390 or U+03B0).
2348 This would rather mess up the minimum length computation.
2350 What we'll do is to look for the tail four bytes, and then peek
2351 at the preceding two bytes to see whether we need to decrease
2352 the minimum length by four (six minus two).
2354 Thanks to the design of UTF-8, there cannot be false matches:
2355 A sequence of valid UTF-8 bytes cannot be a subsequence of
2356 another valid sequence of UTF-8 bytes.
2359 char * const s0 = STRING(scan), *s, *t;
2360 char * const s1 = s0 + STR_LEN(scan) - 1;
2361 char * const s2 = s1 - 4;
2362 #ifdef EBCDIC /* RD tunifold greek 0390 and 03B0 */
2363 const char t0[] = "\xaf\x49\xaf\x42";
2365 const char t0[] = "\xcc\x88\xcc\x81";
2367 const char * const t1 = t0 + 3;
2370 s < s2 && (t = ninstr(s, s1, t0, t1));
2373 if (((U8)t[-1] == 0x68 && (U8)t[-2] == 0xB4) ||
2374 ((U8)t[-1] == 0x46 && (U8)t[-2] == 0xB5))
2376 if (((U8)t[-1] == 0xB9 && (U8)t[-2] == 0xCE) ||
2377 ((U8)t[-1] == 0x85 && (U8)t[-2] == 0xCF))
2385 n = scan + NODE_SZ_STR(scan);
2387 if (PL_regkind[OP(n)] != NOTHING || OP(n) == NOTHING) {
2394 DEBUG_OPTIMISE_r(if (merged){DEBUG_PEEP("finl",scan,depth)});
2398 /* REx optimizer. Converts nodes into quickier variants "in place".
2399 Finds fixed substrings. */
2401 /* Stops at toplevel WHILEM as well as at "last". At end *scanp is set
2402 to the position after last scanned or to NULL. */
2404 #define INIT_AND_WITHP \
2405 assert(!and_withp); \
2406 Newx(and_withp,1,struct regnode_charclass_class); \
2407 SAVEFREEPV(and_withp)
2409 /* this is a chain of data about sub patterns we are processing that
2410 need to be handled seperately/specially in study_chunk. Its so
2411 we can simulate recursion without losing state. */
2413 typedef struct scan_frame {
2414 regnode *last; /* last node to process in this frame */
2415 regnode *next; /* next node to process when last is reached */
2416 struct scan_frame *prev; /*previous frame*/
2417 I32 stop; /* what stopparen do we use */
2421 #define SCAN_COMMIT(s, data, m) scan_commit(s, data, m, is_inf)
2423 #define CASE_SYNST_FNC(nAmE) \
2425 if (flags & SCF_DO_STCLASS_AND) { \
2426 for (value = 0; value < 256; value++) \
2427 if (!is_ ## nAmE ## _cp(value)) \
2428 ANYOF_BITMAP_CLEAR(data->start_class, value); \
2431 for (value = 0; value < 256; value++) \
2432 if (is_ ## nAmE ## _cp(value)) \
2433 ANYOF_BITMAP_SET(data->start_class, value); \
2437 if (flags & SCF_DO_STCLASS_AND) { \
2438 for (value = 0; value < 256; value++) \
2439 if (is_ ## nAmE ## _cp(value)) \
2440 ANYOF_BITMAP_CLEAR(data->start_class, value); \
2443 for (value = 0; value < 256; value++) \
2444 if (!is_ ## nAmE ## _cp(value)) \
2445 ANYOF_BITMAP_SET(data->start_class, value); \
2452 S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
2453 I32 *minlenp, I32 *deltap,
2458 struct regnode_charclass_class *and_withp,
2459 U32 flags, U32 depth)
2460 /* scanp: Start here (read-write). */
2461 /* deltap: Write maxlen-minlen here. */
2462 /* last: Stop before this one. */
2463 /* data: string data about the pattern */
2464 /* stopparen: treat close N as END */
2465 /* recursed: which subroutines have we recursed into */
2466 /* and_withp: Valid if flags & SCF_DO_STCLASS_OR */
2469 I32 min = 0, pars = 0, code;
2470 regnode *scan = *scanp, *next;
2472 int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
2473 int is_inf_internal = 0; /* The studied chunk is infinite */
2474 I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
2475 scan_data_t data_fake;
2476 SV *re_trie_maxbuff = NULL;
2477 regnode *first_non_open = scan;
2478 I32 stopmin = I32_MAX;
2479 scan_frame *frame = NULL;
2481 GET_RE_DEBUG_FLAGS_DECL;
2484 StructCopy(&zero_scan_data, &data_fake, scan_data_t);
2488 while (first_non_open && OP(first_non_open) == OPEN)
2489 first_non_open=regnext(first_non_open);
2494 while ( scan && OP(scan) != END && scan < last ){
2495 /* Peephole optimizer: */
2496 DEBUG_STUDYDATA("Peep:", data,depth);
2497 DEBUG_PEEP("Peep",scan,depth);
2498 JOIN_EXACT(scan,&min,0);
2500 /* Follow the next-chain of the current node and optimize
2501 away all the NOTHINGs from it. */
2502 if (OP(scan) != CURLYX) {
2503 const int max = (reg_off_by_arg[OP(scan)]
2505 /* I32 may be smaller than U16 on CRAYs! */
2506 : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
2507 int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
2511 /* Skip NOTHING and LONGJMP. */
2512 while ((n = regnext(n))
2513 && ((PL_regkind[OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
2514 || ((OP(n) == LONGJMP) && (noff = ARG(n))))
2515 && off + noff < max)
2517 if (reg_off_by_arg[OP(scan)])
2520 NEXT_OFF(scan) = off;
2525 /* The principal pseudo-switch. Cannot be a switch, since we
2526 look into several different things. */
2527 if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
2528 || OP(scan) == IFTHEN) {
2529 next = regnext(scan);
2531 /* demq: the op(next)==code check is to see if we have "branch-branch" AFAICT */
2533 if (OP(next) == code || code == IFTHEN) {
2534 /* NOTE - There is similar code to this block below for handling
2535 TRIE nodes on a re-study. If you change stuff here check there
2537 I32 max1 = 0, min1 = I32_MAX, num = 0;
2538 struct regnode_charclass_class accum;
2539 regnode * const startbranch=scan;
2541 if (flags & SCF_DO_SUBSTR)
2542 SCAN_COMMIT(pRExC_state, data, minlenp); /* Cannot merge strings after this. */
2543 if (flags & SCF_DO_STCLASS)
2544 cl_init_zero(pRExC_state, &accum);
2546 while (OP(scan) == code) {
2547 I32 deltanext, minnext, f = 0, fake;
2548 struct regnode_charclass_class this_class;
2551 data_fake.flags = 0;
2553 data_fake.whilem_c = data->whilem_c;
2554 data_fake.last_closep = data->last_closep;
2557 data_fake.last_closep = &fake;
2559 data_fake.pos_delta = delta;
2560 next = regnext(scan);
2561 scan = NEXTOPER(scan);
2563 scan = NEXTOPER(scan);
2564 if (flags & SCF_DO_STCLASS) {
2565 cl_init(pRExC_state, &this_class);
2566 data_fake.start_class = &this_class;
2567 f = SCF_DO_STCLASS_AND;
2569 if (flags & SCF_WHILEM_VISITED_POS)
2570 f |= SCF_WHILEM_VISITED_POS;
2572 /* we suppose the run is continuous, last=next...*/
2573 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
2575 stopparen, recursed, NULL, f,depth+1);
2578 if (max1 < minnext + deltanext)
2579 max1 = minnext + deltanext;
2580 if (deltanext == I32_MAX)
2581 is_inf = is_inf_internal = 1;
2583 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
2585 if (data_fake.flags & SCF_SEEN_ACCEPT) {
2586 if ( stopmin > minnext)
2587 stopmin = min + min1;
2588 flags &= ~SCF_DO_SUBSTR;
2590 data->flags |= SCF_SEEN_ACCEPT;
2593 if (data_fake.flags & SF_HAS_EVAL)
2594 data->flags |= SF_HAS_EVAL;
2595 data->whilem_c = data_fake.whilem_c;
2597 if (flags & SCF_DO_STCLASS)
2598 cl_or(pRExC_state, &accum, &this_class);
2600 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
2602 if (flags & SCF_DO_SUBSTR) {
2603 data->pos_min += min1;
2604 data->pos_delta += max1 - min1;
2605 if (max1 != min1 || is_inf)
2606 data->longest = &(data->longest_float);
2609 delta += max1 - min1;
2610 if (flags & SCF_DO_STCLASS_OR) {
2611 cl_or(pRExC_state, data->start_class, &accum);
2613 cl_and(data->start_class, and_withp);
2614 flags &= ~SCF_DO_STCLASS;
2617 else if (flags & SCF_DO_STCLASS_AND) {
2619 cl_and(data->start_class, &accum);
2620 flags &= ~SCF_DO_STCLASS;
2623 /* Switch to OR mode: cache the old value of
2624 * data->start_class */
2626 StructCopy(data->start_class, and_withp,
2627 struct regnode_charclass_class);
2628 flags &= ~SCF_DO_STCLASS_AND;
2629 StructCopy(&accum, data->start_class,
2630 struct regnode_charclass_class);
2631 flags |= SCF_DO_STCLASS_OR;
2632 data->start_class->flags |= ANYOF_EOS;
2636 if (PERL_ENABLE_TRIE_OPTIMISATION && OP( startbranch ) == BRANCH ) {
2639 Assuming this was/is a branch we are dealing with: 'scan' now
2640 points at the item that follows the branch sequence, whatever
2641 it is. We now start at the beginning of the sequence and look
2648 which would be constructed from a pattern like /A|LIST|OF|WORDS/
2650 If we can find such a subseqence we need to turn the first
2651 element into a trie and then add the subsequent branch exact
2652 strings to the trie.
2656 1. patterns where the whole set of branch can be converted.
2658 2. patterns where only a subset can be converted.
2660 In case 1 we can replace the whole set with a single regop
2661 for the trie. In case 2 we need to keep the start and end
2664 'BRANCH EXACT; BRANCH EXACT; BRANCH X'
2665 becomes BRANCH TRIE; BRANCH X;
2667 There is an additional case, that being where there is a
2668 common prefix, which gets split out into an EXACT like node
2669 preceding the TRIE node.
2671 If x(1..n)==tail then we can do a simple trie, if not we make
2672 a "jump" trie, such that when we match the appropriate word
2673 we "jump" to the appopriate tail node. Essentailly we turn
2674 a nested if into a case structure of sorts.
2679 if (!re_trie_maxbuff) {
2680 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
2681 if (!SvIOK(re_trie_maxbuff))
2682 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
2684 if ( SvIV(re_trie_maxbuff)>=0 ) {
2686 regnode *first = (regnode *)NULL;
2687 regnode *last = (regnode *)NULL;
2688 regnode *tail = scan;
2693 SV * const mysv = sv_newmortal(); /* for dumping */
2695 /* var tail is used because there may be a TAIL
2696 regop in the way. Ie, the exacts will point to the
2697 thing following the TAIL, but the last branch will
2698 point at the TAIL. So we advance tail. If we
2699 have nested (?:) we may have to move through several
2703 while ( OP( tail ) == TAIL ) {
2704 /* this is the TAIL generated by (?:) */
2705 tail = regnext( tail );
2710 regprop(RExC_rx, mysv, tail );
2711 PerlIO_printf( Perl_debug_log, "%*s%s%s\n",
2712 (int)depth * 2 + 2, "",
2713 "Looking for TRIE'able sequences. Tail node is: ",
2714 SvPV_nolen_const( mysv )
2720 step through the branches, cur represents each
2721 branch, noper is the first thing to be matched
2722 as part of that branch and noper_next is the
2723 regnext() of that node. if noper is an EXACT
2724 and noper_next is the same as scan (our current
2725 position in the regex) then the EXACT branch is
2726 a possible optimization target. Once we have
2727 two or more consequetive such branches we can
2728 create a trie of the EXACT's contents and stich
2729 it in place. If the sequence represents all of
2730 the branches we eliminate the whole thing and
2731 replace it with a single TRIE. If it is a
2732 subsequence then we need to stitch it in. This
2733 means the first branch has to remain, and needs
2734 to be repointed at the item on the branch chain
2735 following the last branch optimized. This could
2736 be either a BRANCH, in which case the
2737 subsequence is internal, or it could be the
2738 item following the branch sequence in which
2739 case the subsequence is at the end.
2743 /* dont use tail as the end marker for this traverse */
2744 for ( cur = startbranch ; cur != scan ; cur = regnext( cur ) ) {
2745 regnode * const noper = NEXTOPER( cur );
2746 #if defined(DEBUGGING) || defined(NOJUMPTRIE)
2747 regnode * const noper_next = regnext( noper );
2751 regprop(RExC_rx, mysv, cur);
2752 PerlIO_printf( Perl_debug_log, "%*s- %s (%d)",
2753 (int)depth * 2 + 2,"", SvPV_nolen_const( mysv ), REG_NODE_NUM(cur) );
2755 regprop(RExC_rx, mysv, noper);
2756 PerlIO_printf( Perl_debug_log, " -> %s",
2757 SvPV_nolen_const(mysv));
2760 regprop(RExC_rx, mysv, noper_next );
2761 PerlIO_printf( Perl_debug_log,"\t=> %s\t",
2762 SvPV_nolen_const(mysv));
2764 PerlIO_printf( Perl_debug_log, "(First==%d,Last==%d,Cur==%d)\n",
2765 REG_NODE_NUM(first), REG_NODE_NUM(last), REG_NODE_NUM(cur) );
2767 if ( (((first && optype!=NOTHING) ? OP( noper ) == optype
2768 : PL_regkind[ OP( noper ) ] == EXACT )
2769 || OP(noper) == NOTHING )
2771 && noper_next == tail
2776 if ( !first || optype == NOTHING ) {
2777 if (!first) first = cur;
2778 optype = OP( noper );
2784 make_trie( pRExC_state,
2785 startbranch, first, cur, tail, count,
2788 if ( PL_regkind[ OP( noper ) ] == EXACT
2790 && noper_next == tail
2795 optype = OP( noper );
2805 regprop(RExC_rx, mysv, cur);
2806 PerlIO_printf( Perl_debug_log,
2807 "%*s- %s (%d) <SCAN FINISHED>\n", (int)depth * 2 + 2,
2808 "", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur));
2812 made= make_trie( pRExC_state, startbranch, first, scan, tail, count, optype, depth+1 );
2813 #ifdef TRIE_STUDY_OPT
2814 if ( ((made == MADE_EXACT_TRIE &&
2815 startbranch == first)
2816 || ( first_non_open == first )) &&
2818 flags |= SCF_TRIE_RESTUDY;
2819 if ( startbranch == first
2822 RExC_seen &=~REG_TOP_LEVEL_BRANCHES;
2832 else if ( code == BRANCHJ ) { /* single branch is optimized. */
2833 scan = NEXTOPER(NEXTOPER(scan));
2834 } else /* single branch is optimized. */
2835 scan = NEXTOPER(scan);
2837 } else if (OP(scan) == SUSPEND || OP(scan) == GOSUB || OP(scan) == GOSTART) {
2838 scan_frame *newframe = NULL;
2843 if (OP(scan) != SUSPEND) {
2844 /* set the pointer */
2845 if (OP(scan) == GOSUB) {
2847 RExC_recurse[ARG2L(scan)] = scan;
2848 start = RExC_open_parens[paren-1];
2849 end = RExC_close_parens[paren-1];
2852 start = RExC_rxi->program + 1;
2856 Newxz(recursed, (((RExC_npar)>>3) +1), U8);
2857 SAVEFREEPV(recursed);
2859 if (!PAREN_TEST(recursed,paren+1)) {
2860 PAREN_SET(recursed,paren+1);
2861 Newx(newframe,1,scan_frame);
2863 if (flags & SCF_DO_SUBSTR) {
2864 SCAN_COMMIT(pRExC_state,data,minlenp);
2865 data->longest = &(data->longest_float);
2867 is_inf = is_inf_internal = 1;
2868 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
2869 cl_anything(pRExC_state, data->start_class);
2870 flags &= ~SCF_DO_STCLASS;
2873 Newx(newframe,1,scan_frame);
2876 end = regnext(scan);
2881 SAVEFREEPV(newframe);
2882 newframe->next = regnext(scan);
2883 newframe->last = last;
2884 newframe->stop = stopparen;
2885 newframe->prev = frame;
2895 else if (OP(scan) == EXACT) {
2896 I32 l = STR_LEN(scan);
2899 const U8 * const s = (U8*)STRING(scan);
2900 l = utf8_length(s, s + l);
2901 uc = utf8_to_uvchr(s, NULL);
2903 uc = *((U8*)STRING(scan));
2906 if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
2907 /* The code below prefers earlier match for fixed
2908 offset, later match for variable offset. */
2909 if (data->last_end == -1) { /* Update the start info. */
2910 data->last_start_min = data->pos_min;
2911 data->last_start_max = is_inf
2912 ? I32_MAX : data->pos_min + data->pos_delta;
2914 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
2916 SvUTF8_on(data->last_found);
2918 SV * const sv = data->last_found;
2919 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
2920 mg_find(sv, PERL_MAGIC_utf8) : NULL;
2921 if (mg && mg->mg_len >= 0)
2922 mg->mg_len += utf8_length((U8*)STRING(scan),
2923 (U8*)STRING(scan)+STR_LEN(scan));
2925 data->last_end = data->pos_min + l;
2926 data->pos_min += l; /* As in the first entry. */
2927 data->flags &= ~SF_BEFORE_EOL;
2929 if (flags & SCF_DO_STCLASS_AND) {
2930 /* Check whether it is compatible with what we know already! */
2934 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
2935 && !ANYOF_BITMAP_TEST(data->start_class, uc)
2936 && (!(data->start_class->flags & ANYOF_FOLD)
2937 || !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
2940 ANYOF_CLASS_ZERO(data->start_class);
2941 ANYOF_BITMAP_ZERO(data->start_class);
2943 ANYOF_BITMAP_SET(data->start_class, uc);
2944 data->start_class->flags &= ~ANYOF_EOS;
2946 data->start_class->flags &= ~ANYOF_UNICODE_ALL;
2948 else if (flags & SCF_DO_STCLASS_OR) {
2949 /* false positive possible if the class is case-folded */
2951 ANYOF_BITMAP_SET(data->start_class, uc);
2953 data->start_class->flags |= ANYOF_UNICODE_ALL;
2954 data->start_class->flags &= ~ANYOF_EOS;
2955 cl_and(data->start_class, and_withp);
2957 flags &= ~SCF_DO_STCLASS;
2959 else if (PL_regkind[OP(scan)] == EXACT) { /* But OP != EXACT! */
2960 I32 l = STR_LEN(scan);
2961 UV uc = *((U8*)STRING(scan));
2963 /* Search for fixed substrings supports EXACT only. */
2964 if (flags & SCF_DO_SUBSTR) {
2966 SCAN_COMMIT(pRExC_state, data, minlenp);
2969 const U8 * const s = (U8 *)STRING(scan);
2970 l = utf8_length(s, s + l);
2971 uc = utf8_to_uvchr(s, NULL);
2974 if (flags & SCF_DO_SUBSTR)
2976 if (flags & SCF_DO_STCLASS_AND) {
2977 /* Check whether it is compatible with what we know already! */
2981 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
2982 && !ANYOF_BITMAP_TEST(data->start_class, uc)
2983 && !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
2985 ANYOF_CLASS_ZERO(data->start_class);
2986 ANYOF_BITMAP_ZERO(data->start_class);
2988 ANYOF_BITMAP_SET(data->start_class, uc);
2989 data->start_class->flags &= ~ANYOF_EOS;
2990 data->start_class->flags |= ANYOF_FOLD;
2991 if (OP(scan) == EXACTFL)
2992 data->start_class->flags |= ANYOF_LOCALE;
2995 else if (flags & SCF_DO_STCLASS_OR) {
2996 if (data->start_class->flags & ANYOF_FOLD) {
2997 /* false positive possible if the class is case-folded.
2998 Assume that the locale settings are the same... */
3000 ANYOF_BITMAP_SET(data->start_class, uc);
3001 data->start_class->flags &= ~ANYOF_EOS;
3003 cl_and(data->start_class, and_withp);
3005 flags &= ~SCF_DO_STCLASS;
3007 else if (strchr((const char*)PL_varies,OP(scan))) {
3008 I32 mincount, maxcount, minnext, deltanext, fl = 0;
3009 I32 f = flags, pos_before = 0;
3010 regnode * const oscan = scan;
3011 struct regnode_charclass_class this_class;
3012 struct regnode_charclass_class *oclass = NULL;
3013 I32 next_is_eval = 0;
3015 switch (PL_regkind[OP(scan)]) {
3016 case WHILEM: /* End of (?:...)* . */
3017 scan = NEXTOPER(scan);
3020 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
3021 next = NEXTOPER(scan);
3022 if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) {
3024 maxcount = REG_INFTY;
3025 next = regnext(scan);
3026 scan = NEXTOPER(scan);
3030 if (flags & SCF_DO_SUBSTR)
3035 if (flags & SCF_DO_STCLASS) {
3037 maxcount = REG_INFTY;
3038 next = regnext(scan);
3039 scan = NEXTOPER(scan);
3042 is_inf = is_inf_internal = 1;
3043 scan = regnext(scan);
3044 if (flags & SCF_DO_SUBSTR) {
3045 SCAN_COMMIT(pRExC_state, data, minlenp); /* Cannot extend fixed substrings */
3046 data->longest = &(data->longest_float);
3048 goto optimize_curly_tail;
3050 if (stopparen>0 && (OP(scan)==CURLYN || OP(scan)==CURLYM)
3051 && (scan->flags == stopparen))
3056 mincount = ARG1(scan);
3057 maxcount = ARG2(scan);
3059 next = regnext(scan);
3060 if (OP(scan) == CURLYX) {
3061 I32 lp = (data ? *(data->last_closep) : 0);
3062 scan->flags = ((lp <= (I32)U8_MAX) ? (U8)lp : U8_MAX);
3064 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
3065 next_is_eval = (OP(scan) == EVAL);
3067 if (flags & SCF_DO_SUBSTR) {
3068 if (mincount == 0) SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot extend fixed substrings */
3069 pos_before = data->pos_min;
3073 data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
3075 data->flags |= SF_IS_INF;
3077 if (flags & SCF_DO_STCLASS) {
3078 cl_init(pRExC_state, &this_class);
3079 oclass = data->start_class;
3080 data->start_class = &this_class;
3081 f |= SCF_DO_STCLASS_AND;
3082 f &= ~SCF_DO_STCLASS_OR;
3084 /* These are the cases when once a subexpression
3085 fails at a particular position, it cannot succeed
3086 even after backtracking at the enclosing scope.
3088 XXXX what if minimal match and we are at the
3089 initial run of {n,m}? */
3090 if ((mincount != maxcount - 1) && (maxcount != REG_INFTY))
3091 f &= ~SCF_WHILEM_VISITED_POS;
3093 /* This will finish on WHILEM, setting scan, or on NULL: */
3094 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
3095 last, data, stopparen, recursed, NULL,
3097 ? (f & ~SCF_DO_SUBSTR) : f),depth+1);
3099 if (flags & SCF_DO_STCLASS)
3100 data->start_class = oclass;
3101 if (mincount == 0 || minnext == 0) {
3102 if (flags & SCF_DO_STCLASS_OR) {
3103 cl_or(pRExC_state, data->start_class, &this_class);
3105 else if (flags & SCF_DO_STCLASS_AND) {
3106 /* Switch to OR mode: cache the old value of
3107 * data->start_class */
3109 StructCopy(data->start_class, and_withp,
3110 struct regnode_charclass_class);
3111 flags &= ~SCF_DO_STCLASS_AND;
3112 StructCopy(&this_class, data->start_class,
3113 struct regnode_charclass_class);
3114 flags |= SCF_DO_STCLASS_OR;
3115 data->start_class->flags |= ANYOF_EOS;
3117 } else { /* Non-zero len */
3118 if (flags & SCF_DO_STCLASS_OR) {
3119 cl_or(pRExC_state, data->start_class, &this_class);
3120 cl_and(data->start_class, and_withp);
3122 else if (flags & SCF_DO_STCLASS_AND)
3123 cl_and(data->start_class, &this_class);
3124 flags &= ~SCF_DO_STCLASS;
3126 if (!scan) /* It was not CURLYX, but CURLY. */
3128 if ( /* ? quantifier ok, except for (?{ ... }) */
3129 (next_is_eval || !(mincount == 0 && maxcount == 1))
3130 && (minnext == 0) && (deltanext == 0)
3131 && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
3132 && maxcount <= REG_INFTY/3 /* Complement check for big count */
3133 && ckWARN(WARN_REGEXP))
3136 "Quantifier unexpected on zero-length expression");
3139 min += minnext * mincount;
3140 is_inf_internal |= ((maxcount == REG_INFTY
3141 && (minnext + deltanext) > 0)
3142 || deltanext == I32_MAX);
3143 is_inf |= is_inf_internal;
3144 delta += (minnext + deltanext) * maxcount - minnext * mincount;
3146 /* Try powerful optimization CURLYX => CURLYN. */
3147 if ( OP(oscan) == CURLYX && data
3148 && data->flags & SF_IN_PAR
3149 && !(data->flags & SF_HAS_EVAL)
3150 && !deltanext && minnext == 1 ) {
3151 /* Try to optimize to CURLYN. */
3152 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
3153 regnode * const nxt1 = nxt;
3160 if (!strchr((const char*)PL_simple,OP(nxt))
3161 && !(PL_regkind[OP(nxt)] == EXACT
3162 && STR_LEN(nxt) == 1))
3168 if (OP(nxt) != CLOSE)
3170 if (RExC_open_parens) {
3171 RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
3172 RExC_close_parens[ARG(nxt1)-1]=nxt+2; /*close->while*/
3174 /* Now we know that nxt2 is the only contents: */
3175 oscan->flags = (U8)ARG(nxt);
3177 OP(nxt1) = NOTHING; /* was OPEN. */
3180 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
3181 NEXT_OFF(nxt1+ 1) = 0; /* just for consistancy. */
3182 NEXT_OFF(nxt2) = 0; /* just for consistancy with CURLY. */
3183 OP(nxt) = OPTIMIZED; /* was CLOSE. */
3184 OP(nxt + 1) = OPTIMIZED; /* was count. */
3185 NEXT_OFF(nxt+ 1) = 0; /* just for consistancy. */
3190 /* Try optimization CURLYX => CURLYM. */
3191 if ( OP(oscan) == CURLYX && data
3192 && !(data->flags & SF_HAS_PAR)
3193 && !(data->flags & SF_HAS_EVAL)
3194 && !deltanext /* atom is fixed width */
3195 && minnext != 0 /* CURLYM can't handle zero width */
3197 /* XXXX How to optimize if data == 0? */
3198 /* Optimize to a simpler form. */
3199 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
3203 while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
3204 && (OP(nxt2) != WHILEM))
3206 OP(nxt2) = SUCCEED; /* Whas WHILEM */
3207 /* Need to optimize away parenths. */
3208 if (data->flags & SF_IN_PAR) {
3209 /* Set the parenth number. */
3210 regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
3212 if (OP(nxt) != CLOSE)
3213 FAIL("Panic opt close");
3214 oscan->flags = (U8)ARG(nxt);
3215 if (RExC_open_parens) {
3216 RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
3217 RExC_close_parens[ARG(nxt1)-1]=nxt2+1; /*close->NOTHING*/
3219 OP(nxt1) = OPTIMIZED; /* was OPEN. */
3220 OP(nxt) = OPTIMIZED; /* was CLOSE. */
3223 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
3224 OP(nxt + 1) = OPTIMIZED; /* was count. */
3225 NEXT_OFF(nxt1 + 1) = 0; /* just for consistancy. */
3226 NEXT_OFF(nxt + 1) = 0; /* just for consistancy. */
3229 while ( nxt1 && (OP(nxt1) != WHILEM)) {
3230 regnode *nnxt = regnext(nxt1);
3233 if (reg_off_by_arg[OP(nxt1)])
3234 ARG_SET(nxt1, nxt2 - nxt1);
3235 else if (nxt2 - nxt1 < U16_MAX)
3236 NEXT_OFF(nxt1) = nxt2 - nxt1;
3238 OP(nxt) = NOTHING; /* Cannot beautify */
3243 /* Optimize again: */
3244 study_chunk(pRExC_state, &nxt1, minlenp, &deltanext, nxt,
3245 NULL, stopparen, recursed, NULL, 0,depth+1);
3250 else if ((OP(oscan) == CURLYX)
3251 && (flags & SCF_WHILEM_VISITED_POS)
3252 /* See the comment on a similar expression above.
3253 However, this time it not a subexpression
3254 we care about, but the expression itself. */
3255 && (maxcount == REG_INFTY)
3256 && data && ++data->whilem_c < 16) {
3257 /* This stays as CURLYX, we can put the count/of pair. */
3258 /* Find WHILEM (as in regexec.c) */
3259 regnode *nxt = oscan + NEXT_OFF(oscan);
3261 if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
3263 PREVOPER(nxt)->flags = (U8)(data->whilem_c
3264 | (RExC_whilem_seen << 4)); /* On WHILEM */
3266 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
3268 if (flags & SCF_DO_SUBSTR) {
3269 SV *last_str = NULL;
3270 int counted = mincount != 0;
3272 if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */
3273 #if defined(SPARC64_GCC_WORKAROUND)
3276 const char *s = NULL;
3279 if (pos_before >= data->last_start_min)
3282 b = data->last_start_min;
3285 s = SvPV_const(data->last_found, l);
3286 old = b - data->last_start_min;
3289 I32 b = pos_before >= data->last_start_min
3290 ? pos_before : data->last_start_min;
3292 const char * const s = SvPV_const(data->last_found, l);
3293 I32 old = b - data->last_start_min;
3297 old = utf8_hop((U8*)s, old) - (U8*)s;
3300 /* Get the added string: */
3301 last_str = newSVpvn(s + old, l);
3303 SvUTF8_on(last_str);
3304 if (deltanext == 0 && pos_before == b) {
3305 /* What was added is a constant string */
3307 SvGROW(last_str, (mincount * l) + 1);
3308 repeatcpy(SvPVX(last_str) + l,
3309 SvPVX_const(last_str), l, mincount - 1);
3310 SvCUR_set(last_str, SvCUR(last_str) * mincount);
3311 /* Add additional parts. */
3312 SvCUR_set(data->last_found,
3313 SvCUR(data->last_found) - l);
3314 sv_catsv(data->last_found, last_str);
3316 SV * sv = data->last_found;
3318 SvUTF8(sv) && SvMAGICAL(sv) ?
3319 mg_find(sv, PERL_MAGIC_utf8) : NULL;
3320 if (mg && mg->mg_len >= 0)
3321 mg->mg_len += CHR_SVLEN(last_str) - l;
3323 data->last_end += l * (mincount - 1);
3326 /* start offset must point into the last copy */
3327 data->last_start_min += minnext * (mincount - 1);
3328 data->last_start_max += is_inf ? I32_MAX
3329 : (maxcount - 1) * (minnext + data->pos_delta);
3332 /* It is counted once already... */
3333 data->pos_min += minnext * (mincount - counted);
3334 data->pos_delta += - counted * deltanext +
3335 (minnext + deltanext) * maxcount - minnext * mincount;
3336 if (mincount != maxcount) {
3337 /* Cannot extend fixed substrings found inside
3339 SCAN_COMMIT(pRExC_state,data,minlenp);
3340 if (mincount && last_str) {
3341 SV * const sv = data->last_found;
3342 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
3343 mg_find(sv, PERL_MAGIC_utf8) : NULL;
3347 sv_setsv(sv, last_str);
3348 data->last_end = data->pos_min;
3349 data->last_start_min =
3350 data->pos_min - CHR_SVLEN(last_str);
3351 data->last_start_max = is_inf
3353 : data->pos_min + data->pos_delta
3354 - CHR_SVLEN(last_str);
3356 data->longest = &(data->longest_float);
3358 SvREFCNT_dec(last_str);
3360 if (data && (fl & SF_HAS_EVAL))
3361 data->flags |= SF_HAS_EVAL;
3362 optimize_curly_tail:
3363 if (OP(oscan) != CURLYX) {
3364 while (PL_regkind[OP(next = regnext(oscan))] == NOTHING
3366 NEXT_OFF(oscan) += NEXT_OFF(next);
3369 default: /* REF and CLUMP only? */
3370 if (flags & SCF_DO_SUBSTR) {
3371 SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */
3372 data->longest = &(data->longest_float);
3374 is_inf = is_inf_internal = 1;
3375 if (flags & SCF_DO_STCLASS_OR)
3376 cl_anything(pRExC_state, data->start_class);
3377 flags &= ~SCF_DO_STCLASS;
3381 else if (OP(scan) == LNBREAK) {
3382 if (flags & SCF_DO_STCLASS) {
3384 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
3385 if (flags & SCF_DO_STCLASS_AND) {
3386 for (value = 0; value < 256; value++)
3387 if (!is_VERTWS_cp(value))
3388 ANYOF_BITMAP_CLEAR(data->start_class, value);
3391 for (value = 0; value < 256; value++)
3392 if (is_VERTWS_cp(value))
3393 ANYOF_BITMAP_SET(data->start_class, value);
3395 if (flags & SCF_DO_STCLASS_OR)
3396 cl_and(data->start_class, and_withp);
3397 flags &= ~SCF_DO_STCLASS;
3401 if (flags & SCF_DO_SUBSTR) {
3402 SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */
3404 data->pos_delta += 1;
3405 data->longest = &(data->longest_float);
3409 else if (OP(scan) == FOLDCHAR) {
3410 int d = ARG(scan)==0xDF ? 1 : 2;
3411 flags &= ~SCF_DO_STCLASS;
3414 if (flags & SCF_DO_SUBSTR) {
3415 SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */
3417 data->pos_delta += d;
3418 data->longest = &(data->longest_float);
3421 else if (strchr((const char*)PL_simple,OP(scan))) {
3424 if (flags & SCF_DO_SUBSTR) {
3425 SCAN_COMMIT(pRExC_state,data,minlenp);
3429 if (flags & SCF_DO_STCLASS) {
3430 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
3432 /* Some of the logic below assumes that switching
3433 locale on will only add false positives. */
3434 switch (PL_regkind[OP(scan)]) {
3438 /* Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan)); */
3439 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
3440 cl_anything(pRExC_state, data->start_class);
3443 if (OP(scan) == SANY)
3445 if (flags & SCF_DO_STCLASS_OR) { /* Everything but \n */
3446 value = (ANYOF_BITMAP_TEST(data->start_class,'\n')
3447 || (data->start_class->flags & ANYOF_CLASS));
3448 cl_anything(pRExC_state, data->start_class);
3450 if (flags & SCF_DO_STCLASS_AND || !value)
3451 ANYOF_BITMAP_CLEAR(data->start_class,'\n');
3454 if (flags & SCF_DO_STCLASS_AND)
3455 cl_and(data->start_class,
3456 (struct regnode_charclass_class*)scan);
3458 cl_or(pRExC_state, data->start_class,
3459 (struct regnode_charclass_class*)scan);
3462 if (flags & SCF_DO_STCLASS_AND) {
3463 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3464 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
3465 for (value = 0; value < 256; value++)
3466 if (!isALNUM(value))
3467 ANYOF_BITMAP_CLEAR(data->start_class, value);
3471 if (data->start_class->flags & ANYOF_LOCALE)
3472 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
3474 for (value = 0; value < 256; value++)
3476 ANYOF_BITMAP_SET(data->start_class, value);
3481 if (flags & SCF_DO_STCLASS_AND) {
3482 if (data->start_class->flags & ANYOF_LOCALE)
3483 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
3486 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
3487 data->start_class->flags |= ANYOF_LOCALE;
3491 if (flags & SCF_DO_STCLASS_AND) {
3492 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3493 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
3494 for (value = 0; value < 256; value++)
3496 ANYOF_BITMAP_CLEAR(data->start_class, value);
3500 if (data->start_class->flags & ANYOF_LOCALE)
3501 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
3503 for (value = 0; value < 256; value++)
3504 if (!isALNUM(value))
3505 ANYOF_BITMAP_SET(data->start_class, value);
3510 if (flags & SCF_DO_STCLASS_AND) {
3511 if (data->start_class->flags & ANYOF_LOCALE)
3512 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
3515 data->start_class->flags |= ANYOF_LOCALE;
3516 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
3520 if (flags & SCF_DO_STCLASS_AND) {
3521 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3522 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
3523 for (value = 0; value < 256; value++)
3524 if (!isSPACE(value))
3525 ANYOF_BITMAP_CLEAR(data->start_class, value);
3529 if (data->start_class->flags & ANYOF_LOCALE)
3530 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
3532 for (value = 0; value < 256; value++)
3534 ANYOF_BITMAP_SET(data->start_class, value);
3539 if (flags & SCF_DO_STCLASS_AND) {
3540 if (data->start_class->flags & ANYOF_LOCALE)
3541 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
3544 data->start_class->flags |= ANYOF_LOCALE;
3545 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
3549 if (flags & SCF_DO_STCLASS_AND) {
3550 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3551 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
3552 for (value = 0; value < 256; value++)
3554 ANYOF_BITMAP_CLEAR(data->start_class, value);
3558 if (data->start_class->flags & ANYOF_LOCALE)
3559 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
3561 for (value = 0; value < 256; value++)
3562 if (!isSPACE(value))
3563 ANYOF_BITMAP_SET(data->start_class, value);
3568 if (flags & SCF_DO_STCLASS_AND) {
3569 if (data->start_class->flags & ANYOF_LOCALE) {
3570 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
3571 for (value = 0; value < 256; value++)
3572 if (!isSPACE(value))
3573 ANYOF_BITMAP_CLEAR(data->start_class, value);
3577 data->start_class->flags |= ANYOF_LOCALE;
3578 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
3582 if (flags & SCF_DO_STCLASS_AND) {
3583 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NDIGIT);
3584 for (value = 0; value < 256; value++)
3585 if (!isDIGIT(value))
3586 ANYOF_BITMAP_CLEAR(data->start_class, value);
3589 if (data->start_class->flags & ANYOF_LOCALE)
3590 ANYOF_CLASS_SET(data->start_class,ANYOF_DIGIT);
3592 for (value = 0; value < 256; value++)
3594 ANYOF_BITMAP_SET(data->start_class, value);
3599 if (flags & SCF_DO_STCLASS_AND) {
3600 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_DIGIT);
3601 for (value = 0; value < 256; value++)
3603 ANYOF_BITMAP_CLEAR(data->start_class, value);
3606 if (data->start_class->flags & ANYOF_LOCALE)
3607 ANYOF_CLASS_SET(data->start_class,ANYOF_NDIGIT);
3609 for (value = 0; value < 256; value++)
3610 if (!isDIGIT(value))
3611 ANYOF_BITMAP_SET(data->start_class, value);
3615 CASE_SYNST_FNC(VERTWS);
3616 CASE_SYNST_FNC(HORIZWS);
3619 if (flags & SCF_DO_STCLASS_OR)
3620 cl_and(data->start_class, and_withp);
3621 flags &= ~SCF_DO_STCLASS;
3624 else if (PL_regkind[OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
3625 data->flags |= (OP(scan) == MEOL
3629 else if ( PL_regkind[OP(scan)] == BRANCHJ
3630 /* Lookbehind, or need to calculate parens/evals/stclass: */
3631 && (scan->flags || data || (flags & SCF_DO_STCLASS))
3632 && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) {
3633 if ( !PERL_ENABLE_POSITIVE_ASSERTION_STUDY
3634 || OP(scan) == UNLESSM )
3636 /* Negative Lookahead/lookbehind
3637 In this case we can't do fixed string optimisation.
3640 I32 deltanext, minnext, fake = 0;
3642 struct regnode_charclass_class intrnl;
3645 data_fake.flags = 0;
3647 data_fake.whilem_c = data->whilem_c;
3648 data_fake.last_closep = data->last_closep;
3651 data_fake.last_closep = &fake;
3652 data_fake.pos_delta = delta;
3653 if ( flags & SCF_DO_STCLASS && !scan->flags
3654 && OP(scan) == IFMATCH ) { /* Lookahead */
3655 cl_init(pRExC_state, &intrnl);
3656 data_fake.start_class = &intrnl;
3657 f |= SCF_DO_STCLASS_AND;
3659 if (flags & SCF_WHILEM_VISITED_POS)
3660 f |= SCF_WHILEM_VISITED_POS;
3661 next = regnext(scan);
3662 nscan = NEXTOPER(NEXTOPER(scan));
3663 minnext = study_chunk(pRExC_state, &nscan, minlenp, &deltanext,
3664 last, &data_fake, stopparen, recursed, NULL, f, depth+1);
3667 FAIL("Variable length lookbehind not implemented");
3669 else if (minnext > (I32)U8_MAX) {
3670 FAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
3672 scan->flags = (U8)minnext;
3675 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3677 if (data_fake.flags & SF_HAS_EVAL)
3678 data->flags |= SF_HAS_EVAL;
3679 data->whilem_c = data_fake.whilem_c;
3681 if (f & SCF_DO_STCLASS_AND) {
3682 const int was = (data->start_class->flags & ANYOF_EOS);
3684 cl_and(data->start_class, &intrnl);
3686 data->start_class->flags |= ANYOF_EOS;
3689 #if PERL_ENABLE_POSITIVE_ASSERTION_STUDY
3691 /* Positive Lookahead/lookbehind
3692 In this case we can do fixed string optimisation,
3693 but we must be careful about it. Note in the case of
3694 lookbehind the positions will be offset by the minimum
3695 length of the pattern, something we won't know about
3696 until after the recurse.
3698 I32 deltanext, fake = 0;
3700 struct regnode_charclass_class intrnl;
3702 /* We use SAVEFREEPV so that when the full compile
3703 is finished perl will clean up the allocated
3704 minlens when its all done. This was we don't
3705 have to worry about freeing them when we know
3706 they wont be used, which would be a pain.
3709 Newx( minnextp, 1, I32 );
3710 SAVEFREEPV(minnextp);
3713 StructCopy(data, &data_fake, scan_data_t);
3714 if ((flags & SCF_DO_SUBSTR) && data->last_found) {
3717 SCAN_COMMIT(pRExC_state, &data_fake,minlenp);
3718 data_fake.last_found=newSVsv(data->last_found);
3722 data_fake.last_closep = &fake;
3723 data_fake.flags = 0;
3724 data_fake.pos_delta = delta;
3726 data_fake.flags |= SF_IS_INF;
3727 if ( flags & SCF_DO_STCLASS && !scan->flags
3728 && OP(scan) == IFMATCH ) { /* Lookahead */
3729 cl_init(pRExC_state, &intrnl);
3730 data_fake.start_class = &intrnl;
3731 f |= SCF_DO_STCLASS_AND;
3733 if (flags & SCF_WHILEM_VISITED_POS)
3734 f |= SCF_WHILEM_VISITED_POS;
3735 next = regnext(scan);
3736 nscan = NEXTOPER(NEXTOPER(scan));
3738 *minnextp = study_chunk(pRExC_state, &nscan, minnextp, &deltanext,
3739 last, &data_fake, stopparen, recursed, NULL, f,depth+1);
3742 FAIL("Variable length lookbehind not implemented");
3744 else if (*minnextp > (I32)U8_MAX) {
3745 FAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
3747 scan->flags = (U8)*minnextp;
3752 if (f & SCF_DO_STCLASS_AND) {
3753 const int was = (data->start_class->flags & ANYOF_EOS);
3755 cl_and(data->start_class, &intrnl);
3757 data->start_class->flags |= ANYOF_EOS;
3760 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3762 if (data_fake.flags & SF_HAS_EVAL)
3763 data->flags |= SF_HAS_EVAL;
3764 data->whilem_c = data_fake.whilem_c;
3765 if ((flags & SCF_DO_SUBSTR) && data_fake.last_found) {
3766 if (RExC_rx->minlen<*minnextp)
3767 RExC_rx->minlen=*minnextp;
3768 SCAN_COMMIT(pRExC_state, &data_fake, minnextp);
3769 SvREFCNT_dec(data_fake.last_found);
3771 if ( data_fake.minlen_fixed != minlenp )
3773 data->offset_fixed= data_fake.offset_fixed;
3774 data->minlen_fixed= data_fake.minlen_fixed;
3775 data->lookbehind_fixed+= scan->flags;
3777 if ( data_fake.minlen_float != minlenp )
3779 data->minlen_float= data_fake.minlen_float;
3780 data->offset_float_min=data_fake.offset_float_min;
3781 data->offset_float_max=data_fake.offset_float_max;
3782 data->lookbehind_float+= scan->flags;
3791 else if (OP(scan) == OPEN) {
3792 if (stopparen != (I32)ARG(scan))
3795 else if (OP(scan) == CLOSE) {
3796 if (stopparen == (I32)ARG(scan)) {
3799 if ((I32)ARG(scan) == is_par) {
3800 next = regnext(scan);
3802 if ( next && (OP(next) != WHILEM) && next < last)
3803 is_par = 0; /* Disable optimization */
3806 *(data->last_closep) = ARG(scan);
3808 else if (OP(scan) == EVAL) {
3810 data->flags |= SF_HAS_EVAL;
3812 else if ( PL_regkind[OP(scan)] == ENDLIKE ) {
3813 if (flags & SCF_DO_SUBSTR) {
3814 SCAN_COMMIT(pRExC_state,data,minlenp);
3815 flags &= ~SCF_DO_SUBSTR;
3817 if (data && OP(scan)==ACCEPT) {
3818 data->flags |= SCF_SEEN_ACCEPT;
3823 else if (OP(scan) == LOGICAL && scan->flags == 2) /* Embedded follows */
3825 if (flags & SCF_DO_SUBSTR) {
3826 SCAN_COMMIT(pRExC_state,data,minlenp);
3827 data->longest = &(data->longest_float);
3829 is_inf = is_inf_internal = 1;
3830 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
3831 cl_anything(pRExC_state, data->start_class);
3832 flags &= ~SCF_DO_STCLASS;
3834 else if (OP(scan) == GPOS) {
3835 if (!(RExC_rx->extflags & RXf_GPOS_FLOAT) &&
3836 !(delta || is_inf || (data && data->pos_delta)))
3838 if (!(RExC_rx->extflags & RXf_ANCH) && (flags & SCF_DO_SUBSTR))
3839 RExC_rx->extflags |= RXf_ANCH_GPOS;
3840 if (RExC_rx->gofs < (U32)min)
3841 RExC_rx->gofs = min;
3843 RExC_rx->extflags |= RXf_GPOS_FLOAT;
3847 #ifdef TRIE_STUDY_OPT
3848 #ifdef FULL_TRIE_STUDY
3849 else if (PL_regkind[OP(scan)] == TRIE) {
3850 /* NOTE - There is similar code to this block above for handling
3851 BRANCH nodes on the initial study. If you change stuff here
3853 regnode *trie_node= scan;
3854 regnode *tail= regnext(scan);
3855 reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
3856 I32 max1 = 0, min1 = I32_MAX;
3857 struct regnode_charclass_class accum;
3859 if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
3860 SCAN_COMMIT(pRExC_state, data,minlenp); /* Cannot merge strings after this. */
3861 if (flags & SCF_DO_STCLASS)
3862 cl_init_zero(pRExC_state, &accum);
3868 const regnode *nextbranch= NULL;
3871 for ( word=1 ; word <= trie->wordcount ; word++)
3873 I32 deltanext=0, minnext=0, f = 0, fake;
3874 struct regnode_charclass_class this_class;
3876 data_fake.flags = 0;
3878 data_fake.whilem_c = data->whilem_c;
3879 data_fake.last_closep = data->last_closep;
3882 data_fake.last_closep = &fake;
3883 data_fake.pos_delta = delta;
3884 if (flags & SCF_DO_STCLASS) {
3885 cl_init(pRExC_state, &this_class);
3886 data_fake.start_class = &this_class;
3887 f = SCF_DO_STCLASS_AND;
3889 if (flags & SCF_WHILEM_VISITED_POS)
3890 f |= SCF_WHILEM_VISITED_POS;
3892 if (trie->jump[word]) {
3894 nextbranch = trie_node + trie->jump[0];
3895 scan= trie_node + trie->jump[word];
3896 /* We go from the jump point to the branch that follows
3897 it. Note this means we need the vestigal unused branches
3898 even though they arent otherwise used.
3900 minnext = study_chunk(pRExC_state, &scan, minlenp,
3901 &deltanext, (regnode *)nextbranch, &data_fake,
3902 stopparen, recursed, NULL, f,depth+1);
3904 if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
3905 nextbranch= regnext((regnode*)nextbranch);
3907 if (min1 > (I32)(minnext + trie->minlen))
3908 min1 = minnext + trie->minlen;
3909 if (max1 < (I32)(minnext + deltanext + trie->maxlen))
3910 max1 = minnext + deltanext + trie->maxlen;
3911 if (deltanext == I32_MAX)
3912 is_inf = is_inf_internal = 1;
3914 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3916 if (data_fake.flags & SCF_SEEN_ACCEPT) {
3917 if ( stopmin > min + min1)
3918 stopmin = min + min1;
3919 flags &= ~SCF_DO_SUBSTR;
3921 data->flags |= SCF_SEEN_ACCEPT;
3924 if (data_fake.flags & SF_HAS_EVAL)
3925 data->flags |= SF_HAS_EVAL;
3926 data->whilem_c = data_fake.whilem_c;
3928 if (flags & SCF_DO_STCLASS)
3929 cl_or(pRExC_state, &accum, &this_class);
3932 if (flags & SCF_DO_SUBSTR) {
3933 data->pos_min += min1;
3934 data->pos_delta += max1 - min1;
3935 if (max1 != min1 || is_inf)
3936 data->longest = &(data->longest_float);
3939 delta += max1 - min1;
3940 if (flags & SCF_DO_STCLASS_OR) {
3941 cl_or(pRExC_state, data->start_class, &accum);
3943 cl_and(data->start_class, and_withp);
3944 flags &= ~SCF_DO_STCLASS;
3947 else if (flags & SCF_DO_STCLASS_AND) {
3949 cl_and(data->start_class, &accum);
3950 flags &= ~SCF_DO_STCLASS;
3953 /* Switch to OR mode: cache the old value of
3954 * data->start_class */
3956 StructCopy(data->start_class, and_withp,
3957 struct regnode_charclass_class);
3958 flags &= ~SCF_DO_STCLASS_AND;
3959 StructCopy(&accum, data->start_class,
3960 struct regnode_charclass_class);
3961 flags |= SCF_DO_STCLASS_OR;
3962 data->start_class->flags |= ANYOF_EOS;
3969 else if (PL_regkind[OP(scan)] == TRIE) {
3970 reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
3973 min += trie->minlen;
3974 delta += (trie->maxlen - trie->minlen);
3975 flags &= ~SCF_DO_STCLASS; /* xxx */
3976 if (flags & SCF_DO_SUBSTR) {
3977 SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */
3978 data->pos_min += trie->minlen;
3979 data->pos_delta += (trie->maxlen - trie->minlen);
3980 if (trie->maxlen != trie->minlen)
3981 data->longest = &(data->longest_float);
3983 if (trie->jump) /* no more substrings -- for now /grr*/
3984 flags &= ~SCF_DO_SUBSTR;
3986 #endif /* old or new */
3987 #endif /* TRIE_STUDY_OPT */
3989 /* Else: zero-length, ignore. */
3990 scan = regnext(scan);
3995 stopparen = frame->stop;
3996 frame = frame->prev;
3997 goto fake_study_recurse;
4002 DEBUG_STUDYDATA("pre-fin:",data,depth);
4005 *deltap = is_inf_internal ? I32_MAX : delta;
4006 if (flags & SCF_DO_SUBSTR && is_inf)
4007 data->pos_delta = I32_MAX - data->pos_min;
4008 if (is_par > (I32)U8_MAX)
4010 if (is_par && pars==1 && data) {
4011 data->flags |= SF_IN_PAR;
4012 data->flags &= ~SF_HAS_PAR;
4014 else if (pars && data) {
4015 data->flags |= SF_HAS_PAR;
4016 data->flags &= ~SF_IN_PAR;
4018 if (flags & SCF_DO_STCLASS_OR)
4019 cl_and(data->start_class, and_withp);
4020 if (flags & SCF_TRIE_RESTUDY)
4021 data->flags |= SCF_TRIE_RESTUDY;
4023 DEBUG_STUDYDATA("post-fin:",data,depth);
4025 return min < stopmin ? min : stopmin;
4029 S_add_data(RExC_state_t *pRExC_state, U32 n, const char *s)
4031 U32 count = RExC_rxi->data ? RExC_rxi->data->count : 0;
4033 Renewc(RExC_rxi->data,
4034 sizeof(*RExC_rxi->data) + sizeof(void*) * (count + n - 1),
4035 char, struct reg_data);
4037 Renew(RExC_rxi->data->what, count + n, U8);
4039 Newx(RExC_rxi->data->what, n, U8);
4040 RExC_rxi->data->count = count + n;
4041 Copy(s, RExC_rxi->data->what + count, n, U8);
4045 /*XXX: todo make this not included in a non debugging perl */
4046 #ifndef PERL_IN_XSUB_RE
4048 Perl_reginitcolors(pTHX)
4051 const char * const s = PerlEnv_getenv("PERL_RE_COLORS");
4053 char *t = savepv(s);
4057 t = strchr(t, '\t');
4063 PL_colors[i] = t = (char *)"";
4068 PL_colors[i++] = (char *)"";
4075 #ifdef TRIE_STUDY_OPT
4076 #define CHECK_RESTUDY_GOTO \
4078 (data.flags & SCF_TRIE_RESTUDY) \
4082 #define CHECK_RESTUDY_GOTO
4086 - pregcomp - compile a regular expression into internal code
4088 * We can't allocate space until we know how big the compiled form will be,
4089 * but we can't compile it (and thus know how big it is) until we've got a
4090 * place to put the code. So we cheat: we compile it twice, once with code
4091 * generation turned off and size counting turned on, and once "for real".
4092 * This also means that we don't allocate space until we are sure that the
4093 * thing really will compile successfully, and we never have to move the
4094 * code and thus invalidate pointers into it. (Note that it has to be in
4095 * one piece because free() must be able to free it all.) [NB: not true in perl]
4097 * Beware that the optimization-preparation code in here knows about some
4098 * of the structure of the compiled regexp. [I'll say.]
4103 #ifndef PERL_IN_XSUB_RE
4104 #define RE_ENGINE_PTR &PL_core_reg_engine
4106 extern const struct regexp_engine my_reg_engine;
4107 #define RE_ENGINE_PTR &my_reg_engine
4110 #ifndef PERL_IN_XSUB_RE
4112 Perl_pregcomp(pTHX_ const SV * const pattern, const U32 flags)
4115 HV * const table = GvHV(PL_hintgv);
4116 /* Dispatch a request to compile a regexp to correct
4119 SV **ptr= hv_fetchs(table, "regcomp", FALSE);
4120 GET_RE_DEBUG_FLAGS_DECL;
4121 if (ptr && SvIOK(*ptr) && SvIV(*ptr)) {
4122 const regexp_engine *eng=INT2PTR(regexp_engine*,SvIV(*ptr));
4124 PerlIO_printf(Perl_debug_log, "Using engine %"UVxf"\n",
4127 return CALLREGCOMP_ENG(eng, pattern, flags);
4130 return Perl_re_compile(aTHX_ pattern, flags);
4135 Perl_re_compile(pTHX_ const SV * const pattern, const U32 pm_flags)
4139 register regexp_internal *ri;
4141 char* exp = SvPV((SV*)pattern, plen);
4142 char* xend = exp + plen;
4149 RExC_state_t RExC_state;
4150 RExC_state_t * const pRExC_state = &RExC_state;
4151 #ifdef TRIE_STUDY_OPT
4153 RExC_state_t copyRExC_state;
4155 GET_RE_DEBUG_FLAGS_DECL;
4156 DEBUG_r(if (!PL_colorset) reginitcolors());
4158 RExC_utf8 = RExC_orig_utf8 = pm_flags & RXf_UTF8;
4161 SV *dsv= sv_newmortal();
4162 RE_PV_QUOTED_DECL(s, RExC_utf8,
4163 dsv, exp, plen, 60);
4164 PerlIO_printf(Perl_debug_log, "%sCompiling REx%s %s\n",
4165 PL_colors[4],PL_colors[5],s);
4170 RExC_flags = pm_flags;
4174 RExC_seen_zerolen = *exp == '^' ? -1 : 0;
4175 RExC_seen_evals = 0;
4178 /* First pass: determine size, legality. */
4186 RExC_emit = &PL_regdummy;
4187 RExC_whilem_seen = 0;
4188 RExC_charnames = NULL;
4189 RExC_open_parens = NULL;
4190 RExC_close_parens = NULL;
4192 RExC_paren_names = NULL;
4194 RExC_paren_name_list = NULL;
4196 RExC_recurse = NULL;
4197 RExC_recurse_count = 0;
4199 #if 0 /* REGC() is (currently) a NOP at the first pass.
4200 * Clever compilers notice this and complain. --jhi */
4201 REGC((U8)REG_MAGIC, (char*)RExC_emit);
4203 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "Starting first pass (sizing)\n"));
4204 if (reg(pRExC_state, 0, &flags,1) == NULL) {
4205 RExC_precomp = NULL;
4208 if (RExC_utf8 && !RExC_orig_utf8) {
4209 /* It's possible to write a regexp in ascii that represents Unicode
4210 codepoints outside of the byte range, such as via \x{100}. If we
4211 detect such a sequence we have to convert the entire pattern to utf8
4212 and then recompile, as our sizing calculation will have been based
4213 on 1 byte == 1 character, but we will need to use utf8 to encode
4214 at least some part of the pattern, and therefore must convert the whole
4216 XXX: somehow figure out how to make this less expensive...
4219 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log,
4220 "UTF8 mismatch! Converting to utf8 for resizing and compile\n"));
4221 exp = (char*)Perl_bytes_to_utf8(aTHX_ (U8*)exp, &len);
4223 RExC_orig_utf8 = RExC_utf8;
4225 goto redo_first_pass;
4228 PerlIO_printf(Perl_debug_log,
4229 "Required size %"IVdf" nodes\n"
4230 "Starting second pass (creation)\n",
4233 RExC_lastparse=NULL;
4235 /* Small enough for pointer-storage convention?
4236 If extralen==0, this means that we will not need long jumps. */
4237 if (RExC_size >= 0x10000L && RExC_extralen)
4238 RExC_size += RExC_extralen;
4241 if (RExC_whilem_seen > 15)
4242 RExC_whilem_seen = 15;
4244 /* Allocate space and zero-initialize. Note, the two step process
4245 of zeroing when in debug mode, thus anything assigned has to
4246 happen after that */
4247 Newxz(r, 1, regexp);
4248 Newxc(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode),
4249 char, regexp_internal);
4250 if ( r == NULL || ri == NULL )
4251 FAIL("Regexp out of space");
4253 /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */
4254 Zero(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode), char);
4256 /* bulk initialize base fields with 0. */
4257 Zero(ri, sizeof(regexp_internal), char);
4260 /* non-zero initialization begins here */
4262 r->engine= RE_ENGINE_PTR;
4265 r->extflags = pm_flags;
4267 bool has_p = ((r->extflags & RXf_PMf_KEEPCOPY) == RXf_PMf_KEEPCOPY);
4268 bool has_minus = ((r->extflags & RXf_PMf_STD_PMMOD) != RXf_PMf_STD_PMMOD);
4269 bool has_runon = ((RExC_seen & REG_SEEN_RUN_ON_COMMENT)==REG_SEEN_RUN_ON_COMMENT);
4270 U16 reganch = (U16)((r->extflags & RXf_PMf_STD_PMMOD) >> 12);
4271 const char *fptr = STD_PAT_MODS; /*"msix"*/
4273 r->wraplen = r->prelen + has_minus + has_p + has_runon
4274 + (sizeof(STD_PAT_MODS) - 1)
4275 + (sizeof("(?:)") - 1);
4277 Newx(r->wrapped, r->wraplen + 1, char );
4281 *p++ = KEEPCOPY_PAT_MOD; /*'p'*/
4283 char *r = p + (sizeof(STD_PAT_MODS) - 1) + has_minus - 1;
4284 char *colon = r + 1;
4287 while((ch = *fptr++)) {
4301 Copy(RExC_precomp, p, r->prelen, char);
4311 r->nparens = RExC_npar - 1; /* set early to validate backrefs */
4313 if (RExC_seen & REG_SEEN_RECURSE) {
4314 Newxz(RExC_open_parens, RExC_npar,regnode *);
4315 SAVEFREEPV(RExC_open_parens);
4316 Newxz(RExC_close_parens,RExC_npar,regnode *);
4317 SAVEFREEPV(RExC_close_parens);
4320 /* Useful during FAIL. */
4321 #ifdef RE_TRACK_PATTERN_OFFSETS
4322 Newxz(ri->u.offsets, 2*RExC_size+1, U32); /* MJD 20001228 */
4323 DEBUG_OFFSETS_r(PerlIO_printf(Perl_debug_log,
4324 "%s %"UVuf" bytes for offset annotations.\n",
4325 ri->u.offsets ? "Got" : "Couldn't get",
4326 (UV)((2*RExC_size+1) * sizeof(U32))));
4328 SetProgLen(ri,RExC_size);
4332 /* Second pass: emit code. */
4333 RExC_flags = pm_flags; /* don't let top level (?i) bleed */
4338 RExC_emit_start = ri->program;
4339 RExC_emit = ri->program;
4340 RExC_emit_bound = ri->program + RExC_size + 1;
4342 /* Store the count of eval-groups for security checks: */
4343 RExC_rx->seen_evals = RExC_seen_evals;
4344 REGC((U8)REG_MAGIC, (char*) RExC_emit++);
4345 if (reg(pRExC_state, 0, &flags,1) == NULL) {
4349 /* XXXX To minimize changes to RE engine we always allocate
4350 3-units-long substrs field. */
4351 Newx(r->substrs, 1, struct reg_substr_data);
4352 if (RExC_recurse_count) {
4353 Newxz(RExC_recurse,RExC_recurse_count,regnode *);
4354 SAVEFREEPV(RExC_recurse);
4358 r->minlen = minlen = sawplus = sawopen = 0;
4359 Zero(r->substrs, 1, struct reg_substr_data);
4361 #ifdef TRIE_STUDY_OPT
4364 DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log,"Restudying\n"));
4366 RExC_state = copyRExC_state;
4367 if (seen & REG_TOP_LEVEL_BRANCHES)
4368 RExC_seen |= REG_TOP_LEVEL_BRANCHES;
4370 RExC_seen &= ~REG_TOP_LEVEL_BRANCHES;
4371 if (data.last_found) {
4372 SvREFCNT_dec(data.longest_fixed);
4373 SvREFCNT_dec(data.longest_float);
4374 SvREFCNT_dec(data.last_found);
4376 StructCopy(&zero_scan_data, &data, scan_data_t);
4378 StructCopy(&zero_scan_data, &data, scan_data_t);
4379 copyRExC_state = RExC_state;
4382 StructCopy(&zero_scan_data, &data, scan_data_t);
4385 /* Dig out information for optimizations. */
4386 r->extflags = RExC_flags; /* was pm_op */
4387 /*dmq: removed as part of de-PMOP: pm->op_pmflags = RExC_flags; */
4390 r->extflags |= RXf_UTF8; /* Unicode in it? */
4391 ri->regstclass = NULL;
4392 if (RExC_naughty >= 10) /* Probably an expensive pattern. */
4393 r->intflags |= PREGf_NAUGHTY;
4394 scan = ri->program + 1; /* First BRANCH. */
4396 /* testing for BRANCH here tells us whether there is "must appear"
4397 data in the pattern. If there is then we can use it for optimisations */
4398 if (!(RExC_seen & REG_TOP_LEVEL_BRANCHES)) { /* Only one top-level choice. */
4400 STRLEN longest_float_length, longest_fixed_length;
4401 struct regnode_charclass_class ch_class; /* pointed to by data */
4403 I32 last_close = 0; /* pointed to by data */
4404 regnode *first= scan;
4405 regnode *first_next= regnext(first);
4407 /* Skip introductions and multiplicators >= 1. */
4408 while ((OP(first) == OPEN && (sawopen = 1)) ||
4409 /* An OR of *one* alternative - should not happen now. */
4410 (OP(first) == BRANCH && OP(first_next) != BRANCH) ||
4411 /* for now we can't handle lookbehind IFMATCH*/
4412 (OP(first) == IFMATCH && !first->flags) ||
4413 (OP(first) == PLUS) ||
4414 (OP(first) == MINMOD) ||
4415 /* An {n,m} with n>0 */
4416 (PL_regkind[OP(first)] == CURLY && ARG1(first) > 0) ||
4417 (OP(first) == NOTHING && PL_regkind[OP(first_next)] != END ))
4420 if (OP(first) == PLUS)
4423 first += regarglen[OP(first)];
4424 if (OP(first) == IFMATCH) {
4425 first = NEXTOPER(first);
4426 first += EXTRA_STEP_2ARGS;
4427 } else /* XXX possible optimisation for /(?=)/ */
4428 first = NEXTOPER(first);
4429 first_next= regnext(first);
4432 /* Starting-point info. */
4434 DEBUG_PEEP("first:",first,0);
4435 /* Ignore EXACT as we deal with it later. */
4436 if (PL_regkind[OP(first)] == EXACT) {
4437 if (OP(first) == EXACT)
4438 NOOP; /* Empty, get anchored substr later. */
4439 else if ((OP(first) == EXACTF || OP(first) == EXACTFL))
4440 ri->regstclass = first;
4443 else if (PL_regkind[OP(first)] == TRIE &&
4444 ((reg_trie_data *)ri->data->data[ ARG(first) ])->minlen>0)
4447 /* this can happen only on restudy */
4448 if ( OP(first) == TRIE ) {
4449 struct regnode_1 *trieop = (struct regnode_1 *)
4450 PerlMemShared_calloc(1, sizeof(struct regnode_1));
4451 StructCopy(first,trieop,struct regnode_1);
4452 trie_op=(regnode *)trieop;
4454 struct regnode_charclass *trieop = (struct regnode_charclass *)
4455 PerlMemShared_calloc(1, sizeof(struct regnode_charclass));
4456 StructCopy(first,trieop,struct regnode_charclass);
4457 trie_op=(regnode *)trieop;
4460 make_trie_failtable(pRExC_state, (regnode *)first, trie_op, 0);
4461 ri->regstclass = trie_op;
4464 else if (strchr((const char*)PL_simple,OP(first)))
4465 ri->regstclass = first;
4466 else if (PL_regkind[OP(first)] == BOUND ||
4467 PL_regkind[OP(first)] == NBOUND)
4468 ri->regstclass = first;
4469 else if (PL_regkind[OP(first)] == BOL) {
4470 r->extflags |= (OP(first) == MBOL
4472 : (OP(first) == SBOL
4475 first = NEXTOPER(first);
4478 else if (OP(first) == GPOS) {
4479 r->extflags |= RXf_ANCH_GPOS;
4480 first = NEXTOPER(first);
4483 else if ((!sawopen || !RExC_sawback) &&
4484 (OP(first) == STAR &&
4485 PL_regkind[OP(NEXTOPER(first))] == REG_ANY) &&
4486 !(r->extflags & RXf_ANCH) && !(RExC_seen & REG_SEEN_EVAL))
4488 /* turn .* into ^.* with an implied $*=1 */
4490 (OP(NEXTOPER(first)) == REG_ANY)
4493 r->extflags |= type;
4494 r->intflags |= PREGf_IMPLICIT;
4495 first = NEXTOPER(first);
4498 if (sawplus && (!sawopen || !RExC_sawback)
4499 && !(RExC_seen & REG_SEEN_EVAL)) /* May examine pos and $& */
4500 /* x+ must match at the 1st pos of run of x's */
4501 r->intflags |= PREGf_SKIP;
4503 /* Scan is after the zeroth branch, first is atomic matcher. */
4504 #ifdef TRIE_STUDY_OPT
4507 PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
4508 (IV)(first - scan + 1))
4512 PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
4513 (IV)(first - scan + 1))
4519 * If there's something expensive in the r.e., find the
4520 * longest literal string that must appear and make it the
4521 * regmust. Resolve ties in favor of later strings, since
4522 * the regstart check works with the beginning of the r.e.
4523 * and avoiding duplication strengthens checking. Not a
4524 * strong reason, but sufficient in the absence of others.
4525 * [Now we resolve ties in favor of the earlier string if
4526 * it happens that c_offset_min has been invalidated, since the
4527 * earlier string may buy us something the later one won't.]
4530 data.longest_fixed = newSVpvs("");
4531 data.longest_float = newSVpvs("");
4532 data.last_found = newSVpvs("");
4533 data.longest = &(data.longest_fixed);
4535 if (!ri->regstclass) {
4536 cl_init(pRExC_state, &ch_class);
4537 data.start_class = &ch_class;
4538 stclass_flag = SCF_DO_STCLASS_AND;
4539 } else /* XXXX Check for BOUND? */
4541 data.last_closep = &last_close;
4543 minlen = study_chunk(pRExC_state, &first, &minlen, &fake, scan + RExC_size, /* Up to end */
4544 &data, -1, NULL, NULL,
4545 SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag,0);
4551 if ( RExC_npar == 1 && data.longest == &(data.longest_fixed)
4552 && data.last_start_min == 0 && data.last_end > 0
4553 && !RExC_seen_zerolen
4554 && !(RExC_seen & REG_SEEN_VERBARG)
4555 && (!(RExC_seen & REG_SEEN_GPOS) || (r->extflags & RXf_ANCH_GPOS)))
4556 r->extflags |= RXf_CHECK_ALL;
4557 scan_commit(pRExC_state, &data,&minlen,0);
4558 SvREFCNT_dec(data.last_found);
4560 /* Note that code very similar to this but for anchored string
4561 follows immediately below, changes may need to be made to both.
4564 longest_float_length = CHR_SVLEN(data.longest_float);
4565 if (longest_float_length
4566 || (data.flags & SF_FL_BEFORE_EOL
4567 && (!(data.flags & SF_FL_BEFORE_MEOL)
4568 || (RExC_flags & RXf_PMf_MULTILINE))))
4572 if (SvCUR(data.longest_fixed) /* ok to leave SvCUR */
4573 && data.offset_fixed == data.offset_float_min
4574 && SvCUR(data.longest_fixed) == SvCUR(data.longest_float))
4575 goto remove_float; /* As in (a)+. */
4577 /* copy the information about the longest float from the reg_scan_data
4578 over to the program. */
4579 if (SvUTF8(data.longest_float)) {
4580 r->float_utf8 = data.longest_float;
4581 r->float_substr = NULL;
4583 r->float_substr = data.longest_float;
4584 r->float_utf8 = NULL;
4586 /* float_end_shift is how many chars that must be matched that
4587 follow this item. We calculate it ahead of time as once the
4588 lookbehind offset is added in we lose the ability to correctly
4590 ml = data.minlen_float ? *(data.minlen_float)
4591 : (I32)longest_float_length;
4592 r->float_end_shift = ml - data.offset_float_min
4593 - longest_float_length + (SvTAIL(data.longest_float) != 0)
4594 + data.lookbehind_float;
4595 r->float_min_offset = data.offset_float_min - data.lookbehind_float;
4596 r->float_max_offset = data.offset_float_max;
4597 if (data.offset_float_max < I32_MAX) /* Don't offset infinity */
4598 r->float_max_offset -= data.lookbehind_float;
4600 t = (data.flags & SF_FL_BEFORE_EOL /* Can't have SEOL and MULTI */
4601 && (!(data.flags & SF_FL_BEFORE_MEOL)
4602 || (RExC_flags & RXf_PMf_MULTILINE)));
4603 fbm_compile(data.longest_float, t ? FBMcf_TAIL : 0);
4607 r->float_substr = r->float_utf8 = NULL;
4608 SvREFCNT_dec(data.longest_float);
4609 longest_float_length = 0;
4612 /* Note that code very similar to this but for floating string
4613 is immediately above, changes may need to be made to both.
4616 longest_fixed_length = CHR_SVLEN(data.longest_fixed);
4617 if (longest_fixed_length
4618 || (data.flags & SF_FIX_BEFORE_EOL /* Cannot have SEOL and MULTI */
4619 && (!(data.flags & SF_FIX_BEFORE_MEOL)
4620 || (RExC_flags & RXf_PMf_MULTILINE))))
4624 /* copy the information about the longest fixed
4625 from the reg_scan_data over to the program. */
4626 if (SvUTF8(data.longest_fixed)) {
4627 r->anchored_utf8 = data.longest_fixed;
4628 r->anchored_substr = NULL;
4630 r->anchored_substr = data.longest_fixed;
4631 r->anchored_utf8 = NULL;
4633 /* fixed_end_shift is how many chars that must be matched that
4634 follow this item. We calculate it ahead of time as once the
4635 lookbehind offset is added in we lose the ability to correctly
4637 ml = data.minlen_fixed ? *(data.minlen_fixed)
4638 : (I32)longest_fixed_length;
4639 r->anchored_end_shift = ml - data.offset_fixed
4640 - longest_fixed_length + (SvTAIL(data.longest_fixed) != 0)
4641 + data.lookbehind_fixed;
4642 r->anchored_offset = data.offset_fixed - data.lookbehind_fixed;
4644 t = (data.flags & SF_FIX_BEFORE_EOL /* Can't have SEOL and MULTI */
4645 && (!(data.flags & SF_FIX_BEFORE_MEOL)
4646 || (RExC_flags & RXf_PMf_MULTILINE)));
4647 fbm_compile(data.longest_fixed, t ? FBMcf_TAIL : 0);
4650 r->anchored_substr = r->anchored_utf8 = NULL;
4651 SvREFCNT_dec(data.longest_fixed);
4652 longest_fixed_length = 0;
4655 && (OP(ri->regstclass) == REG_ANY || OP(ri->regstclass) == SANY))
4656 ri->regstclass = NULL;
4657 if ((!(r->anchored_substr || r->anchored_utf8) || r->anchored_offset)
4659 && !(data.start_class->flags & ANYOF_EOS)
4660 && !cl_is_anything(data.start_class))
4662 const U32 n = add_data(pRExC_state, 1, "f");
4664 Newx(RExC_rxi->data->data[n], 1,
4665 struct regnode_charclass_class);
4666 StructCopy(data.start_class,
4667 (struct regnode_charclass_class*)RExC_rxi->data->data[n],
4668 struct regnode_charclass_class);
4669 ri->regstclass = (regnode*)RExC_rxi->data->data[n];
4670 r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
4671 DEBUG_COMPILE_r({ SV *sv = sv_newmortal();
4672 regprop(r, sv, (regnode*)data.start_class);
4673 PerlIO_printf(Perl_debug_log,
4674 "synthetic stclass \"%s\".\n",
4675 SvPVX_const(sv));});
4678 /* A temporary algorithm prefers floated substr to fixed one to dig more info. */
4679 if (longest_fixed_length > longest_float_length) {
4680 r->check_end_shift = r->anchored_end_shift;
4681 r->check_substr = r->anchored_substr;
4682 r->check_utf8 = r->anchored_utf8;
4683 r->check_offset_min = r->check_offset_max = r->anchored_offset;
4684 if (r->extflags & RXf_ANCH_SINGLE)
4685 r->extflags |= RXf_NOSCAN;
4688 r->check_end_shift = r->float_end_shift;
4689 r->check_substr = r->float_substr;
4690 r->check_utf8 = r->float_utf8;
4691 r->check_offset_min = r->float_min_offset;
4692 r->check_offset_max = r->float_max_offset;
4694 /* XXXX Currently intuiting is not compatible with ANCH_GPOS.
4695 This should be changed ASAP! */
4696 if ((r->check_substr || r->check_utf8) && !(r->extflags & RXf_ANCH_GPOS)) {
4697 r->extflags |= RXf_USE_INTUIT;
4698 if (SvTAIL(r->check_substr ? r->check_substr : r->check_utf8))
4699 r->extflags |= RXf_INTUIT_TAIL;
4701 /* XXX Unneeded? dmq (shouldn't as this is handled elsewhere)
4702 if ( (STRLEN)minlen < longest_float_length )
4703 minlen= longest_float_length;
4704 if ( (STRLEN)minlen < longest_fixed_length )
4705 minlen= longest_fixed_length;
4709 /* Several toplevels. Best we can is to set minlen. */
4711 struct regnode_charclass_class ch_class;
4714 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "\nMulti Top Level\n"));
4716 scan = ri->program + 1;
4717 cl_init(pRExC_state, &ch_class);
4718 data.start_class = &ch_class;
4719 data.last_closep = &last_close;
4722 minlen = study_chunk(pRExC_state, &scan, &minlen, &fake, scan + RExC_size,
4723 &data, -1, NULL, NULL, SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS,0);
4727 r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8
4728 = r->float_substr = r->float_utf8 = NULL;
4729 if (!(data.start_class->flags & ANYOF_EOS)
4730 && !cl_is_anything(data.start_class))
4732 const U32 n = add_data(pRExC_state, 1, "f");
4734 Newx(RExC_rxi->data->data[n], 1,
4735 struct regnode_charclass_class);
4736 StructCopy(data.start_class,
4737 (struct regnode_charclass_class*)RExC_rxi->data->data[n],
4738 struct regnode_charclass_class);
4739 ri->regstclass = (regnode*)RExC_rxi->data->data[n];
4740 r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
4741 DEBUG_COMPILE_r({ SV* sv = sv_newmortal();
4742 regprop(r, sv, (regnode*)data.start_class);
4743 PerlIO_printf(Perl_debug_log,
4744 "synthetic stclass \"%s\".\n",
4745 SvPVX_const(sv));});
4749 /* Guard against an embedded (?=) or (?<=) with a longer minlen than
4750 the "real" pattern. */
4752 PerlIO_printf(Perl_debug_log,"minlen: %"IVdf" r->minlen:%"IVdf"\n",
4753 (IV)minlen, (IV)r->minlen);
4755 r->minlenret = minlen;
4756 if (r->minlen < minlen)
4759 if (RExC_seen & REG_SEEN_GPOS)
4760 r->extflags |= RXf_GPOS_SEEN;
4761 if (RExC_seen & REG_SEEN_LOOKBEHIND)
4762 r->extflags |= RXf_LOOKBEHIND_SEEN;
4763 if (RExC_seen & REG_SEEN_EVAL)
4764 r->extflags |= RXf_EVAL_SEEN;
4765 if (RExC_seen & REG_SEEN_CANY)
4766 r->extflags |= RXf_CANY_SEEN;
4767 if (RExC_seen & REG_SEEN_VERBARG)
4768 r->intflags |= PREGf_VERBARG_SEEN;
4769 if (RExC_seen & REG_SEEN_CUTGROUP)
4770 r->intflags |= PREGf_CUTGROUP_SEEN;
4771 if (RExC_paren_names)
4772 r->paren_names = (HV*)SvREFCNT_inc(RExC_paren_names);
4774 r->paren_names = NULL;
4776 #ifdef STUPID_PATTERN_CHECKS
4778 r->extflags |= RXf_NULL;
4779 if (r->extflags & RXf_SPLIT && r->prelen == 1 && r->precomp[0] == ' ')
4780 /* XXX: this should happen BEFORE we compile */
4781 r->extflags |= (RXf_SKIPWHITE|RXf_WHITE);
4782 else if (r->prelen == 3 && memEQ("\\s+", r->precomp, 3))
4783 r->extflags |= RXf_WHITE;
4784 else if (r->prelen == 1 && r->precomp[0] == '^')
4785 r->extflags |= RXf_START_ONLY;
4787 if (r->extflags & RXf_SPLIT && r->prelen == 1 && r->precomp[0] == ' ')
4788 /* XXX: this should happen BEFORE we compile */
4789 r->extflags |= (RXf_SKIPWHITE|RXf_WHITE);
4791 regnode *first = ri->program + 1;
4793 U8 nop = OP(NEXTOPER(first));
4795 if (PL_regkind[fop] == NOTHING && nop == END)
4796 r->extflags |= RXf_NULL;
4797 else if (PL_regkind[fop] == BOL && nop == END)
4798 r->extflags |= RXf_START_ONLY;
4799 else if (fop == PLUS && nop ==SPACE && OP(regnext(first))==END)
4800 r->extflags |= RXf_WHITE;
4804 if (RExC_paren_names) {
4805 ri->name_list_idx = add_data( pRExC_state, 1, "p" );
4806 ri->data->data[ri->name_list_idx] = (void*)SvREFCNT_inc(RExC_paren_name_list);
4809 ri->name_list_idx = 0;
4811 if (RExC_recurse_count) {
4812 for ( ; RExC_recurse_count ; RExC_recurse_count-- ) {
4813 const regnode *scan = RExC_recurse[RExC_recurse_count-1];
4814 ARG2L_SET( scan, RExC_open_parens[ARG(scan)-1] - scan );
4817 Newxz(r->offs, RExC_npar, regexp_paren_pair);
4818 /* assume we don't need to swap parens around before we match */
4821 PerlIO_printf(Perl_debug_log,"Final program:\n");
4824 #ifdef RE_TRACK_PATTERN_OFFSETS
4825 DEBUG_OFFSETS_r(if (ri->u.offsets) {
4826 const U32 len = ri->u.offsets[0];
4828 GET_RE_DEBUG_FLAGS_DECL;
4829 PerlIO_printf(Perl_debug_log, "Offsets: [%"UVuf"]\n\t", (UV)ri->u.offsets[0]);
4830 for (i = 1; i <= len; i++) {
4831 if (ri->u.offsets[i*2-1] || ri->u.offsets[i*2])
4832 PerlIO_printf(Perl_debug_log, "%"UVuf":%"UVuf"[%"UVuf"] ",
4833 (UV)i, (UV)ri->u.offsets[i*2-1], (UV)ri->u.offsets[i*2]);
4835 PerlIO_printf(Perl_debug_log, "\n");
4841 #undef RE_ENGINE_PTR
4845 Perl_reg_named_buff(pTHX_ REGEXP * const rx, SV * const key, SV * const value,
4848 PERL_UNUSED_ARG(value);
4850 if (flags & RXapif_FETCH) {
4851 return reg_named_buff_fetch(rx, key, flags);
4852 } else if (flags & (RXapif_STORE | RXapif_DELETE | RXapif_CLEAR)) {
4853 Perl_croak(aTHX_ PL_no_modify);
4855 } else if (flags & RXapif_EXISTS) {
4856 return reg_named_buff_exists(rx, key, flags)
4859 } else if (flags & RXapif_REGNAMES) {
4860 return reg_named_buff_all(rx, flags);
4861 } else if (flags & (RXapif_SCALAR | RXapif_REGNAMES_COUNT)) {
4862 return reg_named_buff_scalar(rx, flags);
4864 Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff", (int)flags);
4870 Perl_reg_named_buff_iter(pTHX_ REGEXP * const rx, const SV * const lastkey,
4873 PERL_UNUSED_ARG(lastkey);
4875 if (flags & RXapif_FIRSTKEY)
4876 return reg_named_buff_firstkey(rx, flags);
4877 else if (flags & RXapif_NEXTKEY)
4878 return reg_named_buff_nextkey(rx, flags);
4880 Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_iter", (int)flags);
4886 Perl_reg_named_buff_fetch(pTHX_ REGEXP * const rx, SV * const namesv, const U32 flags)
4888 AV *retarray = NULL;
4890 if (flags & RXapif_ALL)
4893 if (rx && rx->paren_names) {
4894 HE *he_str = hv_fetch_ent( rx->paren_names, namesv, 0, 0 );
4897 SV* sv_dat=HeVAL(he_str);
4898 I32 *nums=(I32*)SvPVX(sv_dat);
4899 for ( i=0; i<SvIVX(sv_dat); i++ ) {
4900 if ((I32)(rx->nparens) >= nums[i]
4901 && rx->offs[nums[i]].start != -1
4902 && rx->offs[nums[i]].end != -1)
4905 CALLREG_NUMBUF_FETCH(rx,nums[i],ret);
4909 ret = newSVsv(&PL_sv_undef);
4912 SvREFCNT_inc_simple_void(ret);
4913 av_push(retarray, ret);
4917 return newRV((SV*)retarray);
4924 Perl_reg_named_buff_exists(pTHX_ REGEXP * const rx, SV * const key,
4927 if (rx && rx->paren_names) {
4928 if (flags & RXapif_ALL) {
4929 return hv_exists_ent(rx->paren_names, key, 0);
4931 SV *sv = CALLREG_NAMED_BUFF_FETCH(rx, key, flags);
4945 Perl_reg_named_buff_firstkey(pTHX_ REGEXP * const rx, const U32 flags)
4947 (void)hv_iterinit(rx->paren_names);
4949 return CALLREG_NAMED_BUFF_NEXTKEY(rx, NULL, flags & ~RXapif_FIRSTKEY);
4953 Perl_reg_named_buff_nextkey(pTHX_ REGEXP * const rx, const U32 flags)
4955 if (rx && rx->paren_names) {
4956 HV *hv = rx->paren_names;
4958 while ( (temphe = hv_iternext_flags(hv,0)) ) {
4961 SV* sv_dat = HeVAL(temphe);
4962 I32 *nums = (I32*)SvPVX(sv_dat);
4963 for ( i = 0; i < SvIVX(sv_dat); i++ ) {
4964 if ((I32)(rx->lastcloseparen) >= nums[i] &&
4965 rx->offs[nums[i]].start != -1 &&
4966 rx->offs[nums[i]].end != -1)
4972 if (parno || flags & RXapif_ALL) {
4974 char *pv = HePV(temphe, len);
4975 return newSVpvn(pv,len);
4983 Perl_reg_named_buff_scalar(pTHX_ REGEXP * const rx, const U32 flags)
4989 if (rx && rx->paren_names) {
4990 if (flags & (RXapif_ALL | RXapif_REGNAMES_COUNT)) {
4991 return newSViv(HvTOTALKEYS(rx->paren_names));
4992 } else if (flags & RXapif_ONE) {
4993 ret = CALLREG_NAMED_BUFF_ALL(rx, (flags | RXapif_REGNAMES));
4994 av = (AV*)SvRV(ret);
4995 length = av_len(av);
4996 return newSViv(length + 1);
4998 Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_scalar", (int)flags);
5002 return &PL_sv_undef;
5006 Perl_reg_named_buff_all(pTHX_ REGEXP * const rx, const U32 flags)
5010 if (rx && rx->paren_names) {
5011 HV *hv= rx->paren_names;
5013 (void)hv_iterinit(hv);
5014 while ( (temphe = hv_iternext_flags(hv,0)) ) {
5017 SV* sv_dat = HeVAL(temphe);
5018 I32 *nums = (I32*)SvPVX(sv_dat);
5019 for ( i = 0; i < SvIVX(sv_dat); i++ ) {
5020 if ((I32)(rx->lastcloseparen) >= nums[i] &&
5021 rx->offs[nums[i]].start != -1 &&
5022 rx->offs[nums[i]].end != -1)
5028 if (parno || flags & RXapif_ALL) {
5030 char *pv = HePV(temphe, len);
5031 av_push(av, newSVpvn(pv,len));
5036 return newRV((SV*)av);
5040 Perl_reg_numbered_buff_fetch(pTHX_ REGEXP * const rx, const I32 paren, SV * const sv)
5047 sv_setsv(sv,&PL_sv_undef);
5051 if (paren == RX_BUFF_IDX_PREMATCH && rx->offs[0].start != -1) {
5053 i = rx->offs[0].start;
5057 if (paren == RX_BUFF_IDX_POSTMATCH && rx->offs[0].end != -1) {
5059 s = rx->subbeg + rx->offs[0].end;
5060 i = rx->sublen - rx->offs[0].end;
5063 if ( 0 <= paren && paren <= (I32)rx->nparens &&
5064 (s1 = rx->offs[paren].start) != -1 &&
5065 (t1 = rx->offs[paren].end) != -1)
5069 s = rx->subbeg + s1;
5071 sv_setsv(sv,&PL_sv_undef);
5074 assert(rx->sublen >= (s - rx->subbeg) + i );
5076 const int oldtainted = PL_tainted;
5078 sv_setpvn(sv, s, i);
5079 PL_tainted = oldtainted;
5080 if ( (rx->extflags & RXf_CANY_SEEN)
5081 ? (RX_MATCH_UTF8(rx)
5082 && (!i || is_utf8_string((U8*)s, i)))
5083 : (RX_MATCH_UTF8(rx)) )
5090 if (RX_MATCH_TAINTED(rx)) {
5091 if (SvTYPE(sv) >= SVt_PVMG) {
5092 MAGIC* const mg = SvMAGIC(sv);
5095 SvMAGIC_set(sv, mg->mg_moremagic);
5097 if ((mgt = SvMAGIC(sv))) {
5098 mg->mg_moremagic = mgt;
5099 SvMAGIC_set(sv, mg);
5109 sv_setsv(sv,&PL_sv_undef);
5115 Perl_reg_numbered_buff_store(pTHX_ REGEXP * const rx, const I32 paren,
5116 SV const * const value)
5118 PERL_UNUSED_ARG(rx);
5119 PERL_UNUSED_ARG(paren);
5120 PERL_UNUSED_ARG(value);
5123 Perl_croak(aTHX_ PL_no_modify);
5127 Perl_reg_numbered_buff_length(pTHX_ REGEXP * const rx, const SV * const sv,
5133 /* Some of this code was originally in C<Perl_magic_len> in F<mg.c> */
5135 /* $` / ${^PREMATCH} */
5136 case RX_BUFF_IDX_PREMATCH:
5137 if (rx->offs[0].start != -1) {
5138 i = rx->offs[0].start;
5146 /* $' / ${^POSTMATCH} */
5147 case RX_BUFF_IDX_POSTMATCH:
5148 if (rx->offs[0].end != -1) {
5149 i = rx->sublen - rx->offs[0].end;
5151 s1 = rx->offs[0].end;
5157 /* $& / ${^MATCH}, $1, $2, ... */
5159 if (paren <= (I32)rx->nparens &&
5160 (s1 = rx->offs[paren].start) != -1 &&
5161 (t1 = rx->offs[paren].end) != -1)
5166 if (ckWARN(WARN_UNINITIALIZED))
5167 report_uninit((SV*)sv);
5172 if (i > 0 && RX_MATCH_UTF8(rx)) {
5173 const char * const s = rx->subbeg + s1;
5178 if (is_utf8_string_loclen((U8*)s, i, &ep, &el))
5185 Perl_reg_qr_package(pTHX_ REGEXP * const rx)
5187 PERL_UNUSED_ARG(rx);
5188 return newSVpvs("Regexp");
5191 /* Scans the name of a named buffer from the pattern.
5192 * If flags is REG_RSN_RETURN_NULL returns null.
5193 * If flags is REG_RSN_RETURN_NAME returns an SV* containing the name
5194 * If flags is REG_RSN_RETURN_DATA returns the data SV* corresponding
5195 * to the parsed name as looked up in the RExC_paren_names hash.
5196 * If there is an error throws a vFAIL().. type exception.
5199 #define REG_RSN_RETURN_NULL 0
5200 #define REG_RSN_RETURN_NAME 1
5201 #define REG_RSN_RETURN_DATA 2
5204 S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags) {
5205 char *name_start = RExC_parse;
5207 if (isIDFIRST_lazy_if(RExC_parse, UTF)) {
5208 /* skip IDFIRST by using do...while */
5211 RExC_parse += UTF8SKIP(RExC_parse);
5212 } while (isALNUM_utf8((U8*)RExC_parse));
5216 } while (isALNUM(*RExC_parse));
5220 SV* sv_name = sv_2mortal(Perl_newSVpvn(aTHX_ name_start,
5221 (int)(RExC_parse - name_start)));
5224 if ( flags == REG_RSN_RETURN_NAME)
5226 else if (flags==REG_RSN_RETURN_DATA) {
5229 if ( ! sv_name ) /* should not happen*/
5230 Perl_croak(aTHX_ "panic: no svname in reg_scan_name");
5231 if (RExC_paren_names)
5232 he_str = hv_fetch_ent( RExC_paren_names, sv_name, 0, 0 );
5234 sv_dat = HeVAL(he_str);
5236 vFAIL("Reference to nonexistent named group");
5240 Perl_croak(aTHX_ "panic: bad flag in reg_scan_name");
5247 #define DEBUG_PARSE_MSG(funcname) DEBUG_PARSE_r({ \
5248 int rem=(int)(RExC_end - RExC_parse); \
5257 if (RExC_lastparse!=RExC_parse) \
5258 PerlIO_printf(Perl_debug_log," >%.*s%-*s", \
5261 iscut ? "..." : "<" \
5264 PerlIO_printf(Perl_debug_log,"%16s",""); \
5267 num = RExC_size + 1; \
5269 num=REG_NODE_NUM(RExC_emit); \
5270 if (RExC_lastnum!=num) \
5271 PerlIO_printf(Perl_debug_log,"|%4d",num); \
5273 PerlIO_printf(Perl_debug_log,"|%4s",""); \
5274 PerlIO_printf(Perl_debug_log,"|%*s%-4s", \
5275 (int)((depth*2)), "", \
5279 RExC_lastparse=RExC_parse; \
5284 #define DEBUG_PARSE(funcname) DEBUG_PARSE_r({ \
5285 DEBUG_PARSE_MSG((funcname)); \
5286 PerlIO_printf(Perl_debug_log,"%4s","\n"); \
5288 #define DEBUG_PARSE_FMT(funcname,fmt,args) DEBUG_PARSE_r({ \
5289 DEBUG_PARSE_MSG((funcname)); \
5290 PerlIO_printf(Perl_debug_log,fmt "\n",args); \
5293 - reg - regular expression, i.e. main body or parenthesized thing
5295 * Caller must absorb opening parenthesis.
5297 * Combining parenthesis handling with the base level of regular expression
5298 * is a trifle forced, but the need to tie the tails of the branches to what
5299 * follows makes it hard to avoid.
5301 #define REGTAIL(x,y,z) regtail((x),(y),(z),depth+1)
5303 #define REGTAIL_STUDY(x,y,z) regtail_study((x),(y),(z),depth+1)
5305 #define REGTAIL_STUDY(x,y,z) regtail((x),(y),(z),depth+1)
5309 S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
5310 /* paren: Parenthesized? 0=top, 1=(, inside: changed to letter. */
5313 register regnode *ret; /* Will be the head of the group. */
5314 register regnode *br;
5315 register regnode *lastbr;
5316 register regnode *ender = NULL;
5317 register I32 parno = 0;
5319 U32 oregflags = RExC_flags;
5320 bool have_branch = 0;
5322 I32 freeze_paren = 0;
5323 I32 after_freeze = 0;
5325 /* for (?g), (?gc), and (?o) warnings; warning
5326 about (?c) will warn about (?g) -- japhy */
5328 #define WASTED_O 0x01
5329 #define WASTED_G 0x02
5330 #define WASTED_C 0x04
5331 #define WASTED_GC (0x02|0x04)
5332 I32 wastedflags = 0x00;
5334 char * parse_start = RExC_parse; /* MJD */
5335 char * const oregcomp_parse = RExC_parse;
5337 GET_RE_DEBUG_FLAGS_DECL;
5338 DEBUG_PARSE("reg ");
5340 *flagp = 0; /* Tentatively. */
5343 /* Make an OPEN node, if parenthesized. */
5345 if ( *RExC_parse == '*') { /* (*VERB:ARG) */
5346 char *start_verb = RExC_parse;
5347 STRLEN verb_len = 0;
5348 char *start_arg = NULL;
5349 unsigned char op = 0;
5351 int internal_argval = 0; /* internal_argval is only useful if !argok */
5352 while ( *RExC_parse && *RExC_parse != ')' ) {
5353 if ( *RExC_parse == ':' ) {
5354 start_arg = RExC_parse + 1;
5360 verb_len = RExC_parse - start_verb;
5363 while ( *RExC_parse && *RExC_parse != ')' )
5365 if ( *RExC_parse != ')' )
5366 vFAIL("Unterminated verb pattern argument");
5367 if ( RExC_parse == start_arg )
5370 if ( *RExC_parse != ')' )
5371 vFAIL("Unterminated verb pattern");
5374 switch ( *start_verb ) {
5375 case 'A': /* (*ACCEPT) */
5376 if ( memEQs(start_verb,verb_len,"ACCEPT") ) {
5378 internal_argval = RExC_nestroot;
5381 case 'C': /* (*COMMIT) */
5382 if ( memEQs(start_verb,verb_len,"COMMIT") )
5385 case 'F': /* (*FAIL) */
5386 if ( verb_len==1 || memEQs(start_verb,verb_len,"FAIL") ) {
5391 case ':': /* (*:NAME) */
5392 case 'M': /* (*MARK:NAME) */
5393 if ( verb_len==0 || memEQs(start_verb,verb_len,"MARK") ) {
5398 case 'P': /* (*PRUNE) */
5399 if ( memEQs(start_verb,verb_len,"PRUNE") )
5402 case 'S': /* (*SKIP) */
5403 if ( memEQs(start_verb,verb_len,"SKIP") )
5406 case 'T': /* (*THEN) */
5407 /* [19:06] <TimToady> :: is then */
5408 if ( memEQs(start_verb,verb_len,"THEN") ) {
5410 RExC_seen |= REG_SEEN_CUTGROUP;
5416 vFAIL3("Unknown verb pattern '%.*s'",
5417 verb_len, start_verb);
5420 if ( start_arg && internal_argval ) {
5421 vFAIL3("Verb pattern '%.*s' may not have an argument",
5422 verb_len, start_verb);
5423 } else if ( argok < 0 && !start_arg ) {
5424 vFAIL3("Verb pattern '%.*s' has a mandatory argument",
5425 verb_len, start_verb);
5427 ret = reganode(pRExC_state, op, internal_argval);
5428 if ( ! internal_argval && ! SIZE_ONLY ) {
5430 SV *sv = newSVpvn( start_arg, RExC_parse - start_arg);
5431 ARG(ret) = add_data( pRExC_state, 1, "S" );
5432 RExC_rxi->data->data[ARG(ret)]=(void*)sv;
5439 if (!internal_argval)
5440 RExC_seen |= REG_SEEN_VERBARG;
5441 } else if ( start_arg ) {
5442 vFAIL3("Verb pattern '%.*s' may not have an argument",
5443 verb_len, start_verb);
5445 ret = reg_node(pRExC_state, op);
5447 nextchar(pRExC_state);
5450 if (*RExC_parse == '?') { /* (?...) */
5451 bool is_logical = 0;
5452 const char * const seqstart = RExC_parse;
5455 paren = *RExC_parse++;
5456 ret = NULL; /* For look-ahead/behind. */
5459 case 'P': /* (?P...) variants for those used to PCRE/Python */
5460 paren = *RExC_parse++;
5461 if ( paren == '<') /* (?P<...>) named capture */
5463 else if (paren == '>') { /* (?P>name) named recursion */
5464 goto named_recursion;
5466 else if (paren == '=') { /* (?P=...) named backref */
5467 /* this pretty much dupes the code for \k<NAME> in regatom(), if
5468 you change this make sure you change that */
5469 char* name_start = RExC_parse;
5471 SV *sv_dat = reg_scan_name(pRExC_state,
5472 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
5473 if (RExC_parse == name_start || *RExC_parse != ')')
5474 vFAIL2("Sequence %.3s... not terminated",parse_start);
5477 num = add_data( pRExC_state, 1, "S" );
5478 RExC_rxi->data->data[num]=(void*)sv_dat;
5479 SvREFCNT_inc_simple_void(sv_dat);
5482 ret = reganode(pRExC_state,
5483 (U8)(FOLD ? (LOC ? NREFFL : NREFF) : NREF),
5487 Set_Node_Offset(ret, parse_start+1);
5488 Set_Node_Cur_Length(ret); /* MJD */
5490 nextchar(pRExC_state);
5494 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
5496 case '<': /* (?<...) */
5497 if (*RExC_parse == '!')
5499 else if (*RExC_parse != '=')
5505 case '\'': /* (?'...') */
5506 name_start= RExC_parse;
5507 svname = reg_scan_name(pRExC_state,
5508 SIZE_ONLY ? /* reverse test from the others */
5509 REG_RSN_RETURN_NAME :
5510 REG_RSN_RETURN_NULL);
5511 if (RExC_parse == name_start) {
5513 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
5516 if (*RExC_parse != paren)
5517 vFAIL2("Sequence (?%c... not terminated",
5518 paren=='>' ? '<' : paren);
5522 if (!svname) /* shouldnt happen */
5524 "panic: reg_scan_name returned NULL");
5525 if (!RExC_paren_names) {
5526 RExC_paren_names= newHV();
5527 sv_2mortal((SV*)RExC_paren_names);
5529 RExC_paren_name_list= newAV();
5530 sv_2mortal((SV*)RExC_paren_name_list);
5533 he_str = hv_fetch_ent( RExC_paren_names, svname, 1, 0 );
5535 sv_dat = HeVAL(he_str);
5537 /* croak baby croak */
5539 "panic: paren_name hash element allocation failed");
5540 } else if ( SvPOK(sv_dat) ) {
5541 /* (?|...) can mean we have dupes so scan to check
5542 its already been stored. Maybe a flag indicating
5543 we are inside such a construct would be useful,
5544 but the arrays are likely to be quite small, so
5545 for now we punt -- dmq */
5546 IV count = SvIV(sv_dat);
5547 I32 *pv = (I32*)SvPVX(sv_dat);
5549 for ( i = 0 ; i < count ; i++ ) {
5550 if ( pv[i] == RExC_npar ) {
5556 pv = (I32*)SvGROW(sv_dat, SvCUR(sv_dat) + sizeof(I32)+1);
5557 SvCUR_set(sv_dat, SvCUR(sv_dat) + sizeof(I32));
5558 pv[count] = RExC_npar;
5562 (void)SvUPGRADE(sv_dat,SVt_PVNV);
5563 sv_setpvn(sv_dat, (char *)&(RExC_npar), sizeof(I32));
5568 if (!av_store(RExC_paren_name_list, RExC_npar, SvREFCNT_inc(svname)))
5569 SvREFCNT_dec(svname);
5572 /*sv_dump(sv_dat);*/
5574 nextchar(pRExC_state);
5576 goto capturing_parens;
5578 RExC_seen |= REG_SEEN_LOOKBEHIND;
5580 case '=': /* (?=...) */
5581 case '!': /* (?!...) */
5582 RExC_seen_zerolen++;
5583 if (*RExC_parse == ')') {
5584 ret=reg_node(pRExC_state, OPFAIL);
5585 nextchar(pRExC_state);
5589 case '|': /* (?|...) */
5590 /* branch reset, behave like a (?:...) except that
5591 buffers in alternations share the same numbers */
5593 after_freeze = freeze_paren = RExC_npar;
5595 case ':': /* (?:...) */
5596 case '>': /* (?>...) */
5598 case '$': /* (?$...) */
5599 case '@': /* (?@...) */
5600 vFAIL2("Sequence (?%c...) not implemented", (int)paren);
5602 case '#': /* (?#...) */
5603 while (*RExC_parse && *RExC_parse != ')')
5605 if (*RExC_parse != ')')
5606 FAIL("Sequence (?#... not terminated");
5607 nextchar(pRExC_state);
5610 case '0' : /* (?0) */
5611 case 'R' : /* (?R) */
5612 if (*RExC_parse != ')')
5613 FAIL("Sequence (?R) not terminated");
5614 ret = reg_node(pRExC_state, GOSTART);
5615 *flagp |= POSTPONED;
5616 nextchar(pRExC_state);
5619 { /* named and numeric backreferences */
5621 case '&': /* (?&NAME) */
5622 parse_start = RExC_parse - 1;
5625 SV *sv_dat = reg_scan_name(pRExC_state,
5626 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
5627 num = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
5629 goto gen_recurse_regop;
5632 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
5634 vFAIL("Illegal pattern");
5636 goto parse_recursion;
5638 case '-': /* (?-1) */
5639 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
5640 RExC_parse--; /* rewind to let it be handled later */
5644 case '1': case '2': case '3': case '4': /* (?1) */
5645 case '5': case '6': case '7': case '8': case '9':
5648 num = atoi(RExC_parse);
5649 parse_start = RExC_parse - 1; /* MJD */
5650 if (*RExC_parse == '-')
5652 while (isDIGIT(*RExC_parse))
5654 if (*RExC_parse!=')')
5655 vFAIL("Expecting close bracket");
5658 if ( paren == '-' ) {
5660 Diagram of capture buffer numbering.
5661 Top line is the normal capture buffer numbers
5662 Botton line is the negative indexing as from
5666 /(a(x)y)(a(b(c(?-2)d)e)f)(g(h))/
5670 num = RExC_npar + num;
5673 vFAIL("Reference to nonexistent group");
5675 } else if ( paren == '+' ) {
5676 num = RExC_npar + num - 1;
5679 ret = reganode(pRExC_state, GOSUB, num);
5681 if (num > (I32)RExC_rx->nparens) {
5683 vFAIL("Reference to nonexistent group");
5685 ARG2L_SET( ret, RExC_recurse_count++);
5687 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
5688 "Recurse #%"UVuf" to %"IVdf"\n", (UV)ARG(ret), (IV)ARG2L(ret)));
5692 RExC_seen |= REG_SEEN_RECURSE;
5693 Set_Node_Length(ret, 1 + regarglen[OP(ret)]); /* MJD */
5694 Set_Node_Offset(ret, parse_start); /* MJD */
5696 *flagp |= POSTPONED;
5697 nextchar(pRExC_state);
5699 } /* named and numeric backreferences */
5702 case '?': /* (??...) */
5704 if (*RExC_parse != '{') {
5706 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
5709 *flagp |= POSTPONED;
5710 paren = *RExC_parse++;
5712 case '{': /* (?{...}) */
5717 char *s = RExC_parse;
5719 RExC_seen_zerolen++;
5720 RExC_seen |= REG_SEEN_EVAL;
5721 while (count && (c = *RExC_parse)) {
5732 if (*RExC_parse != ')') {
5734 vFAIL("Sequence (?{...}) not terminated or not {}-balanced");
5738 OP_4tree *sop, *rop;
5739 SV * const sv = newSVpvn(s, RExC_parse - 1 - s);
5742 Perl_save_re_context(aTHX);
5743 rop = sv_compile_2op(sv, &sop, "re", &pad);
5744 sop->op_private |= OPpREFCOUNTED;
5745 /* re_dup will OpREFCNT_inc */
5746 OpREFCNT_set(sop, 1);
5749 n = add_data(pRExC_state, 3, "nop");
5750 RExC_rxi->data->data[n] = (void*)rop;
5751 RExC_rxi->data->data[n+1] = (void*)sop;
5752 RExC_rxi->data->data[n+2] = (void*)pad;
5755 else { /* First pass */
5756 if (PL_reginterp_cnt < ++RExC_seen_evals
5758 /* No compiled RE interpolated, has runtime
5759 components ===> unsafe. */
5760 FAIL("Eval-group not allowed at runtime, use re 'eval'");
5761 if (PL_tainting && PL_tainted)
5762 FAIL("Eval-group in insecure regular expression");
5763 #if PERL_VERSION > 8
5764 if (IN_PERL_COMPILETIME)
5769 nextchar(pRExC_state);
5771 ret = reg_node(pRExC_state, LOGICAL);
5774 REGTAIL(pRExC_state, ret, reganode(pRExC_state, EVAL, n));
5775 /* deal with the length of this later - MJD */
5778 ret = reganode(pRExC_state, EVAL, n);
5779 Set_Node_Length(ret, RExC_parse - parse_start + 1);
5780 Set_Node_Offset(ret, parse_start);
5783 case '(': /* (?(?{...})...) and (?(?=...)...) */
5786 if (RExC_parse[0] == '?') { /* (?(?...)) */
5787 if (RExC_parse[1] == '=' || RExC_parse[1] == '!'
5788 || RExC_parse[1] == '<'
5789 || RExC_parse[1] == '{') { /* Lookahead or eval. */
5792 ret = reg_node(pRExC_state, LOGICAL);
5795 REGTAIL(pRExC_state, ret, reg(pRExC_state, 1, &flag,depth+1));
5799 else if ( RExC_parse[0] == '<' /* (?(<NAME>)...) */
5800 || RExC_parse[0] == '\'' ) /* (?('NAME')...) */
5802 char ch = RExC_parse[0] == '<' ? '>' : '\'';
5803 char *name_start= RExC_parse++;
5805 SV *sv_dat=reg_scan_name(pRExC_state,
5806 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
5807 if (RExC_parse == name_start || *RExC_parse != ch)
5808 vFAIL2("Sequence (?(%c... not terminated",
5809 (ch == '>' ? '<' : ch));
5812 num = add_data( pRExC_state, 1, "S" );
5813 RExC_rxi->data->data[num]=(void*)sv_dat;
5814 SvREFCNT_inc_simple_void(sv_dat);
5816 ret = reganode(pRExC_state,NGROUPP,num);
5817 goto insert_if_check_paren;
5819 else if (RExC_parse[0] == 'D' &&
5820 RExC_parse[1] == 'E' &&
5821 RExC_parse[2] == 'F' &&
5822 RExC_parse[3] == 'I' &&
5823 RExC_parse[4] == 'N' &&
5824 RExC_parse[5] == 'E')
5826 ret = reganode(pRExC_state,DEFINEP,0);
5829 goto insert_if_check_paren;
5831 else if (RExC_parse[0] == 'R') {
5834 if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
5835 parno = atoi(RExC_parse++);
5836 while (isDIGIT(*RExC_parse))
5838 } else if (RExC_parse[0] == '&') {
5841 sv_dat = reg_scan_name(pRExC_state,
5842 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
5843 parno = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
5845 ret = reganode(pRExC_state,INSUBP,parno);
5846 goto insert_if_check_paren;
5848 else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
5851 parno = atoi(RExC_parse++);
5853 while (isDIGIT(*RExC_parse))
5855 ret = reganode(pRExC_state, GROUPP, parno);
5857 insert_if_check_paren:
5858 if ((c = *nextchar(pRExC_state)) != ')')
5859 vFAIL("Switch condition not recognized");
5861 REGTAIL(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
5862 br = regbranch(pRExC_state, &flags, 1,depth+1);
5864 br = reganode(pRExC_state, LONGJMP, 0);
5866 REGTAIL(pRExC_state, br, reganode(pRExC_state, LONGJMP, 0));
5867 c = *nextchar(pRExC_state);
5872 vFAIL("(?(DEFINE)....) does not allow branches");
5873 lastbr = reganode(pRExC_state, IFTHEN, 0); /* Fake one for optimizer. */
5874 regbranch(pRExC_state, &flags, 1,depth+1);
5875 REGTAIL(pRExC_state, ret, lastbr);
5878 c = *nextchar(pRExC_state);
5883 vFAIL("Switch (?(condition)... contains too many branches");
5884 ender = reg_node(pRExC_state, TAIL);
5885 REGTAIL(pRExC_state, br, ender);
5887 REGTAIL(pRExC_state, lastbr, ender);
5888 REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
5891 REGTAIL(pRExC_state, ret, ender);
5892 RExC_size++; /* XXX WHY do we need this?!!
5893 For large programs it seems to be required
5894 but I can't figure out why. -- dmq*/
5898 vFAIL2("Unknown switch condition (?(%.2s", RExC_parse);
5902 RExC_parse--; /* for vFAIL to print correctly */
5903 vFAIL("Sequence (? incomplete");
5907 parse_flags: /* (?i) */
5909 U32 posflags = 0, negflags = 0;
5910 U32 *flagsp = &posflags;
5912 while (*RExC_parse) {
5913 /* && strchr("iogcmsx", *RExC_parse) */
5914 /* (?g), (?gc) and (?o) are useless here
5915 and must be globally applied -- japhy */
5916 switch (*RExC_parse) {
5917 CASE_STD_PMMOD_FLAGS_PARSE_SET(flagsp);
5918 case ONCE_PAT_MOD: /* 'o' */
5919 case GLOBAL_PAT_MOD: /* 'g' */
5920 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
5921 const I32 wflagbit = *RExC_parse == 'o' ? WASTED_O : WASTED_G;
5922 if (! (wastedflags & wflagbit) ) {
5923 wastedflags |= wflagbit;
5926 "Useless (%s%c) - %suse /%c modifier",
5927 flagsp == &negflags ? "?-" : "?",
5929 flagsp == &negflags ? "don't " : "",
5936 case CONTINUE_PAT_MOD: /* 'c' */
5937 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
5938 if (! (wastedflags & WASTED_C) ) {
5939 wastedflags |= WASTED_GC;
5942 "Useless (%sc) - %suse /gc modifier",
5943 flagsp == &negflags ? "?-" : "?",
5944 flagsp == &negflags ? "don't " : ""
5949 case KEEPCOPY_PAT_MOD: /* 'p' */
5950 if (flagsp == &negflags) {
5951 if (SIZE_ONLY && ckWARN(WARN_REGEXP))
5952 vWARN(RExC_parse + 1,"Useless use of (?-p)");
5954 *flagsp |= RXf_PMf_KEEPCOPY;
5958 if (flagsp == &negflags) {
5960 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
5964 wastedflags = 0; /* reset so (?g-c) warns twice */
5970 RExC_flags |= posflags;
5971 RExC_flags &= ~negflags;
5973 oregflags |= posflags;
5974 oregflags &= ~negflags;
5976 nextchar(pRExC_state);
5987 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
5992 }} /* one for the default block, one for the switch */
5999 ret = reganode(pRExC_state, OPEN, parno);
6002 RExC_nestroot = parno;
6003 if (RExC_seen & REG_SEEN_RECURSE
6004 && !RExC_open_parens[parno-1])
6006 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
6007 "Setting open paren #%"IVdf" to %d\n",
6008 (IV)parno, REG_NODE_NUM(ret)));
6009 RExC_open_parens[parno-1]= ret;
6012 Set_Node_Length(ret, 1); /* MJD */
6013 Set_Node_Offset(ret, RExC_parse); /* MJD */
6021 /* Pick up the branches, linking them together. */
6022 parse_start = RExC_parse; /* MJD */
6023 br = regbranch(pRExC_state, &flags, 1,depth+1);
6024 /* branch_len = (paren != 0); */
6028 if (*RExC_parse == '|') {
6029 if (!SIZE_ONLY && RExC_extralen) {
6030 reginsert(pRExC_state, BRANCHJ, br, depth+1);
6033 reginsert(pRExC_state, BRANCH, br, depth+1);
6034 Set_Node_Length(br, paren != 0);
6035 Set_Node_Offset_To_R(br-RExC_emit_start, parse_start-RExC_start);
6039 RExC_extralen += 1; /* For BRANCHJ-BRANCH. */
6041 else if (paren == ':') {
6042 *flagp |= flags&SIMPLE;
6044 if (is_open) { /* Starts with OPEN. */
6045 REGTAIL(pRExC_state, ret, br); /* OPEN -> first. */
6047 else if (paren != '?') /* Not Conditional */
6049 *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
6051 while (*RExC_parse == '|') {
6052 if (!SIZE_ONLY && RExC_extralen) {
6053 ender = reganode(pRExC_state, LONGJMP,0);
6054 REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender); /* Append to the previous. */
6057 RExC_extralen += 2; /* Account for LONGJMP. */
6058 nextchar(pRExC_state);
6060 if (RExC_npar > after_freeze)
6061 after_freeze = RExC_npar;
6062 RExC_npar = freeze_paren;
6064 br = regbranch(pRExC_state, &flags, 0, depth+1);
6068 REGTAIL(pRExC_state, lastbr, br); /* BRANCH -> BRANCH. */
6070 *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
6073 if (have_branch || paren != ':') {
6074 /* Make a closing node, and hook it on the end. */
6077 ender = reg_node(pRExC_state, TAIL);
6080 ender = reganode(pRExC_state, CLOSE, parno);
6081 if (!SIZE_ONLY && RExC_seen & REG_SEEN_RECURSE) {
6082 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
6083 "Setting close paren #%"IVdf" to %d\n",
6084 (IV)parno, REG_NODE_NUM(ender)));
6085 RExC_close_parens[parno-1]= ender;
6086 if (RExC_nestroot == parno)
6089 Set_Node_Offset(ender,RExC_parse+1); /* MJD */
6090 Set_Node_Length(ender,1); /* MJD */
6096 *flagp &= ~HASWIDTH;
6099 ender = reg_node(pRExC_state, SUCCEED);
6102 ender = reg_node(pRExC_state, END);
6104 assert(!RExC_opend); /* there can only be one! */
6109 REGTAIL(pRExC_state, lastbr, ender);
6111 if (have_branch && !SIZE_ONLY) {
6113 RExC_seen |= REG_TOP_LEVEL_BRANCHES;
6115 /* Hook the tails of the branches to the closing node. */
6116 for (br = ret; br; br = regnext(br)) {
6117 const U8 op = PL_regkind[OP(br)];
6119 REGTAIL_STUDY(pRExC_state, NEXTOPER(br), ender);
6121 else if (op == BRANCHJ) {
6122 REGTAIL_STUDY(pRExC_state, NEXTOPER(NEXTOPER(br)), ender);
6130 static const char parens[] = "=!<,>";
6132 if (paren && (p = strchr(parens, paren))) {
6133 U8 node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
6134 int flag = (p - parens) > 1;
6137 node = SUSPEND, flag = 0;
6138 reginsert(pRExC_state, node,ret, depth+1);
6139 Set_Node_Cur_Length(ret);
6140 Set_Node_Offset(ret, parse_start + 1);
6142 REGTAIL_STUDY(pRExC_state, ret, reg_node(pRExC_state, TAIL));
6146 /* Check for proper termination. */
6148 RExC_flags = oregflags;
6149 if (RExC_parse >= RExC_end || *nextchar(pRExC_state) != ')') {
6150 RExC_parse = oregcomp_parse;
6151 vFAIL("Unmatched (");
6154 else if (!paren && RExC_parse < RExC_end) {
6155 if (*RExC_parse == ')') {
6157 vFAIL("Unmatched )");
6160 FAIL("Junk on end of regexp"); /* "Can't happen". */
6164 RExC_npar = after_freeze;
6169 - regbranch - one alternative of an | operator
6171 * Implements the concatenation operator.
6174 S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first, U32 depth)
6177 register regnode *ret;
6178 register regnode *chain = NULL;
6179 register regnode *latest;
6180 I32 flags = 0, c = 0;
6181 GET_RE_DEBUG_FLAGS_DECL;
6182 DEBUG_PARSE("brnc");
6187 if (!SIZE_ONLY && RExC_extralen)
6188 ret = reganode(pRExC_state, BRANCHJ,0);
6190 ret = reg_node(pRExC_state, BRANCH);
6191 Set_Node_Length(ret, 1);
6195 if (!first && SIZE_ONLY)
6196 RExC_extralen += 1; /* BRANCHJ */
6198 *flagp = WORST; /* Tentatively. */
6201 nextchar(pRExC_state);
6202 while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
6204 latest = regpiece(pRExC_state, &flags,depth+1);
6205 if (latest == NULL) {
6206 if (flags & TRYAGAIN)
6210 else if (ret == NULL)
6212 *flagp |= flags&(HASWIDTH|POSTPONED);
6213 if (chain == NULL) /* First piece. */
6214 *flagp |= flags&SPSTART;
6217 REGTAIL(pRExC_state, chain, latest);
6222 if (chain == NULL) { /* Loop ran zero times. */
6223 chain = reg_node(pRExC_state, NOTHING);
6228 *flagp |= flags&SIMPLE;
6235 - regpiece - something followed by possible [*+?]
6237 * Note that the branching code sequences used for ? and the general cases
6238 * of * and + are somewhat optimized: they use the same NOTHING node as
6239 * both the endmarker for their branch list and the body of the last branch.
6240 * It might seem that this node could be dispensed with entirely, but the
6241 * endmarker role is not redundant.
6244 S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
6247 register regnode *ret;
6249 register char *next;
6251 const char * const origparse = RExC_parse;
6253 I32 max = REG_INFTY;
6255 const char *maxpos = NULL;
6256 GET_RE_DEBUG_FLAGS_DECL;
6257 DEBUG_PARSE("piec");
6259 ret = regatom(pRExC_state, &flags,depth+1);
6261 if (flags & TRYAGAIN)
6268 if (op == '{' && regcurly(RExC_parse)) {
6270 parse_start = RExC_parse; /* MJD */
6271 next = RExC_parse + 1;
6272 while (isDIGIT(*next) || *next == ',') {
6281 if (*next == '}') { /* got one */
6285 min = atoi(RExC_parse);
6289 maxpos = RExC_parse;
6291 if (!max && *maxpos != '0')
6292 max = REG_INFTY; /* meaning "infinity" */
6293 else if (max >= REG_INFTY)
6294 vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
6296 nextchar(pRExC_state);
6299 if ((flags&SIMPLE)) {
6300 RExC_naughty += 2 + RExC_naughty / 2;
6301 reginsert(pRExC_state, CURLY, ret, depth+1);
6302 Set_Node_Offset(ret, parse_start+1); /* MJD */
6303 Set_Node_Cur_Length(ret);
6306 regnode * const w = reg_node(pRExC_state, WHILEM);
6309 REGTAIL(pRExC_state, ret, w);
6310 if (!SIZE_ONLY && RExC_extralen) {
6311 reginsert(pRExC_state, LONGJMP,ret, depth+1);
6312 reginsert(pRExC_state, NOTHING,ret, depth+1);
6313 NEXT_OFF(ret) = 3; /* Go over LONGJMP. */
6315 reginsert(pRExC_state, CURLYX,ret, depth+1);
6317 Set_Node_Offset(ret, parse_start+1);
6318 Set_Node_Length(ret,
6319 op == '{' ? (RExC_parse - parse_start) : 1);
6321 if (!SIZE_ONLY && RExC_extralen)
6322 NEXT_OFF(ret) = 3; /* Go over NOTHING to LONGJMP. */
6323 REGTAIL(pRExC_state, ret, reg_node(pRExC_state, NOTHING));
6325 RExC_whilem_seen++, RExC_extralen += 3;
6326 RExC_naughty += 4 + RExC_naughty; /* compound interest */
6334 if (max && max < min)
6335 vFAIL("Can't do {n,m} with n > m");
6337 ARG1_SET(ret, (U16)min);
6338 ARG2_SET(ret, (U16)max);
6350 #if 0 /* Now runtime fix should be reliable. */
6352 /* if this is reinstated, don't forget to put this back into perldiag:
6354 =item Regexp *+ operand could be empty at {#} in regex m/%s/
6356 (F) The part of the regexp subject to either the * or + quantifier
6357 could match an empty string. The {#} shows in the regular
6358 expression about where the problem was discovered.
6362 if (!(flags&HASWIDTH) && op != '?')
6363 vFAIL("Regexp *+ operand could be empty");
6366 parse_start = RExC_parse;
6367 nextchar(pRExC_state);
6369 *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH);
6371 if (op == '*' && (flags&SIMPLE)) {
6372 reginsert(pRExC_state, STAR, ret, depth+1);
6376 else if (op == '*') {
6380 else if (op == '+' && (flags&SIMPLE)) {
6381 reginsert(pRExC_state, PLUS, ret, depth+1);
6385 else if (op == '+') {
6389 else if (op == '?') {
6394 if (!SIZE_ONLY && !(flags&(HASWIDTH|POSTPONED)) && max > REG_INFTY/3 && ckWARN(WARN_REGEXP)) {
6396 "%.*s matches null string many times",
6397 (int)(RExC_parse >= origparse ? RExC_parse - origparse : 0),
6401 if (RExC_parse < RExC_end && *RExC_parse == '?') {
6402 nextchar(pRExC_state);
6403 reginsert(pRExC_state, MINMOD, ret, depth+1);
6404 REGTAIL(pRExC_state, ret, ret + NODE_STEP_REGNODE);
6406 #ifndef REG_ALLOW_MINMOD_SUSPEND
6409 if (RExC_parse < RExC_end && *RExC_parse == '+') {
6411 nextchar(pRExC_state);
6412 ender = reg_node(pRExC_state, SUCCEED);
6413 REGTAIL(pRExC_state, ret, ender);
6414 reginsert(pRExC_state, SUSPEND, ret, depth+1);
6416 ender = reg_node(pRExC_state, TAIL);
6417 REGTAIL(pRExC_state, ret, ender);
6421 if (RExC_parse < RExC_end && ISMULT2(RExC_parse)) {
6423 vFAIL("Nested quantifiers");
6430 /* reg_namedseq(pRExC_state,UVp)
6432 This is expected to be called by a parser routine that has
6433 recognized'\N' and needs to handle the rest. RExC_parse is
6434 expected to point at the first char following the N at the time
6437 If valuep is non-null then it is assumed that we are parsing inside
6438 of a charclass definition and the first codepoint in the resolved
6439 string is returned via *valuep and the routine will return NULL.
6440 In this mode if a multichar string is returned from the charnames
6441 handler a warning will be issued, and only the first char in the
6442 sequence will be examined. If the string returned is zero length
6443 then the value of *valuep is undefined and NON-NULL will
6444 be returned to indicate failure. (This will NOT be a valid pointer
6447 If value is null then it is assumed that we are parsing normal text
6448 and inserts a new EXACT node into the program containing the resolved
6449 string and returns a pointer to the new node. If the string is
6450 zerolength a NOTHING node is emitted.
6452 On success RExC_parse is set to the char following the endbrace.
6453 Parsing failures will generate a fatal errorvia vFAIL(...)
6455 NOTE: We cache all results from the charnames handler locally in
6456 the RExC_charnames hash (created on first use) to prevent a charnames
6457 handler from playing silly-buggers and returning a short string and
6458 then a long string for a given pattern. Since the regexp program
6459 size is calculated during an initial parse this would result
6460 in a buffer overrun so we cache to prevent the charname result from
6461 changing during the course of the parse.
6465 S_reg_namedseq(pTHX_ RExC_state_t *pRExC_state, UV *valuep)
6467 char * name; /* start of the content of the name */
6468 char * endbrace; /* endbrace following the name */
6471 STRLEN len; /* this has various purposes throughout the code */
6472 bool cached = 0; /* if this is true then we shouldn't refcount dev sv_str */
6473 regnode *ret = NULL;
6475 if (*RExC_parse != '{') {
6476 vFAIL("Missing braces on \\N{}");
6478 name = RExC_parse+1;
6479 endbrace = strchr(RExC_parse, '}');
6482 vFAIL("Missing right brace on \\N{}");
6484 RExC_parse = endbrace + 1;
6487 /* RExC_parse points at the beginning brace,
6488 endbrace points at the last */
6489 if ( name[0]=='U' && name[1]=='+' ) {
6490 /* its a "Unicode hex" notation {U+89AB} */
6491 I32 fl = PERL_SCAN_ALLOW_UNDERSCORES
6492 | PERL_SCAN_DISALLOW_PREFIX
6493 | (SIZE_ONLY ? PERL_SCAN_SILENT_ILLDIGIT : 0);
6496 len = (STRLEN)(endbrace - name - 2);
6497 cp = grok_hex(name + 2, &len, &fl, NULL);
6498 if ( len != (STRLEN)(endbrace - name - 2) ) {
6508 sv_str= newSVpvn(&string, 1);
6510 /* fetch the charnames handler for this scope */
6511 HV * const table = GvHV(PL_hintgv);
6513 hv_fetchs(table, "charnames", FALSE) :
6515 SV *cv= cvp ? *cvp : NULL;
6518 /* create an SV with the name as argument */
6519 sv_name = newSVpvn(name, endbrace - name);
6521 if (!table || !(PL_hints & HINT_LOCALIZE_HH)) {
6522 vFAIL2("Constant(\\N{%s}) unknown: "
6523 "(possibly a missing \"use charnames ...\")",
6526 if (!cvp || !SvOK(*cvp)) { /* when $^H{charnames} = undef; */
6527 vFAIL2("Constant(\\N{%s}): "
6528 "$^H{charnames} is not defined",SvPVX(sv_name));
6533 if (!RExC_charnames) {
6534 /* make sure our cache is allocated */
6535 RExC_charnames = newHV();
6536 sv_2mortal((SV*)RExC_charnames);
6538 /* see if we have looked this one up before */
6539 he_str = hv_fetch_ent( RExC_charnames, sv_name, 0, 0 );
6541 sv_str = HeVAL(he_str);
6554 count= call_sv(cv, G_SCALAR);
6556 if (count == 1) { /* XXXX is this right? dmq */
6558 SvREFCNT_inc_simple_void(sv_str);
6566 if ( !sv_str || !SvOK(sv_str) ) {
6567 vFAIL2("Constant(\\N{%s}): Call to &{$^H{charnames}} "
6568 "did not return a defined value",SvPVX(sv_name));
6570 if (hv_store_ent( RExC_charnames, sv_name, sv_str, 0))
6575 char *p = SvPV(sv_str, len);
6578 if ( SvUTF8(sv_str) ) {
6579 *valuep = utf8_to_uvchr((U8*)p, &numlen);
6583 We have to turn on utf8 for high bit chars otherwise
6584 we get failures with
6586 "ss" =~ /[\N{LATIN SMALL LETTER SHARP S}]/i
6587 "SS" =~ /[\N{LATIN SMALL LETTER SHARP S}]/i
6589 This is different from what \x{} would do with the same
6590 codepoint, where the condition is > 0xFF.
6597 /* warn if we havent used the whole string? */
6599 if (numlen<len && SIZE_ONLY && ckWARN(WARN_REGEXP)) {
6601 "Ignoring excess chars from \\N{%s} in character class",
6605 } else if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
6607 "Ignoring zero length \\N{%s} in character class",
6612 SvREFCNT_dec(sv_name);
6614 SvREFCNT_dec(sv_str);
6615 return len ? NULL : (regnode *)&len;
6616 } else if(SvCUR(sv_str)) {
6622 char * parse_start = name-3; /* needed for the offsets */
6624 GET_RE_DEBUG_FLAGS_DECL; /* needed for the offsets */
6626 ret = reg_node(pRExC_state,
6627 (U8)(FOLD ? (LOC ? EXACTFL : EXACTF) : EXACT));
6630 if ( RExC_utf8 && !SvUTF8(sv_str) ) {
6631 sv_utf8_upgrade(sv_str);
6632 } else if ( !RExC_utf8 && SvUTF8(sv_str) ) {
6636 p = SvPV(sv_str, len);
6638 /* len is the length written, charlen is the size the char read */
6639 for ( len = 0; p < pend; p += charlen ) {
6641 UV uvc = utf8_to_uvchr((U8*)p, &charlen);
6643 STRLEN foldlen,numlen;
6644 U8 tmpbuf[UTF8_MAXBYTES_CASE+1], *foldbuf;
6645 uvc = toFOLD_uni(uvc, tmpbuf, &foldlen);
6646 /* Emit all the Unicode characters. */
6648 for (foldbuf = tmpbuf;
6652 uvc = utf8_to_uvchr(foldbuf, &numlen);
6654 const STRLEN unilen = reguni(pRExC_state, uvc, s);
6657 /* In EBCDIC the numlen
6658 * and unilen can differ. */
6660 if (numlen >= foldlen)
6664 break; /* "Can't happen." */
6667 const STRLEN unilen = reguni(pRExC_state, uvc, s);
6679 RExC_size += STR_SZ(len);
6682 RExC_emit += STR_SZ(len);
6684 Set_Node_Cur_Length(ret); /* MJD */
6686 nextchar(pRExC_state);
6688 ret = reg_node(pRExC_state,NOTHING);
6691 SvREFCNT_dec(sv_str);
6694 SvREFCNT_dec(sv_name);
6704 * It returns the code point in utf8 for the value in *encp.
6705 * value: a code value in the source encoding
6706 * encp: a pointer to an Encode object
6708 * If the result from Encode is not a single character,
6709 * it returns U+FFFD (Replacement character) and sets *encp to NULL.
6712 S_reg_recode(pTHX_ const char value, SV **encp)
6715 SV * const sv = sv_2mortal(newSVpvn(&value, numlen));
6716 const char * const s = *encp ? sv_recode_to_utf8(sv, *encp) : SvPVX(sv);
6717 const STRLEN newlen = SvCUR(sv);
6718 UV uv = UNICODE_REPLACEMENT;
6722 ? utf8n_to_uvchr((U8*)s, newlen, &numlen, UTF8_ALLOW_DEFAULT)
6725 if (!newlen || numlen != newlen) {
6726 uv = UNICODE_REPLACEMENT;
6734 - regatom - the lowest level
6736 Try to identify anything special at the start of the pattern. If there
6737 is, then handle it as required. This may involve generating a single regop,
6738 such as for an assertion; or it may involve recursing, such as to
6739 handle a () structure.
6741 If the string doesn't start with something special then we gobble up
6742 as much literal text as we can.
6744 Once we have been able to handle whatever type of thing started the
6745 sequence, we return.
6747 Note: we have to be careful with escapes, as they can be both literal
6748 and special, and in the case of \10 and friends can either, depending
6749 on context. Specifically there are two seperate switches for handling
6750 escape sequences, with the one for handling literal escapes requiring
6751 a dummy entry for all of the special escapes that are actually handled
6756 S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
6759 register regnode *ret = NULL;
6761 char *parse_start = RExC_parse;
6762 GET_RE_DEBUG_FLAGS_DECL;
6763 DEBUG_PARSE("atom");
6764 *flagp = WORST; /* Tentatively. */
6768 switch ((U8)*RExC_parse) {
6770 RExC_seen_zerolen++;
6771 nextchar(pRExC_state);
6772 if (RExC_flags & RXf_PMf_MULTILINE)
6773 ret = reg_node(pRExC_state, MBOL);
6774 else if (RExC_flags & RXf_PMf_SINGLELINE)
6775 ret = reg_node(pRExC_state, SBOL);
6777 ret = reg_node(pRExC_state, BOL);
6778 Set_Node_Length(ret, 1); /* MJD */
6781 nextchar(pRExC_state);
6783 RExC_seen_zerolen++;
6784 if (RExC_flags & RXf_PMf_MULTILINE)
6785 ret = reg_node(pRExC_state, MEOL);
6786 else if (RExC_flags & RXf_PMf_SINGLELINE)
6787 ret = reg_node(pRExC_state, SEOL);
6789 ret = reg_node(pRExC_state, EOL);
6790 Set_Node_Length(ret, 1); /* MJD */
6793 nextchar(pRExC_state);
6794 if (RExC_flags & RXf_PMf_SINGLELINE)
6795 ret = reg_node(pRExC_state, SANY);
6797 ret = reg_node(pRExC_state, REG_ANY);
6798 *flagp |= HASWIDTH|SIMPLE;
6800 Set_Node_Length(ret, 1); /* MJD */
6804 char * const oregcomp_parse = ++RExC_parse;
6805 ret = regclass(pRExC_state,depth+1);
6806 if (*RExC_parse != ']') {
6807 RExC_parse = oregcomp_parse;
6808 vFAIL("Unmatched [");
6810 nextchar(pRExC_state);
6811 *flagp |= HASWIDTH|SIMPLE;
6812 Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */
6816 nextchar(pRExC_state);
6817 ret = reg(pRExC_state, 1, &flags,depth+1);
6819 if (flags & TRYAGAIN) {
6820 if (RExC_parse == RExC_end) {
6821 /* Make parent create an empty node if needed. */
6829 *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE|POSTPONED);
6833 if (flags & TRYAGAIN) {
6837 vFAIL("Internal urp");
6838 /* Supposed to be caught earlier. */
6841 if (!regcurly(RExC_parse)) {
6850 vFAIL("Quantifier follows nothing");
6857 len=0; /* silence a spurious compiler warning */
6858 if ((cp = what_len_TRICKYFOLD_safe(RExC_parse,RExC_end,UTF,len))) {
6859 *flagp |= HASWIDTH; /* could be SIMPLE too, but needs a handler in regexec.regrepeat */
6860 RExC_parse+=len-1; /* we get one from nextchar() as well. :-( */
6861 ret = reganode(pRExC_state, FOLDCHAR, cp);
6862 Set_Node_Length(ret, 1); /* MJD */
6863 nextchar(pRExC_state); /* kill whitespace under /x */
6871 This switch handles escape sequences that resolve to some kind
6872 of special regop and not to literal text. Escape sequnces that
6873 resolve to literal text are handled below in the switch marked
6876 Every entry in this switch *must* have a corresponding entry
6877 in the literal escape switch. However, the opposite is not
6878 required, as the default for this switch is to jump to the
6879 literal text handling code.
6881 switch (*++RExC_parse) {
6882 /* Special Escapes */
6884 RExC_seen_zerolen++;
6885 ret = reg_node(pRExC_state, SBOL);
6887 goto finish_meta_pat;
6889 ret = reg_node(pRExC_state, GPOS);
6890 RExC_seen |= REG_SEEN_GPOS;
6892 goto finish_meta_pat;
6894 RExC_seen_zerolen++;
6895 ret = reg_node(pRExC_state, KEEPS);
6897 goto finish_meta_pat;
6899 ret = reg_node(pRExC_state, SEOL);
6901 RExC_seen_zerolen++; /* Do not optimize RE away */
6902 goto finish_meta_pat;
6904 ret = reg_node(pRExC_state, EOS);
6906 RExC_seen_zerolen++; /* Do not optimize RE away */
6907 goto finish_meta_pat;
6909 ret = reg_node(pRExC_state, CANY);
6910 RExC_seen |= REG_SEEN_CANY;
6911 *flagp |= HASWIDTH|SIMPLE;
6912 goto finish_meta_pat;
6914 ret = reg_node(pRExC_state, CLUMP);
6916 goto finish_meta_pat;
6918 ret = reg_node(pRExC_state, (U8)(LOC ? ALNUML : ALNUM));
6919 *flagp |= HASWIDTH|SIMPLE;
6920 goto finish_meta_pat;
6922 ret = reg_node(pRExC_state, (U8)(LOC ? NALNUML : NALNUM));
6923 *flagp |= HASWIDTH|SIMPLE;
6924 goto finish_meta_pat;
6926 RExC_seen_zerolen++;
6927 RExC_seen |= REG_SEEN_LOOKBEHIND;
6928 ret = reg_node(pRExC_state, (U8)(LOC ? BOUNDL : BOUND));
6930 goto finish_meta_pat;
6932 RExC_seen_zerolen++;
6933 RExC_seen |= REG_SEEN_LOOKBEHIND;
6934 ret = reg_node(pRExC_state, (U8)(LOC ? NBOUNDL : NBOUND));
6936 goto finish_meta_pat;
6938 ret = reg_node(pRExC_state, (U8)(LOC ? SPACEL : SPACE));
6939 *flagp |= HASWIDTH|SIMPLE;
6940 goto finish_meta_pat;
6942 ret = reg_node(pRExC_state, (U8)(LOC ? NSPACEL : NSPACE));
6943 *flagp |= HASWIDTH|SIMPLE;
6944 goto finish_meta_pat;
6946 ret = reg_node(pRExC_state, DIGIT);
6947 *flagp |= HASWIDTH|SIMPLE;
6948 goto finish_meta_pat;
6950 ret = reg_node(pRExC_state, NDIGIT);
6951 *flagp |= HASWIDTH|SIMPLE;
6952 goto finish_meta_pat;
6954 ret = reg_node(pRExC_state, LNBREAK);
6955 *flagp |= HASWIDTH|SIMPLE;
6956 goto finish_meta_pat;
6958 ret = reg_node(pRExC_state, HORIZWS);
6959 *flagp |= HASWIDTH|SIMPLE;
6960 goto finish_meta_pat;
6962 ret = reg_node(pRExC_state, NHORIZWS);
6963 *flagp |= HASWIDTH|SIMPLE;
6964 goto finish_meta_pat;
6966 ret = reg_node(pRExC_state, VERTWS);
6967 *flagp |= HASWIDTH|SIMPLE;
6968 goto finish_meta_pat;
6970 ret = reg_node(pRExC_state, NVERTWS);
6971 *flagp |= HASWIDTH|SIMPLE;
6973 nextchar(pRExC_state);
6974 Set_Node_Length(ret, 2); /* MJD */
6979 char* const oldregxend = RExC_end;
6981 char* parse_start = RExC_parse - 2;
6984 if (RExC_parse[1] == '{') {
6985 /* a lovely hack--pretend we saw [\pX] instead */
6986 RExC_end = strchr(RExC_parse, '}');
6988 const U8 c = (U8)*RExC_parse;
6990 RExC_end = oldregxend;
6991 vFAIL2("Missing right brace on \\%c{}", c);
6996 RExC_end = RExC_parse + 2;
6997 if (RExC_end > oldregxend)
6998 RExC_end = oldregxend;
7002 ret = regclass(pRExC_state,depth+1);
7004 RExC_end = oldregxend;
7007 Set_Node_Offset(ret, parse_start + 2);
7008 Set_Node_Cur_Length(ret);
7009 nextchar(pRExC_state);
7010 *flagp |= HASWIDTH|SIMPLE;
7014 /* Handle \N{NAME} here and not below because it can be
7015 multicharacter. join_exact() will join them up later on.
7016 Also this makes sure that things like /\N{BLAH}+/ and
7017 \N{BLAH} being multi char Just Happen. dmq*/
7019 ret= reg_namedseq(pRExC_state, NULL);
7021 case 'k': /* Handle \k<NAME> and \k'NAME' */
7024 char ch= RExC_parse[1];
7025 if (ch != '<' && ch != '\'' && ch != '{') {
7027 vFAIL2("Sequence %.2s... not terminated",parse_start);
7029 /* this pretty much dupes the code for (?P=...) in reg(), if
7030 you change this make sure you change that */
7031 char* name_start = (RExC_parse += 2);
7033 SV *sv_dat = reg_scan_name(pRExC_state,
7034 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
7035 ch= (ch == '<') ? '>' : (ch == '{') ? '}' : '\'';
7036 if (RExC_parse == name_start || *RExC_parse != ch)
7037 vFAIL2("Sequence %.3s... not terminated",parse_start);
7040 num = add_data( pRExC_state, 1, "S" );
7041 RExC_rxi->data->data[num]=(void*)sv_dat;
7042 SvREFCNT_inc_simple_void(sv_dat);
7046 ret = reganode(pRExC_state,
7047 (U8)(FOLD ? (LOC ? NREFFL : NREFF) : NREF),
7051 /* override incorrect value set in reganode MJD */
7052 Set_Node_Offset(ret, parse_start+1);
7053 Set_Node_Cur_Length(ret); /* MJD */
7054 nextchar(pRExC_state);
7060 case '1': case '2': case '3': case '4':
7061 case '5': case '6': case '7': case '8': case '9':
7064 bool isg = *RExC_parse == 'g';
7069 if (*RExC_parse == '{') {
7073 if (*RExC_parse == '-') {
7077 if (hasbrace && !isDIGIT(*RExC_parse)) {
7078 if (isrel) RExC_parse--;
7080 goto parse_named_seq;
7082 num = atoi(RExC_parse);
7083 if (isg && num == 0)
7084 vFAIL("Reference to invalid group 0");
7086 num = RExC_npar - num;
7088 vFAIL("Reference to nonexistent or unclosed group");
7090 if (!isg && num > 9 && num >= RExC_npar)
7093 char * const parse_start = RExC_parse - 1; /* MJD */
7094 while (isDIGIT(*RExC_parse))
7096 if (parse_start == RExC_parse - 1)
7097 vFAIL("Unterminated \\g... pattern");
7099 if (*RExC_parse != '}')
7100 vFAIL("Unterminated \\g{...} pattern");
7104 if (num > (I32)RExC_rx->nparens)
7105 vFAIL("Reference to nonexistent group");
7108 ret = reganode(pRExC_state,
7109 (U8)(FOLD ? (LOC ? REFFL : REFF) : REF),
7113 /* override incorrect value set in reganode MJD */
7114 Set_Node_Offset(ret, parse_start+1);
7115 Set_Node_Cur_Length(ret); /* MJD */
7117 nextchar(pRExC_state);
7122 if (RExC_parse >= RExC_end)
7123 FAIL("Trailing \\");
7126 /* Do not generate "unrecognized" warnings here, we fall
7127 back into the quick-grab loop below */
7134 if (RExC_flags & RXf_PMf_EXTENDED) {
7135 if ( reg_skipcomment( pRExC_state ) )
7142 register STRLEN len;
7147 U8 tmpbuf[UTF8_MAXBYTES_CASE+1], *foldbuf;
7149 parse_start = RExC_parse - 1;
7155 ret = reg_node(pRExC_state,
7156 (U8)(FOLD ? (LOC ? EXACTFL : EXACTF) : EXACT));
7158 for (len = 0, p = RExC_parse - 1;
7159 len < 127 && p < RExC_end;
7162 char * const oldp = p;
7164 if (RExC_flags & RXf_PMf_EXTENDED)
7165 p = regwhite( pRExC_state, p );
7170 if (LOC || !FOLD || !is_TRICKYFOLD_safe(p,RExC_end,UTF))
7171 goto normal_default;
7181 /* Literal Escapes Switch
7183 This switch is meant to handle escape sequences that
7184 resolve to a literal character.
7186 Every escape sequence that represents something
7187 else, like an assertion or a char class, is handled
7188 in the switch marked 'Special Escapes' above in this
7189 routine, but also has an entry here as anything that
7190 isn't explicitly mentioned here will be treated as
7191 an unescaped equivalent literal.
7195 /* These are all the special escapes. */
7196 case 'A': /* Start assertion */
7197 case 'b': case 'B': /* Word-boundary assertion*/
7198 case 'C': /* Single char !DANGEROUS! */
7199 case 'd': case 'D': /* digit class */
7200 case 'g': case 'G': /* generic-backref, pos assertion */
7201 case 'h': case 'H': /* HORIZWS */
7202 case 'k': case 'K': /* named backref, keep marker */
7203 case 'N': /* named char sequence */
7204 case 'p': case 'P': /* Unicode property */
7205 case 'R': /* LNBREAK */
7206 case 's': case 'S': /* space class */
7207 case 'v': case 'V': /* VERTWS */
7208 case 'w': case 'W': /* word class */
7209 case 'X': /* eXtended Unicode "combining character sequence" */
7210 case 'z': case 'Z': /* End of line/string assertion */
7214 /* Anything after here is an escape that resolves to a
7215 literal. (Except digits, which may or may not)
7234 ender = ASCII_TO_NATIVE('\033');
7238 ender = ASCII_TO_NATIVE('\007');
7243 char* const e = strchr(p, '}');
7247 vFAIL("Missing right brace on \\x{}");
7250 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
7251 | PERL_SCAN_DISALLOW_PREFIX;
7252 STRLEN numlen = e - p - 1;
7253 ender = grok_hex(p + 1, &numlen, &flags, NULL);
7260 I32 flags = PERL_SCAN_DISALLOW_PREFIX;
7262 ender = grok_hex(p, &numlen, &flags, NULL);
7265 if (PL_encoding && ender < 0x100)
7266 goto recode_encoding;
7270 ender = UCHARAT(p++);
7271 ender = toCTRL(ender);
7273 case '0': case '1': case '2': case '3':case '4':
7274 case '5': case '6': case '7': case '8':case '9':
7276 (isDIGIT(p[1]) && atoi(p) >= RExC_npar) ) {
7279 ender = grok_oct(p, &numlen, &flags, NULL);
7286 if (PL_encoding && ender < 0x100)
7287 goto recode_encoding;
7291 SV* enc = PL_encoding;
7292 ender = reg_recode((const char)(U8)ender, &enc);
7293 if (!enc && SIZE_ONLY && ckWARN(WARN_REGEXP))
7294 vWARN(p, "Invalid escape in the specified encoding");
7300 FAIL("Trailing \\");
7303 if (!SIZE_ONLY&& isALPHA(*p) && ckWARN(WARN_REGEXP))
7304 vWARN2(p + 1, "Unrecognized escape \\%c passed through", UCHARAT(p));
7305 goto normal_default;
7310 if (UTF8_IS_START(*p) && UTF) {
7312 ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
7313 &numlen, UTF8_ALLOW_DEFAULT);
7320 if ( RExC_flags & RXf_PMf_EXTENDED)
7321 p = regwhite( pRExC_state, p );
7323 /* Prime the casefolded buffer. */
7324 ender = toFOLD_uni(ender, tmpbuf, &foldlen);
7326 if (p < RExC_end && ISMULT2(p)) { /* Back off on ?+*. */
7331 /* Emit all the Unicode characters. */
7333 for (foldbuf = tmpbuf;
7335 foldlen -= numlen) {
7336 ender = utf8_to_uvchr(foldbuf, &numlen);
7338 const STRLEN unilen = reguni(pRExC_state, ender, s);
7341 /* In EBCDIC the numlen
7342 * and unilen can differ. */
7344 if (numlen >= foldlen)
7348 break; /* "Can't happen." */
7352 const STRLEN unilen = reguni(pRExC_state, ender, s);
7361 REGC((char)ender, s++);
7367 /* Emit all the Unicode characters. */
7369 for (foldbuf = tmpbuf;
7371 foldlen -= numlen) {
7372 ender = utf8_to_uvchr(foldbuf, &numlen);
7374 const STRLEN unilen = reguni(pRExC_state, ender, s);
7377 /* In EBCDIC the numlen
7378 * and unilen can differ. */
7380 if (numlen >= foldlen)
7388 const STRLEN unilen = reguni(pRExC_state, ender, s);
7397 REGC((char)ender, s++);
7401 Set_Node_Cur_Length(ret); /* MJD */
7402 nextchar(pRExC_state);
7404 /* len is STRLEN which is unsigned, need to copy to signed */
7407 vFAIL("Internal disaster");
7411 if (len == 1 && UNI_IS_INVARIANT(ender))
7415 RExC_size += STR_SZ(len);
7418 RExC_emit += STR_SZ(len);
7428 S_regwhite( RExC_state_t *pRExC_state, char *p )
7430 const char *e = RExC_end;
7434 else if (*p == '#') {
7443 RExC_seen |= REG_SEEN_RUN_ON_COMMENT;
7451 /* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
7452 Character classes ([:foo:]) can also be negated ([:^foo:]).
7453 Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
7454 Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
7455 but trigger failures because they are currently unimplemented. */
7457 #define POSIXCC_DONE(c) ((c) == ':')
7458 #define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.')
7459 #define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c))
7462 S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value)
7465 I32 namedclass = OOB_NAMEDCLASS;
7467 if (value == '[' && RExC_parse + 1 < RExC_end &&
7468 /* I smell either [: or [= or [. -- POSIX has been here, right? */
7469 POSIXCC(UCHARAT(RExC_parse))) {
7470 const char c = UCHARAT(RExC_parse);
7471 char* const s = RExC_parse++;
7473 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != c)
7475 if (RExC_parse == RExC_end)
7476 /* Grandfather lone [:, [=, [. */
7479 const char* const t = RExC_parse++; /* skip over the c */
7482 if (UCHARAT(RExC_parse) == ']') {
7483 const char *posixcc = s + 1;
7484 RExC_parse++; /* skip over the ending ] */
7487 const I32 complement = *posixcc == '^' ? *posixcc++ : 0;
7488 const I32 skip = t - posixcc;
7490 /* Initially switch on the length of the name. */
7493 if (memEQ(posixcc, "word", 4)) /* this is not POSIX, this is the Perl \w */
7494 namedclass = complement ? ANYOF_NALNUM : ANYOF_ALNUM;
7497 /* Names all of length 5. */
7498 /* alnum alpha ascii blank cntrl digit graph lower
7499 print punct space upper */
7500 /* Offset 4 gives the best switch position. */
7501 switch (posixcc[4]) {
7503 if (memEQ(posixcc, "alph", 4)) /* alpha */
7504 namedclass = complement ? ANYOF_NALPHA : ANYOF_ALPHA;
7507 if (memEQ(posixcc, "spac", 4)) /* space */
7508 namedclass = complement ? ANYOF_NPSXSPC : ANYOF_PSXSPC;
7511 if (memEQ(posixcc, "grap", 4)) /* graph */
7512 namedclass = complement ? ANYOF_NGRAPH : ANYOF_GRAPH;
7515 if (memEQ(posixcc, "asci", 4)) /* ascii */
7516 namedclass = complement ? ANYOF_NASCII : ANYOF_ASCII;
7519 if (memEQ(posixcc, "blan", 4)) /* blank */
7520 namedclass = complement ? ANYOF_NBLANK : ANYOF_BLANK;
7523 if (memEQ(posixcc, "cntr", 4)) /* cntrl */
7524 namedclass = complement ? ANYOF_NCNTRL : ANYOF_CNTRL;
7527 if (memEQ(posixcc, "alnu", 4)) /* alnum */
7528 namedclass = complement ? ANYOF_NALNUMC : ANYOF_ALNUMC;
7531 if (memEQ(posixcc, "lowe", 4)) /* lower */
7532 namedclass = complement ? ANYOF_NLOWER : ANYOF_LOWER;
7533 else if (memEQ(posixcc, "uppe", 4)) /* upper */
7534 namedclass = complement ? ANYOF_NUPPER : ANYOF_UPPER;
7537 if (memEQ(posixcc, "digi", 4)) /* digit */
7538 namedclass = complement ? ANYOF_NDIGIT : ANYOF_DIGIT;
7539 else if (memEQ(posixcc, "prin", 4)) /* print */
7540 namedclass = complement ? ANYOF_NPRINT : ANYOF_PRINT;
7541 else if (memEQ(posixcc, "punc", 4)) /* punct */
7542 namedclass = complement ? ANYOF_NPUNCT : ANYOF_PUNCT;
7547 if (memEQ(posixcc, "xdigit", 6))
7548 namedclass = complement ? ANYOF_NXDIGIT : ANYOF_XDIGIT;
7552 if (namedclass == OOB_NAMEDCLASS)
7553 Simple_vFAIL3("POSIX class [:%.*s:] unknown",
7555 assert (posixcc[skip] == ':');
7556 assert (posixcc[skip+1] == ']');
7557 } else if (!SIZE_ONLY) {
7558 /* [[=foo=]] and [[.foo.]] are still future. */
7560 /* adjust RExC_parse so the warning shows after
7562 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse) != ']')
7564 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
7567 /* Maternal grandfather:
7568 * "[:" ending in ":" but not in ":]" */
7578 S_checkposixcc(pTHX_ RExC_state_t *pRExC_state)
7581 if (POSIXCC(UCHARAT(RExC_parse))) {
7582 const char *s = RExC_parse;
7583 const char c = *s++;
7587 if (*s && c == *s && s[1] == ']') {
7588 if (ckWARN(WARN_REGEXP))
7590 "POSIX syntax [%c %c] belongs inside character classes",
7593 /* [[=foo=]] and [[.foo.]] are still future. */
7594 if (POSIXCC_NOTYET(c)) {
7595 /* adjust RExC_parse so the error shows after
7597 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse++) != ']')
7599 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
7606 #define _C_C_T_(NAME,TEST,WORD) \
7609 ANYOF_CLASS_SET(ret, ANYOF_##NAME); \
7611 for (value = 0; value < 256; value++) \
7613 ANYOF_BITMAP_SET(ret, value); \
7618 case ANYOF_N##NAME: \
7620 ANYOF_CLASS_SET(ret, ANYOF_N##NAME); \
7622 for (value = 0; value < 256; value++) \
7624 ANYOF_BITMAP_SET(ret, value); \
7630 #define _C_C_T_NOLOC_(NAME,TEST,WORD) \
7632 for (value = 0; value < 256; value++) \
7634 ANYOF_BITMAP_SET(ret, value); \
7638 case ANYOF_N##NAME: \
7639 for (value = 0; value < 256; value++) \
7641 ANYOF_BITMAP_SET(ret, value); \
7647 parse a class specification and produce either an ANYOF node that
7648 matches the pattern or if the pattern matches a single char only and
7649 that char is < 256 and we are case insensitive then we produce an
7654 S_regclass(pTHX_ RExC_state_t *pRExC_state, U32 depth)
7657 register UV nextvalue;
7658 register IV prevvalue = OOB_UNICODE;
7659 register IV range = 0;
7660 UV value = 0; /* XXX:dmq: needs to be referenceable (unfortunately) */
7661 register regnode *ret;
7664 char *rangebegin = NULL;
7665 bool need_class = 0;
7668 bool optimize_invert = TRUE;
7669 AV* unicode_alternate = NULL;
7671 UV literal_endpoint = 0;
7673 UV stored = 0; /* number of chars stored in the class */
7675 regnode * const orig_emit = RExC_emit; /* Save the original RExC_emit in
7676 case we need to change the emitted regop to an EXACT. */
7677 const char * orig_parse = RExC_parse;
7678 GET_RE_DEBUG_FLAGS_DECL;
7680 PERL_UNUSED_ARG(depth);
7683 DEBUG_PARSE("clas");
7685 /* Assume we are going to generate an ANYOF node. */
7686 ret = reganode(pRExC_state, ANYOF, 0);
7689 ANYOF_FLAGS(ret) = 0;
7691 if (UCHARAT(RExC_parse) == '^') { /* Complement of range. */
7695 ANYOF_FLAGS(ret) |= ANYOF_INVERT;
7699 RExC_size += ANYOF_SKIP;
7700 listsv = &PL_sv_undef; /* For code scanners: listsv always non-NULL. */
7703 RExC_emit += ANYOF_SKIP;
7705 ANYOF_FLAGS(ret) |= ANYOF_FOLD;
7707 ANYOF_FLAGS(ret) |= ANYOF_LOCALE;
7708 ANYOF_BITMAP_ZERO(ret);
7709 listsv = newSVpvs("# comment\n");
7712 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
7714 if (!SIZE_ONLY && POSIXCC(nextvalue))
7715 checkposixcc(pRExC_state);
7717 /* allow 1st char to be ] (allowing it to be - is dealt with later) */
7718 if (UCHARAT(RExC_parse) == ']')
7722 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != ']') {
7726 namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
7729 rangebegin = RExC_parse;
7731 value = utf8n_to_uvchr((U8*)RExC_parse,
7732 RExC_end - RExC_parse,
7733 &numlen, UTF8_ALLOW_DEFAULT);
7734 RExC_parse += numlen;
7737 value = UCHARAT(RExC_parse++);
7739 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
7740 if (value == '[' && POSIXCC(nextvalue))
7741 namedclass = regpposixcc(pRExC_state, value);
7742 else if (value == '\\') {
7744 value = utf8n_to_uvchr((U8*)RExC_parse,
7745 RExC_end - RExC_parse,
7746 &numlen, UTF8_ALLOW_DEFAULT);
7747 RExC_parse += numlen;
7750 value = UCHARAT(RExC_parse++);
7751 /* Some compilers cannot handle switching on 64-bit integer
7752 * values, therefore value cannot be an UV. Yes, this will
7753 * be a problem later if we want switch on Unicode.
7754 * A similar issue a little bit later when switching on
7755 * namedclass. --jhi */
7756 switch ((I32)value) {
7757 case 'w': namedclass = ANYOF_ALNUM; break;
7758 case 'W': namedclass = ANYOF_NALNUM; break;
7759 case 's': namedclass = ANYOF_SPACE; break;
7760 case 'S': namedclass = ANYOF_NSPACE; break;
7761 case 'd': namedclass = ANYOF_DIGIT; break;
7762 case 'D': namedclass = ANYOF_NDIGIT; break;
7763 case 'v': namedclass = ANYOF_VERTWS; break;
7764 case 'V': namedclass = ANYOF_NVERTWS; break;
7765 case 'h': namedclass = ANYOF_HORIZWS; break;
7766 case 'H': namedclass = ANYOF_NHORIZWS; break;
7767 case 'N': /* Handle \N{NAME} in class */
7769 /* We only pay attention to the first char of
7770 multichar strings being returned. I kinda wonder
7771 if this makes sense as it does change the behaviour
7772 from earlier versions, OTOH that behaviour was broken
7774 UV v; /* value is register so we cant & it /grrr */
7775 if (reg_namedseq(pRExC_state, &v)) {
7785 if (RExC_parse >= RExC_end)
7786 vFAIL2("Empty \\%c{}", (U8)value);
7787 if (*RExC_parse == '{') {
7788 const U8 c = (U8)value;
7789 e = strchr(RExC_parse++, '}');
7791 vFAIL2("Missing right brace on \\%c{}", c);
7792 while (isSPACE(UCHARAT(RExC_parse)))
7794 if (e == RExC_parse)
7795 vFAIL2("Empty \\%c{}", c);
7797 while (isSPACE(UCHARAT(RExC_parse + n - 1)))
7805 if (UCHARAT(RExC_parse) == '^') {
7808 value = value == 'p' ? 'P' : 'p'; /* toggle */
7809 while (isSPACE(UCHARAT(RExC_parse))) {
7814 Perl_sv_catpvf(aTHX_ listsv, "%cutf8::%.*s\n",
7815 (value=='p' ? '+' : '!'), (int)n, RExC_parse);
7818 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
7819 namedclass = ANYOF_MAX; /* no official name, but it's named */
7822 case 'n': value = '\n'; break;
7823 case 'r': value = '\r'; break;
7824 case 't': value = '\t'; break;
7825 case 'f': value = '\f'; break;
7826 case 'b': value = '\b'; break;
7827 case 'e': value = ASCII_TO_NATIVE('\033');break;
7828 case 'a': value = ASCII_TO_NATIVE('\007');break;
7830 if (*RExC_parse == '{') {
7831 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
7832 | PERL_SCAN_DISALLOW_PREFIX;
7833 char * const e = strchr(RExC_parse++, '}');
7835 vFAIL("Missing right brace on \\x{}");
7837 numlen = e - RExC_parse;
7838 value = grok_hex(RExC_parse, &numlen, &flags, NULL);
7842 I32 flags = PERL_SCAN_DISALLOW_PREFIX;
7844 value = grok_hex(RExC_parse, &numlen, &flags, NULL);
7845 RExC_parse += numlen;
7847 if (PL_encoding && value < 0x100)
7848 goto recode_encoding;
7851 value = UCHARAT(RExC_parse++);
7852 value = toCTRL(value);
7854 case '0': case '1': case '2': case '3': case '4':
7855 case '5': case '6': case '7': case '8': case '9':
7859 value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
7860 RExC_parse += numlen;
7861 if (PL_encoding && value < 0x100)
7862 goto recode_encoding;
7867 SV* enc = PL_encoding;
7868 value = reg_recode((const char)(U8)value, &enc);
7869 if (!enc && SIZE_ONLY && ckWARN(WARN_REGEXP))
7871 "Invalid escape in the specified encoding");
7875 if (!SIZE_ONLY && isALPHA(value) && ckWARN(WARN_REGEXP))
7877 "Unrecognized escape \\%c in character class passed through",
7881 } /* end of \blah */
7887 if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
7889 if (!SIZE_ONLY && !need_class)
7890 ANYOF_CLASS_ZERO(ret);
7894 /* a bad range like a-\d, a-[:digit:] ? */
7897 if (ckWARN(WARN_REGEXP)) {
7899 RExC_parse >= rangebegin ?
7900 RExC_parse - rangebegin : 0;
7902 "False [] range \"%*.*s\"",
7905 if (prevvalue < 256) {
7906 ANYOF_BITMAP_SET(ret, prevvalue);
7907 ANYOF_BITMAP_SET(ret, '-');
7910 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
7911 Perl_sv_catpvf(aTHX_ listsv,
7912 "%04"UVxf"\n%04"UVxf"\n", (UV)prevvalue, (UV) '-');
7916 range = 0; /* this was not a true range */
7922 const char *what = NULL;
7925 if (namedclass > OOB_NAMEDCLASS)
7926 optimize_invert = FALSE;
7927 /* Possible truncation here but in some 64-bit environments
7928 * the compiler gets heartburn about switch on 64-bit values.
7929 * A similar issue a little earlier when switching on value.
7931 switch ((I32)namedclass) {
7932 case _C_C_T_(ALNUM, isALNUM(value), "Word");
7933 case _C_C_T_(ALNUMC, isALNUMC(value), "Alnum");
7934 case _C_C_T_(ALPHA, isALPHA(value), "Alpha");
7935 case _C_C_T_(BLANK, isBLANK(value), "Blank");
7936 case _C_C_T_(CNTRL, isCNTRL(value), "Cntrl");
7937 case _C_C_T_(GRAPH, isGRAPH(value), "Graph");
7938 case _C_C_T_(LOWER, isLOWER(value), "Lower");
7939 case _C_C_T_(PRINT, isPRINT(value), "Print");
7940 case _C_C_T_(PSXSPC, isPSXSPC(value), "Space");
7941 case _C_C_T_(PUNCT, isPUNCT(value), "Punct");
7942 case _C_C_T_(SPACE, isSPACE(value), "SpacePerl");
7943 case _C_C_T_(UPPER, isUPPER(value), "Upper");
7944 case _C_C_T_(XDIGIT, isXDIGIT(value), "XDigit");
7945 case _C_C_T_NOLOC_(VERTWS, is_VERTWS_latin1(&value), "VertSpace");
7946 case _C_C_T_NOLOC_(HORIZWS, is_HORIZWS_latin1(&value), "HorizSpace");
7949 ANYOF_CLASS_SET(ret, ANYOF_ASCII);
7952 for (value = 0; value < 128; value++)
7953 ANYOF_BITMAP_SET(ret, value);
7955 for (value = 0; value < 256; value++) {
7957 ANYOF_BITMAP_SET(ret, value);
7966 ANYOF_CLASS_SET(ret, ANYOF_NASCII);
7969 for (value = 128; value < 256; value++)
7970 ANYOF_BITMAP_SET(ret, value);
7972 for (value = 0; value < 256; value++) {
7973 if (!isASCII(value))
7974 ANYOF_BITMAP_SET(ret, value);
7983 ANYOF_CLASS_SET(ret, ANYOF_DIGIT);
7985 /* consecutive digits assumed */
7986 for (value = '0'; value <= '9'; value++)
7987 ANYOF_BITMAP_SET(ret, value);
7994 ANYOF_CLASS_SET(ret, ANYOF_NDIGIT);
7996 /* consecutive digits assumed */
7997 for (value = 0; value < '0'; value++)
7998 ANYOF_BITMAP_SET(ret, value);
7999 for (value = '9' + 1; value < 256; value++)
8000 ANYOF_BITMAP_SET(ret, value);
8006 /* this is to handle \p and \P */
8009 vFAIL("Invalid [::] class");
8013 /* Strings such as "+utf8::isWord\n" */
8014 Perl_sv_catpvf(aTHX_ listsv, "%cutf8::Is%s\n", yesno, what);
8017 ANYOF_FLAGS(ret) |= ANYOF_CLASS;
8020 } /* end of namedclass \blah */
8023 if (prevvalue > (IV)value) /* b-a */ {
8024 const int w = RExC_parse - rangebegin;
8025 Simple_vFAIL4("Invalid [] range \"%*.*s\"", w, w, rangebegin);
8026 range = 0; /* not a valid range */
8030 prevvalue = value; /* save the beginning of the range */
8031 if (*RExC_parse == '-' && RExC_parse+1 < RExC_end &&
8032 RExC_parse[1] != ']') {
8035 /* a bad range like \w-, [:word:]- ? */
8036 if (namedclass > OOB_NAMEDCLASS) {
8037 if (ckWARN(WARN_REGEXP)) {
8039 RExC_parse >= rangebegin ?
8040 RExC_parse - rangebegin : 0;
8042 "False [] range \"%*.*s\"",
8046 ANYOF_BITMAP_SET(ret, '-');
8048 range = 1; /* yeah, it's a range! */
8049 continue; /* but do it the next time */
8053 /* now is the next time */
8054 /*stored += (value - prevvalue + 1);*/
8056 if (prevvalue < 256) {
8057 const IV ceilvalue = value < 256 ? value : 255;
8060 /* In EBCDIC [\x89-\x91] should include
8061 * the \x8e but [i-j] should not. */
8062 if (literal_endpoint == 2 &&
8063 ((isLOWER(prevvalue) && isLOWER(ceilvalue)) ||
8064 (isUPPER(prevvalue) && isUPPER(ceilvalue))))
8066 if (isLOWER(prevvalue)) {
8067 for (i = prevvalue; i <= ceilvalue; i++)
8068 if (isLOWER(i) && !ANYOF_BITMAP_TEST(ret,i)) {
8070 ANYOF_BITMAP_SET(ret, i);
8073 for (i = prevvalue; i <= ceilvalue; i++)
8074 if (isUPPER(i) && !ANYOF_BITMAP_TEST(ret,i)) {
8076 ANYOF_BITMAP_SET(ret, i);
8082 for (i = prevvalue; i <= ceilvalue; i++) {
8083 if (!ANYOF_BITMAP_TEST(ret,i)) {
8085 ANYOF_BITMAP_SET(ret, i);
8089 if (value > 255 || UTF) {
8090 const UV prevnatvalue = NATIVE_TO_UNI(prevvalue);
8091 const UV natvalue = NATIVE_TO_UNI(value);
8092 stored+=2; /* can't optimize this class */
8093 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
8094 if (prevnatvalue < natvalue) { /* what about > ? */
8095 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\t%04"UVxf"\n",
8096 prevnatvalue, natvalue);
8098 else if (prevnatvalue == natvalue) {
8099 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n", natvalue);
8101 U8 foldbuf[UTF8_MAXBYTES_CASE+1];
8103 const UV f = to_uni_fold(natvalue, foldbuf, &foldlen);
8105 #ifdef EBCDIC /* RD t/uni/fold ff and 6b */
8106 if (RExC_precomp[0] == ':' &&
8107 RExC_precomp[1] == '[' &&
8108 (f == 0xDF || f == 0x92)) {
8109 f = NATIVE_TO_UNI(f);
8112 /* If folding and foldable and a single
8113 * character, insert also the folded version
8114 * to the charclass. */
8116 #ifdef EBCDIC /* RD tunifold ligatures s,t fb05, fb06 */
8117 if ((RExC_precomp[0] == ':' &&
8118 RExC_precomp[1] == '[' &&
8120 (value == 0xFB05 || value == 0xFB06))) ?
8121 foldlen == ((STRLEN)UNISKIP(f) - 1) :
8122 foldlen == (STRLEN)UNISKIP(f) )
8124 if (foldlen == (STRLEN)UNISKIP(f))
8126 Perl_sv_catpvf(aTHX_ listsv,
8129 /* Any multicharacter foldings
8130 * require the following transform:
8131 * [ABCDEF] -> (?:[ABCabcDEFd]|pq|rst)
8132 * where E folds into "pq" and F folds
8133 * into "rst", all other characters
8134 * fold to single characters. We save
8135 * away these multicharacter foldings,
8136 * to be later saved as part of the
8137 * additional "s" data. */
8140 if (!unicode_alternate)
8141 unicode_alternate = newAV();
8142 sv = newSVpvn((char*)foldbuf, foldlen);
8144 av_push(unicode_alternate, sv);
8148 /* If folding and the value is one of the Greek
8149 * sigmas insert a few more sigmas to make the
8150 * folding rules of the sigmas to work right.
8151 * Note that not all the possible combinations
8152 * are handled here: some of them are handled
8153 * by the standard folding rules, and some of
8154 * them (literal or EXACTF cases) are handled
8155 * during runtime in regexec.c:S_find_byclass(). */
8156 if (value == UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA) {
8157 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
8158 (UV)UNICODE_GREEK_CAPITAL_LETTER_SIGMA);
8159 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
8160 (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
8162 else if (value == UNICODE_GREEK_CAPITAL_LETTER_SIGMA)
8163 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
8164 (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
8169 literal_endpoint = 0;
8173 range = 0; /* this range (if it was one) is done now */
8177 ANYOF_FLAGS(ret) |= ANYOF_LARGE;
8179 RExC_size += ANYOF_CLASS_ADD_SKIP;
8181 RExC_emit += ANYOF_CLASS_ADD_SKIP;
8187 /****** !SIZE_ONLY AFTER HERE *********/
8189 if( stored == 1 && (value < 128 || (value < 256 && !UTF))
8190 && !( ANYOF_FLAGS(ret) & ( ANYOF_FLAGS_ALL ^ ANYOF_FOLD ) )
8192 /* optimize single char class to an EXACT node
8193 but *only* when its not a UTF/high char */
8194 const char * cur_parse= RExC_parse;
8195 RExC_emit = (regnode *)orig_emit;
8196 RExC_parse = (char *)orig_parse;
8197 ret = reg_node(pRExC_state,
8198 (U8)((ANYOF_FLAGS(ret) & ANYOF_FOLD) ? EXACTF : EXACT));
8199 RExC_parse = (char *)cur_parse;
8200 *STRING(ret)= (char)value;
8202 RExC_emit += STR_SZ(1);
8205 /* optimize case-insensitive simple patterns (e.g. /[a-z]/i) */
8206 if ( /* If the only flag is folding (plus possibly inversion). */
8207 ((ANYOF_FLAGS(ret) & (ANYOF_FLAGS_ALL ^ ANYOF_INVERT)) == ANYOF_FOLD)
8209 for (value = 0; value < 256; ++value) {
8210 if (ANYOF_BITMAP_TEST(ret, value)) {
8211 UV fold = PL_fold[value];
8214 ANYOF_BITMAP_SET(ret, fold);
8217 ANYOF_FLAGS(ret) &= ~ANYOF_FOLD;
8220 /* optimize inverted simple patterns (e.g. [^a-z]) */
8221 if (optimize_invert &&
8222 /* If the only flag is inversion. */
8223 (ANYOF_FLAGS(ret) & ANYOF_FLAGS_ALL) == ANYOF_INVERT) {
8224 for (value = 0; value < ANYOF_BITMAP_SIZE; ++value)
8225 ANYOF_BITMAP(ret)[value] ^= ANYOF_FLAGS_ALL;
8226 ANYOF_FLAGS(ret) = ANYOF_UNICODE_ALL;
8229 AV * const av = newAV();
8231 /* The 0th element stores the character class description
8232 * in its textual form: used later (regexec.c:Perl_regclass_swash())
8233 * to initialize the appropriate swash (which gets stored in
8234 * the 1st element), and also useful for dumping the regnode.
8235 * The 2nd element stores the multicharacter foldings,
8236 * used later (regexec.c:S_reginclass()). */
8237 av_store(av, 0, listsv);
8238 av_store(av, 1, NULL);
8239 av_store(av, 2, (SV*)unicode_alternate);
8240 rv = newRV_noinc((SV*)av);
8241 n = add_data(pRExC_state, 1, "s");
8242 RExC_rxi->data->data[n] = (void*)rv;
8250 /* reg_skipcomment()
8252 Absorbs an /x style # comments from the input stream.
8253 Returns true if there is more text remaining in the stream.
8254 Will set the REG_SEEN_RUN_ON_COMMENT flag if the comment
8255 terminates the pattern without including a newline.
8257 Note its the callers responsibility to ensure that we are
8263 S_reg_skipcomment(pTHX_ RExC_state_t *pRExC_state)
8266 while (RExC_parse < RExC_end)
8267 if (*RExC_parse++ == '\n') {
8272 /* we ran off the end of the pattern without ending
8273 the comment, so we have to add an \n when wrapping */
8274 RExC_seen |= REG_SEEN_RUN_ON_COMMENT;
8282 Advance that parse position, and optionall absorbs
8283 "whitespace" from the inputstream.
8285 Without /x "whitespace" means (?#...) style comments only,
8286 with /x this means (?#...) and # comments and whitespace proper.
8288 Returns the RExC_parse point from BEFORE the scan occurs.
8290 This is the /x friendly way of saying RExC_parse++.
8294 S_nextchar(pTHX_ RExC_state_t *pRExC_state)
8296 char* const retval = RExC_parse++;
8299 if (*RExC_parse == '(' && RExC_parse[1] == '?' &&
8300 RExC_parse[2] == '#') {
8301 while (*RExC_parse != ')') {
8302 if (RExC_parse == RExC_end)
8303 FAIL("Sequence (?#... not terminated");
8309 if (RExC_flags & RXf_PMf_EXTENDED) {
8310 if (isSPACE(*RExC_parse)) {
8314 else if (*RExC_parse == '#') {
8315 if ( reg_skipcomment( pRExC_state ) )
8324 - reg_node - emit a node
8326 STATIC regnode * /* Location. */
8327 S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
8330 register regnode *ptr;
8331 regnode * const ret = RExC_emit;
8332 GET_RE_DEBUG_FLAGS_DECL;
8335 SIZE_ALIGN(RExC_size);
8339 if (RExC_emit >= RExC_emit_bound)
8340 Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d", op);
8342 NODE_ALIGN_FILL(ret);
8344 FILL_ADVANCE_NODE(ptr, op);
8345 #ifdef RE_TRACK_PATTERN_OFFSETS
8346 if (RExC_offsets) { /* MJD */
8347 MJD_OFFSET_DEBUG(("%s:%d: (op %s) %s %"UVuf" (len %"UVuf") (max %"UVuf").\n",
8348 "reg_node", __LINE__,
8350 (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0]
8351 ? "Overwriting end of array!\n" : "OK",
8352 (UV)(RExC_emit - RExC_emit_start),
8353 (UV)(RExC_parse - RExC_start),
8354 (UV)RExC_offsets[0]));
8355 Set_Node_Offset(RExC_emit, RExC_parse + (op == END));
8363 - reganode - emit a node with an argument
8365 STATIC regnode * /* Location. */
8366 S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
8369 register regnode *ptr;
8370 regnode * const ret = RExC_emit;
8371 GET_RE_DEBUG_FLAGS_DECL;
8374 SIZE_ALIGN(RExC_size);
8379 assert(2==regarglen[op]+1);
8381 Anything larger than this has to allocate the extra amount.
8382 If we changed this to be:
8384 RExC_size += (1 + regarglen[op]);
8386 then it wouldn't matter. Its not clear what side effect
8387 might come from that so its not done so far.
8392 if (RExC_emit >= RExC_emit_bound)
8393 Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d", op);
8395 NODE_ALIGN_FILL(ret);
8397 FILL_ADVANCE_NODE_ARG(ptr, op, arg);
8398 #ifdef RE_TRACK_PATTERN_OFFSETS
8399 if (RExC_offsets) { /* MJD */
8400 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
8404 (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0] ?
8405 "Overwriting end of array!\n" : "OK",
8406 (UV)(RExC_emit - RExC_emit_start),
8407 (UV)(RExC_parse - RExC_start),
8408 (UV)RExC_offsets[0]));
8409 Set_Cur_Node_Offset;
8417 - reguni - emit (if appropriate) a Unicode character
8420 S_reguni(pTHX_ const RExC_state_t *pRExC_state, UV uv, char* s)
8423 return SIZE_ONLY ? UNISKIP(uv) : (uvchr_to_utf8((U8*)s, uv) - (U8*)s);
8427 - reginsert - insert an operator in front of already-emitted operand
8429 * Means relocating the operand.
8432 S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd, U32 depth)
8435 register regnode *src;
8436 register regnode *dst;
8437 register regnode *place;
8438 const int offset = regarglen[(U8)op];
8439 const int size = NODE_STEP_REGNODE + offset;
8440 GET_RE_DEBUG_FLAGS_DECL;
8441 PERL_UNUSED_ARG(depth);
8442 /* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */
8443 DEBUG_PARSE_FMT("inst"," - %s",PL_reg_name[op]);
8452 if (RExC_open_parens) {
8454 /*DEBUG_PARSE_FMT("inst"," - %"IVdf, (IV)RExC_npar);*/
8455 for ( paren=0 ; paren < RExC_npar ; paren++ ) {
8456 if ( RExC_open_parens[paren] >= opnd ) {
8457 /*DEBUG_PARSE_FMT("open"," - %d",size);*/
8458 RExC_open_parens[paren] += size;
8460 /*DEBUG_PARSE_FMT("open"," - %s","ok");*/
8462 if ( RExC_close_parens[paren] >= opnd ) {
8463 /*DEBUG_PARSE_FMT("close"," - %d",size);*/
8464 RExC_close_parens[paren] += size;
8466 /*DEBUG_PARSE_FMT("close"," - %s","ok");*/
8471 while (src > opnd) {
8472 StructCopy(--src, --dst, regnode);
8473 #ifdef RE_TRACK_PATTERN_OFFSETS
8474 if (RExC_offsets) { /* MJD 20010112 */
8475 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s copy %"UVuf" -> %"UVuf" (max %"UVuf").\n",
8479 (UV)(dst - RExC_emit_start) > RExC_offsets[0]
8480 ? "Overwriting end of array!\n" : "OK",
8481 (UV)(src - RExC_emit_start),
8482 (UV)(dst - RExC_emit_start),
8483 (UV)RExC_offsets[0]));
8484 Set_Node_Offset_To_R(dst-RExC_emit_start, Node_Offset(src));
8485 Set_Node_Length_To_R(dst-RExC_emit_start, Node_Length(src));
8491 place = opnd; /* Op node, where operand used to be. */
8492 #ifdef RE_TRACK_PATTERN_OFFSETS
8493 if (RExC_offsets) { /* MJD */
8494 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
8498 (UV)(place - RExC_emit_start) > RExC_offsets[0]
8499 ? "Overwriting end of array!\n" : "OK",
8500 (UV)(place - RExC_emit_start),
8501 (UV)(RExC_parse - RExC_start),
8502 (UV)RExC_offsets[0]));
8503 Set_Node_Offset(place, RExC_parse);
8504 Set_Node_Length(place, 1);
8507 src = NEXTOPER(place);
8508 FILL_ADVANCE_NODE(place, op);
8509 Zero(src, offset, regnode);
8513 - regtail - set the next-pointer at the end of a node chain of p to val.
8514 - SEE ALSO: regtail_study
8516 /* TODO: All three parms should be const */
8518 S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
8521 register regnode *scan;
8522 GET_RE_DEBUG_FLAGS_DECL;
8524 PERL_UNUSED_ARG(depth);
8530 /* Find last node. */
8533 regnode * const temp = regnext(scan);
8535 SV * const mysv=sv_newmortal();
8536 DEBUG_PARSE_MSG((scan==p ? "tail" : ""));
8537 regprop(RExC_rx, mysv, scan);
8538 PerlIO_printf(Perl_debug_log, "~ %s (%d) %s %s\n",
8539 SvPV_nolen_const(mysv), REG_NODE_NUM(scan),
8540 (temp == NULL ? "->" : ""),
8541 (temp == NULL ? PL_reg_name[OP(val)] : "")
8549 if (reg_off_by_arg[OP(scan)]) {
8550 ARG_SET(scan, val - scan);
8553 NEXT_OFF(scan) = val - scan;
8559 - regtail_study - set the next-pointer at the end of a node chain of p to val.
8560 - Look for optimizable sequences at the same time.
8561 - currently only looks for EXACT chains.
8563 This is expermental code. The idea is to use this routine to perform
8564 in place optimizations on branches and groups as they are constructed,
8565 with the long term intention of removing optimization from study_chunk so
8566 that it is purely analytical.
8568 Currently only used when in DEBUG mode. The macro REGTAIL_STUDY() is used
8569 to control which is which.
8572 /* TODO: All four parms should be const */
8575 S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
8578 register regnode *scan;
8580 #ifdef EXPERIMENTAL_INPLACESCAN
8584 GET_RE_DEBUG_FLAGS_DECL;
8590 /* Find last node. */
8594 regnode * const temp = regnext(scan);
8595 #ifdef EXPERIMENTAL_INPLACESCAN
8596 if (PL_regkind[OP(scan)] == EXACT)
8597 if (join_exact(pRExC_state,scan,&min,1,val,depth+1))
8605 if( exact == PSEUDO )
8607 else if ( exact != OP(scan) )
8616 SV * const mysv=sv_newmortal();
8617 DEBUG_PARSE_MSG((scan==p ? "tsdy" : ""));
8618 regprop(RExC_rx, mysv, scan);
8619 PerlIO_printf(Perl_debug_log, "~ %s (%d) -> %s\n",
8620 SvPV_nolen_const(mysv),
8622 PL_reg_name[exact]);
8629 SV * const mysv_val=sv_newmortal();
8630 DEBUG_PARSE_MSG("");
8631 regprop(RExC_rx, mysv_val, val);
8632 PerlIO_printf(Perl_debug_log, "~ attach to %s (%"IVdf") offset to %"IVdf"\n",
8633 SvPV_nolen_const(mysv_val),
8634 (IV)REG_NODE_NUM(val),
8638 if (reg_off_by_arg[OP(scan)]) {
8639 ARG_SET(scan, val - scan);
8642 NEXT_OFF(scan) = val - scan;
8650 - regcurly - a little FSA that accepts {\d+,?\d*}
8653 S_regcurly(register const char *s)
8672 - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
8676 S_regdump_extflags(pTHX_ const char *lead, const U32 flags) {
8679 for (bit=0; bit<32; bit++) {
8680 if (flags & (1<<bit)) {
8682 PerlIO_printf(Perl_debug_log, "%s",lead);
8683 PerlIO_printf(Perl_debug_log, "%s ",PL_reg_extflags_name[bit]);
8688 PerlIO_printf(Perl_debug_log, "\n");
8690 PerlIO_printf(Perl_debug_log, "%s[none-set]\n",lead);
8696 Perl_regdump(pTHX_ const regexp *r)
8700 SV * const sv = sv_newmortal();
8701 SV *dsv= sv_newmortal();
8703 GET_RE_DEBUG_FLAGS_DECL;
8705 (void)dumpuntil(r, ri->program, ri->program + 1, NULL, NULL, sv, 0, 0);
8707 /* Header fields of interest. */
8708 if (r->anchored_substr) {
8709 RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->anchored_substr),
8710 RE_SV_DUMPLEN(r->anchored_substr), 30);
8711 PerlIO_printf(Perl_debug_log,
8712 "anchored %s%s at %"IVdf" ",
8713 s, RE_SV_TAIL(r->anchored_substr),
8714 (IV)r->anchored_offset);
8715 } else if (r->anchored_utf8) {
8716 RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->anchored_utf8),
8717 RE_SV_DUMPLEN(r->anchored_utf8), 30);
8718 PerlIO_printf(Perl_debug_log,
8719 "anchored utf8 %s%s at %"IVdf" ",
8720 s, RE_SV_TAIL(r->anchored_utf8),
8721 (IV)r->anchored_offset);
8723 if (r->float_substr) {
8724 RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->float_substr),
8725 RE_SV_DUMPLEN(r->float_substr), 30);
8726 PerlIO_printf(Perl_debug_log,
8727 "floating %s%s at %"IVdf"..%"UVuf" ",
8728 s, RE_SV_TAIL(r->float_substr),
8729 (IV)r->float_min_offset, (UV)r->float_max_offset);
8730 } else if (r->float_utf8) {
8731 RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->float_utf8),
8732 RE_SV_DUMPLEN(r->float_utf8), 30);
8733 PerlIO_printf(Perl_debug_log,
8734 "floating utf8 %s%s at %"IVdf"..%"UVuf" ",
8735 s, RE_SV_TAIL(r->float_utf8),
8736 (IV)r->float_min_offset, (UV)r->float_max_offset);
8738 if (r->check_substr || r->check_utf8)
8739 PerlIO_printf(Perl_debug_log,
8741 (r->check_substr == r->float_substr
8742 && r->check_utf8 == r->float_utf8
8743 ? "(checking floating" : "(checking anchored"));
8744 if (r->extflags & RXf_NOSCAN)
8745 PerlIO_printf(Perl_debug_log, " noscan");
8746 if (r->extflags & RXf_CHECK_ALL)
8747 PerlIO_printf(Perl_debug_log, " isall");
8748 if (r->check_substr || r->check_utf8)
8749 PerlIO_printf(Perl_debug_log, ") ");
8751 if (ri->regstclass) {
8752 regprop(r, sv, ri->regstclass);
8753 PerlIO_printf(Perl_debug_log, "stclass %s ", SvPVX_const(sv));
8755 if (r->extflags & RXf_ANCH) {
8756 PerlIO_printf(Perl_debug_log, "anchored");
8757 if (r->extflags & RXf_ANCH_BOL)
8758 PerlIO_printf(Perl_debug_log, "(BOL)");
8759 if (r->extflags & RXf_ANCH_MBOL)
8760 PerlIO_printf(Perl_debug_log, "(MBOL)");
8761 if (r->extflags & RXf_ANCH_SBOL)
8762 PerlIO_printf(Perl_debug_log, "(SBOL)");
8763 if (r->extflags & RXf_ANCH_GPOS)
8764 PerlIO_printf(Perl_debug_log, "(GPOS)");
8765 PerlIO_putc(Perl_debug_log, ' ');
8767 if (r->extflags & RXf_GPOS_SEEN)
8768 PerlIO_printf(Perl_debug_log, "GPOS:%"UVuf" ", (UV)r->gofs);
8769 if (r->intflags & PREGf_SKIP)
8770 PerlIO_printf(Perl_debug_log, "plus ");
8771 if (r->intflags & PREGf_IMPLICIT)
8772 PerlIO_printf(Perl_debug_log, "implicit ");
8773 PerlIO_printf(Perl_debug_log, "minlen %"IVdf" ", (IV)r->minlen);
8774 if (r->extflags & RXf_EVAL_SEEN)
8775 PerlIO_printf(Perl_debug_log, "with eval ");
8776 PerlIO_printf(Perl_debug_log, "\n");
8777 DEBUG_FLAGS_r(regdump_extflags("r->extflags: ",r->extflags));
8779 PERL_UNUSED_CONTEXT;
8781 #endif /* DEBUGGING */
8785 - regprop - printable representation of opcode
8788 Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o)
8793 RXi_GET_DECL(prog,progi);
8794 GET_RE_DEBUG_FLAGS_DECL;
8797 sv_setpvn(sv, "", 0);
8799 if (OP(o) > REGNODE_MAX) /* regnode.type is unsigned */
8800 /* It would be nice to FAIL() here, but this may be called from
8801 regexec.c, and it would be hard to supply pRExC_state. */
8802 Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d", (int)OP(o), (int)REGNODE_MAX);
8803 sv_catpv(sv, PL_reg_name[OP(o)]); /* Take off const! */
8805 k = PL_regkind[OP(o)];
8809 /* Using is_utf8_string() (via PERL_PV_UNI_DETECT)
8810 * is a crude hack but it may be the best for now since
8811 * we have no flag "this EXACTish node was UTF-8"
8813 pv_pretty(sv, STRING(o), STR_LEN(o), 60, PL_colors[0], PL_colors[1],
8814 PERL_PV_ESCAPE_UNI_DETECT |
8815 PERL_PV_PRETTY_ELLIPSES |
8816 PERL_PV_PRETTY_LTGT |
8817 PERL_PV_PRETTY_NOCLEAR
8819 } else if (k == TRIE) {
8820 /* print the details of the trie in dumpuntil instead, as
8821 * progi->data isn't available here */
8822 const char op = OP(o);
8823 const U32 n = ARG(o);
8824 const reg_ac_data * const ac = IS_TRIE_AC(op) ?
8825 (reg_ac_data *)progi->data->data[n] :
8827 const reg_trie_data * const trie
8828 = (reg_trie_data*)progi->data->data[!IS_TRIE_AC(op) ? n : ac->trie];
8830 Perl_sv_catpvf(aTHX_ sv, "-%s",PL_reg_name[o->flags]);
8831 DEBUG_TRIE_COMPILE_r(
8832 Perl_sv_catpvf(aTHX_ sv,
8833 "<S:%"UVuf"/%"IVdf" W:%"UVuf" L:%"UVuf"/%"UVuf" C:%"UVuf"/%"UVuf">",
8834 (UV)trie->startstate,
8835 (IV)trie->statecount-1, /* -1 because of the unused 0 element */
8836 (UV)trie->wordcount,
8839 (UV)TRIE_CHARCOUNT(trie),
8840 (UV)trie->uniquecharcount
8843 if ( IS_ANYOF_TRIE(op) || trie->bitmap ) {
8845 int rangestart = -1;
8846 U8* bitmap = IS_ANYOF_TRIE(op) ? (U8*)ANYOF_BITMAP(o) : (U8*)TRIE_BITMAP(trie);
8848 for (i = 0; i <= 256; i++) {
8849 if (i < 256 && BITMAP_TEST(bitmap,i)) {
8850 if (rangestart == -1)
8852 } else if (rangestart != -1) {
8853 if (i <= rangestart + 3)
8854 for (; rangestart < i; rangestart++)
8855 put_byte(sv, rangestart);
8857 put_byte(sv, rangestart);
8859 put_byte(sv, i - 1);
8867 } else if (k == CURLY) {
8868 if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX)
8869 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */
8870 Perl_sv_catpvf(aTHX_ sv, " {%d,%d}", ARG1(o), ARG2(o));
8872 else if (k == WHILEM && o->flags) /* Ordinal/of */
8873 Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4);
8874 else if (k == REF || k == OPEN || k == CLOSE || k == GROUPP || OP(o)==ACCEPT) {
8875 Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o)); /* Parenth number */
8876 if ( prog->paren_names ) {
8877 if ( k != REF || OP(o) < NREF) {
8878 AV *list= (AV *)progi->data->data[progi->name_list_idx];
8879 SV **name= av_fetch(list, ARG(o), 0 );
8881 Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
8884 AV *list= (AV *)progi->data->data[ progi->name_list_idx ];
8885 SV *sv_dat=(SV*)progi->data->data[ ARG( o ) ];
8886 I32 *nums=(I32*)SvPVX(sv_dat);
8887 SV **name= av_fetch(list, nums[0], 0 );
8890 for ( n=0; n<SvIVX(sv_dat); n++ ) {
8891 Perl_sv_catpvf(aTHX_ sv, "%s%"IVdf,
8892 (n ? "," : ""), (IV)nums[n]);
8894 Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
8898 } else if (k == GOSUB)
8899 Perl_sv_catpvf(aTHX_ sv, "%d[%+d]", (int)ARG(o),(int)ARG2L(o)); /* Paren and offset */
8900 else if (k == VERB) {
8902 Perl_sv_catpvf(aTHX_ sv, ":%"SVf,
8903 SVfARG((SV*)progi->data->data[ ARG( o ) ]));
8904 } else if (k == LOGICAL)
8905 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* 2: embedded, otherwise 1 */
8906 else if (k == FOLDCHAR)
8907 Perl_sv_catpvf(aTHX_ sv, "[0x%"UVXf"]", PTR2UV(ARG(o)) );
8908 else if (k == ANYOF) {
8909 int i, rangestart = -1;
8910 const U8 flags = ANYOF_FLAGS(o);
8912 /* Should be synchronized with * ANYOF_ #xdefines in regcomp.h */
8913 static const char * const anyofs[] = {
8946 if (flags & ANYOF_LOCALE)
8947 sv_catpvs(sv, "{loc}");
8948 if (flags & ANYOF_FOLD)
8949 sv_catpvs(sv, "{i}");
8950 Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
8951 if (flags & ANYOF_INVERT)
8953 for (i = 0; i <= 256; i++) {
8954 if (i < 256 && ANYOF_BITMAP_TEST(o,i)) {
8955 if (rangestart == -1)
8957 } else if (rangestart != -1) {
8958 if (i <= rangestart + 3)
8959 for (; rangestart < i; rangestart++)
8960 put_byte(sv, rangestart);
8962 put_byte(sv, rangestart);
8964 put_byte(sv, i - 1);
8970 if (o->flags & ANYOF_CLASS)
8971 for (i = 0; i < (int)(sizeof(anyofs)/sizeof(char*)); i++)
8972 if (ANYOF_CLASS_TEST(o,i))
8973 sv_catpv(sv, anyofs[i]);
8975 if (flags & ANYOF_UNICODE)
8976 sv_catpvs(sv, "{unicode}");
8977 else if (flags & ANYOF_UNICODE_ALL)
8978 sv_catpvs(sv, "{unicode_all}");
8982 SV * const sw = regclass_swash(prog, o, FALSE, &lv, 0);
8986 U8 s[UTF8_MAXBYTES_CASE+1];
8988 for (i = 0; i <= 256; i++) { /* just the first 256 */
8989 uvchr_to_utf8(s, i);
8991 if (i < 256 && swash_fetch(sw, s, TRUE)) {
8992 if (rangestart == -1)
8994 } else if (rangestart != -1) {
8995 if (i <= rangestart + 3)
8996 for (; rangestart < i; rangestart++) {
8997 const U8 * const e = uvchr_to_utf8(s,rangestart);
8999 for(p = s; p < e; p++)
9003 const U8 *e = uvchr_to_utf8(s,rangestart);
9005 for (p = s; p < e; p++)
9008 e = uvchr_to_utf8(s, i-1);
9009 for (p = s; p < e; p++)
9016 sv_catpvs(sv, "..."); /* et cetera */
9020 char *s = savesvpv(lv);
9021 char * const origs = s;
9023 while (*s && *s != '\n')
9027 const char * const t = ++s;
9045 Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
9047 else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH))
9048 Perl_sv_catpvf(aTHX_ sv, "[%d]", -(o->flags));
9050 PERL_UNUSED_CONTEXT;
9051 PERL_UNUSED_ARG(sv);
9053 PERL_UNUSED_ARG(prog);
9054 #endif /* DEBUGGING */
9058 Perl_re_intuit_string(pTHX_ REGEXP * const prog)
9059 { /* Assume that RE_INTUIT is set */
9061 GET_RE_DEBUG_FLAGS_DECL;
9062 PERL_UNUSED_CONTEXT;
9066 const char * const s = SvPV_nolen_const(prog->check_substr
9067 ? prog->check_substr : prog->check_utf8);
9069 if (!PL_colorset) reginitcolors();
9070 PerlIO_printf(Perl_debug_log,
9071 "%sUsing REx %ssubstr:%s \"%s%.60s%s%s\"\n",
9073 prog->check_substr ? "" : "utf8 ",
9074 PL_colors[5],PL_colors[0],
9077 (strlen(s) > 60 ? "..." : ""));
9080 return prog->check_substr ? prog->check_substr : prog->check_utf8;
9086 handles refcounting and freeing the perl core regexp structure. When
9087 it is necessary to actually free the structure the first thing it
9088 does is call the 'free' method of the regexp_engine associated to to
9089 the regexp, allowing the handling of the void *pprivate; member
9090 first. (This routine is not overridable by extensions, which is why
9091 the extensions free is called first.)
9093 See regdupe and regdupe_internal if you change anything here.
9095 #ifndef PERL_IN_XSUB_RE
9097 Perl_pregfree(pTHX_ struct regexp *r)
9100 GET_RE_DEBUG_FLAGS_DECL;
9102 if (!r || (--r->refcnt > 0))
9105 ReREFCNT_dec(r->mother_re);
9107 CALLREGFREE_PVT(r); /* free the private data */
9109 SvREFCNT_dec(r->paren_names);
9110 Safefree(r->wrapped);
9113 if (r->anchored_substr)
9114 SvREFCNT_dec(r->anchored_substr);
9115 if (r->anchored_utf8)
9116 SvREFCNT_dec(r->anchored_utf8);
9117 if (r->float_substr)
9118 SvREFCNT_dec(r->float_substr);
9120 SvREFCNT_dec(r->float_utf8);
9121 Safefree(r->substrs);
9123 RX_MATCH_COPY_FREE(r);
9124 #ifdef PERL_OLD_COPY_ON_WRITE
9126 SvREFCNT_dec(r->saved_copy);
9135 This is a hacky workaround to the structural issue of match results
9136 being stored in the regexp structure which is in turn stored in
9137 PL_curpm/PL_reg_curpm. The problem is that due to qr// the pattern
9138 could be PL_curpm in multiple contexts, and could require multiple
9139 result sets being associated with the pattern simultaneously, such
9140 as when doing a recursive match with (??{$qr})
9142 The solution is to make a lightweight copy of the regexp structure
9143 when a qr// is returned from the code executed by (??{$qr}) this
9144 lightweight copy doesnt actually own any of its data except for
9145 the starp/end and the actual regexp structure itself.
9151 Perl_reg_temp_copy (pTHX_ struct regexp *r) {
9153 register const I32 npar = r->nparens+1;
9154 (void)ReREFCNT_inc(r);
9155 Newx(ret, 1, regexp);
9156 StructCopy(r, ret, regexp);
9157 Newx(ret->offs, npar, regexp_paren_pair);
9158 Copy(r->offs, ret->offs, npar, regexp_paren_pair);
9161 Newx(ret->substrs, 1, struct reg_substr_data);
9162 StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
9164 SvREFCNT_inc_void(ret->anchored_substr);
9165 SvREFCNT_inc_void(ret->anchored_utf8);
9166 SvREFCNT_inc_void(ret->float_substr);
9167 SvREFCNT_inc_void(ret->float_utf8);
9169 /* check_substr and check_utf8, if non-NULL, point to either their
9170 anchored or float namesakes, and don't hold a second reference. */
9172 RX_MATCH_COPIED_off(ret);
9173 #ifdef PERL_OLD_COPY_ON_WRITE
9174 ret->saved_copy = NULL;
9183 /* regfree_internal()
9185 Free the private data in a regexp. This is overloadable by
9186 extensions. Perl takes care of the regexp structure in pregfree(),
9187 this covers the *pprivate pointer which technically perldoesnt
9188 know about, however of course we have to handle the
9189 regexp_internal structure when no extension is in use.
9191 Note this is called before freeing anything in the regexp
9196 Perl_regfree_internal(pTHX_ REGEXP * const r)
9200 GET_RE_DEBUG_FLAGS_DECL;
9206 SV *dsv= sv_newmortal();
9207 RE_PV_QUOTED_DECL(s, (r->extflags & RXf_UTF8),
9208 dsv, r->precomp, r->prelen, 60);
9209 PerlIO_printf(Perl_debug_log,"%sFreeing REx:%s %s\n",
9210 PL_colors[4],PL_colors[5],s);
9213 #ifdef RE_TRACK_PATTERN_OFFSETS
9215 Safefree(ri->u.offsets); /* 20010421 MJD */
9218 int n = ri->data->count;
9219 PAD* new_comppad = NULL;
9224 /* If you add a ->what type here, update the comment in regcomp.h */
9225 switch (ri->data->what[n]) {
9229 SvREFCNT_dec((SV*)ri->data->data[n]);
9232 Safefree(ri->data->data[n]);
9235 new_comppad = (AV*)ri->data->data[n];
9238 if (new_comppad == NULL)
9239 Perl_croak(aTHX_ "panic: pregfree comppad");
9240 PAD_SAVE_LOCAL(old_comppad,
9241 /* Watch out for global destruction's random ordering. */
9242 (SvTYPE(new_comppad) == SVt_PVAV) ? new_comppad : NULL
9245 refcnt = OpREFCNT_dec((OP_4tree*)ri->data->data[n]);
9248 op_free((OP_4tree*)ri->data->data[n]);
9250 PAD_RESTORE_LOCAL(old_comppad);
9251 SvREFCNT_dec((SV*)new_comppad);
9257 { /* Aho Corasick add-on structure for a trie node.
9258 Used in stclass optimization only */
9260 reg_ac_data *aho=(reg_ac_data*)ri->data->data[n];
9262 refcount = --aho->refcount;
9265 PerlMemShared_free(aho->states);
9266 PerlMemShared_free(aho->fail);
9267 /* do this last!!!! */
9268 PerlMemShared_free(ri->data->data[n]);
9269 PerlMemShared_free(ri->regstclass);
9275 /* trie structure. */
9277 reg_trie_data *trie=(reg_trie_data*)ri->data->data[n];
9279 refcount = --trie->refcount;
9282 PerlMemShared_free(trie->charmap);
9283 PerlMemShared_free(trie->states);
9284 PerlMemShared_free(trie->trans);
9286 PerlMemShared_free(trie->bitmap);
9288 PerlMemShared_free(trie->wordlen);
9290 PerlMemShared_free(trie->jump);
9292 PerlMemShared_free(trie->nextword);
9293 /* do this last!!!! */
9294 PerlMemShared_free(ri->data->data[n]);
9299 Perl_croak(aTHX_ "panic: regfree data code '%c'", ri->data->what[n]);
9302 Safefree(ri->data->what);
9309 #define sv_dup_inc(s,t) SvREFCNT_inc(sv_dup(s,t))
9310 #define av_dup_inc(s,t) (AV*)SvREFCNT_inc(sv_dup((SV*)s,t))
9311 #define hv_dup_inc(s,t) (HV*)SvREFCNT_inc(sv_dup((SV*)s,t))
9312 #define SAVEPVN(p,n) ((p) ? savepvn(p,n) : NULL)
9315 re_dup - duplicate a regexp.
9317 This routine is expected to clone a given regexp structure. It is not
9318 compiler under USE_ITHREADS.
9320 After all of the core data stored in struct regexp is duplicated
9321 the regexp_engine.dupe method is used to copy any private data
9322 stored in the *pprivate pointer. This allows extensions to handle
9323 any duplication it needs to do.
9325 See pregfree() and regfree_internal() if you change anything here.
9327 #if defined(USE_ITHREADS)
9328 #ifndef PERL_IN_XSUB_RE
9330 Perl_re_dup(pTHX_ const regexp *r, CLONE_PARAMS *param)
9337 return (REGEXP *)NULL;
9339 if ((ret = (REGEXP *)ptr_table_fetch(PL_ptr_table, r)))
9343 npar = r->nparens+1;
9344 Newx(ret, 1, regexp);
9345 StructCopy(r, ret, regexp);
9346 Newx(ret->offs, npar, regexp_paren_pair);
9347 Copy(r->offs, ret->offs, npar, regexp_paren_pair);
9349 /* no need to copy these */
9350 Newx(ret->swap, npar, regexp_paren_pair);
9354 /* Do it this way to avoid reading from *r after the StructCopy().
9355 That way, if any of the sv_dup_inc()s dislodge *r from the L1
9356 cache, it doesn't matter. */
9357 const bool anchored = r->check_substr == r->anchored_substr;
9358 Newx(ret->substrs, 1, struct reg_substr_data);
9359 StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
9361 ret->anchored_substr = sv_dup_inc(ret->anchored_substr, param);
9362 ret->anchored_utf8 = sv_dup_inc(ret->anchored_utf8, param);
9363 ret->float_substr = sv_dup_inc(ret->float_substr, param);
9364 ret->float_utf8 = sv_dup_inc(ret->float_utf8, param);
9366 /* check_substr and check_utf8, if non-NULL, point to either their
9367 anchored or float namesakes, and don't hold a second reference. */
9369 if (ret->check_substr) {
9371 assert(r->check_utf8 == r->anchored_utf8);
9372 ret->check_substr = ret->anchored_substr;
9373 ret->check_utf8 = ret->anchored_utf8;
9375 assert(r->check_substr == r->float_substr);
9376 assert(r->check_utf8 == r->float_utf8);
9377 ret->check_substr = ret->float_substr;
9378 ret->check_utf8 = ret->float_utf8;
9383 ret->wrapped = SAVEPVN(ret->wrapped, ret->wraplen+1);
9384 ret->precomp = ret->wrapped + (ret->precomp - ret->wrapped);
9385 ret->paren_names = hv_dup_inc(ret->paren_names, param);
9388 RXi_SET(ret,CALLREGDUPE_PVT(ret,param));
9390 if (RX_MATCH_COPIED(ret))
9391 ret->subbeg = SAVEPVN(ret->subbeg, ret->sublen);
9394 #ifdef PERL_OLD_COPY_ON_WRITE
9395 ret->saved_copy = NULL;
9398 ret->mother_re = NULL;
9400 ret->seen_evals = 0;
9402 ptr_table_store(PL_ptr_table, r, ret);
9405 #endif /* PERL_IN_XSUB_RE */
9410 This is the internal complement to regdupe() which is used to copy
9411 the structure pointed to by the *pprivate pointer in the regexp.
9412 This is the core version of the extension overridable cloning hook.
9413 The regexp structure being duplicated will be copied by perl prior
9414 to this and will be provided as the regexp *r argument, however
9415 with the /old/ structures pprivate pointer value. Thus this routine
9416 may override any copying normally done by perl.
9418 It returns a pointer to the new regexp_internal structure.
9422 Perl_regdupe_internal(pTHX_ REGEXP * const r, CLONE_PARAMS *param)
9425 regexp_internal *reti;
9429 npar = r->nparens+1;
9432 Newxc(reti, sizeof(regexp_internal) + (len+1)*sizeof(regnode), char, regexp_internal);
9433 Copy(ri->program, reti->program, len+1, regnode);
9436 reti->regstclass = NULL;
9440 const int count = ri->data->count;
9443 Newxc(d, sizeof(struct reg_data) + count*sizeof(void *),
9444 char, struct reg_data);
9445 Newx(d->what, count, U8);
9448 for (i = 0; i < count; i++) {
9449 d->what[i] = ri->data->what[i];
9450 switch (d->what[i]) {
9451 /* legal options are one of: sSfpontTu
9452 see also regcomp.h and pregfree() */
9455 case 'p': /* actually an AV, but the dup function is identical. */
9456 case 'u': /* actually an HV, but the dup function is identical. */
9457 d->data[i] = sv_dup_inc((SV *)ri->data->data[i], param);
9460 /* This is cheating. */
9461 Newx(d->data[i], 1, struct regnode_charclass_class);
9462 StructCopy(ri->data->data[i], d->data[i],
9463 struct regnode_charclass_class);
9464 reti->regstclass = (regnode*)d->data[i];
9467 /* Compiled op trees are readonly and in shared memory,
9468 and can thus be shared without duplication. */
9470 d->data[i] = (void*)OpREFCNT_inc((OP*)ri->data->data[i]);
9474 /* Trie stclasses are readonly and can thus be shared
9475 * without duplication. We free the stclass in pregfree
9476 * when the corresponding reg_ac_data struct is freed.
9478 reti->regstclass= ri->regstclass;
9482 ((reg_trie_data*)ri->data->data[i])->refcount++;
9486 d->data[i] = ri->data->data[i];
9489 Perl_croak(aTHX_ "panic: re_dup unknown data code '%c'", ri->data->what[i]);
9498 reti->name_list_idx = ri->name_list_idx;
9500 #ifdef RE_TRACK_PATTERN_OFFSETS
9501 if (ri->u.offsets) {
9502 Newx(reti->u.offsets, 2*len+1, U32);
9503 Copy(ri->u.offsets, reti->u.offsets, 2*len+1, U32);
9506 SetProgLen(reti,len);
9512 #endif /* USE_ITHREADS */
9517 converts a regexp embedded in a MAGIC struct to its stringified form,
9518 caching the converted form in the struct and returns the cached
9521 If lp is nonnull then it is used to return the length of the
9524 If flags is nonnull and the returned string contains UTF8 then
9525 (*flags & 1) will be true.
9527 If haseval is nonnull then it is used to return whether the pattern
9530 Normally called via macro:
9532 CALLREG_STRINGIFY(mg,&len,&utf8);
9536 CALLREG_AS_STR(mg,&lp,&flags,&haseval)
9538 See sv_2pv_flags() in sv.c for an example of internal usage.
9541 #ifndef PERL_IN_XSUB_RE
9544 Perl_reg_stringify(pTHX_ MAGIC *mg, STRLEN *lp, U32 *flags, I32 *haseval ) {
9546 const regexp * const re = (regexp *)mg->mg_obj;
9548 *haseval = re->seen_evals;
9550 *flags = ((re->extflags & RXf_UTF8) ? 1 : 0);
9557 - regnext - dig the "next" pointer out of a node
9560 Perl_regnext(pTHX_ register regnode *p)
9563 register I32 offset;
9568 offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p));
9577 S_re_croak2(pTHX_ const char* pat1,const char* pat2,...)
9580 STRLEN l1 = strlen(pat1);
9581 STRLEN l2 = strlen(pat2);
9584 const char *message;
9590 Copy(pat1, buf, l1 , char);
9591 Copy(pat2, buf + l1, l2 , char);
9592 buf[l1 + l2] = '\n';
9593 buf[l1 + l2 + 1] = '\0';
9595 /* ANSI variant takes additional second argument */
9596 va_start(args, pat2);
9600 msv = vmess(buf, &args);
9602 message = SvPV_const(msv,l1);
9605 Copy(message, buf, l1 , char);
9606 buf[l1-1] = '\0'; /* Overwrite \n */
9607 Perl_croak(aTHX_ "%s", buf);
9610 /* XXX Here's a total kludge. But we need to re-enter for swash routines. */
9612 #ifndef PERL_IN_XSUB_RE
9614 Perl_save_re_context(pTHX)
9618 struct re_save_state *state;
9620 SAVEVPTR(PL_curcop);
9621 SSGROW(SAVESTACK_ALLOC_FOR_RE_SAVE_STATE + 1);
9623 state = (struct re_save_state *)(PL_savestack + PL_savestack_ix);
9624 PL_savestack_ix += SAVESTACK_ALLOC_FOR_RE_SAVE_STATE;
9625 SSPUSHINT(SAVEt_RE_STATE);
9627 Copy(&PL_reg_state, state, 1, struct re_save_state);
9629 PL_reg_start_tmp = 0;
9630 PL_reg_start_tmpl = 0;
9631 PL_reg_oldsaved = NULL;
9632 PL_reg_oldsavedlen = 0;
9634 PL_reg_leftiter = 0;
9635 PL_reg_poscache = NULL;
9636 PL_reg_poscache_size = 0;
9637 #ifdef PERL_OLD_COPY_ON_WRITE
9641 /* Save $1..$n (#18107: UTF-8 s/(\w+)/uc($1)/e); AMS 20021106. */
9643 const REGEXP * const rx = PM_GETRE(PL_curpm);
9646 for (i = 1; i <= rx->nparens; i++) {
9647 char digits[TYPE_CHARS(long)];
9648 const STRLEN len = my_snprintf(digits, sizeof(digits), "%lu", (long)i);
9649 GV *const *const gvp
9650 = (GV**)hv_fetch(PL_defstash, digits, len, 0);
9653 GV * const gv = *gvp;
9654 if (SvTYPE(gv) == SVt_PVGV && GvSV(gv))
9664 clear_re(pTHX_ void *r)
9667 ReREFCNT_dec((regexp *)r);
9673 S_put_byte(pTHX_ SV *sv, int c)
9675 /* Our definition of isPRINT() ignores locales, so only bytes that are
9676 not part of UTF-8 are considered printable. I assume that the same
9677 holds for UTF-EBCDIC.
9678 Also, code point 255 is not printable in either (it's E0 in EBCDIC,
9679 which Wikipedia says:
9681 EO, or Eight Ones, is an 8-bit EBCDIC character code represented as all
9682 ones (binary 1111 1111, hexadecimal FF). It is similar, but not
9683 identical, to the ASCII delete (DEL) or rubout control character.
9684 ) So the old condition can be simplified to !isPRINT(c) */
9686 Perl_sv_catpvf(aTHX_ sv, "\\%o", c);
9688 const char string = c;
9689 if (c == '-' || c == ']' || c == '\\' || c == '^')
9690 sv_catpvs(sv, "\\");
9691 sv_catpvn(sv, &string, 1);
9696 #define CLEAR_OPTSTART \
9697 if (optstart) STMT_START { \
9698 DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log, " (%"IVdf" nodes)\n", (IV)(node - optstart))); \
9702 #define DUMPUNTIL(b,e) CLEAR_OPTSTART; node=dumpuntil(r,start,(b),(e),last,sv,indent+1,depth+1);
9704 STATIC const regnode *
9705 S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node,
9706 const regnode *last, const regnode *plast,
9707 SV* sv, I32 indent, U32 depth)
9710 register U8 op = PSEUDO; /* Arbitrary non-END op. */
9711 register const regnode *next;
9712 const regnode *optstart= NULL;
9715 GET_RE_DEBUG_FLAGS_DECL;
9717 #ifdef DEBUG_DUMPUNTIL
9718 PerlIO_printf(Perl_debug_log, "--- %d : %d - %d - %d\n",indent,node-start,
9719 last ? last-start : 0,plast ? plast-start : 0);
9722 if (plast && plast < last)
9725 while (PL_regkind[op] != END && (!last || node < last)) {
9726 /* While that wasn't END last time... */
9729 if (op == CLOSE || op == WHILEM)
9731 next = regnext((regnode *)node);
9734 if (OP(node) == OPTIMIZED) {
9735 if (!optstart && RE_DEBUG_FLAG(RE_DEBUG_COMPILE_OPTIMISE))
9742 regprop(r, sv, node);
9743 PerlIO_printf(Perl_debug_log, "%4"IVdf":%*s%s", (IV)(node - start),
9744 (int)(2*indent + 1), "", SvPVX_const(sv));
9746 if (OP(node) != OPTIMIZED) {
9747 if (next == NULL) /* Next ptr. */
9748 PerlIO_printf(Perl_debug_log, " (0)");
9749 else if (PL_regkind[(U8)op] == BRANCH && PL_regkind[OP(next)] != BRANCH )
9750 PerlIO_printf(Perl_debug_log, " (FAIL)");
9752 PerlIO_printf(Perl_debug_log, " (%"IVdf")", (IV)(next - start));
9753 (void)PerlIO_putc(Perl_debug_log, '\n');
9757 if (PL_regkind[(U8)op] == BRANCHJ) {
9760 register const regnode *nnode = (OP(next) == LONGJMP
9761 ? regnext((regnode *)next)
9763 if (last && nnode > last)
9765 DUMPUNTIL(NEXTOPER(NEXTOPER(node)), nnode);
9768 else if (PL_regkind[(U8)op] == BRANCH) {
9770 DUMPUNTIL(NEXTOPER(node), next);
9772 else if ( PL_regkind[(U8)op] == TRIE ) {
9773 const regnode *this_trie = node;
9774 const char op = OP(node);
9775 const U32 n = ARG(node);
9776 const reg_ac_data * const ac = op>=AHOCORASICK ?
9777 (reg_ac_data *)ri->data->data[n] :
9779 const reg_trie_data * const trie =
9780 (reg_trie_data*)ri->data->data[op<AHOCORASICK ? n : ac->trie];
9782 AV *const trie_words = (AV *) ri->data->data[n + TRIE_WORDS_OFFSET];
9784 const regnode *nextbranch= NULL;
9786 sv_setpvn(sv, "", 0);
9787 for (word_idx= 0; word_idx < (I32)trie->wordcount; word_idx++) {
9788 SV ** const elem_ptr = av_fetch(trie_words,word_idx,0);
9790 PerlIO_printf(Perl_debug_log, "%*s%s ",
9791 (int)(2*(indent+3)), "",
9792 elem_ptr ? pv_pretty(sv, SvPV_nolen_const(*elem_ptr), SvCUR(*elem_ptr), 60,
9793 PL_colors[0], PL_colors[1],
9794 (SvUTF8(*elem_ptr) ? PERL_PV_ESCAPE_UNI : 0) |
9795 PERL_PV_PRETTY_ELLIPSES |
9801 U16 dist= trie->jump[word_idx+1];
9802 PerlIO_printf(Perl_debug_log, "(%"UVuf")\n",
9803 (UV)((dist ? this_trie + dist : next) - start));
9806 nextbranch= this_trie + trie->jump[0];
9807 DUMPUNTIL(this_trie + dist, nextbranch);
9809 if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
9810 nextbranch= regnext((regnode *)nextbranch);
9812 PerlIO_printf(Perl_debug_log, "\n");
9815 if (last && next > last)
9820 else if ( op == CURLY ) { /* "next" might be very big: optimizer */
9821 DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS,
9822 NEXTOPER(node) + EXTRA_STEP_2ARGS + 1);
9824 else if (PL_regkind[(U8)op] == CURLY && op != CURLYX) {
9826 DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS, next);
9828 else if ( op == PLUS || op == STAR) {
9829 DUMPUNTIL(NEXTOPER(node), NEXTOPER(node) + 1);
9831 else if (op == ANYOF) {
9832 /* arglen 1 + class block */
9833 node += 1 + ((ANYOF_FLAGS(node) & ANYOF_LARGE)
9834 ? ANYOF_CLASS_SKIP : ANYOF_SKIP);
9835 node = NEXTOPER(node);
9837 else if (PL_regkind[(U8)op] == EXACT) {
9838 /* Literal string, where present. */
9839 node += NODE_SZ_STR(node) - 1;
9840 node = NEXTOPER(node);
9843 node = NEXTOPER(node);
9844 node += regarglen[(U8)op];
9846 if (op == CURLYX || op == OPEN)
9850 #ifdef DEBUG_DUMPUNTIL
9851 PerlIO_printf(Perl_debug_log, "--- %d\n", (int)indent);
9856 #endif /* DEBUGGING */
9860 * c-indentation-style: bsd
9862 * indent-tabs-mode: t
9865 * ex: set ts=8 sts=4 sw=4 noet: