5 * "A fair jaw-cracker dwarf-language must be." --Samwise Gamgee
8 /* This file contains functions for compiling a regular expression. See
9 * also regexec.c which funnily enough, contains functions for executing
10 * a regular expression.
12 * This file is also copied at build time to ext/re/re_comp.c, where
13 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
14 * This causes the main functions to be compiled under new names and with
15 * debugging support added, which makes "use re 'debug'" work.
18 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
19 * confused with the original package (see point 3 below). Thanks, Henry!
22 /* Additional note: this code is very heavily munged from Henry's version
23 * in places. In some spots I've traded clarity for efficiency, so don't
24 * blame Henry for some of the lack of readability.
27 /* The names of the functions have been changed from regcomp and
28 * regexec to pregcomp and pregexec in order to avoid conflicts
29 * with the POSIX routines of the same names.
32 #ifdef PERL_EXT_RE_BUILD
37 * pregcomp and pregexec -- regsub and regerror are not used in perl
39 * Copyright (c) 1986 by University of Toronto.
40 * Written by Henry Spencer. Not derived from licensed software.
42 * Permission is granted to anyone to use this software for any
43 * purpose on any computer system, and to redistribute it freely,
44 * subject to the following restrictions:
46 * 1. The author is not responsible for the consequences of use of
47 * this software, no matter how awful, even if they arise
50 * 2. The origin of this software must not be misrepresented, either
51 * by explicit claim or by omission.
53 * 3. Altered versions must be plainly marked as such, and must not
54 * be misrepresented as being the original software.
57 **** Alterations to Henry's code are...
59 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
60 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007 by Larry Wall and others
62 **** You may distribute under the terms of either the GNU General Public
63 **** License or the Artistic License, as specified in the README file.
66 * Beware that some of this code is subtly aware of the way operator
67 * precedence is structured in regular expressions. Serious changes in
68 * regular-expression syntax might require a total rethink.
71 #define PERL_IN_REGCOMP_C
74 #ifndef PERL_IN_XSUB_RE
79 #ifdef PERL_IN_XSUB_RE
90 # if defined(BUGGY_MSC6)
91 /* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */
92 # pragma optimize("a",off)
93 /* But MSC 6.00A is happy with 'w', for aliases only across function calls*/
94 # pragma optimize("w",on )
95 # endif /* BUGGY_MSC6 */
102 typedef struct RExC_state_t {
103 U32 flags; /* are we folding, multilining? */
104 char *precomp; /* uncompiled string. */
105 regexp *rx; /* perl core regexp structure */
106 regexp_internal *rxi; /* internal data for regexp object pprivate field */
107 char *start; /* Start of input for compile */
108 char *end; /* End of input for compile */
109 char *parse; /* Input-scan pointer. */
110 I32 whilem_seen; /* number of WHILEM in this expr */
111 regnode *emit_start; /* Start of emitted-code area */
112 regnode *emit_bound; /* First regnode outside of the allocated space */
113 regnode *emit; /* Code-emit pointer; ®dummy = don't = compiling */
114 I32 naughty; /* How bad is this pattern? */
115 I32 sawback; /* Did we see \1, ...? */
117 I32 size; /* Code size. */
118 I32 npar; /* Capture buffer count, (OPEN). */
119 I32 cpar; /* Capture buffer count, (CLOSE). */
120 I32 nestroot; /* root parens we are in - used by accept */
124 regnode **open_parens; /* pointers to open parens */
125 regnode **close_parens; /* pointers to close parens */
126 regnode *opend; /* END node in program */
127 I32 utf8; /* whether the pattern is utf8 or not */
128 I32 orig_utf8; /* whether the pattern was originally in utf8 */
129 /* XXX use this for future optimisation of case
130 * where pattern must be upgraded to utf8. */
131 HV *charnames; /* cache of named sequences */
132 HV *paren_names; /* Paren names */
134 regnode **recurse; /* Recurse regops */
135 I32 recurse_count; /* Number of recurse regops */
137 char *starttry; /* -Dr: where regtry was called. */
138 #define RExC_starttry (pRExC_state->starttry)
141 const char *lastparse;
143 AV *paren_name_list; /* idx -> name */
144 #define RExC_lastparse (pRExC_state->lastparse)
145 #define RExC_lastnum (pRExC_state->lastnum)
146 #define RExC_paren_name_list (pRExC_state->paren_name_list)
150 #define RExC_flags (pRExC_state->flags)
151 #define RExC_precomp (pRExC_state->precomp)
152 #define RExC_rx (pRExC_state->rx)
153 #define RExC_rxi (pRExC_state->rxi)
154 #define RExC_start (pRExC_state->start)
155 #define RExC_end (pRExC_state->end)
156 #define RExC_parse (pRExC_state->parse)
157 #define RExC_whilem_seen (pRExC_state->whilem_seen)
158 #ifdef RE_TRACK_PATTERN_OFFSETS
159 #define RExC_offsets (pRExC_state->rxi->u.offsets) /* I am not like the others */
161 #define RExC_emit (pRExC_state->emit)
162 #define RExC_emit_start (pRExC_state->emit_start)
163 #define RExC_emit_bound (pRExC_state->emit_bound)
164 #define RExC_naughty (pRExC_state->naughty)
165 #define RExC_sawback (pRExC_state->sawback)
166 #define RExC_seen (pRExC_state->seen)
167 #define RExC_size (pRExC_state->size)
168 #define RExC_npar (pRExC_state->npar)
169 #define RExC_nestroot (pRExC_state->nestroot)
170 #define RExC_extralen (pRExC_state->extralen)
171 #define RExC_seen_zerolen (pRExC_state->seen_zerolen)
172 #define RExC_seen_evals (pRExC_state->seen_evals)
173 #define RExC_utf8 (pRExC_state->utf8)
174 #define RExC_orig_utf8 (pRExC_state->orig_utf8)
175 #define RExC_charnames (pRExC_state->charnames)
176 #define RExC_open_parens (pRExC_state->open_parens)
177 #define RExC_close_parens (pRExC_state->close_parens)
178 #define RExC_opend (pRExC_state->opend)
179 #define RExC_paren_names (pRExC_state->paren_names)
180 #define RExC_recurse (pRExC_state->recurse)
181 #define RExC_recurse_count (pRExC_state->recurse_count)
184 #define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?')
185 #define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
186 ((*s) == '{' && regcurly(s)))
189 #undef SPSTART /* dratted cpp namespace... */
192 * Flags to be passed up and down.
194 #define WORST 0 /* Worst case. */
195 #define HASWIDTH 0x01 /* Known to match non-null strings. */
196 #define SIMPLE 0x02 /* Simple enough to be STAR/PLUS operand. */
197 #define SPSTART 0x04 /* Starts with * or +. */
198 #define TRYAGAIN 0x08 /* Weeded out a declaration. */
199 #define POSTPONED 0x10 /* (?1),(?&name), (??{...}) or similar */
201 #define REG_NODE_NUM(x) ((x) ? (int)((x)-RExC_emit_start) : -1)
203 /* whether trie related optimizations are enabled */
204 #if PERL_ENABLE_EXTENDED_TRIE_OPTIMISATION
205 #define TRIE_STUDY_OPT
206 #define FULL_TRIE_STUDY
212 #define PBYTE(u8str,paren) ((U8*)(u8str))[(paren) >> 3]
213 #define PBITVAL(paren) (1 << ((paren) & 7))
214 #define PAREN_TEST(u8str,paren) ( PBYTE(u8str,paren) & PBITVAL(paren))
215 #define PAREN_SET(u8str,paren) PBYTE(u8str,paren) |= PBITVAL(paren)
216 #define PAREN_UNSET(u8str,paren) PBYTE(u8str,paren) &= (~PBITVAL(paren))
219 /* About scan_data_t.
221 During optimisation we recurse through the regexp program performing
222 various inplace (keyhole style) optimisations. In addition study_chunk
223 and scan_commit populate this data structure with information about
224 what strings MUST appear in the pattern. We look for the longest
225 string that must appear for at a fixed location, and we look for the
226 longest string that may appear at a floating location. So for instance
231 Both 'FOO' and 'A' are fixed strings. Both 'B' and 'BAR' are floating
232 strings (because they follow a .* construct). study_chunk will identify
233 both FOO and BAR as being the longest fixed and floating strings respectively.
235 The strings can be composites, for instance
239 will result in a composite fixed substring 'foo'.
241 For each string some basic information is maintained:
243 - offset or min_offset
244 This is the position the string must appear at, or not before.
245 It also implicitly (when combined with minlenp) tells us how many
246 character must match before the string we are searching.
247 Likewise when combined with minlenp and the length of the string
248 tells us how many characters must appear after the string we have
252 Only used for floating strings. This is the rightmost point that
253 the string can appear at. Ifset to I32 max it indicates that the
254 string can occur infinitely far to the right.
257 A pointer to the minimum length of the pattern that the string
258 was found inside. This is important as in the case of positive
259 lookahead or positive lookbehind we can have multiple patterns
264 The minimum length of the pattern overall is 3, the minimum length
265 of the lookahead part is 3, but the minimum length of the part that
266 will actually match is 1. So 'FOO's minimum length is 3, but the
267 minimum length for the F is 1. This is important as the minimum length
268 is used to determine offsets in front of and behind the string being
269 looked for. Since strings can be composites this is the length of the
270 pattern at the time it was commited with a scan_commit. Note that
271 the length is calculated by study_chunk, so that the minimum lengths
272 are not known until the full pattern has been compiled, thus the
273 pointer to the value.
277 In the case of lookbehind the string being searched for can be
278 offset past the start point of the final matching string.
279 If this value was just blithely removed from the min_offset it would
280 invalidate some of the calculations for how many chars must match
281 before or after (as they are derived from min_offset and minlen and
282 the length of the string being searched for).
283 When the final pattern is compiled and the data is moved from the
284 scan_data_t structure into the regexp structure the information
285 about lookbehind is factored in, with the information that would
286 have been lost precalculated in the end_shift field for the
289 The fields pos_min and pos_delta are used to store the minimum offset
290 and the delta to the maximum offset at the current point in the pattern.
294 typedef struct scan_data_t {
295 /*I32 len_min; unused */
296 /*I32 len_delta; unused */
300 I32 last_end; /* min value, <0 unless valid. */
303 SV **longest; /* Either &l_fixed, or &l_float. */
304 SV *longest_fixed; /* longest fixed string found in pattern */
305 I32 offset_fixed; /* offset where it starts */
306 I32 *minlen_fixed; /* pointer to the minlen relevent to the string */
307 I32 lookbehind_fixed; /* is the position of the string modfied by LB */
308 SV *longest_float; /* longest floating string found in pattern */
309 I32 offset_float_min; /* earliest point in string it can appear */
310 I32 offset_float_max; /* latest point in string it can appear */
311 I32 *minlen_float; /* pointer to the minlen relevent to the string */
312 I32 lookbehind_float; /* is the position of the string modified by LB */
316 struct regnode_charclass_class *start_class;
320 * Forward declarations for pregcomp()'s friends.
323 static const scan_data_t zero_scan_data =
324 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ,0};
326 #define SF_BEFORE_EOL (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
327 #define SF_BEFORE_SEOL 0x0001
328 #define SF_BEFORE_MEOL 0x0002
329 #define SF_FIX_BEFORE_EOL (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
330 #define SF_FL_BEFORE_EOL (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
333 # define SF_FIX_SHIFT_EOL (0+2)
334 # define SF_FL_SHIFT_EOL (0+4)
336 # define SF_FIX_SHIFT_EOL (+2)
337 # define SF_FL_SHIFT_EOL (+4)
340 #define SF_FIX_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
341 #define SF_FIX_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
343 #define SF_FL_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
344 #define SF_FL_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
345 #define SF_IS_INF 0x0040
346 #define SF_HAS_PAR 0x0080
347 #define SF_IN_PAR 0x0100
348 #define SF_HAS_EVAL 0x0200
349 #define SCF_DO_SUBSTR 0x0400
350 #define SCF_DO_STCLASS_AND 0x0800
351 #define SCF_DO_STCLASS_OR 0x1000
352 #define SCF_DO_STCLASS (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
353 #define SCF_WHILEM_VISITED_POS 0x2000
355 #define SCF_TRIE_RESTUDY 0x4000 /* Do restudy? */
356 #define SCF_SEEN_ACCEPT 0x8000
358 #define UTF (RExC_utf8 != 0)
359 #define LOC ((RExC_flags & RXf_PMf_LOCALE) != 0)
360 #define FOLD ((RExC_flags & RXf_PMf_FOLD) != 0)
362 #define OOB_UNICODE 12345678
363 #define OOB_NAMEDCLASS -1
365 #define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
366 #define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
369 /* length of regex to show in messages that don't mark a position within */
370 #define RegexLengthToShowInErrorMessages 127
373 * If MARKER[12] are adjusted, be sure to adjust the constants at the top
374 * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
375 * op/pragma/warn/regcomp.
377 #define MARKER1 "<-- HERE" /* marker as it appears in the description */
378 #define MARKER2 " <-- HERE " /* marker as it appears within the regex */
380 #define REPORT_LOCATION " in regex; marked by " MARKER1 " in m/%.*s" MARKER2 "%s/"
383 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
384 * arg. Show regex, up to a maximum length. If it's too long, chop and add
387 #define _FAIL(code) STMT_START { \
388 const char *ellipses = ""; \
389 IV len = RExC_end - RExC_precomp; \
392 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
393 if (len > RegexLengthToShowInErrorMessages) { \
394 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
395 len = RegexLengthToShowInErrorMessages - 10; \
401 #define FAIL(msg) _FAIL( \
402 Perl_croak(aTHX_ "%s in regex m/%.*s%s/", \
403 msg, (int)len, RExC_precomp, ellipses))
405 #define FAIL2(msg,arg) _FAIL( \
406 Perl_croak(aTHX_ msg " in regex m/%.*s%s/", \
407 arg, (int)len, RExC_precomp, ellipses))
410 * Simple_vFAIL -- like FAIL, but marks the current location in the scan
412 #define Simple_vFAIL(m) STMT_START { \
413 const IV offset = RExC_parse - RExC_precomp; \
414 Perl_croak(aTHX_ "%s" REPORT_LOCATION, \
415 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
419 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
421 #define vFAIL(m) STMT_START { \
423 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
428 * Like Simple_vFAIL(), but accepts two arguments.
430 #define Simple_vFAIL2(m,a1) STMT_START { \
431 const IV offset = RExC_parse - RExC_precomp; \
432 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, \
433 (int)offset, RExC_precomp, RExC_precomp + offset); \
437 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
439 #define vFAIL2(m,a1) STMT_START { \
441 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
442 Simple_vFAIL2(m, a1); \
447 * Like Simple_vFAIL(), but accepts three arguments.
449 #define Simple_vFAIL3(m, a1, a2) STMT_START { \
450 const IV offset = RExC_parse - RExC_precomp; \
451 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, \
452 (int)offset, RExC_precomp, RExC_precomp + offset); \
456 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
458 #define vFAIL3(m,a1,a2) STMT_START { \
460 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
461 Simple_vFAIL3(m, a1, a2); \
465 * Like Simple_vFAIL(), but accepts four arguments.
467 #define Simple_vFAIL4(m, a1, a2, a3) STMT_START { \
468 const IV offset = RExC_parse - RExC_precomp; \
469 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3, \
470 (int)offset, RExC_precomp, RExC_precomp + offset); \
473 #define vWARN(loc,m) STMT_START { \
474 const IV offset = loc - RExC_precomp; \
475 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s" REPORT_LOCATION, \
476 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
479 #define vWARNdep(loc,m) STMT_START { \
480 const IV offset = loc - RExC_precomp; \
481 Perl_warner(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), \
482 "%s" REPORT_LOCATION, \
483 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
487 #define vWARN2(loc, m, a1) STMT_START { \
488 const IV offset = loc - RExC_precomp; \
489 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
490 a1, (int)offset, RExC_precomp, RExC_precomp + offset); \
493 #define vWARN3(loc, m, a1, a2) STMT_START { \
494 const IV offset = loc - RExC_precomp; \
495 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
496 a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset); \
499 #define vWARN4(loc, m, a1, a2, a3) STMT_START { \
500 const IV offset = loc - RExC_precomp; \
501 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
502 a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
505 #define vWARN5(loc, m, a1, a2, a3, a4) STMT_START { \
506 const IV offset = loc - RExC_precomp; \
507 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
508 a1, a2, a3, a4, (int)offset, RExC_precomp, RExC_precomp + offset); \
512 /* Allow for side effects in s */
513 #define REGC(c,s) STMT_START { \
514 if (!SIZE_ONLY) *(s) = (c); else (void)(s); \
517 /* Macros for recording node offsets. 20001227 mjd@plover.com
518 * Nodes are numbered 1, 2, 3, 4. Node #n's position is recorded in
519 * element 2*n-1 of the array. Element #2n holds the byte length node #n.
520 * Element 0 holds the number n.
521 * Position is 1 indexed.
523 #ifndef RE_TRACK_PATTERN_OFFSETS
524 #define Set_Node_Offset_To_R(node,byte)
525 #define Set_Node_Offset(node,byte)
526 #define Set_Cur_Node_Offset
527 #define Set_Node_Length_To_R(node,len)
528 #define Set_Node_Length(node,len)
529 #define Set_Node_Cur_Length(node)
530 #define Node_Offset(n)
531 #define Node_Length(n)
532 #define Set_Node_Offset_Length(node,offset,len)
533 #define ProgLen(ri) ri->u.proglen
534 #define SetProgLen(ri,x) ri->u.proglen = x
536 #define ProgLen(ri) ri->u.offsets[0]
537 #define SetProgLen(ri,x) ri->u.offsets[0] = x
538 #define Set_Node_Offset_To_R(node,byte) STMT_START { \
540 MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n", \
541 __LINE__, (int)(node), (int)(byte))); \
543 Perl_croak(aTHX_ "value of node is %d in Offset macro", (int)(node)); \
545 RExC_offsets[2*(node)-1] = (byte); \
550 #define Set_Node_Offset(node,byte) \
551 Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
552 #define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
554 #define Set_Node_Length_To_R(node,len) STMT_START { \
556 MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n", \
557 __LINE__, (int)(node), (int)(len))); \
559 Perl_croak(aTHX_ "value of node is %d in Length macro", (int)(node)); \
561 RExC_offsets[2*(node)] = (len); \
566 #define Set_Node_Length(node,len) \
567 Set_Node_Length_To_R((node)-RExC_emit_start, len)
568 #define Set_Cur_Node_Length(len) Set_Node_Length(RExC_emit, len)
569 #define Set_Node_Cur_Length(node) \
570 Set_Node_Length(node, RExC_parse - parse_start)
572 /* Get offsets and lengths */
573 #define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
574 #define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
576 #define Set_Node_Offset_Length(node,offset,len) STMT_START { \
577 Set_Node_Offset_To_R((node)-RExC_emit_start, (offset)); \
578 Set_Node_Length_To_R((node)-RExC_emit_start, (len)); \
582 #if PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS
583 #define EXPERIMENTAL_INPLACESCAN
584 #endif /*RE_TRACK_PATTERN_OFFSETS*/
586 #define DEBUG_STUDYDATA(str,data,depth) \
587 DEBUG_OPTIMISE_MORE_r(if(data){ \
588 PerlIO_printf(Perl_debug_log, \
589 "%*s" str "Pos:%"IVdf"/%"IVdf \
590 " Flags: 0x%"UVXf" Whilem_c: %"IVdf" Lcp: %"IVdf" %s", \
591 (int)(depth)*2, "", \
592 (IV)((data)->pos_min), \
593 (IV)((data)->pos_delta), \
594 (UV)((data)->flags), \
595 (IV)((data)->whilem_c), \
596 (IV)((data)->last_closep ? *((data)->last_closep) : -1), \
597 is_inf ? "INF " : "" \
599 if ((data)->last_found) \
600 PerlIO_printf(Perl_debug_log, \
601 "Last:'%s' %"IVdf":%"IVdf"/%"IVdf" %sFixed:'%s' @ %"IVdf \
602 " %sFloat: '%s' @ %"IVdf"/%"IVdf"", \
603 SvPVX_const((data)->last_found), \
604 (IV)((data)->last_end), \
605 (IV)((data)->last_start_min), \
606 (IV)((data)->last_start_max), \
607 ((data)->longest && \
608 (data)->longest==&((data)->longest_fixed)) ? "*" : "", \
609 SvPVX_const((data)->longest_fixed), \
610 (IV)((data)->offset_fixed), \
611 ((data)->longest && \
612 (data)->longest==&((data)->longest_float)) ? "*" : "", \
613 SvPVX_const((data)->longest_float), \
614 (IV)((data)->offset_float_min), \
615 (IV)((data)->offset_float_max) \
617 PerlIO_printf(Perl_debug_log,"\n"); \
620 static void clear_re(pTHX_ void *r);
622 /* Mark that we cannot extend a found fixed substring at this point.
623 Update the longest found anchored substring and the longest found
624 floating substrings if needed. */
627 S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data, I32 *minlenp, int is_inf)
629 const STRLEN l = CHR_SVLEN(data->last_found);
630 const STRLEN old_l = CHR_SVLEN(*data->longest);
631 GET_RE_DEBUG_FLAGS_DECL;
633 if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
634 SvSetMagicSV(*data->longest, data->last_found);
635 if (*data->longest == data->longest_fixed) {
636 data->offset_fixed = l ? data->last_start_min : data->pos_min;
637 if (data->flags & SF_BEFORE_EOL)
639 |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
641 data->flags &= ~SF_FIX_BEFORE_EOL;
642 data->minlen_fixed=minlenp;
643 data->lookbehind_fixed=0;
645 else { /* *data->longest == data->longest_float */
646 data->offset_float_min = l ? data->last_start_min : data->pos_min;
647 data->offset_float_max = (l
648 ? data->last_start_max
649 : data->pos_min + data->pos_delta);
650 if (is_inf || (U32)data->offset_float_max > (U32)I32_MAX)
651 data->offset_float_max = I32_MAX;
652 if (data->flags & SF_BEFORE_EOL)
654 |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
656 data->flags &= ~SF_FL_BEFORE_EOL;
657 data->minlen_float=minlenp;
658 data->lookbehind_float=0;
661 SvCUR_set(data->last_found, 0);
663 SV * const sv = data->last_found;
664 if (SvUTF8(sv) && SvMAGICAL(sv)) {
665 MAGIC * const mg = mg_find(sv, PERL_MAGIC_utf8);
671 data->flags &= ~SF_BEFORE_EOL;
672 DEBUG_STUDYDATA("commit: ",data,0);
675 /* Can match anything (initialization) */
677 S_cl_anything(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
679 ANYOF_CLASS_ZERO(cl);
680 ANYOF_BITMAP_SETALL(cl);
681 cl->flags = ANYOF_EOS|ANYOF_UNICODE_ALL;
683 cl->flags |= ANYOF_LOCALE;
686 /* Can match anything (initialization) */
688 S_cl_is_anything(const struct regnode_charclass_class *cl)
692 for (value = 0; value <= ANYOF_MAX; value += 2)
693 if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1))
695 if (!(cl->flags & ANYOF_UNICODE_ALL))
697 if (!ANYOF_BITMAP_TESTALLSET((const void*)cl))
702 /* Can match anything (initialization) */
704 S_cl_init(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
706 Zero(cl, 1, struct regnode_charclass_class);
708 cl_anything(pRExC_state, cl);
712 S_cl_init_zero(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
714 Zero(cl, 1, struct regnode_charclass_class);
716 cl_anything(pRExC_state, cl);
718 cl->flags |= ANYOF_LOCALE;
721 /* 'And' a given class with another one. Can create false positives */
722 /* We assume that cl is not inverted */
724 S_cl_and(struct regnode_charclass_class *cl,
725 const struct regnode_charclass_class *and_with)
728 assert(and_with->type == ANYOF);
729 if (!(and_with->flags & ANYOF_CLASS)
730 && !(cl->flags & ANYOF_CLASS)
731 && (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
732 && !(and_with->flags & ANYOF_FOLD)
733 && !(cl->flags & ANYOF_FOLD)) {
736 if (and_with->flags & ANYOF_INVERT)
737 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
738 cl->bitmap[i] &= ~and_with->bitmap[i];
740 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
741 cl->bitmap[i] &= and_with->bitmap[i];
742 } /* XXXX: logic is complicated otherwise, leave it along for a moment. */
743 if (!(and_with->flags & ANYOF_EOS))
744 cl->flags &= ~ANYOF_EOS;
746 if (cl->flags & ANYOF_UNICODE_ALL && and_with->flags & ANYOF_UNICODE &&
747 !(and_with->flags & ANYOF_INVERT)) {
748 cl->flags &= ~ANYOF_UNICODE_ALL;
749 cl->flags |= ANYOF_UNICODE;
750 ARG_SET(cl, ARG(and_with));
752 if (!(and_with->flags & ANYOF_UNICODE_ALL) &&
753 !(and_with->flags & ANYOF_INVERT))
754 cl->flags &= ~ANYOF_UNICODE_ALL;
755 if (!(and_with->flags & (ANYOF_UNICODE|ANYOF_UNICODE_ALL)) &&
756 !(and_with->flags & ANYOF_INVERT))
757 cl->flags &= ~ANYOF_UNICODE;
760 /* 'OR' a given class with another one. Can create false positives */
761 /* We assume that cl is not inverted */
763 S_cl_or(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, const struct regnode_charclass_class *or_with)
765 if (or_with->flags & ANYOF_INVERT) {
767 * (B1 | CL1) | (!B2 & !CL2) = (B1 | !B2 & !CL2) | (CL1 | (!B2 & !CL2))
768 * <= (B1 | !B2) | (CL1 | !CL2)
769 * which is wasteful if CL2 is small, but we ignore CL2:
770 * (B1 | CL1) | (!B2 & !CL2) <= (B1 | CL1) | !B2 = (B1 | !B2) | CL1
771 * XXXX Can we handle case-fold? Unclear:
772 * (OK1(i) | OK1(i')) | !(OK1(i) | OK1(i')) =
773 * (OK1(i) | OK1(i')) | (!OK1(i) & !OK1(i'))
775 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
776 && !(or_with->flags & ANYOF_FOLD)
777 && !(cl->flags & ANYOF_FOLD) ) {
780 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
781 cl->bitmap[i] |= ~or_with->bitmap[i];
782 } /* XXXX: logic is complicated otherwise */
784 cl_anything(pRExC_state, cl);
787 /* (B1 | CL1) | (B2 | CL2) = (B1 | B2) | (CL1 | CL2)) */
788 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
789 && (!(or_with->flags & ANYOF_FOLD)
790 || (cl->flags & ANYOF_FOLD)) ) {
793 /* OR char bitmap and class bitmap separately */
794 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
795 cl->bitmap[i] |= or_with->bitmap[i];
796 if (or_with->flags & ANYOF_CLASS) {
797 for (i = 0; i < ANYOF_CLASSBITMAP_SIZE; i++)
798 cl->classflags[i] |= or_with->classflags[i];
799 cl->flags |= ANYOF_CLASS;
802 else { /* XXXX: logic is complicated, leave it along for a moment. */
803 cl_anything(pRExC_state, cl);
806 if (or_with->flags & ANYOF_EOS)
807 cl->flags |= ANYOF_EOS;
809 if (cl->flags & ANYOF_UNICODE && or_with->flags & ANYOF_UNICODE &&
810 ARG(cl) != ARG(or_with)) {
811 cl->flags |= ANYOF_UNICODE_ALL;
812 cl->flags &= ~ANYOF_UNICODE;
814 if (or_with->flags & ANYOF_UNICODE_ALL) {
815 cl->flags |= ANYOF_UNICODE_ALL;
816 cl->flags &= ~ANYOF_UNICODE;
820 #define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ]
821 #define TRIE_LIST_CUR(state) ( TRIE_LIST_ITEM( state, 0 ).forid )
822 #define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate )
823 #define TRIE_LIST_USED(idx) ( trie->states[state].trans.list ? (TRIE_LIST_CUR( idx ) - 1) : 0 )
828 dump_trie(trie,widecharmap,revcharmap)
829 dump_trie_interim_list(trie,widecharmap,revcharmap,next_alloc)
830 dump_trie_interim_table(trie,widecharmap,revcharmap,next_alloc)
832 These routines dump out a trie in a somewhat readable format.
833 The _interim_ variants are used for debugging the interim
834 tables that are used to generate the final compressed
835 representation which is what dump_trie expects.
837 Part of the reason for their existance is to provide a form
838 of documentation as to how the different representations function.
843 Dumps the final compressed table form of the trie to Perl_debug_log.
844 Used for debugging make_trie().
848 S_dump_trie(pTHX_ const struct _reg_trie_data *trie, HV *widecharmap,
849 AV *revcharmap, U32 depth)
852 SV *sv=sv_newmortal();
853 int colwidth= widecharmap ? 6 : 4;
854 GET_RE_DEBUG_FLAGS_DECL;
857 PerlIO_printf( Perl_debug_log, "%*sChar : %-6s%-6s%-4s ",
858 (int)depth * 2 + 2,"",
859 "Match","Base","Ofs" );
861 for( state = 0 ; state < trie->uniquecharcount ; state++ ) {
862 SV ** const tmp = av_fetch( revcharmap, state, 0);
864 PerlIO_printf( Perl_debug_log, "%*s",
866 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
867 PL_colors[0], PL_colors[1],
868 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
869 PERL_PV_ESCAPE_FIRSTCHAR
874 PerlIO_printf( Perl_debug_log, "\n%*sState|-----------------------",
875 (int)depth * 2 + 2,"");
877 for( state = 0 ; state < trie->uniquecharcount ; state++ )
878 PerlIO_printf( Perl_debug_log, "%.*s", colwidth, "--------");
879 PerlIO_printf( Perl_debug_log, "\n");
881 for( state = 1 ; state < trie->statecount ; state++ ) {
882 const U32 base = trie->states[ state ].trans.base;
884 PerlIO_printf( Perl_debug_log, "%*s#%4"UVXf"|", (int)depth * 2 + 2,"", (UV)state);
886 if ( trie->states[ state ].wordnum ) {
887 PerlIO_printf( Perl_debug_log, " W%4X", trie->states[ state ].wordnum );
889 PerlIO_printf( Perl_debug_log, "%6s", "" );
892 PerlIO_printf( Perl_debug_log, " @%4"UVXf" ", (UV)base );
897 while( ( base + ofs < trie->uniquecharcount ) ||
898 ( base + ofs - trie->uniquecharcount < trie->lasttrans
899 && trie->trans[ base + ofs - trie->uniquecharcount ].check != state))
902 PerlIO_printf( Perl_debug_log, "+%2"UVXf"[ ", (UV)ofs);
904 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
905 if ( ( base + ofs >= trie->uniquecharcount ) &&
906 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
907 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
909 PerlIO_printf( Perl_debug_log, "%*"UVXf,
911 (UV)trie->trans[ base + ofs - trie->uniquecharcount ].next );
913 PerlIO_printf( Perl_debug_log, "%*s",colwidth," ." );
917 PerlIO_printf( Perl_debug_log, "]");
920 PerlIO_printf( Perl_debug_log, "\n" );
924 Dumps a fully constructed but uncompressed trie in list form.
925 List tries normally only are used for construction when the number of
926 possible chars (trie->uniquecharcount) is very high.
927 Used for debugging make_trie().
930 S_dump_trie_interim_list(pTHX_ const struct _reg_trie_data *trie,
931 HV *widecharmap, AV *revcharmap, U32 next_alloc,
935 SV *sv=sv_newmortal();
936 int colwidth= widecharmap ? 6 : 4;
937 GET_RE_DEBUG_FLAGS_DECL;
938 /* print out the table precompression. */
939 PerlIO_printf( Perl_debug_log, "%*sState :Word | Transition Data\n%*s%s",
940 (int)depth * 2 + 2,"", (int)depth * 2 + 2,"",
941 "------:-----+-----------------\n" );
943 for( state=1 ; state < next_alloc ; state ++ ) {
946 PerlIO_printf( Perl_debug_log, "%*s %4"UVXf" :",
947 (int)depth * 2 + 2,"", (UV)state );
948 if ( ! trie->states[ state ].wordnum ) {
949 PerlIO_printf( Perl_debug_log, "%5s| ","");
951 PerlIO_printf( Perl_debug_log, "W%4x| ",
952 trie->states[ state ].wordnum
955 for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) {
956 SV ** const tmp = av_fetch( revcharmap, TRIE_LIST_ITEM(state,charid).forid, 0);
958 PerlIO_printf( Perl_debug_log, "%*s:%3X=%4"UVXf" | ",
960 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
961 PL_colors[0], PL_colors[1],
962 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
963 PERL_PV_ESCAPE_FIRSTCHAR
965 TRIE_LIST_ITEM(state,charid).forid,
966 (UV)TRIE_LIST_ITEM(state,charid).newstate
969 PerlIO_printf(Perl_debug_log, "\n%*s| ",
970 (int)((depth * 2) + 14), "");
973 PerlIO_printf( Perl_debug_log, "\n");
978 Dumps a fully constructed but uncompressed trie in table form.
979 This is the normal DFA style state transition table, with a few
980 twists to facilitate compression later.
981 Used for debugging make_trie().
984 S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie,
985 HV *widecharmap, AV *revcharmap, U32 next_alloc,
990 SV *sv=sv_newmortal();
991 int colwidth= widecharmap ? 6 : 4;
992 GET_RE_DEBUG_FLAGS_DECL;
995 print out the table precompression so that we can do a visual check
996 that they are identical.
999 PerlIO_printf( Perl_debug_log, "%*sChar : ",(int)depth * 2 + 2,"" );
1001 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
1002 SV ** const tmp = av_fetch( revcharmap, charid, 0);
1004 PerlIO_printf( Perl_debug_log, "%*s",
1006 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
1007 PL_colors[0], PL_colors[1],
1008 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1009 PERL_PV_ESCAPE_FIRSTCHAR
1015 PerlIO_printf( Perl_debug_log, "\n%*sState+-",(int)depth * 2 + 2,"" );
1017 for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) {
1018 PerlIO_printf( Perl_debug_log, "%.*s", colwidth,"--------");
1021 PerlIO_printf( Perl_debug_log, "\n" );
1023 for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) {
1025 PerlIO_printf( Perl_debug_log, "%*s%4"UVXf" : ",
1026 (int)depth * 2 + 2,"",
1027 (UV)TRIE_NODENUM( state ) );
1029 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
1030 UV v=(UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next );
1032 PerlIO_printf( Perl_debug_log, "%*"UVXf, colwidth, v );
1034 PerlIO_printf( Perl_debug_log, "%*s", colwidth, "." );
1036 if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) {
1037 PerlIO_printf( Perl_debug_log, " (%4"UVXf")\n", (UV)trie->trans[ state ].check );
1039 PerlIO_printf( Perl_debug_log, " (%4"UVXf") W%4X\n", (UV)trie->trans[ state ].check,
1040 trie->states[ TRIE_NODENUM( state ) ].wordnum );
1047 /* make_trie(startbranch,first,last,tail,word_count,flags,depth)
1048 startbranch: the first branch in the whole branch sequence
1049 first : start branch of sequence of branch-exact nodes.
1050 May be the same as startbranch
1051 last : Thing following the last branch.
1052 May be the same as tail.
1053 tail : item following the branch sequence
1054 count : words in the sequence
1055 flags : currently the OP() type we will be building one of /EXACT(|F|Fl)/
1056 depth : indent depth
1058 Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node.
1060 A trie is an N'ary tree where the branches are determined by digital
1061 decomposition of the key. IE, at the root node you look up the 1st character and
1062 follow that branch repeat until you find the end of the branches. Nodes can be
1063 marked as "accepting" meaning they represent a complete word. Eg:
1067 would convert into the following structure. Numbers represent states, letters
1068 following numbers represent valid transitions on the letter from that state, if
1069 the number is in square brackets it represents an accepting state, otherwise it
1070 will be in parenthesis.
1072 +-h->+-e->[3]-+-r->(8)-+-s->[9]
1076 (1) +-i->(6)-+-s->[7]
1078 +-s->(3)-+-h->(4)-+-e->[5]
1080 Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers)
1082 This shows that when matching against the string 'hers' we will begin at state 1
1083 read 'h' and move to state 2, read 'e' and move to state 3 which is accepting,
1084 then read 'r' and go to state 8 followed by 's' which takes us to state 9 which
1085 is also accepting. Thus we know that we can match both 'he' and 'hers' with a
1086 single traverse. We store a mapping from accepting to state to which word was
1087 matched, and then when we have multiple possibilities we try to complete the
1088 rest of the regex in the order in which they occured in the alternation.
1090 The only prior NFA like behaviour that would be changed by the TRIE support is
1091 the silent ignoring of duplicate alternations which are of the form:
1093 / (DUPE|DUPE) X? (?{ ... }) Y /x
1095 Thus EVAL blocks follwing a trie may be called a different number of times with
1096 and without the optimisation. With the optimisations dupes will be silently
1097 ignored. This inconsistant behaviour of EVAL type nodes is well established as
1098 the following demonstrates:
1100 'words'=~/(word|word|word)(?{ print $1 })[xyz]/
1102 which prints out 'word' three times, but
1104 'words'=~/(word|word|word)(?{ print $1 })S/
1106 which doesnt print it out at all. This is due to other optimisations kicking in.
1108 Example of what happens on a structural level:
1110 The regexp /(ac|ad|ab)+/ will produce the folowing debug output:
1112 1: CURLYM[1] {1,32767}(18)
1123 This would be optimizable with startbranch=5, first=5, last=16, tail=16
1124 and should turn into:
1126 1: CURLYM[1] {1,32767}(18)
1128 [Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1]
1136 Cases where tail != last would be like /(?foo|bar)baz/:
1146 which would be optimizable with startbranch=1, first=1, last=7, tail=8
1147 and would end up looking like:
1150 [Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1]
1157 d = uvuni_to_utf8_flags(d, uv, 0);
1159 is the recommended Unicode-aware way of saying
1164 #define TRIE_STORE_REVCHAR \
1167 SV *zlopp = newSV(2); \
1168 unsigned char *flrbbbbb = (unsigned char *) SvPVX(zlopp); \
1169 unsigned const char *const kapow = uvuni_to_utf8(flrbbbbb, uvc & 0xFF); \
1170 SvCUR_set(zlopp, kapow - flrbbbbb); \
1173 av_push(revcharmap, zlopp); \
1175 char ooooff = (char)uvc; \
1176 av_push(revcharmap, newSVpvn(&ooooff, 1)); \
1180 #define TRIE_READ_CHAR STMT_START { \
1184 if ( foldlen > 0 ) { \
1185 uvc = utf8n_to_uvuni( scan, UTF8_MAXLEN, &len, uniflags ); \
1190 uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
1191 uvc = to_uni_fold( uvc, foldbuf, &foldlen ); \
1192 foldlen -= UNISKIP( uvc ); \
1193 scan = foldbuf + UNISKIP( uvc ); \
1196 uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
1206 #define TRIE_LIST_PUSH(state,fid,ns) STMT_START { \
1207 if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) { \
1208 U32 ging = TRIE_LIST_LEN( state ) *= 2; \
1209 Renew( trie->states[ state ].trans.list, ging, reg_trie_trans_le ); \
1211 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid; \
1212 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns; \
1213 TRIE_LIST_CUR( state )++; \
1216 #define TRIE_LIST_NEW(state) STMT_START { \
1217 Newxz( trie->states[ state ].trans.list, \
1218 4, reg_trie_trans_le ); \
1219 TRIE_LIST_CUR( state ) = 1; \
1220 TRIE_LIST_LEN( state ) = 4; \
1223 #define TRIE_HANDLE_WORD(state) STMT_START { \
1224 U16 dupe= trie->states[ state ].wordnum; \
1225 regnode * const noper_next = regnext( noper ); \
1227 if (trie->wordlen) \
1228 trie->wordlen[ curword ] = wordlen; \
1230 /* store the word for dumping */ \
1232 if (OP(noper) != NOTHING) \
1233 tmp = newSVpvn(STRING(noper), STR_LEN(noper)); \
1235 tmp = newSVpvn( "", 0 ); \
1236 if ( UTF ) SvUTF8_on( tmp ); \
1237 av_push( trie_words, tmp ); \
1242 if ( noper_next < tail ) { \
1244 trie->jump = (U16 *) PerlMemShared_calloc( word_count + 1, sizeof(U16) ); \
1245 trie->jump[curword] = (U16)(noper_next - convert); \
1247 jumper = noper_next; \
1249 nextbranch= regnext(cur); \
1253 /* So it's a dupe. This means we need to maintain a */\
1254 /* linked-list from the first to the next. */\
1255 /* we only allocate the nextword buffer when there */\
1256 /* a dupe, so first time we have to do the allocation */\
1257 if (!trie->nextword) \
1258 trie->nextword = (U16 *) \
1259 PerlMemShared_calloc( word_count + 1, sizeof(U16)); \
1260 while ( trie->nextword[dupe] ) \
1261 dupe= trie->nextword[dupe]; \
1262 trie->nextword[dupe]= curword; \
1264 /* we haven't inserted this word yet. */ \
1265 trie->states[ state ].wordnum = curword; \
1270 #define TRIE_TRANS_STATE(state,base,ucharcount,charid,special) \
1271 ( ( base + charid >= ucharcount \
1272 && base + charid < ubound \
1273 && state == trie->trans[ base - ucharcount + charid ].check \
1274 && trie->trans[ base - ucharcount + charid ].next ) \
1275 ? trie->trans[ base - ucharcount + charid ].next \
1276 : ( state==1 ? special : 0 ) \
1280 #define MADE_JUMP_TRIE 2
1281 #define MADE_EXACT_TRIE 4
1284 S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *first, regnode *last, regnode *tail, U32 word_count, U32 flags, U32 depth)
1287 /* first pass, loop through and scan words */
1288 reg_trie_data *trie;
1289 HV *widecharmap = NULL;
1290 AV *revcharmap = newAV();
1292 const U32 uniflags = UTF8_ALLOW_DEFAULT;
1297 regnode *jumper = NULL;
1298 regnode *nextbranch = NULL;
1299 regnode *convert = NULL;
1300 /* we just use folder as a flag in utf8 */
1301 const U8 * const folder = ( flags == EXACTF
1303 : ( flags == EXACTFL
1310 const U32 data_slot = add_data( pRExC_state, 4, "tuuu" );
1311 AV *trie_words = NULL;
1312 /* along with revcharmap, this only used during construction but both are
1313 * useful during debugging so we store them in the struct when debugging.
1316 const U32 data_slot = add_data( pRExC_state, 2, "tu" );
1317 STRLEN trie_charcount=0;
1319 SV *re_trie_maxbuff;
1320 GET_RE_DEBUG_FLAGS_DECL;
1322 PERL_UNUSED_ARG(depth);
1325 trie = (reg_trie_data *) PerlMemShared_calloc( 1, sizeof(reg_trie_data) );
1327 trie->startstate = 1;
1328 trie->wordcount = word_count;
1329 RExC_rxi->data->data[ data_slot ] = (void*)trie;
1330 trie->charmap = (U16 *) PerlMemShared_calloc( 256, sizeof(U16) );
1331 if (!(UTF && folder))
1332 trie->bitmap = (char *) PerlMemShared_calloc( ANYOF_BITMAP_SIZE, 1 );
1334 trie_words = newAV();
1337 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
1338 if (!SvIOK(re_trie_maxbuff)) {
1339 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
1342 PerlIO_printf( Perl_debug_log,
1343 "%*smake_trie start==%d, first==%d, last==%d, tail==%d depth=%d\n",
1344 (int)depth * 2 + 2, "",
1345 REG_NODE_NUM(startbranch),REG_NODE_NUM(first),
1346 REG_NODE_NUM(last), REG_NODE_NUM(tail),
1350 /* Find the node we are going to overwrite */
1351 if ( first == startbranch && OP( last ) != BRANCH ) {
1352 /* whole branch chain */
1355 /* branch sub-chain */
1356 convert = NEXTOPER( first );
1359 /* -- First loop and Setup --
1361 We first traverse the branches and scan each word to determine if it
1362 contains widechars, and how many unique chars there are, this is
1363 important as we have to build a table with at least as many columns as we
1366 We use an array of integers to represent the character codes 0..255
1367 (trie->charmap) and we use a an HV* to store Unicode characters. We use the
1368 native representation of the character value as the key and IV's for the
1371 *TODO* If we keep track of how many times each character is used we can
1372 remap the columns so that the table compression later on is more
1373 efficient in terms of memory by ensuring most common value is in the
1374 middle and the least common are on the outside. IMO this would be better
1375 than a most to least common mapping as theres a decent chance the most
1376 common letter will share a node with the least common, meaning the node
1377 will not be compressable. With a middle is most common approach the worst
1378 case is when we have the least common nodes twice.
1382 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1383 regnode * const noper = NEXTOPER( cur );
1384 const U8 *uc = (U8*)STRING( noper );
1385 const U8 * const e = uc + STR_LEN( noper );
1387 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1388 const U8 *scan = (U8*)NULL;
1389 U32 wordlen = 0; /* required init */
1391 bool set_bit = trie->bitmap ? 1 : 0; /*store the first char in the bitmap?*/
1393 if (OP(noper) == NOTHING) {
1397 if ( set_bit ) /* bitmap only alloced when !(UTF&&Folding) */
1398 TRIE_BITMAP_SET(trie,*uc); /* store the raw first byte
1399 regardless of encoding */
1401 for ( ; uc < e ; uc += len ) {
1402 TRIE_CHARCOUNT(trie)++;
1406 if ( !trie->charmap[ uvc ] ) {
1407 trie->charmap[ uvc ]=( ++trie->uniquecharcount );
1409 trie->charmap[ folder[ uvc ] ] = trie->charmap[ uvc ];
1413 /* store the codepoint in the bitmap, and if its ascii
1414 also store its folded equivelent. */
1415 TRIE_BITMAP_SET(trie,uvc);
1417 /* store the folded codepoint */
1418 if ( folder ) TRIE_BITMAP_SET(trie,folder[ uvc ]);
1421 /* store first byte of utf8 representation of
1422 codepoints in the 127 < uvc < 256 range */
1423 if (127 < uvc && uvc < 192) {
1424 TRIE_BITMAP_SET(trie,194);
1425 } else if (191 < uvc ) {
1426 TRIE_BITMAP_SET(trie,195);
1427 /* && uvc < 256 -- we know uvc is < 256 already */
1430 set_bit = 0; /* We've done our bit :-) */
1435 widecharmap = newHV();
1437 svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 1 );
1440 Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%"UVXf, uvc );
1442 if ( !SvTRUE( *svpp ) ) {
1443 sv_setiv( *svpp, ++trie->uniquecharcount );
1448 if( cur == first ) {
1451 } else if (chars < trie->minlen) {
1453 } else if (chars > trie->maxlen) {
1457 } /* end first pass */
1458 DEBUG_TRIE_COMPILE_r(
1459 PerlIO_printf( Perl_debug_log, "%*sTRIE(%s): W:%d C:%d Uq:%d Min:%d Max:%d\n",
1460 (int)depth * 2 + 2,"",
1461 ( widecharmap ? "UTF8" : "NATIVE" ), (int)word_count,
1462 (int)TRIE_CHARCOUNT(trie), trie->uniquecharcount,
1463 (int)trie->minlen, (int)trie->maxlen )
1465 trie->wordlen = (U32 *) PerlMemShared_calloc( word_count, sizeof(U32) );
1468 We now know what we are dealing with in terms of unique chars and
1469 string sizes so we can calculate how much memory a naive
1470 representation using a flat table will take. If it's over a reasonable
1471 limit (as specified by ${^RE_TRIE_MAXBUF}) we use a more memory
1472 conservative but potentially much slower representation using an array
1475 At the end we convert both representations into the same compressed
1476 form that will be used in regexec.c for matching with. The latter
1477 is a form that cannot be used to construct with but has memory
1478 properties similar to the list form and access properties similar
1479 to the table form making it both suitable for fast searches and
1480 small enough that its feasable to store for the duration of a program.
1482 See the comment in the code where the compressed table is produced
1483 inplace from the flat tabe representation for an explanation of how
1484 the compression works.
1489 if ( (IV)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1) > SvIV(re_trie_maxbuff) ) {
1491 Second Pass -- Array Of Lists Representation
1493 Each state will be represented by a list of charid:state records
1494 (reg_trie_trans_le) the first such element holds the CUR and LEN
1495 points of the allocated array. (See defines above).
1497 We build the initial structure using the lists, and then convert
1498 it into the compressed table form which allows faster lookups
1499 (but cant be modified once converted).
1502 STRLEN transcount = 1;
1504 DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
1505 "%*sCompiling trie using list compiler\n",
1506 (int)depth * 2 + 2, ""));
1508 trie->states = (reg_trie_state *)
1509 PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
1510 sizeof(reg_trie_state) );
1514 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1516 regnode * const noper = NEXTOPER( cur );
1517 U8 *uc = (U8*)STRING( noper );
1518 const U8 * const e = uc + STR_LEN( noper );
1519 U32 state = 1; /* required init */
1520 U16 charid = 0; /* sanity init */
1521 U8 *scan = (U8*)NULL; /* sanity init */
1522 STRLEN foldlen = 0; /* required init */
1523 U32 wordlen = 0; /* required init */
1524 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1526 if (OP(noper) != NOTHING) {
1527 for ( ; uc < e ; uc += len ) {
1532 charid = trie->charmap[ uvc ];
1534 SV** const svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 0);
1538 charid=(U16)SvIV( *svpp );
1541 /* charid is now 0 if we dont know the char read, or nonzero if we do */
1548 if ( !trie->states[ state ].trans.list ) {
1549 TRIE_LIST_NEW( state );
1551 for ( check = 1; check <= TRIE_LIST_USED( state ); check++ ) {
1552 if ( TRIE_LIST_ITEM( state, check ).forid == charid ) {
1553 newstate = TRIE_LIST_ITEM( state, check ).newstate;
1558 newstate = next_alloc++;
1559 TRIE_LIST_PUSH( state, charid, newstate );
1564 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
1568 TRIE_HANDLE_WORD(state);
1570 } /* end second pass */
1572 /* next alloc is the NEXT state to be allocated */
1573 trie->statecount = next_alloc;
1574 trie->states = (reg_trie_state *)
1575 PerlMemShared_realloc( trie->states,
1577 * sizeof(reg_trie_state) );
1579 /* and now dump it out before we compress it */
1580 DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_list(trie, widecharmap,
1581 revcharmap, next_alloc,
1585 trie->trans = (reg_trie_trans *)
1586 PerlMemShared_calloc( transcount, sizeof(reg_trie_trans) );
1593 for( state=1 ; state < next_alloc ; state ++ ) {
1597 DEBUG_TRIE_COMPILE_MORE_r(
1598 PerlIO_printf( Perl_debug_log, "tp: %d zp: %d ",tp,zp)
1602 if (trie->states[state].trans.list) {
1603 U16 minid=TRIE_LIST_ITEM( state, 1).forid;
1607 for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
1608 const U16 forid = TRIE_LIST_ITEM( state, idx).forid;
1609 if ( forid < minid ) {
1611 } else if ( forid > maxid ) {
1615 if ( transcount < tp + maxid - minid + 1) {
1617 trie->trans = (reg_trie_trans *)
1618 PerlMemShared_realloc( trie->trans,
1620 * sizeof(reg_trie_trans) );
1621 Zero( trie->trans + (transcount / 2), transcount / 2 , reg_trie_trans );
1623 base = trie->uniquecharcount + tp - minid;
1624 if ( maxid == minid ) {
1626 for ( ; zp < tp ; zp++ ) {
1627 if ( ! trie->trans[ zp ].next ) {
1628 base = trie->uniquecharcount + zp - minid;
1629 trie->trans[ zp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1630 trie->trans[ zp ].check = state;
1636 trie->trans[ tp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1637 trie->trans[ tp ].check = state;
1642 for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
1643 const U32 tid = base - trie->uniquecharcount + TRIE_LIST_ITEM( state, idx ).forid;
1644 trie->trans[ tid ].next = TRIE_LIST_ITEM( state, idx ).newstate;
1645 trie->trans[ tid ].check = state;
1647 tp += ( maxid - minid + 1 );
1649 Safefree(trie->states[ state ].trans.list);
1652 DEBUG_TRIE_COMPILE_MORE_r(
1653 PerlIO_printf( Perl_debug_log, " base: %d\n",base);
1656 trie->states[ state ].trans.base=base;
1658 trie->lasttrans = tp + 1;
1662 Second Pass -- Flat Table Representation.
1664 we dont use the 0 slot of either trans[] or states[] so we add 1 to each.
1665 We know that we will need Charcount+1 trans at most to store the data
1666 (one row per char at worst case) So we preallocate both structures
1667 assuming worst case.
1669 We then construct the trie using only the .next slots of the entry
1672 We use the .check field of the first entry of the node temporarily to
1673 make compression both faster and easier by keeping track of how many non
1674 zero fields are in the node.
1676 Since trans are numbered from 1 any 0 pointer in the table is a FAIL
1679 There are two terms at use here: state as a TRIE_NODEIDX() which is a
1680 number representing the first entry of the node, and state as a
1681 TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1) and
1682 TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3) if there
1683 are 2 entrys per node. eg:
1691 The table is internally in the right hand, idx form. However as we also
1692 have to deal with the states array which is indexed by nodenum we have to
1693 use TRIE_NODENUM() to convert.
1696 DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
1697 "%*sCompiling trie using table compiler\n",
1698 (int)depth * 2 + 2, ""));
1700 trie->trans = (reg_trie_trans *)
1701 PerlMemShared_calloc( ( TRIE_CHARCOUNT(trie) + 1 )
1702 * trie->uniquecharcount + 1,
1703 sizeof(reg_trie_trans) );
1704 trie->states = (reg_trie_state *)
1705 PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
1706 sizeof(reg_trie_state) );
1707 next_alloc = trie->uniquecharcount + 1;
1710 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1712 regnode * const noper = NEXTOPER( cur );
1713 const U8 *uc = (U8*)STRING( noper );
1714 const U8 * const e = uc + STR_LEN( noper );
1716 U32 state = 1; /* required init */
1718 U16 charid = 0; /* sanity init */
1719 U32 accept_state = 0; /* sanity init */
1720 U8 *scan = (U8*)NULL; /* sanity init */
1722 STRLEN foldlen = 0; /* required init */
1723 U32 wordlen = 0; /* required init */
1724 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1726 if ( OP(noper) != NOTHING ) {
1727 for ( ; uc < e ; uc += len ) {
1732 charid = trie->charmap[ uvc ];
1734 SV* const * const svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 0);
1735 charid = svpp ? (U16)SvIV(*svpp) : 0;
1739 if ( !trie->trans[ state + charid ].next ) {
1740 trie->trans[ state + charid ].next = next_alloc;
1741 trie->trans[ state ].check++;
1742 next_alloc += trie->uniquecharcount;
1744 state = trie->trans[ state + charid ].next;
1746 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
1748 /* charid is now 0 if we dont know the char read, or nonzero if we do */
1751 accept_state = TRIE_NODENUM( state );
1752 TRIE_HANDLE_WORD(accept_state);
1754 } /* end second pass */
1756 /* and now dump it out before we compress it */
1757 DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_table(trie, widecharmap,
1759 next_alloc, depth+1));
1763 * Inplace compress the table.*
1765 For sparse data sets the table constructed by the trie algorithm will
1766 be mostly 0/FAIL transitions or to put it another way mostly empty.
1767 (Note that leaf nodes will not contain any transitions.)
1769 This algorithm compresses the tables by eliminating most such
1770 transitions, at the cost of a modest bit of extra work during lookup:
1772 - Each states[] entry contains a .base field which indicates the
1773 index in the state[] array wheres its transition data is stored.
1775 - If .base is 0 there are no valid transitions from that node.
1777 - If .base is nonzero then charid is added to it to find an entry in
1780 -If trans[states[state].base+charid].check!=state then the
1781 transition is taken to be a 0/Fail transition. Thus if there are fail
1782 transitions at the front of the node then the .base offset will point
1783 somewhere inside the previous nodes data (or maybe even into a node
1784 even earlier), but the .check field determines if the transition is
1788 The following process inplace converts the table to the compressed
1789 table: We first do not compress the root node 1,and mark its all its
1790 .check pointers as 1 and set its .base pointer as 1 as well. This
1791 allows to do a DFA construction from the compressed table later, and
1792 ensures that any .base pointers we calculate later are greater than
1795 - We set 'pos' to indicate the first entry of the second node.
1797 - We then iterate over the columns of the node, finding the first and
1798 last used entry at l and m. We then copy l..m into pos..(pos+m-l),
1799 and set the .check pointers accordingly, and advance pos
1800 appropriately and repreat for the next node. Note that when we copy
1801 the next pointers we have to convert them from the original
1802 NODEIDX form to NODENUM form as the former is not valid post
1805 - If a node has no transitions used we mark its base as 0 and do not
1806 advance the pos pointer.
1808 - If a node only has one transition we use a second pointer into the
1809 structure to fill in allocated fail transitions from other states.
1810 This pointer is independent of the main pointer and scans forward
1811 looking for null transitions that are allocated to a state. When it
1812 finds one it writes the single transition into the "hole". If the
1813 pointer doesnt find one the single transition is appended as normal.
1815 - Once compressed we can Renew/realloc the structures to release the
1818 See "Table-Compression Methods" in sec 3.9 of the Red Dragon,
1819 specifically Fig 3.47 and the associated pseudocode.
1823 const U32 laststate = TRIE_NODENUM( next_alloc );
1826 trie->statecount = laststate;
1828 for ( state = 1 ; state < laststate ; state++ ) {
1830 const U32 stateidx = TRIE_NODEIDX( state );
1831 const U32 o_used = trie->trans[ stateidx ].check;
1832 U32 used = trie->trans[ stateidx ].check;
1833 trie->trans[ stateidx ].check = 0;
1835 for ( charid = 0 ; used && charid < trie->uniquecharcount ; charid++ ) {
1836 if ( flag || trie->trans[ stateidx + charid ].next ) {
1837 if ( trie->trans[ stateidx + charid ].next ) {
1839 for ( ; zp < pos ; zp++ ) {
1840 if ( ! trie->trans[ zp ].next ) {
1844 trie->states[ state ].trans.base = zp + trie->uniquecharcount - charid ;
1845 trie->trans[ zp ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
1846 trie->trans[ zp ].check = state;
1847 if ( ++zp > pos ) pos = zp;
1854 trie->states[ state ].trans.base = pos + trie->uniquecharcount - charid ;
1856 trie->trans[ pos ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
1857 trie->trans[ pos ].check = state;
1862 trie->lasttrans = pos + 1;
1863 trie->states = (reg_trie_state *)
1864 PerlMemShared_realloc( trie->states, laststate
1865 * sizeof(reg_trie_state) );
1866 DEBUG_TRIE_COMPILE_MORE_r(
1867 PerlIO_printf( Perl_debug_log,
1868 "%*sAlloc: %d Orig: %"IVdf" elements, Final:%"IVdf". Savings of %%%5.2f\n",
1869 (int)depth * 2 + 2,"",
1870 (int)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1 ),
1873 ( ( next_alloc - pos ) * 100 ) / (double)next_alloc );
1876 } /* end table compress */
1878 DEBUG_TRIE_COMPILE_MORE_r(
1879 PerlIO_printf(Perl_debug_log, "%*sStatecount:%"UVxf" Lasttrans:%"UVxf"\n",
1880 (int)depth * 2 + 2, "",
1881 (UV)trie->statecount,
1882 (UV)trie->lasttrans)
1884 /* resize the trans array to remove unused space */
1885 trie->trans = (reg_trie_trans *)
1886 PerlMemShared_realloc( trie->trans, trie->lasttrans
1887 * sizeof(reg_trie_trans) );
1889 /* and now dump out the compressed format */
1890 DEBUG_TRIE_COMPILE_r(dump_trie(trie, widecharmap, revcharmap, depth+1));
1892 { /* Modify the program and insert the new TRIE node*/
1893 U8 nodetype =(U8)(flags & 0xFF);
1897 regnode *optimize = NULL;
1898 #ifdef RE_TRACK_PATTERN_OFFSETS
1901 U32 mjd_nodelen = 0;
1902 #endif /* RE_TRACK_PATTERN_OFFSETS */
1903 #endif /* DEBUGGING */
1905 This means we convert either the first branch or the first Exact,
1906 depending on whether the thing following (in 'last') is a branch
1907 or not and whther first is the startbranch (ie is it a sub part of
1908 the alternation or is it the whole thing.)
1909 Assuming its a sub part we conver the EXACT otherwise we convert
1910 the whole branch sequence, including the first.
1912 /* Find the node we are going to overwrite */
1913 if ( first != startbranch || OP( last ) == BRANCH ) {
1914 /* branch sub-chain */
1915 NEXT_OFF( first ) = (U16)(last - first);
1916 #ifdef RE_TRACK_PATTERN_OFFSETS
1918 mjd_offset= Node_Offset((convert));
1919 mjd_nodelen= Node_Length((convert));
1922 /* whole branch chain */
1924 #ifdef RE_TRACK_PATTERN_OFFSETS
1927 const regnode *nop = NEXTOPER( convert );
1928 mjd_offset= Node_Offset((nop));
1929 mjd_nodelen= Node_Length((nop));
1933 PerlIO_printf(Perl_debug_log, "%*sMJD offset:%"UVuf" MJD length:%"UVuf"\n",
1934 (int)depth * 2 + 2, "",
1935 (UV)mjd_offset, (UV)mjd_nodelen)
1938 /* But first we check to see if there is a common prefix we can
1939 split out as an EXACT and put in front of the TRIE node. */
1940 trie->startstate= 1;
1941 if ( trie->bitmap && !widecharmap && !trie->jump ) {
1943 for ( state = 1 ; state < trie->statecount-1 ; state++ ) {
1947 const U32 base = trie->states[ state ].trans.base;
1949 if ( trie->states[state].wordnum )
1952 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
1953 if ( ( base + ofs >= trie->uniquecharcount ) &&
1954 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
1955 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
1957 if ( ++count > 1 ) {
1958 SV **tmp = av_fetch( revcharmap, ofs, 0);
1959 const U8 *ch = (U8*)SvPV_nolen_const( *tmp );
1960 if ( state == 1 ) break;
1962 Zero(trie->bitmap, ANYOF_BITMAP_SIZE, char);
1964 PerlIO_printf(Perl_debug_log,
1965 "%*sNew Start State=%"UVuf" Class: [",
1966 (int)depth * 2 + 2, "",
1969 SV ** const tmp = av_fetch( revcharmap, idx, 0);
1970 const U8 * const ch = (U8*)SvPV_nolen_const( *tmp );
1972 TRIE_BITMAP_SET(trie,*ch);
1974 TRIE_BITMAP_SET(trie, folder[ *ch ]);
1976 PerlIO_printf(Perl_debug_log, (char*)ch)
1980 TRIE_BITMAP_SET(trie,*ch);
1982 TRIE_BITMAP_SET(trie,folder[ *ch ]);
1983 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"%s", ch));
1989 SV **tmp = av_fetch( revcharmap, idx, 0);
1991 char *ch = SvPV( *tmp, len );
1993 SV *sv=sv_newmortal();
1994 PerlIO_printf( Perl_debug_log,
1995 "%*sPrefix State: %"UVuf" Idx:%"UVuf" Char='%s'\n",
1996 (int)depth * 2 + 2, "",
1998 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 6,
1999 PL_colors[0], PL_colors[1],
2000 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
2001 PERL_PV_ESCAPE_FIRSTCHAR
2006 OP( convert ) = nodetype;
2007 str=STRING(convert);
2010 STR_LEN(convert) += len;
2016 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"]\n"));
2022 regnode *n = convert+NODE_SZ_STR(convert);
2023 NEXT_OFF(convert) = NODE_SZ_STR(convert);
2024 trie->startstate = state;
2025 trie->minlen -= (state - 1);
2026 trie->maxlen -= (state - 1);
2028 /* At least the UNICOS C compiler choked on this
2029 * being argument to DEBUG_r(), so let's just have
2032 #ifdef PERL_EXT_RE_BUILD
2038 regnode *fix = convert;
2039 U32 word = trie->wordcount;
2041 Set_Node_Offset_Length(convert, mjd_offset, state - 1);
2042 while( ++fix < n ) {
2043 Set_Node_Offset_Length(fix, 0, 0);
2046 SV ** const tmp = av_fetch( trie_words, word, 0 );
2048 if ( STR_LEN(convert) <= SvCUR(*tmp) )
2049 sv_chop(*tmp, SvPV_nolen(*tmp) + STR_LEN(convert));
2051 sv_chop(*tmp, SvPV_nolen(*tmp) + SvCUR(*tmp));
2059 NEXT_OFF(convert) = (U16)(tail - convert);
2060 DEBUG_r(optimize= n);
2066 if ( trie->maxlen ) {
2067 NEXT_OFF( convert ) = (U16)(tail - convert);
2068 ARG_SET( convert, data_slot );
2069 /* Store the offset to the first unabsorbed branch in
2070 jump[0], which is otherwise unused by the jump logic.
2071 We use this when dumping a trie and during optimisation. */
2073 trie->jump[0] = (U16)(nextbranch - convert);
2076 if ( !trie->states[trie->startstate].wordnum && trie->bitmap &&
2077 ( (char *)jumper - (char *)convert) >= (int)sizeof(struct regnode_charclass) )
2079 OP( convert ) = TRIEC;
2080 Copy(trie->bitmap, ((struct regnode_charclass *)convert)->bitmap, ANYOF_BITMAP_SIZE, char);
2081 PerlMemShared_free(trie->bitmap);
2084 OP( convert ) = TRIE;
2086 /* store the type in the flags */
2087 convert->flags = nodetype;
2091 + regarglen[ OP( convert ) ];
2093 /* XXX We really should free up the resource in trie now,
2094 as we won't use them - (which resources?) dmq */
2096 /* needed for dumping*/
2097 DEBUG_r(if (optimize) {
2098 regnode *opt = convert;
2100 while ( ++opt < optimize) {
2101 Set_Node_Offset_Length(opt,0,0);
2104 Try to clean up some of the debris left after the
2107 while( optimize < jumper ) {
2108 mjd_nodelen += Node_Length((optimize));
2109 OP( optimize ) = OPTIMIZED;
2110 Set_Node_Offset_Length(optimize,0,0);
2113 Set_Node_Offset_Length(convert,mjd_offset,mjd_nodelen);
2115 } /* end node insert */
2116 RExC_rxi->data->data[ data_slot + 1 ] = (void*)widecharmap;
2118 RExC_rxi->data->data[ data_slot + TRIE_WORDS_OFFSET ] = (void*)trie_words;
2119 RExC_rxi->data->data[ data_slot + 3 ] = (void*)revcharmap;
2121 SvREFCNT_dec(revcharmap);
2125 : trie->startstate>1
2131 S_make_trie_failtable(pTHX_ RExC_state_t *pRExC_state, regnode *source, regnode *stclass, U32 depth)
2133 /* The Trie is constructed and compressed now so we can build a fail array now if its needed
2135 This is basically the Aho-Corasick algorithm. Its from exercise 3.31 and 3.32 in the
2136 "Red Dragon" -- Compilers, principles, techniques, and tools. Aho, Sethi, Ullman 1985/88
2139 We find the fail state for each state in the trie, this state is the longest proper
2140 suffix of the current states 'word' that is also a proper prefix of another word in our
2141 trie. State 1 represents the word '' and is the thus the default fail state. This allows
2142 the DFA not to have to restart after its tried and failed a word at a given point, it
2143 simply continues as though it had been matching the other word in the first place.
2145 'abcdgu'=~/abcdefg|cdgu/
2146 When we get to 'd' we are still matching the first word, we would encounter 'g' which would
2147 fail, which would bring use to the state representing 'd' in the second word where we would
2148 try 'g' and succeed, prodceding to match 'cdgu'.
2150 /* add a fail transition */
2151 const U32 trie_offset = ARG(source);
2152 reg_trie_data *trie=(reg_trie_data *)RExC_rxi->data->data[trie_offset];
2154 const U32 ucharcount = trie->uniquecharcount;
2155 const U32 numstates = trie->statecount;
2156 const U32 ubound = trie->lasttrans + ucharcount;
2160 U32 base = trie->states[ 1 ].trans.base;
2163 const U32 data_slot = add_data( pRExC_state, 1, "T" );
2164 GET_RE_DEBUG_FLAGS_DECL;
2166 PERL_UNUSED_ARG(depth);
2170 ARG_SET( stclass, data_slot );
2171 aho = (reg_ac_data *) PerlMemShared_calloc( 1, sizeof(reg_ac_data) );
2172 RExC_rxi->data->data[ data_slot ] = (void*)aho;
2173 aho->trie=trie_offset;
2174 aho->states=(reg_trie_state *)PerlMemShared_malloc( numstates * sizeof(reg_trie_state) );
2175 Copy( trie->states, aho->states, numstates, reg_trie_state );
2176 Newxz( q, numstates, U32);
2177 aho->fail = (U32 *) PerlMemShared_calloc( numstates, sizeof(U32) );
2180 /* initialize fail[0..1] to be 1 so that we always have
2181 a valid final fail state */
2182 fail[ 0 ] = fail[ 1 ] = 1;
2184 for ( charid = 0; charid < ucharcount ; charid++ ) {
2185 const U32 newstate = TRIE_TRANS_STATE( 1, base, ucharcount, charid, 0 );
2187 q[ q_write ] = newstate;
2188 /* set to point at the root */
2189 fail[ q[ q_write++ ] ]=1;
2192 while ( q_read < q_write) {
2193 const U32 cur = q[ q_read++ % numstates ];
2194 base = trie->states[ cur ].trans.base;
2196 for ( charid = 0 ; charid < ucharcount ; charid++ ) {
2197 const U32 ch_state = TRIE_TRANS_STATE( cur, base, ucharcount, charid, 1 );
2199 U32 fail_state = cur;
2202 fail_state = fail[ fail_state ];
2203 fail_base = aho->states[ fail_state ].trans.base;
2204 } while ( !TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 ) );
2206 fail_state = TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 );
2207 fail[ ch_state ] = fail_state;
2208 if ( !aho->states[ ch_state ].wordnum && aho->states[ fail_state ].wordnum )
2210 aho->states[ ch_state ].wordnum = aho->states[ fail_state ].wordnum;
2212 q[ q_write++ % numstates] = ch_state;
2216 /* restore fail[0..1] to 0 so that we "fall out" of the AC loop
2217 when we fail in state 1, this allows us to use the
2218 charclass scan to find a valid start char. This is based on the principle
2219 that theres a good chance the string being searched contains lots of stuff
2220 that cant be a start char.
2222 fail[ 0 ] = fail[ 1 ] = 0;
2223 DEBUG_TRIE_COMPILE_r({
2224 PerlIO_printf(Perl_debug_log,
2225 "%*sStclass Failtable (%"UVuf" states): 0",
2226 (int)(depth * 2), "", (UV)numstates
2228 for( q_read=1; q_read<numstates; q_read++ ) {
2229 PerlIO_printf(Perl_debug_log, ", %"UVuf, (UV)fail[q_read]);
2231 PerlIO_printf(Perl_debug_log, "\n");
2234 /*RExC_seen |= REG_SEEN_TRIEDFA;*/
2239 * There are strange code-generation bugs caused on sparc64 by gcc-2.95.2.
2240 * These need to be revisited when a newer toolchain becomes available.
2242 #if defined(__sparc64__) && defined(__GNUC__)
2243 # if __GNUC__ < 2 || (__GNUC__ == 2 && __GNUC_MINOR__ < 96)
2244 # undef SPARC64_GCC_WORKAROUND
2245 # define SPARC64_GCC_WORKAROUND 1
2249 #define DEBUG_PEEP(str,scan,depth) \
2250 DEBUG_OPTIMISE_r({if (scan){ \
2251 SV * const mysv=sv_newmortal(); \
2252 regnode *Next = regnext(scan); \
2253 regprop(RExC_rx, mysv, scan); \
2254 PerlIO_printf(Perl_debug_log, "%*s" str ">%3d: %s (%d)\n", \
2255 (int)depth*2, "", REG_NODE_NUM(scan), SvPV_nolen_const(mysv),\
2256 Next ? (REG_NODE_NUM(Next)) : 0 ); \
2263 #define JOIN_EXACT(scan,min,flags) \
2264 if (PL_regkind[OP(scan)] == EXACT) \
2265 join_exact(pRExC_state,(scan),(min),(flags),NULL,depth+1)
2268 S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, I32 *min, U32 flags,regnode *val, U32 depth) {
2269 /* Merge several consecutive EXACTish nodes into one. */
2270 regnode *n = regnext(scan);
2272 regnode *next = scan + NODE_SZ_STR(scan);
2276 regnode *stop = scan;
2277 GET_RE_DEBUG_FLAGS_DECL;
2279 PERL_UNUSED_ARG(depth);
2281 #ifndef EXPERIMENTAL_INPLACESCAN
2282 PERL_UNUSED_ARG(flags);
2283 PERL_UNUSED_ARG(val);
2285 DEBUG_PEEP("join",scan,depth);
2287 /* Skip NOTHING, merge EXACT*. */
2289 ( PL_regkind[OP(n)] == NOTHING ||
2290 (stringok && (OP(n) == OP(scan))))
2292 && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX) {
2294 if (OP(n) == TAIL || n > next)
2296 if (PL_regkind[OP(n)] == NOTHING) {
2297 DEBUG_PEEP("skip:",n,depth);
2298 NEXT_OFF(scan) += NEXT_OFF(n);
2299 next = n + NODE_STEP_REGNODE;
2306 else if (stringok) {
2307 const unsigned int oldl = STR_LEN(scan);
2308 regnode * const nnext = regnext(n);
2310 DEBUG_PEEP("merg",n,depth);
2313 if (oldl + STR_LEN(n) > U8_MAX)
2315 NEXT_OFF(scan) += NEXT_OFF(n);
2316 STR_LEN(scan) += STR_LEN(n);
2317 next = n + NODE_SZ_STR(n);
2318 /* Now we can overwrite *n : */
2319 Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
2327 #ifdef EXPERIMENTAL_INPLACESCAN
2328 if (flags && !NEXT_OFF(n)) {
2329 DEBUG_PEEP("atch", val, depth);
2330 if (reg_off_by_arg[OP(n)]) {
2331 ARG_SET(n, val - n);
2334 NEXT_OFF(n) = val - n;
2341 if (UTF && ( OP(scan) == EXACTF ) && ( STR_LEN(scan) >= 6 ) ) {
2343 Two problematic code points in Unicode casefolding of EXACT nodes:
2345 U+0390 - GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
2346 U+03B0 - GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
2352 U+03B9 U+0308 U+0301 0xCE 0xB9 0xCC 0x88 0xCC 0x81
2353 U+03C5 U+0308 U+0301 0xCF 0x85 0xCC 0x88 0xCC 0x81
2355 This means that in case-insensitive matching (or "loose matching",
2356 as Unicode calls it), an EXACTF of length six (the UTF-8 encoded byte
2357 length of the above casefolded versions) can match a target string
2358 of length two (the byte length of UTF-8 encoded U+0390 or U+03B0).
2359 This would rather mess up the minimum length computation.
2361 What we'll do is to look for the tail four bytes, and then peek
2362 at the preceding two bytes to see whether we need to decrease
2363 the minimum length by four (six minus two).
2365 Thanks to the design of UTF-8, there cannot be false matches:
2366 A sequence of valid UTF-8 bytes cannot be a subsequence of
2367 another valid sequence of UTF-8 bytes.
2370 char * const s0 = STRING(scan), *s, *t;
2371 char * const s1 = s0 + STR_LEN(scan) - 1;
2372 char * const s2 = s1 - 4;
2373 #ifdef EBCDIC /* RD tunifold greek 0390 and 03B0 */
2374 const char t0[] = "\xaf\x49\xaf\x42";
2376 const char t0[] = "\xcc\x88\xcc\x81";
2378 const char * const t1 = t0 + 3;
2381 s < s2 && (t = ninstr(s, s1, t0, t1));
2384 if (((U8)t[-1] == 0x68 && (U8)t[-2] == 0xB4) ||
2385 ((U8)t[-1] == 0x46 && (U8)t[-2] == 0xB5))
2387 if (((U8)t[-1] == 0xB9 && (U8)t[-2] == 0xCE) ||
2388 ((U8)t[-1] == 0x85 && (U8)t[-2] == 0xCF))
2396 n = scan + NODE_SZ_STR(scan);
2398 if (PL_regkind[OP(n)] != NOTHING || OP(n) == NOTHING) {
2405 DEBUG_OPTIMISE_r(if (merged){DEBUG_PEEP("finl",scan,depth)});
2409 /* REx optimizer. Converts nodes into quickier variants "in place".
2410 Finds fixed substrings. */
2412 /* Stops at toplevel WHILEM as well as at "last". At end *scanp is set
2413 to the position after last scanned or to NULL. */
2415 #define INIT_AND_WITHP \
2416 assert(!and_withp); \
2417 Newx(and_withp,1,struct regnode_charclass_class); \
2418 SAVEFREEPV(and_withp)
2420 /* this is a chain of data about sub patterns we are processing that
2421 need to be handled seperately/specially in study_chunk. Its so
2422 we can simulate recursion without losing state. */
2424 typedef struct scan_frame {
2425 regnode *last; /* last node to process in this frame */
2426 regnode *next; /* next node to process when last is reached */
2427 struct scan_frame *prev; /*previous frame*/
2428 I32 stop; /* what stopparen do we use */
2432 #define SCAN_COMMIT(s, data, m) scan_commit(s, data, m, is_inf)
2434 #define CASE_SYNST_FNC(nAmE) \
2436 if (flags & SCF_DO_STCLASS_AND) { \
2437 for (value = 0; value < 256; value++) \
2438 if (!is_ ## nAmE ## _cp(value)) \
2439 ANYOF_BITMAP_CLEAR(data->start_class, value); \
2442 for (value = 0; value < 256; value++) \
2443 if (is_ ## nAmE ## _cp(value)) \
2444 ANYOF_BITMAP_SET(data->start_class, value); \
2448 if (flags & SCF_DO_STCLASS_AND) { \
2449 for (value = 0; value < 256; value++) \
2450 if (is_ ## nAmE ## _cp(value)) \
2451 ANYOF_BITMAP_CLEAR(data->start_class, value); \
2454 for (value = 0; value < 256; value++) \
2455 if (!is_ ## nAmE ## _cp(value)) \
2456 ANYOF_BITMAP_SET(data->start_class, value); \
2463 S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
2464 I32 *minlenp, I32 *deltap,
2469 struct regnode_charclass_class *and_withp,
2470 U32 flags, U32 depth)
2471 /* scanp: Start here (read-write). */
2472 /* deltap: Write maxlen-minlen here. */
2473 /* last: Stop before this one. */
2474 /* data: string data about the pattern */
2475 /* stopparen: treat close N as END */
2476 /* recursed: which subroutines have we recursed into */
2477 /* and_withp: Valid if flags & SCF_DO_STCLASS_OR */
2480 I32 min = 0, pars = 0, code;
2481 regnode *scan = *scanp, *next;
2483 int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
2484 int is_inf_internal = 0; /* The studied chunk is infinite */
2485 I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
2486 scan_data_t data_fake;
2487 SV *re_trie_maxbuff = NULL;
2488 regnode *first_non_open = scan;
2489 I32 stopmin = I32_MAX;
2490 scan_frame *frame = NULL;
2492 GET_RE_DEBUG_FLAGS_DECL;
2495 StructCopy(&zero_scan_data, &data_fake, scan_data_t);
2499 while (first_non_open && OP(first_non_open) == OPEN)
2500 first_non_open=regnext(first_non_open);
2505 while ( scan && OP(scan) != END && scan < last ){
2506 /* Peephole optimizer: */
2507 DEBUG_STUDYDATA("Peep:", data,depth);
2508 DEBUG_PEEP("Peep",scan,depth);
2509 JOIN_EXACT(scan,&min,0);
2511 /* Follow the next-chain of the current node and optimize
2512 away all the NOTHINGs from it. */
2513 if (OP(scan) != CURLYX) {
2514 const int max = (reg_off_by_arg[OP(scan)]
2516 /* I32 may be smaller than U16 on CRAYs! */
2517 : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
2518 int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
2522 /* Skip NOTHING and LONGJMP. */
2523 while ((n = regnext(n))
2524 && ((PL_regkind[OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
2525 || ((OP(n) == LONGJMP) && (noff = ARG(n))))
2526 && off + noff < max)
2528 if (reg_off_by_arg[OP(scan)])
2531 NEXT_OFF(scan) = off;
2536 /* The principal pseudo-switch. Cannot be a switch, since we
2537 look into several different things. */
2538 if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
2539 || OP(scan) == IFTHEN) {
2540 next = regnext(scan);
2542 /* demq: the op(next)==code check is to see if we have "branch-branch" AFAICT */
2544 if (OP(next) == code || code == IFTHEN) {
2545 /* NOTE - There is similar code to this block below for handling
2546 TRIE nodes on a re-study. If you change stuff here check there
2548 I32 max1 = 0, min1 = I32_MAX, num = 0;
2549 struct regnode_charclass_class accum;
2550 regnode * const startbranch=scan;
2552 if (flags & SCF_DO_SUBSTR)
2553 SCAN_COMMIT(pRExC_state, data, minlenp); /* Cannot merge strings after this. */
2554 if (flags & SCF_DO_STCLASS)
2555 cl_init_zero(pRExC_state, &accum);
2557 while (OP(scan) == code) {
2558 I32 deltanext, minnext, f = 0, fake;
2559 struct regnode_charclass_class this_class;
2562 data_fake.flags = 0;
2564 data_fake.whilem_c = data->whilem_c;
2565 data_fake.last_closep = data->last_closep;
2568 data_fake.last_closep = &fake;
2570 data_fake.pos_delta = delta;
2571 next = regnext(scan);
2572 scan = NEXTOPER(scan);
2574 scan = NEXTOPER(scan);
2575 if (flags & SCF_DO_STCLASS) {
2576 cl_init(pRExC_state, &this_class);
2577 data_fake.start_class = &this_class;
2578 f = SCF_DO_STCLASS_AND;
2580 if (flags & SCF_WHILEM_VISITED_POS)
2581 f |= SCF_WHILEM_VISITED_POS;
2583 /* we suppose the run is continuous, last=next...*/
2584 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
2586 stopparen, recursed, NULL, f,depth+1);
2589 if (max1 < minnext + deltanext)
2590 max1 = minnext + deltanext;
2591 if (deltanext == I32_MAX)
2592 is_inf = is_inf_internal = 1;
2594 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
2596 if (data_fake.flags & SCF_SEEN_ACCEPT) {
2597 if ( stopmin > minnext)
2598 stopmin = min + min1;
2599 flags &= ~SCF_DO_SUBSTR;
2601 data->flags |= SCF_SEEN_ACCEPT;
2604 if (data_fake.flags & SF_HAS_EVAL)
2605 data->flags |= SF_HAS_EVAL;
2606 data->whilem_c = data_fake.whilem_c;
2608 if (flags & SCF_DO_STCLASS)
2609 cl_or(pRExC_state, &accum, &this_class);
2611 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
2613 if (flags & SCF_DO_SUBSTR) {
2614 data->pos_min += min1;
2615 data->pos_delta += max1 - min1;
2616 if (max1 != min1 || is_inf)
2617 data->longest = &(data->longest_float);
2620 delta += max1 - min1;
2621 if (flags & SCF_DO_STCLASS_OR) {
2622 cl_or(pRExC_state, data->start_class, &accum);
2624 cl_and(data->start_class, and_withp);
2625 flags &= ~SCF_DO_STCLASS;
2628 else if (flags & SCF_DO_STCLASS_AND) {
2630 cl_and(data->start_class, &accum);
2631 flags &= ~SCF_DO_STCLASS;
2634 /* Switch to OR mode: cache the old value of
2635 * data->start_class */
2637 StructCopy(data->start_class, and_withp,
2638 struct regnode_charclass_class);
2639 flags &= ~SCF_DO_STCLASS_AND;
2640 StructCopy(&accum, data->start_class,
2641 struct regnode_charclass_class);
2642 flags |= SCF_DO_STCLASS_OR;
2643 data->start_class->flags |= ANYOF_EOS;
2647 if (PERL_ENABLE_TRIE_OPTIMISATION && OP( startbranch ) == BRANCH ) {
2650 Assuming this was/is a branch we are dealing with: 'scan' now
2651 points at the item that follows the branch sequence, whatever
2652 it is. We now start at the beginning of the sequence and look
2659 which would be constructed from a pattern like /A|LIST|OF|WORDS/
2661 If we can find such a subseqence we need to turn the first
2662 element into a trie and then add the subsequent branch exact
2663 strings to the trie.
2667 1. patterns where the whole set of branch can be converted.
2669 2. patterns where only a subset can be converted.
2671 In case 1 we can replace the whole set with a single regop
2672 for the trie. In case 2 we need to keep the start and end
2675 'BRANCH EXACT; BRANCH EXACT; BRANCH X'
2676 becomes BRANCH TRIE; BRANCH X;
2678 There is an additional case, that being where there is a
2679 common prefix, which gets split out into an EXACT like node
2680 preceding the TRIE node.
2682 If x(1..n)==tail then we can do a simple trie, if not we make
2683 a "jump" trie, such that when we match the appropriate word
2684 we "jump" to the appopriate tail node. Essentailly we turn
2685 a nested if into a case structure of sorts.
2690 if (!re_trie_maxbuff) {
2691 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
2692 if (!SvIOK(re_trie_maxbuff))
2693 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
2695 if ( SvIV(re_trie_maxbuff)>=0 ) {
2697 regnode *first = (regnode *)NULL;
2698 regnode *last = (regnode *)NULL;
2699 regnode *tail = scan;
2704 SV * const mysv = sv_newmortal(); /* for dumping */
2706 /* var tail is used because there may be a TAIL
2707 regop in the way. Ie, the exacts will point to the
2708 thing following the TAIL, but the last branch will
2709 point at the TAIL. So we advance tail. If we
2710 have nested (?:) we may have to move through several
2714 while ( OP( tail ) == TAIL ) {
2715 /* this is the TAIL generated by (?:) */
2716 tail = regnext( tail );
2721 regprop(RExC_rx, mysv, tail );
2722 PerlIO_printf( Perl_debug_log, "%*s%s%s\n",
2723 (int)depth * 2 + 2, "",
2724 "Looking for TRIE'able sequences. Tail node is: ",
2725 SvPV_nolen_const( mysv )
2731 step through the branches, cur represents each
2732 branch, noper is the first thing to be matched
2733 as part of that branch and noper_next is the
2734 regnext() of that node. if noper is an EXACT
2735 and noper_next is the same as scan (our current
2736 position in the regex) then the EXACT branch is
2737 a possible optimization target. Once we have
2738 two or more consequetive such branches we can
2739 create a trie of the EXACT's contents and stich
2740 it in place. If the sequence represents all of
2741 the branches we eliminate the whole thing and
2742 replace it with a single TRIE. If it is a
2743 subsequence then we need to stitch it in. This
2744 means the first branch has to remain, and needs
2745 to be repointed at the item on the branch chain
2746 following the last branch optimized. This could
2747 be either a BRANCH, in which case the
2748 subsequence is internal, or it could be the
2749 item following the branch sequence in which
2750 case the subsequence is at the end.
2754 /* dont use tail as the end marker for this traverse */
2755 for ( cur = startbranch ; cur != scan ; cur = regnext( cur ) ) {
2756 regnode * const noper = NEXTOPER( cur );
2757 #if defined(DEBUGGING) || defined(NOJUMPTRIE)
2758 regnode * const noper_next = regnext( noper );
2762 regprop(RExC_rx, mysv, cur);
2763 PerlIO_printf( Perl_debug_log, "%*s- %s (%d)",
2764 (int)depth * 2 + 2,"", SvPV_nolen_const( mysv ), REG_NODE_NUM(cur) );
2766 regprop(RExC_rx, mysv, noper);
2767 PerlIO_printf( Perl_debug_log, " -> %s",
2768 SvPV_nolen_const(mysv));
2771 regprop(RExC_rx, mysv, noper_next );
2772 PerlIO_printf( Perl_debug_log,"\t=> %s\t",
2773 SvPV_nolen_const(mysv));
2775 PerlIO_printf( Perl_debug_log, "(First==%d,Last==%d,Cur==%d)\n",
2776 REG_NODE_NUM(first), REG_NODE_NUM(last), REG_NODE_NUM(cur) );
2778 if ( (((first && optype!=NOTHING) ? OP( noper ) == optype
2779 : PL_regkind[ OP( noper ) ] == EXACT )
2780 || OP(noper) == NOTHING )
2782 && noper_next == tail
2787 if ( !first || optype == NOTHING ) {
2788 if (!first) first = cur;
2789 optype = OP( noper );
2795 Currently we assume that the trie can handle unicode and ascii
2796 matches fold cased matches. If this proves true then the following
2797 define will prevent tries in this situation.
2799 #define TRIE_TYPE_IS_SAFE (UTF || optype==EXACT)
2801 #define TRIE_TYPE_IS_SAFE 1
2802 if ( last && TRIE_TYPE_IS_SAFE ) {
2803 make_trie( pRExC_state,
2804 startbranch, first, cur, tail, count,
2807 if ( PL_regkind[ OP( noper ) ] == EXACT
2809 && noper_next == tail
2814 optype = OP( noper );
2824 regprop(RExC_rx, mysv, cur);
2825 PerlIO_printf( Perl_debug_log,
2826 "%*s- %s (%d) <SCAN FINISHED>\n", (int)depth * 2 + 2,
2827 "", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur));
2831 if ( last && TRIE_TYPE_IS_SAFE ) {
2832 made= make_trie( pRExC_state, startbranch, first, scan, tail, count, optype, depth+1 );
2833 #ifdef TRIE_STUDY_OPT
2834 if ( ((made == MADE_EXACT_TRIE &&
2835 startbranch == first)
2836 || ( first_non_open == first )) &&
2838 flags |= SCF_TRIE_RESTUDY;
2839 if ( startbranch == first
2842 RExC_seen &=~REG_TOP_LEVEL_BRANCHES;
2852 else if ( code == BRANCHJ ) { /* single branch is optimized. */
2853 scan = NEXTOPER(NEXTOPER(scan));
2854 } else /* single branch is optimized. */
2855 scan = NEXTOPER(scan);
2857 } else if (OP(scan) == SUSPEND || OP(scan) == GOSUB || OP(scan) == GOSTART) {
2858 scan_frame *newframe = NULL;
2863 if (OP(scan) != SUSPEND) {
2864 /* set the pointer */
2865 if (OP(scan) == GOSUB) {
2867 RExC_recurse[ARG2L(scan)] = scan;
2868 start = RExC_open_parens[paren-1];
2869 end = RExC_close_parens[paren-1];
2872 start = RExC_rxi->program + 1;
2876 Newxz(recursed, (((RExC_npar)>>3) +1), U8);
2877 SAVEFREEPV(recursed);
2879 if (!PAREN_TEST(recursed,paren+1)) {
2880 PAREN_SET(recursed,paren+1);
2881 Newx(newframe,1,scan_frame);
2883 if (flags & SCF_DO_SUBSTR) {
2884 SCAN_COMMIT(pRExC_state,data,minlenp);
2885 data->longest = &(data->longest_float);
2887 is_inf = is_inf_internal = 1;
2888 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
2889 cl_anything(pRExC_state, data->start_class);
2890 flags &= ~SCF_DO_STCLASS;
2893 Newx(newframe,1,scan_frame);
2896 end = regnext(scan);
2901 SAVEFREEPV(newframe);
2902 newframe->next = regnext(scan);
2903 newframe->last = last;
2904 newframe->stop = stopparen;
2905 newframe->prev = frame;
2915 else if (OP(scan) == EXACT) {
2916 I32 l = STR_LEN(scan);
2919 const U8 * const s = (U8*)STRING(scan);
2920 l = utf8_length(s, s + l);
2921 uc = utf8_to_uvchr(s, NULL);
2923 uc = *((U8*)STRING(scan));
2926 if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
2927 /* The code below prefers earlier match for fixed
2928 offset, later match for variable offset. */
2929 if (data->last_end == -1) { /* Update the start info. */
2930 data->last_start_min = data->pos_min;
2931 data->last_start_max = is_inf
2932 ? I32_MAX : data->pos_min + data->pos_delta;
2934 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
2936 SvUTF8_on(data->last_found);
2938 SV * const sv = data->last_found;
2939 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
2940 mg_find(sv, PERL_MAGIC_utf8) : NULL;
2941 if (mg && mg->mg_len >= 0)
2942 mg->mg_len += utf8_length((U8*)STRING(scan),
2943 (U8*)STRING(scan)+STR_LEN(scan));
2945 data->last_end = data->pos_min + l;
2946 data->pos_min += l; /* As in the first entry. */
2947 data->flags &= ~SF_BEFORE_EOL;
2949 if (flags & SCF_DO_STCLASS_AND) {
2950 /* Check whether it is compatible with what we know already! */
2954 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
2955 && !ANYOF_BITMAP_TEST(data->start_class, uc)
2956 && (!(data->start_class->flags & ANYOF_FOLD)
2957 || !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
2960 ANYOF_CLASS_ZERO(data->start_class);
2961 ANYOF_BITMAP_ZERO(data->start_class);
2963 ANYOF_BITMAP_SET(data->start_class, uc);
2964 data->start_class->flags &= ~ANYOF_EOS;
2966 data->start_class->flags &= ~ANYOF_UNICODE_ALL;
2968 else if (flags & SCF_DO_STCLASS_OR) {
2969 /* false positive possible if the class is case-folded */
2971 ANYOF_BITMAP_SET(data->start_class, uc);
2973 data->start_class->flags |= ANYOF_UNICODE_ALL;
2974 data->start_class->flags &= ~ANYOF_EOS;
2975 cl_and(data->start_class, and_withp);
2977 flags &= ~SCF_DO_STCLASS;
2979 else if (PL_regkind[OP(scan)] == EXACT) { /* But OP != EXACT! */
2980 I32 l = STR_LEN(scan);
2981 UV uc = *((U8*)STRING(scan));
2983 /* Search for fixed substrings supports EXACT only. */
2984 if (flags & SCF_DO_SUBSTR) {
2986 SCAN_COMMIT(pRExC_state, data, minlenp);
2989 const U8 * const s = (U8 *)STRING(scan);
2990 l = utf8_length(s, s + l);
2991 uc = utf8_to_uvchr(s, NULL);
2994 if (flags & SCF_DO_SUBSTR)
2996 if (flags & SCF_DO_STCLASS_AND) {
2997 /* Check whether it is compatible with what we know already! */
3001 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
3002 && !ANYOF_BITMAP_TEST(data->start_class, uc)
3003 && !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
3005 ANYOF_CLASS_ZERO(data->start_class);
3006 ANYOF_BITMAP_ZERO(data->start_class);
3008 ANYOF_BITMAP_SET(data->start_class, uc);
3009 data->start_class->flags &= ~ANYOF_EOS;
3010 data->start_class->flags |= ANYOF_FOLD;
3011 if (OP(scan) == EXACTFL)
3012 data->start_class->flags |= ANYOF_LOCALE;
3015 else if (flags & SCF_DO_STCLASS_OR) {
3016 if (data->start_class->flags & ANYOF_FOLD) {
3017 /* false positive possible if the class is case-folded.
3018 Assume that the locale settings are the same... */
3020 ANYOF_BITMAP_SET(data->start_class, uc);
3021 data->start_class->flags &= ~ANYOF_EOS;
3023 cl_and(data->start_class, and_withp);
3025 flags &= ~SCF_DO_STCLASS;
3027 else if (strchr((const char*)PL_varies,OP(scan))) {
3028 I32 mincount, maxcount, minnext, deltanext, fl = 0;
3029 I32 f = flags, pos_before = 0;
3030 regnode * const oscan = scan;
3031 struct regnode_charclass_class this_class;
3032 struct regnode_charclass_class *oclass = NULL;
3033 I32 next_is_eval = 0;
3035 switch (PL_regkind[OP(scan)]) {
3036 case WHILEM: /* End of (?:...)* . */
3037 scan = NEXTOPER(scan);
3040 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
3041 next = NEXTOPER(scan);
3042 if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) {
3044 maxcount = REG_INFTY;
3045 next = regnext(scan);
3046 scan = NEXTOPER(scan);
3050 if (flags & SCF_DO_SUBSTR)
3055 if (flags & SCF_DO_STCLASS) {
3057 maxcount = REG_INFTY;
3058 next = regnext(scan);
3059 scan = NEXTOPER(scan);
3062 is_inf = is_inf_internal = 1;
3063 scan = regnext(scan);
3064 if (flags & SCF_DO_SUBSTR) {
3065 SCAN_COMMIT(pRExC_state, data, minlenp); /* Cannot extend fixed substrings */
3066 data->longest = &(data->longest_float);
3068 goto optimize_curly_tail;
3070 if (stopparen>0 && (OP(scan)==CURLYN || OP(scan)==CURLYM)
3071 && (scan->flags == stopparen))
3076 mincount = ARG1(scan);
3077 maxcount = ARG2(scan);
3079 next = regnext(scan);
3080 if (OP(scan) == CURLYX) {
3081 I32 lp = (data ? *(data->last_closep) : 0);
3082 scan->flags = ((lp <= (I32)U8_MAX) ? (U8)lp : U8_MAX);
3084 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
3085 next_is_eval = (OP(scan) == EVAL);
3087 if (flags & SCF_DO_SUBSTR) {
3088 if (mincount == 0) SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot extend fixed substrings */
3089 pos_before = data->pos_min;
3093 data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
3095 data->flags |= SF_IS_INF;
3097 if (flags & SCF_DO_STCLASS) {
3098 cl_init(pRExC_state, &this_class);
3099 oclass = data->start_class;
3100 data->start_class = &this_class;
3101 f |= SCF_DO_STCLASS_AND;
3102 f &= ~SCF_DO_STCLASS_OR;
3104 /* These are the cases when once a subexpression
3105 fails at a particular position, it cannot succeed
3106 even after backtracking at the enclosing scope.
3108 XXXX what if minimal match and we are at the
3109 initial run of {n,m}? */
3110 if ((mincount != maxcount - 1) && (maxcount != REG_INFTY))
3111 f &= ~SCF_WHILEM_VISITED_POS;
3113 /* This will finish on WHILEM, setting scan, or on NULL: */
3114 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
3115 last, data, stopparen, recursed, NULL,
3117 ? (f & ~SCF_DO_SUBSTR) : f),depth+1);
3119 if (flags & SCF_DO_STCLASS)
3120 data->start_class = oclass;
3121 if (mincount == 0 || minnext == 0) {
3122 if (flags & SCF_DO_STCLASS_OR) {
3123 cl_or(pRExC_state, data->start_class, &this_class);
3125 else if (flags & SCF_DO_STCLASS_AND) {
3126 /* Switch to OR mode: cache the old value of
3127 * data->start_class */
3129 StructCopy(data->start_class, and_withp,
3130 struct regnode_charclass_class);
3131 flags &= ~SCF_DO_STCLASS_AND;
3132 StructCopy(&this_class, data->start_class,
3133 struct regnode_charclass_class);
3134 flags |= SCF_DO_STCLASS_OR;
3135 data->start_class->flags |= ANYOF_EOS;
3137 } else { /* Non-zero len */
3138 if (flags & SCF_DO_STCLASS_OR) {
3139 cl_or(pRExC_state, data->start_class, &this_class);
3140 cl_and(data->start_class, and_withp);
3142 else if (flags & SCF_DO_STCLASS_AND)
3143 cl_and(data->start_class, &this_class);
3144 flags &= ~SCF_DO_STCLASS;
3146 if (!scan) /* It was not CURLYX, but CURLY. */
3148 if ( /* ? quantifier ok, except for (?{ ... }) */
3149 (next_is_eval || !(mincount == 0 && maxcount == 1))
3150 && (minnext == 0) && (deltanext == 0)
3151 && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
3152 && maxcount <= REG_INFTY/3 /* Complement check for big count */
3153 && ckWARN(WARN_REGEXP))
3156 "Quantifier unexpected on zero-length expression");
3159 min += minnext * mincount;
3160 is_inf_internal |= ((maxcount == REG_INFTY
3161 && (minnext + deltanext) > 0)
3162 || deltanext == I32_MAX);
3163 is_inf |= is_inf_internal;
3164 delta += (minnext + deltanext) * maxcount - minnext * mincount;
3166 /* Try powerful optimization CURLYX => CURLYN. */
3167 if ( OP(oscan) == CURLYX && data
3168 && data->flags & SF_IN_PAR
3169 && !(data->flags & SF_HAS_EVAL)
3170 && !deltanext && minnext == 1 ) {
3171 /* Try to optimize to CURLYN. */
3172 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
3173 regnode * const nxt1 = nxt;
3180 if (!strchr((const char*)PL_simple,OP(nxt))
3181 && !(PL_regkind[OP(nxt)] == EXACT
3182 && STR_LEN(nxt) == 1))
3188 if (OP(nxt) != CLOSE)
3190 if (RExC_open_parens) {
3191 RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
3192 RExC_close_parens[ARG(nxt1)-1]=nxt+2; /*close->while*/
3194 /* Now we know that nxt2 is the only contents: */
3195 oscan->flags = (U8)ARG(nxt);
3197 OP(nxt1) = NOTHING; /* was OPEN. */
3200 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
3201 NEXT_OFF(nxt1+ 1) = 0; /* just for consistancy. */
3202 NEXT_OFF(nxt2) = 0; /* just for consistancy with CURLY. */
3203 OP(nxt) = OPTIMIZED; /* was CLOSE. */
3204 OP(nxt + 1) = OPTIMIZED; /* was count. */
3205 NEXT_OFF(nxt+ 1) = 0; /* just for consistancy. */
3210 /* Try optimization CURLYX => CURLYM. */
3211 if ( OP(oscan) == CURLYX && data
3212 && !(data->flags & SF_HAS_PAR)
3213 && !(data->flags & SF_HAS_EVAL)
3214 && !deltanext /* atom is fixed width */
3215 && minnext != 0 /* CURLYM can't handle zero width */
3217 /* XXXX How to optimize if data == 0? */
3218 /* Optimize to a simpler form. */
3219 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
3223 while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
3224 && (OP(nxt2) != WHILEM))
3226 OP(nxt2) = SUCCEED; /* Whas WHILEM */
3227 /* Need to optimize away parenths. */
3228 if (data->flags & SF_IN_PAR) {
3229 /* Set the parenth number. */
3230 regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
3232 if (OP(nxt) != CLOSE)
3233 FAIL("Panic opt close");
3234 oscan->flags = (U8)ARG(nxt);
3235 if (RExC_open_parens) {
3236 RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
3237 RExC_close_parens[ARG(nxt1)-1]=nxt2+1; /*close->NOTHING*/
3239 OP(nxt1) = OPTIMIZED; /* was OPEN. */
3240 OP(nxt) = OPTIMIZED; /* was CLOSE. */
3243 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
3244 OP(nxt + 1) = OPTIMIZED; /* was count. */
3245 NEXT_OFF(nxt1 + 1) = 0; /* just for consistancy. */
3246 NEXT_OFF(nxt + 1) = 0; /* just for consistancy. */
3249 while ( nxt1 && (OP(nxt1) != WHILEM)) {
3250 regnode *nnxt = regnext(nxt1);
3253 if (reg_off_by_arg[OP(nxt1)])
3254 ARG_SET(nxt1, nxt2 - nxt1);
3255 else if (nxt2 - nxt1 < U16_MAX)
3256 NEXT_OFF(nxt1) = nxt2 - nxt1;
3258 OP(nxt) = NOTHING; /* Cannot beautify */
3263 /* Optimize again: */
3264 study_chunk(pRExC_state, &nxt1, minlenp, &deltanext, nxt,
3265 NULL, stopparen, recursed, NULL, 0,depth+1);
3270 else if ((OP(oscan) == CURLYX)
3271 && (flags & SCF_WHILEM_VISITED_POS)
3272 /* See the comment on a similar expression above.
3273 However, this time it not a subexpression
3274 we care about, but the expression itself. */
3275 && (maxcount == REG_INFTY)
3276 && data && ++data->whilem_c < 16) {
3277 /* This stays as CURLYX, we can put the count/of pair. */
3278 /* Find WHILEM (as in regexec.c) */
3279 regnode *nxt = oscan + NEXT_OFF(oscan);
3281 if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
3283 PREVOPER(nxt)->flags = (U8)(data->whilem_c
3284 | (RExC_whilem_seen << 4)); /* On WHILEM */
3286 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
3288 if (flags & SCF_DO_SUBSTR) {
3289 SV *last_str = NULL;
3290 int counted = mincount != 0;
3292 if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */
3293 #if defined(SPARC64_GCC_WORKAROUND)
3296 const char *s = NULL;
3299 if (pos_before >= data->last_start_min)
3302 b = data->last_start_min;
3305 s = SvPV_const(data->last_found, l);
3306 old = b - data->last_start_min;
3309 I32 b = pos_before >= data->last_start_min
3310 ? pos_before : data->last_start_min;
3312 const char * const s = SvPV_const(data->last_found, l);
3313 I32 old = b - data->last_start_min;
3317 old = utf8_hop((U8*)s, old) - (U8*)s;
3320 /* Get the added string: */
3321 last_str = newSVpvn(s + old, l);
3323 SvUTF8_on(last_str);
3324 if (deltanext == 0 && pos_before == b) {
3325 /* What was added is a constant string */
3327 SvGROW(last_str, (mincount * l) + 1);
3328 repeatcpy(SvPVX(last_str) + l,
3329 SvPVX_const(last_str), l, mincount - 1);
3330 SvCUR_set(last_str, SvCUR(last_str) * mincount);
3331 /* Add additional parts. */
3332 SvCUR_set(data->last_found,
3333 SvCUR(data->last_found) - l);
3334 sv_catsv(data->last_found, last_str);
3336 SV * sv = data->last_found;
3338 SvUTF8(sv) && SvMAGICAL(sv) ?
3339 mg_find(sv, PERL_MAGIC_utf8) : NULL;
3340 if (mg && mg->mg_len >= 0)
3341 mg->mg_len += CHR_SVLEN(last_str) - l;
3343 data->last_end += l * (mincount - 1);
3346 /* start offset must point into the last copy */
3347 data->last_start_min += minnext * (mincount - 1);
3348 data->last_start_max += is_inf ? I32_MAX
3349 : (maxcount - 1) * (minnext + data->pos_delta);
3352 /* It is counted once already... */
3353 data->pos_min += minnext * (mincount - counted);
3354 data->pos_delta += - counted * deltanext +
3355 (minnext + deltanext) * maxcount - minnext * mincount;
3356 if (mincount != maxcount) {
3357 /* Cannot extend fixed substrings found inside
3359 SCAN_COMMIT(pRExC_state,data,minlenp);
3360 if (mincount && last_str) {
3361 SV * const sv = data->last_found;
3362 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
3363 mg_find(sv, PERL_MAGIC_utf8) : NULL;
3367 sv_setsv(sv, last_str);
3368 data->last_end = data->pos_min;
3369 data->last_start_min =
3370 data->pos_min - CHR_SVLEN(last_str);
3371 data->last_start_max = is_inf
3373 : data->pos_min + data->pos_delta
3374 - CHR_SVLEN(last_str);
3376 data->longest = &(data->longest_float);
3378 SvREFCNT_dec(last_str);
3380 if (data && (fl & SF_HAS_EVAL))
3381 data->flags |= SF_HAS_EVAL;
3382 optimize_curly_tail:
3383 if (OP(oscan) != CURLYX) {
3384 while (PL_regkind[OP(next = regnext(oscan))] == NOTHING
3386 NEXT_OFF(oscan) += NEXT_OFF(next);
3389 default: /* REF and CLUMP only? */
3390 if (flags & SCF_DO_SUBSTR) {
3391 SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */
3392 data->longest = &(data->longest_float);
3394 is_inf = is_inf_internal = 1;
3395 if (flags & SCF_DO_STCLASS_OR)
3396 cl_anything(pRExC_state, data->start_class);
3397 flags &= ~SCF_DO_STCLASS;
3401 else if (OP(scan) == LNBREAK) {
3402 if (flags & SCF_DO_STCLASS) {
3404 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
3405 if (flags & SCF_DO_STCLASS_AND) {
3406 for (value = 0; value < 256; value++)
3407 if (!is_VERTWS_cp(value))
3408 ANYOF_BITMAP_CLEAR(data->start_class, value);
3411 for (value = 0; value < 256; value++)
3412 if (is_VERTWS_cp(value))
3413 ANYOF_BITMAP_SET(data->start_class, value);
3415 if (flags & SCF_DO_STCLASS_OR)
3416 cl_and(data->start_class, and_withp);
3417 flags &= ~SCF_DO_STCLASS;
3421 if (flags & SCF_DO_SUBSTR) {
3422 SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */
3424 data->pos_delta += 1;
3425 data->longest = &(data->longest_float);
3429 else if (OP(scan) == FOLDCHAR) {
3430 int d = ARG(scan)==0xDF ? 1 : 2;
3431 flags &= ~SCF_DO_STCLASS;
3434 if (flags & SCF_DO_SUBSTR) {
3435 SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */
3437 data->pos_delta += d;
3438 data->longest = &(data->longest_float);
3441 else if (strchr((const char*)PL_simple,OP(scan))) {
3444 if (flags & SCF_DO_SUBSTR) {
3445 SCAN_COMMIT(pRExC_state,data,minlenp);
3449 if (flags & SCF_DO_STCLASS) {
3450 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
3452 /* Some of the logic below assumes that switching
3453 locale on will only add false positives. */
3454 switch (PL_regkind[OP(scan)]) {
3458 /* Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan)); */
3459 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
3460 cl_anything(pRExC_state, data->start_class);
3463 if (OP(scan) == SANY)
3465 if (flags & SCF_DO_STCLASS_OR) { /* Everything but \n */
3466 value = (ANYOF_BITMAP_TEST(data->start_class,'\n')
3467 || (data->start_class->flags & ANYOF_CLASS));
3468 cl_anything(pRExC_state, data->start_class);
3470 if (flags & SCF_DO_STCLASS_AND || !value)
3471 ANYOF_BITMAP_CLEAR(data->start_class,'\n');
3474 if (flags & SCF_DO_STCLASS_AND)
3475 cl_and(data->start_class,
3476 (struct regnode_charclass_class*)scan);
3478 cl_or(pRExC_state, data->start_class,
3479 (struct regnode_charclass_class*)scan);
3482 if (flags & SCF_DO_STCLASS_AND) {
3483 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3484 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
3485 for (value = 0; value < 256; value++)
3486 if (!isALNUM(value))
3487 ANYOF_BITMAP_CLEAR(data->start_class, value);
3491 if (data->start_class->flags & ANYOF_LOCALE)
3492 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
3494 for (value = 0; value < 256; value++)
3496 ANYOF_BITMAP_SET(data->start_class, value);
3501 if (flags & SCF_DO_STCLASS_AND) {
3502 if (data->start_class->flags & ANYOF_LOCALE)
3503 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
3506 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
3507 data->start_class->flags |= ANYOF_LOCALE;
3511 if (flags & SCF_DO_STCLASS_AND) {
3512 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3513 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
3514 for (value = 0; value < 256; value++)
3516 ANYOF_BITMAP_CLEAR(data->start_class, value);
3520 if (data->start_class->flags & ANYOF_LOCALE)
3521 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
3523 for (value = 0; value < 256; value++)
3524 if (!isALNUM(value))
3525 ANYOF_BITMAP_SET(data->start_class, value);
3530 if (flags & SCF_DO_STCLASS_AND) {
3531 if (data->start_class->flags & ANYOF_LOCALE)
3532 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
3535 data->start_class->flags |= ANYOF_LOCALE;
3536 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
3540 if (flags & SCF_DO_STCLASS_AND) {
3541 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3542 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
3543 for (value = 0; value < 256; value++)
3544 if (!isSPACE(value))
3545 ANYOF_BITMAP_CLEAR(data->start_class, value);
3549 if (data->start_class->flags & ANYOF_LOCALE)
3550 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
3552 for (value = 0; value < 256; value++)
3554 ANYOF_BITMAP_SET(data->start_class, value);
3559 if (flags & SCF_DO_STCLASS_AND) {
3560 if (data->start_class->flags & ANYOF_LOCALE)
3561 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
3564 data->start_class->flags |= ANYOF_LOCALE;
3565 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
3569 if (flags & SCF_DO_STCLASS_AND) {
3570 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3571 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
3572 for (value = 0; value < 256; value++)
3574 ANYOF_BITMAP_CLEAR(data->start_class, value);
3578 if (data->start_class->flags & ANYOF_LOCALE)
3579 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
3581 for (value = 0; value < 256; value++)
3582 if (!isSPACE(value))
3583 ANYOF_BITMAP_SET(data->start_class, value);
3588 if (flags & SCF_DO_STCLASS_AND) {
3589 if (data->start_class->flags & ANYOF_LOCALE) {
3590 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
3591 for (value = 0; value < 256; value++)
3592 if (!isSPACE(value))
3593 ANYOF_BITMAP_CLEAR(data->start_class, value);
3597 data->start_class->flags |= ANYOF_LOCALE;
3598 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
3602 if (flags & SCF_DO_STCLASS_AND) {
3603 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NDIGIT);
3604 for (value = 0; value < 256; value++)
3605 if (!isDIGIT(value))
3606 ANYOF_BITMAP_CLEAR(data->start_class, value);
3609 if (data->start_class->flags & ANYOF_LOCALE)
3610 ANYOF_CLASS_SET(data->start_class,ANYOF_DIGIT);
3612 for (value = 0; value < 256; value++)
3614 ANYOF_BITMAP_SET(data->start_class, value);
3619 if (flags & SCF_DO_STCLASS_AND) {
3620 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_DIGIT);
3621 for (value = 0; value < 256; value++)
3623 ANYOF_BITMAP_CLEAR(data->start_class, value);
3626 if (data->start_class->flags & ANYOF_LOCALE)
3627 ANYOF_CLASS_SET(data->start_class,ANYOF_NDIGIT);
3629 for (value = 0; value < 256; value++)
3630 if (!isDIGIT(value))
3631 ANYOF_BITMAP_SET(data->start_class, value);
3635 CASE_SYNST_FNC(VERTWS);
3636 CASE_SYNST_FNC(HORIZWS);
3639 if (flags & SCF_DO_STCLASS_OR)
3640 cl_and(data->start_class, and_withp);
3641 flags &= ~SCF_DO_STCLASS;
3644 else if (PL_regkind[OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
3645 data->flags |= (OP(scan) == MEOL
3649 else if ( PL_regkind[OP(scan)] == BRANCHJ
3650 /* Lookbehind, or need to calculate parens/evals/stclass: */
3651 && (scan->flags || data || (flags & SCF_DO_STCLASS))
3652 && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) {
3653 if ( !PERL_ENABLE_POSITIVE_ASSERTION_STUDY
3654 || OP(scan) == UNLESSM )
3656 /* Negative Lookahead/lookbehind
3657 In this case we can't do fixed string optimisation.
3660 I32 deltanext, minnext, fake = 0;
3662 struct regnode_charclass_class intrnl;
3665 data_fake.flags = 0;
3667 data_fake.whilem_c = data->whilem_c;
3668 data_fake.last_closep = data->last_closep;
3671 data_fake.last_closep = &fake;
3672 data_fake.pos_delta = delta;
3673 if ( flags & SCF_DO_STCLASS && !scan->flags
3674 && OP(scan) == IFMATCH ) { /* Lookahead */
3675 cl_init(pRExC_state, &intrnl);
3676 data_fake.start_class = &intrnl;
3677 f |= SCF_DO_STCLASS_AND;
3679 if (flags & SCF_WHILEM_VISITED_POS)
3680 f |= SCF_WHILEM_VISITED_POS;
3681 next = regnext(scan);
3682 nscan = NEXTOPER(NEXTOPER(scan));
3683 minnext = study_chunk(pRExC_state, &nscan, minlenp, &deltanext,
3684 last, &data_fake, stopparen, recursed, NULL, f, depth+1);
3687 FAIL("Variable length lookbehind not implemented");
3689 else if (minnext > (I32)U8_MAX) {
3690 FAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
3692 scan->flags = (U8)minnext;
3695 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3697 if (data_fake.flags & SF_HAS_EVAL)
3698 data->flags |= SF_HAS_EVAL;
3699 data->whilem_c = data_fake.whilem_c;
3701 if (f & SCF_DO_STCLASS_AND) {
3702 const int was = (data->start_class->flags & ANYOF_EOS);
3704 cl_and(data->start_class, &intrnl);
3706 data->start_class->flags |= ANYOF_EOS;
3709 #if PERL_ENABLE_POSITIVE_ASSERTION_STUDY
3711 /* Positive Lookahead/lookbehind
3712 In this case we can do fixed string optimisation,
3713 but we must be careful about it. Note in the case of
3714 lookbehind the positions will be offset by the minimum
3715 length of the pattern, something we won't know about
3716 until after the recurse.
3718 I32 deltanext, fake = 0;
3720 struct regnode_charclass_class intrnl;
3722 /* We use SAVEFREEPV so that when the full compile
3723 is finished perl will clean up the allocated
3724 minlens when its all done. This was we don't
3725 have to worry about freeing them when we know
3726 they wont be used, which would be a pain.
3729 Newx( minnextp, 1, I32 );
3730 SAVEFREEPV(minnextp);
3733 StructCopy(data, &data_fake, scan_data_t);
3734 if ((flags & SCF_DO_SUBSTR) && data->last_found) {
3737 SCAN_COMMIT(pRExC_state, &data_fake,minlenp);
3738 data_fake.last_found=newSVsv(data->last_found);
3742 data_fake.last_closep = &fake;
3743 data_fake.flags = 0;
3744 data_fake.pos_delta = delta;
3746 data_fake.flags |= SF_IS_INF;
3747 if ( flags & SCF_DO_STCLASS && !scan->flags
3748 && OP(scan) == IFMATCH ) { /* Lookahead */
3749 cl_init(pRExC_state, &intrnl);
3750 data_fake.start_class = &intrnl;
3751 f |= SCF_DO_STCLASS_AND;
3753 if (flags & SCF_WHILEM_VISITED_POS)
3754 f |= SCF_WHILEM_VISITED_POS;
3755 next = regnext(scan);
3756 nscan = NEXTOPER(NEXTOPER(scan));
3758 *minnextp = study_chunk(pRExC_state, &nscan, minnextp, &deltanext,
3759 last, &data_fake, stopparen, recursed, NULL, f,depth+1);
3762 FAIL("Variable length lookbehind not implemented");
3764 else if (*minnextp > (I32)U8_MAX) {
3765 FAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
3767 scan->flags = (U8)*minnextp;
3772 if (f & SCF_DO_STCLASS_AND) {
3773 const int was = (data->start_class->flags & ANYOF_EOS);
3775 cl_and(data->start_class, &intrnl);
3777 data->start_class->flags |= ANYOF_EOS;
3780 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3782 if (data_fake.flags & SF_HAS_EVAL)
3783 data->flags |= SF_HAS_EVAL;
3784 data->whilem_c = data_fake.whilem_c;
3785 if ((flags & SCF_DO_SUBSTR) && data_fake.last_found) {
3786 if (RExC_rx->minlen<*minnextp)
3787 RExC_rx->minlen=*minnextp;
3788 SCAN_COMMIT(pRExC_state, &data_fake, minnextp);
3789 SvREFCNT_dec(data_fake.last_found);
3791 if ( data_fake.minlen_fixed != minlenp )
3793 data->offset_fixed= data_fake.offset_fixed;
3794 data->minlen_fixed= data_fake.minlen_fixed;
3795 data->lookbehind_fixed+= scan->flags;
3797 if ( data_fake.minlen_float != minlenp )
3799 data->minlen_float= data_fake.minlen_float;
3800 data->offset_float_min=data_fake.offset_float_min;
3801 data->offset_float_max=data_fake.offset_float_max;
3802 data->lookbehind_float+= scan->flags;
3811 else if (OP(scan) == OPEN) {
3812 if (stopparen != (I32)ARG(scan))
3815 else if (OP(scan) == CLOSE) {
3816 if (stopparen == (I32)ARG(scan)) {
3819 if ((I32)ARG(scan) == is_par) {
3820 next = regnext(scan);
3822 if ( next && (OP(next) != WHILEM) && next < last)
3823 is_par = 0; /* Disable optimization */
3826 *(data->last_closep) = ARG(scan);
3828 else if (OP(scan) == EVAL) {
3830 data->flags |= SF_HAS_EVAL;
3832 else if ( PL_regkind[OP(scan)] == ENDLIKE ) {
3833 if (flags & SCF_DO_SUBSTR) {
3834 SCAN_COMMIT(pRExC_state,data,minlenp);
3835 flags &= ~SCF_DO_SUBSTR;
3837 if (data && OP(scan)==ACCEPT) {
3838 data->flags |= SCF_SEEN_ACCEPT;
3843 else if (OP(scan) == LOGICAL && scan->flags == 2) /* Embedded follows */
3845 if (flags & SCF_DO_SUBSTR) {
3846 SCAN_COMMIT(pRExC_state,data,minlenp);
3847 data->longest = &(data->longest_float);
3849 is_inf = is_inf_internal = 1;
3850 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
3851 cl_anything(pRExC_state, data->start_class);
3852 flags &= ~SCF_DO_STCLASS;
3854 else if (OP(scan) == GPOS) {
3855 if (!(RExC_rx->extflags & RXf_GPOS_FLOAT) &&
3856 !(delta || is_inf || (data && data->pos_delta)))
3858 if (!(RExC_rx->extflags & RXf_ANCH) && (flags & SCF_DO_SUBSTR))
3859 RExC_rx->extflags |= RXf_ANCH_GPOS;
3860 if (RExC_rx->gofs < (U32)min)
3861 RExC_rx->gofs = min;
3863 RExC_rx->extflags |= RXf_GPOS_FLOAT;
3867 #ifdef TRIE_STUDY_OPT
3868 #ifdef FULL_TRIE_STUDY
3869 else if (PL_regkind[OP(scan)] == TRIE) {
3870 /* NOTE - There is similar code to this block above for handling
3871 BRANCH nodes on the initial study. If you change stuff here
3873 regnode *trie_node= scan;
3874 regnode *tail= regnext(scan);
3875 reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
3876 I32 max1 = 0, min1 = I32_MAX;
3877 struct regnode_charclass_class accum;
3879 if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
3880 SCAN_COMMIT(pRExC_state, data,minlenp); /* Cannot merge strings after this. */
3881 if (flags & SCF_DO_STCLASS)
3882 cl_init_zero(pRExC_state, &accum);
3888 const regnode *nextbranch= NULL;
3891 for ( word=1 ; word <= trie->wordcount ; word++)
3893 I32 deltanext=0, minnext=0, f = 0, fake;
3894 struct regnode_charclass_class this_class;
3896 data_fake.flags = 0;
3898 data_fake.whilem_c = data->whilem_c;
3899 data_fake.last_closep = data->last_closep;
3902 data_fake.last_closep = &fake;
3903 data_fake.pos_delta = delta;
3904 if (flags & SCF_DO_STCLASS) {
3905 cl_init(pRExC_state, &this_class);
3906 data_fake.start_class = &this_class;
3907 f = SCF_DO_STCLASS_AND;
3909 if (flags & SCF_WHILEM_VISITED_POS)
3910 f |= SCF_WHILEM_VISITED_POS;
3912 if (trie->jump[word]) {
3914 nextbranch = trie_node + trie->jump[0];
3915 scan= trie_node + trie->jump[word];
3916 /* We go from the jump point to the branch that follows
3917 it. Note this means we need the vestigal unused branches
3918 even though they arent otherwise used.
3920 minnext = study_chunk(pRExC_state, &scan, minlenp,
3921 &deltanext, (regnode *)nextbranch, &data_fake,
3922 stopparen, recursed, NULL, f,depth+1);
3924 if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
3925 nextbranch= regnext((regnode*)nextbranch);
3927 if (min1 > (I32)(minnext + trie->minlen))
3928 min1 = minnext + trie->minlen;
3929 if (max1 < (I32)(minnext + deltanext + trie->maxlen))
3930 max1 = minnext + deltanext + trie->maxlen;
3931 if (deltanext == I32_MAX)
3932 is_inf = is_inf_internal = 1;
3934 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3936 if (data_fake.flags & SCF_SEEN_ACCEPT) {
3937 if ( stopmin > min + min1)
3938 stopmin = min + min1;
3939 flags &= ~SCF_DO_SUBSTR;
3941 data->flags |= SCF_SEEN_ACCEPT;
3944 if (data_fake.flags & SF_HAS_EVAL)
3945 data->flags |= SF_HAS_EVAL;
3946 data->whilem_c = data_fake.whilem_c;
3948 if (flags & SCF_DO_STCLASS)
3949 cl_or(pRExC_state, &accum, &this_class);
3952 if (flags & SCF_DO_SUBSTR) {
3953 data->pos_min += min1;
3954 data->pos_delta += max1 - min1;
3955 if (max1 != min1 || is_inf)
3956 data->longest = &(data->longest_float);
3959 delta += max1 - min1;
3960 if (flags & SCF_DO_STCLASS_OR) {
3961 cl_or(pRExC_state, data->start_class, &accum);
3963 cl_and(data->start_class, and_withp);
3964 flags &= ~SCF_DO_STCLASS;
3967 else if (flags & SCF_DO_STCLASS_AND) {
3969 cl_and(data->start_class, &accum);
3970 flags &= ~SCF_DO_STCLASS;
3973 /* Switch to OR mode: cache the old value of
3974 * data->start_class */
3976 StructCopy(data->start_class, and_withp,
3977 struct regnode_charclass_class);
3978 flags &= ~SCF_DO_STCLASS_AND;
3979 StructCopy(&accum, data->start_class,
3980 struct regnode_charclass_class);
3981 flags |= SCF_DO_STCLASS_OR;
3982 data->start_class->flags |= ANYOF_EOS;
3989 else if (PL_regkind[OP(scan)] == TRIE) {
3990 reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
3993 min += trie->minlen;
3994 delta += (trie->maxlen - trie->minlen);
3995 flags &= ~SCF_DO_STCLASS; /* xxx */
3996 if (flags & SCF_DO_SUBSTR) {
3997 SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */
3998 data->pos_min += trie->minlen;
3999 data->pos_delta += (trie->maxlen - trie->minlen);
4000 if (trie->maxlen != trie->minlen)
4001 data->longest = &(data->longest_float);
4003 if (trie->jump) /* no more substrings -- for now /grr*/
4004 flags &= ~SCF_DO_SUBSTR;
4006 #endif /* old or new */
4007 #endif /* TRIE_STUDY_OPT */
4009 /* Else: zero-length, ignore. */
4010 scan = regnext(scan);
4015 stopparen = frame->stop;
4016 frame = frame->prev;
4017 goto fake_study_recurse;
4022 DEBUG_STUDYDATA("pre-fin:",data,depth);
4025 *deltap = is_inf_internal ? I32_MAX : delta;
4026 if (flags & SCF_DO_SUBSTR && is_inf)
4027 data->pos_delta = I32_MAX - data->pos_min;
4028 if (is_par > (I32)U8_MAX)
4030 if (is_par && pars==1 && data) {
4031 data->flags |= SF_IN_PAR;
4032 data->flags &= ~SF_HAS_PAR;
4034 else if (pars && data) {
4035 data->flags |= SF_HAS_PAR;
4036 data->flags &= ~SF_IN_PAR;
4038 if (flags & SCF_DO_STCLASS_OR)
4039 cl_and(data->start_class, and_withp);
4040 if (flags & SCF_TRIE_RESTUDY)
4041 data->flags |= SCF_TRIE_RESTUDY;
4043 DEBUG_STUDYDATA("post-fin:",data,depth);
4045 return min < stopmin ? min : stopmin;
4049 S_add_data(RExC_state_t *pRExC_state, U32 n, const char *s)
4051 U32 count = RExC_rxi->data ? RExC_rxi->data->count : 0;
4053 Renewc(RExC_rxi->data,
4054 sizeof(*RExC_rxi->data) + sizeof(void*) * (count + n - 1),
4055 char, struct reg_data);
4057 Renew(RExC_rxi->data->what, count + n, U8);
4059 Newx(RExC_rxi->data->what, n, U8);
4060 RExC_rxi->data->count = count + n;
4061 Copy(s, RExC_rxi->data->what + count, n, U8);
4065 /*XXX: todo make this not included in a non debugging perl */
4066 #ifndef PERL_IN_XSUB_RE
4068 Perl_reginitcolors(pTHX)
4071 const char * const s = PerlEnv_getenv("PERL_RE_COLORS");
4073 char *t = savepv(s);
4077 t = strchr(t, '\t');
4083 PL_colors[i] = t = (char *)"";
4088 PL_colors[i++] = (char *)"";
4095 #ifdef TRIE_STUDY_OPT
4096 #define CHECK_RESTUDY_GOTO \
4098 (data.flags & SCF_TRIE_RESTUDY) \
4102 #define CHECK_RESTUDY_GOTO
4106 - pregcomp - compile a regular expression into internal code
4108 * We can't allocate space until we know how big the compiled form will be,
4109 * but we can't compile it (and thus know how big it is) until we've got a
4110 * place to put the code. So we cheat: we compile it twice, once with code
4111 * generation turned off and size counting turned on, and once "for real".
4112 * This also means that we don't allocate space until we are sure that the
4113 * thing really will compile successfully, and we never have to move the
4114 * code and thus invalidate pointers into it. (Note that it has to be in
4115 * one piece because free() must be able to free it all.) [NB: not true in perl]
4117 * Beware that the optimization-preparation code in here knows about some
4118 * of the structure of the compiled regexp. [I'll say.]
4123 #ifndef PERL_IN_XSUB_RE
4124 #define RE_ENGINE_PTR &PL_core_reg_engine
4126 extern const struct regexp_engine my_reg_engine;
4127 #define RE_ENGINE_PTR &my_reg_engine
4130 #ifndef PERL_IN_XSUB_RE
4132 Perl_pregcomp(pTHX_ const SV * const pattern, const U32 flags)
4135 HV * const table = GvHV(PL_hintgv);
4136 /* Dispatch a request to compile a regexp to correct
4139 SV **ptr= hv_fetchs(table, "regcomp", FALSE);
4140 GET_RE_DEBUG_FLAGS_DECL;
4141 if (ptr && SvIOK(*ptr) && SvIV(*ptr)) {
4142 const regexp_engine *eng=INT2PTR(regexp_engine*,SvIV(*ptr));
4144 PerlIO_printf(Perl_debug_log, "Using engine %"UVxf"\n",
4147 return CALLREGCOMP_ENG(eng, pattern, flags);
4150 return Perl_re_compile(aTHX_ pattern, flags);
4155 Perl_re_compile(pTHX_ const SV * const pattern, const U32 pm_flags)
4159 register regexp_internal *ri;
4161 char* exp = SvPV((SV*)pattern, plen);
4162 char* xend = exp + plen;
4169 RExC_state_t RExC_state;
4170 RExC_state_t * const pRExC_state = &RExC_state;
4171 #ifdef TRIE_STUDY_OPT
4173 RExC_state_t copyRExC_state;
4175 GET_RE_DEBUG_FLAGS_DECL;
4176 DEBUG_r(if (!PL_colorset) reginitcolors());
4178 RExC_utf8 = RExC_orig_utf8 = pm_flags & RXf_UTF8;
4181 SV *dsv= sv_newmortal();
4182 RE_PV_QUOTED_DECL(s, RExC_utf8,
4183 dsv, exp, plen, 60);
4184 PerlIO_printf(Perl_debug_log, "%sCompiling REx%s %s\n",
4185 PL_colors[4],PL_colors[5],s);
4190 RExC_flags = pm_flags;
4194 RExC_seen_zerolen = *exp == '^' ? -1 : 0;
4195 RExC_seen_evals = 0;
4198 /* First pass: determine size, legality. */
4206 RExC_emit = &PL_regdummy;
4207 RExC_whilem_seen = 0;
4208 RExC_charnames = NULL;
4209 RExC_open_parens = NULL;
4210 RExC_close_parens = NULL;
4212 RExC_paren_names = NULL;
4214 RExC_paren_name_list = NULL;
4216 RExC_recurse = NULL;
4217 RExC_recurse_count = 0;
4219 #if 0 /* REGC() is (currently) a NOP at the first pass.
4220 * Clever compilers notice this and complain. --jhi */
4221 REGC((U8)REG_MAGIC, (char*)RExC_emit);
4223 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "Starting first pass (sizing)\n"));
4224 if (reg(pRExC_state, 0, &flags,1) == NULL) {
4225 RExC_precomp = NULL;
4228 if (RExC_utf8 && !RExC_orig_utf8) {
4229 /* It's possible to write a regexp in ascii that represents Unicode
4230 codepoints outside of the byte range, such as via \x{100}. If we
4231 detect such a sequence we have to convert the entire pattern to utf8
4232 and then recompile, as our sizing calculation will have been based
4233 on 1 byte == 1 character, but we will need to use utf8 to encode
4234 at least some part of the pattern, and therefore must convert the whole
4236 XXX: somehow figure out how to make this less expensive...
4239 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log,
4240 "UTF8 mismatch! Converting to utf8 for resizing and compile\n"));
4241 exp = (char*)Perl_bytes_to_utf8(aTHX_ (U8*)exp, &len);
4243 RExC_orig_utf8 = RExC_utf8;
4245 goto redo_first_pass;
4248 PerlIO_printf(Perl_debug_log,
4249 "Required size %"IVdf" nodes\n"
4250 "Starting second pass (creation)\n",
4253 RExC_lastparse=NULL;
4255 /* Small enough for pointer-storage convention?
4256 If extralen==0, this means that we will not need long jumps. */
4257 if (RExC_size >= 0x10000L && RExC_extralen)
4258 RExC_size += RExC_extralen;
4261 if (RExC_whilem_seen > 15)
4262 RExC_whilem_seen = 15;
4264 /* Allocate space and zero-initialize. Note, the two step process
4265 of zeroing when in debug mode, thus anything assigned has to
4266 happen after that */
4267 Newxz(r, 1, regexp);
4268 Newxc(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode),
4269 char, regexp_internal);
4270 if ( r == NULL || ri == NULL )
4271 FAIL("Regexp out of space");
4273 /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */
4274 Zero(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode), char);
4276 /* bulk initialize base fields with 0. */
4277 Zero(ri, sizeof(regexp_internal), char);
4280 /* non-zero initialization begins here */
4282 r->engine= RE_ENGINE_PTR;
4284 r->extflags = pm_flags;
4286 bool has_p = ((r->extflags & RXf_PMf_KEEPCOPY) == RXf_PMf_KEEPCOPY);
4287 bool has_minus = ((r->extflags & RXf_PMf_STD_PMMOD) != RXf_PMf_STD_PMMOD);
4288 bool has_runon = ((RExC_seen & REG_SEEN_RUN_ON_COMMENT)==REG_SEEN_RUN_ON_COMMENT);
4289 U16 reganch = (U16)((r->extflags & RXf_PMf_STD_PMMOD)
4290 >> RXf_PMf_STD_PMMOD_SHIFT);
4291 const char *fptr = STD_PAT_MODS; /*"msix"*/
4293 RXp_WRAPLEN(r) = plen + has_minus + has_p + has_runon
4294 + (sizeof(STD_PAT_MODS) - 1)
4295 + (sizeof("(?:)") - 1);
4297 Newx(RXp_WRAPPED(r), RXp_WRAPLEN(r) + 1, char );
4301 *p++ = KEEPCOPY_PAT_MOD; /*'p'*/
4303 char *r = p + (sizeof(STD_PAT_MODS) - 1) + has_minus - 1;
4304 char *colon = r + 1;
4307 while((ch = *fptr++)) {
4321 Copy(RExC_precomp, p, plen, char);
4322 assert ((RXp_WRAPPED(r) - p) < 16);
4323 r->pre_prefix = p - RXp_WRAPPED(r);
4332 r->nparens = RExC_npar - 1; /* set early to validate backrefs */
4334 if (RExC_seen & REG_SEEN_RECURSE) {
4335 Newxz(RExC_open_parens, RExC_npar,regnode *);
4336 SAVEFREEPV(RExC_open_parens);
4337 Newxz(RExC_close_parens,RExC_npar,regnode *);
4338 SAVEFREEPV(RExC_close_parens);
4341 /* Useful during FAIL. */
4342 #ifdef RE_TRACK_PATTERN_OFFSETS
4343 Newxz(ri->u.offsets, 2*RExC_size+1, U32); /* MJD 20001228 */
4344 DEBUG_OFFSETS_r(PerlIO_printf(Perl_debug_log,
4345 "%s %"UVuf" bytes for offset annotations.\n",
4346 ri->u.offsets ? "Got" : "Couldn't get",
4347 (UV)((2*RExC_size+1) * sizeof(U32))));
4349 SetProgLen(ri,RExC_size);
4353 /* Second pass: emit code. */
4354 RExC_flags = pm_flags; /* don't let top level (?i) bleed */
4359 RExC_emit_start = ri->program;
4360 RExC_emit = ri->program;
4361 RExC_emit_bound = ri->program + RExC_size + 1;
4363 /* Store the count of eval-groups for security checks: */
4364 RExC_rx->seen_evals = RExC_seen_evals;
4365 REGC((U8)REG_MAGIC, (char*) RExC_emit++);
4366 if (reg(pRExC_state, 0, &flags,1) == NULL) {
4370 /* XXXX To minimize changes to RE engine we always allocate
4371 3-units-long substrs field. */
4372 Newx(r->substrs, 1, struct reg_substr_data);
4373 if (RExC_recurse_count) {
4374 Newxz(RExC_recurse,RExC_recurse_count,regnode *);
4375 SAVEFREEPV(RExC_recurse);
4379 r->minlen = minlen = sawplus = sawopen = 0;
4380 Zero(r->substrs, 1, struct reg_substr_data);
4382 #ifdef TRIE_STUDY_OPT
4385 DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log,"Restudying\n"));
4387 RExC_state = copyRExC_state;
4388 if (seen & REG_TOP_LEVEL_BRANCHES)
4389 RExC_seen |= REG_TOP_LEVEL_BRANCHES;
4391 RExC_seen &= ~REG_TOP_LEVEL_BRANCHES;
4392 if (data.last_found) {
4393 SvREFCNT_dec(data.longest_fixed);
4394 SvREFCNT_dec(data.longest_float);
4395 SvREFCNT_dec(data.last_found);
4397 StructCopy(&zero_scan_data, &data, scan_data_t);
4399 StructCopy(&zero_scan_data, &data, scan_data_t);
4400 copyRExC_state = RExC_state;
4403 StructCopy(&zero_scan_data, &data, scan_data_t);
4406 /* Dig out information for optimizations. */
4407 r->extflags = RExC_flags; /* was pm_op */
4408 /*dmq: removed as part of de-PMOP: pm->op_pmflags = RExC_flags; */
4411 r->extflags |= RXf_UTF8; /* Unicode in it? */
4412 ri->regstclass = NULL;
4413 if (RExC_naughty >= 10) /* Probably an expensive pattern. */
4414 r->intflags |= PREGf_NAUGHTY;
4415 scan = ri->program + 1; /* First BRANCH. */
4417 /* testing for BRANCH here tells us whether there is "must appear"
4418 data in the pattern. If there is then we can use it for optimisations */
4419 if (!(RExC_seen & REG_TOP_LEVEL_BRANCHES)) { /* Only one top-level choice. */
4421 STRLEN longest_float_length, longest_fixed_length;
4422 struct regnode_charclass_class ch_class; /* pointed to by data */
4424 I32 last_close = 0; /* pointed to by data */
4425 regnode *first= scan;
4426 regnode *first_next= regnext(first);
4428 /* Skip introductions and multiplicators >= 1. */
4429 while ((OP(first) == OPEN && (sawopen = 1)) ||
4430 /* An OR of *one* alternative - should not happen now. */
4431 (OP(first) == BRANCH && OP(first_next) != BRANCH) ||
4432 /* for now we can't handle lookbehind IFMATCH*/
4433 (OP(first) == IFMATCH && !first->flags) ||
4434 (OP(first) == PLUS) ||
4435 (OP(first) == MINMOD) ||
4436 /* An {n,m} with n>0 */
4437 (PL_regkind[OP(first)] == CURLY && ARG1(first) > 0) ||
4438 (OP(first) == NOTHING && PL_regkind[OP(first_next)] != END ))
4441 if (OP(first) == PLUS)
4444 first += regarglen[OP(first)];
4445 if (OP(first) == IFMATCH) {
4446 first = NEXTOPER(first);
4447 first += EXTRA_STEP_2ARGS;
4448 } else /* XXX possible optimisation for /(?=)/ */
4449 first = NEXTOPER(first);
4450 first_next= regnext(first);
4453 /* Starting-point info. */
4455 DEBUG_PEEP("first:",first,0);
4456 /* Ignore EXACT as we deal with it later. */
4457 if (PL_regkind[OP(first)] == EXACT) {
4458 if (OP(first) == EXACT)
4459 NOOP; /* Empty, get anchored substr later. */
4460 else if ((OP(first) == EXACTF || OP(first) == EXACTFL))
4461 ri->regstclass = first;
4464 else if (PL_regkind[OP(first)] == TRIE &&
4465 ((reg_trie_data *)ri->data->data[ ARG(first) ])->minlen>0)
4468 /* this can happen only on restudy */
4469 if ( OP(first) == TRIE ) {
4470 struct regnode_1 *trieop = (struct regnode_1 *)
4471 PerlMemShared_calloc(1, sizeof(struct regnode_1));
4472 StructCopy(first,trieop,struct regnode_1);
4473 trie_op=(regnode *)trieop;
4475 struct regnode_charclass *trieop = (struct regnode_charclass *)
4476 PerlMemShared_calloc(1, sizeof(struct regnode_charclass));
4477 StructCopy(first,trieop,struct regnode_charclass);
4478 trie_op=(regnode *)trieop;
4481 make_trie_failtable(pRExC_state, (regnode *)first, trie_op, 0);
4482 ri->regstclass = trie_op;
4485 else if (strchr((const char*)PL_simple,OP(first)))
4486 ri->regstclass = first;
4487 else if (PL_regkind[OP(first)] == BOUND ||
4488 PL_regkind[OP(first)] == NBOUND)
4489 ri->regstclass = first;
4490 else if (PL_regkind[OP(first)] == BOL) {
4491 r->extflags |= (OP(first) == MBOL
4493 : (OP(first) == SBOL
4496 first = NEXTOPER(first);
4499 else if (OP(first) == GPOS) {
4500 r->extflags |= RXf_ANCH_GPOS;
4501 first = NEXTOPER(first);
4504 else if ((!sawopen || !RExC_sawback) &&
4505 (OP(first) == STAR &&
4506 PL_regkind[OP(NEXTOPER(first))] == REG_ANY) &&
4507 !(r->extflags & RXf_ANCH) && !(RExC_seen & REG_SEEN_EVAL))
4509 /* turn .* into ^.* with an implied $*=1 */
4511 (OP(NEXTOPER(first)) == REG_ANY)
4514 r->extflags |= type;
4515 r->intflags |= PREGf_IMPLICIT;
4516 first = NEXTOPER(first);
4519 if (sawplus && (!sawopen || !RExC_sawback)
4520 && !(RExC_seen & REG_SEEN_EVAL)) /* May examine pos and $& */
4521 /* x+ must match at the 1st pos of run of x's */
4522 r->intflags |= PREGf_SKIP;
4524 /* Scan is after the zeroth branch, first is atomic matcher. */
4525 #ifdef TRIE_STUDY_OPT
4528 PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
4529 (IV)(first - scan + 1))
4533 PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
4534 (IV)(first - scan + 1))
4540 * If there's something expensive in the r.e., find the
4541 * longest literal string that must appear and make it the
4542 * regmust. Resolve ties in favor of later strings, since
4543 * the regstart check works with the beginning of the r.e.
4544 * and avoiding duplication strengthens checking. Not a
4545 * strong reason, but sufficient in the absence of others.
4546 * [Now we resolve ties in favor of the earlier string if
4547 * it happens that c_offset_min has been invalidated, since the
4548 * earlier string may buy us something the later one won't.]
4551 data.longest_fixed = newSVpvs("");
4552 data.longest_float = newSVpvs("");
4553 data.last_found = newSVpvs("");
4554 data.longest = &(data.longest_fixed);
4556 if (!ri->regstclass) {
4557 cl_init(pRExC_state, &ch_class);
4558 data.start_class = &ch_class;
4559 stclass_flag = SCF_DO_STCLASS_AND;
4560 } else /* XXXX Check for BOUND? */
4562 data.last_closep = &last_close;
4564 minlen = study_chunk(pRExC_state, &first, &minlen, &fake, scan + RExC_size, /* Up to end */
4565 &data, -1, NULL, NULL,
4566 SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag,0);
4572 if ( RExC_npar == 1 && data.longest == &(data.longest_fixed)
4573 && data.last_start_min == 0 && data.last_end > 0
4574 && !RExC_seen_zerolen
4575 && !(RExC_seen & REG_SEEN_VERBARG)
4576 && (!(RExC_seen & REG_SEEN_GPOS) || (r->extflags & RXf_ANCH_GPOS)))
4577 r->extflags |= RXf_CHECK_ALL;
4578 scan_commit(pRExC_state, &data,&minlen,0);
4579 SvREFCNT_dec(data.last_found);
4581 /* Note that code very similar to this but for anchored string
4582 follows immediately below, changes may need to be made to both.
4585 longest_float_length = CHR_SVLEN(data.longest_float);
4586 if (longest_float_length
4587 || (data.flags & SF_FL_BEFORE_EOL
4588 && (!(data.flags & SF_FL_BEFORE_MEOL)
4589 || (RExC_flags & RXf_PMf_MULTILINE))))
4593 if (SvCUR(data.longest_fixed) /* ok to leave SvCUR */
4594 && data.offset_fixed == data.offset_float_min
4595 && SvCUR(data.longest_fixed) == SvCUR(data.longest_float))
4596 goto remove_float; /* As in (a)+. */
4598 /* copy the information about the longest float from the reg_scan_data
4599 over to the program. */
4600 if (SvUTF8(data.longest_float)) {
4601 r->float_utf8 = data.longest_float;
4602 r->float_substr = NULL;
4604 r->float_substr = data.longest_float;
4605 r->float_utf8 = NULL;
4607 /* float_end_shift is how many chars that must be matched that
4608 follow this item. We calculate it ahead of time as once the
4609 lookbehind offset is added in we lose the ability to correctly
4611 ml = data.minlen_float ? *(data.minlen_float)
4612 : (I32)longest_float_length;
4613 r->float_end_shift = ml - data.offset_float_min
4614 - longest_float_length + (SvTAIL(data.longest_float) != 0)
4615 + data.lookbehind_float;
4616 r->float_min_offset = data.offset_float_min - data.lookbehind_float;
4617 r->float_max_offset = data.offset_float_max;
4618 if (data.offset_float_max < I32_MAX) /* Don't offset infinity */
4619 r->float_max_offset -= data.lookbehind_float;
4621 t = (data.flags & SF_FL_BEFORE_EOL /* Can't have SEOL and MULTI */
4622 && (!(data.flags & SF_FL_BEFORE_MEOL)
4623 || (RExC_flags & RXf_PMf_MULTILINE)));
4624 fbm_compile(data.longest_float, t ? FBMcf_TAIL : 0);
4628 r->float_substr = r->float_utf8 = NULL;
4629 SvREFCNT_dec(data.longest_float);
4630 longest_float_length = 0;
4633 /* Note that code very similar to this but for floating string
4634 is immediately above, changes may need to be made to both.
4637 longest_fixed_length = CHR_SVLEN(data.longest_fixed);
4638 if (longest_fixed_length
4639 || (data.flags & SF_FIX_BEFORE_EOL /* Cannot have SEOL and MULTI */
4640 && (!(data.flags & SF_FIX_BEFORE_MEOL)
4641 || (RExC_flags & RXf_PMf_MULTILINE))))
4645 /* copy the information about the longest fixed
4646 from the reg_scan_data over to the program. */
4647 if (SvUTF8(data.longest_fixed)) {
4648 r->anchored_utf8 = data.longest_fixed;
4649 r->anchored_substr = NULL;
4651 r->anchored_substr = data.longest_fixed;
4652 r->anchored_utf8 = NULL;
4654 /* fixed_end_shift is how many chars that must be matched that
4655 follow this item. We calculate it ahead of time as once the
4656 lookbehind offset is added in we lose the ability to correctly
4658 ml = data.minlen_fixed ? *(data.minlen_fixed)
4659 : (I32)longest_fixed_length;
4660 r->anchored_end_shift = ml - data.offset_fixed
4661 - longest_fixed_length + (SvTAIL(data.longest_fixed) != 0)
4662 + data.lookbehind_fixed;
4663 r->anchored_offset = data.offset_fixed - data.lookbehind_fixed;
4665 t = (data.flags & SF_FIX_BEFORE_EOL /* Can't have SEOL and MULTI */
4666 && (!(data.flags & SF_FIX_BEFORE_MEOL)
4667 || (RExC_flags & RXf_PMf_MULTILINE)));
4668 fbm_compile(data.longest_fixed, t ? FBMcf_TAIL : 0);
4671 r->anchored_substr = r->anchored_utf8 = NULL;
4672 SvREFCNT_dec(data.longest_fixed);
4673 longest_fixed_length = 0;
4676 && (OP(ri->regstclass) == REG_ANY || OP(ri->regstclass) == SANY))
4677 ri->regstclass = NULL;
4678 if ((!(r->anchored_substr || r->anchored_utf8) || r->anchored_offset)
4680 && !(data.start_class->flags & ANYOF_EOS)
4681 && !cl_is_anything(data.start_class))
4683 const U32 n = add_data(pRExC_state, 1, "f");
4685 Newx(RExC_rxi->data->data[n], 1,
4686 struct regnode_charclass_class);
4687 StructCopy(data.start_class,
4688 (struct regnode_charclass_class*)RExC_rxi->data->data[n],
4689 struct regnode_charclass_class);
4690 ri->regstclass = (regnode*)RExC_rxi->data->data[n];
4691 r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
4692 DEBUG_COMPILE_r({ SV *sv = sv_newmortal();
4693 regprop(r, sv, (regnode*)data.start_class);
4694 PerlIO_printf(Perl_debug_log,
4695 "synthetic stclass \"%s\".\n",
4696 SvPVX_const(sv));});
4699 /* A temporary algorithm prefers floated substr to fixed one to dig more info. */
4700 if (longest_fixed_length > longest_float_length) {
4701 r->check_end_shift = r->anchored_end_shift;
4702 r->check_substr = r->anchored_substr;
4703 r->check_utf8 = r->anchored_utf8;
4704 r->check_offset_min = r->check_offset_max = r->anchored_offset;
4705 if (r->extflags & RXf_ANCH_SINGLE)
4706 r->extflags |= RXf_NOSCAN;
4709 r->check_end_shift = r->float_end_shift;
4710 r->check_substr = r->float_substr;
4711 r->check_utf8 = r->float_utf8;
4712 r->check_offset_min = r->float_min_offset;
4713 r->check_offset_max = r->float_max_offset;
4715 /* XXXX Currently intuiting is not compatible with ANCH_GPOS.
4716 This should be changed ASAP! */
4717 if ((r->check_substr || r->check_utf8) && !(r->extflags & RXf_ANCH_GPOS)) {
4718 r->extflags |= RXf_USE_INTUIT;
4719 if (SvTAIL(r->check_substr ? r->check_substr : r->check_utf8))
4720 r->extflags |= RXf_INTUIT_TAIL;
4722 /* XXX Unneeded? dmq (shouldn't as this is handled elsewhere)
4723 if ( (STRLEN)minlen < longest_float_length )
4724 minlen= longest_float_length;
4725 if ( (STRLEN)minlen < longest_fixed_length )
4726 minlen= longest_fixed_length;
4730 /* Several toplevels. Best we can is to set minlen. */
4732 struct regnode_charclass_class ch_class;
4735 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "\nMulti Top Level\n"));
4737 scan = ri->program + 1;
4738 cl_init(pRExC_state, &ch_class);
4739 data.start_class = &ch_class;
4740 data.last_closep = &last_close;
4743 minlen = study_chunk(pRExC_state, &scan, &minlen, &fake, scan + RExC_size,
4744 &data, -1, NULL, NULL, SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS,0);
4748 r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8
4749 = r->float_substr = r->float_utf8 = NULL;
4750 if (!(data.start_class->flags & ANYOF_EOS)
4751 && !cl_is_anything(data.start_class))
4753 const U32 n = add_data(pRExC_state, 1, "f");
4755 Newx(RExC_rxi->data->data[n], 1,
4756 struct regnode_charclass_class);
4757 StructCopy(data.start_class,
4758 (struct regnode_charclass_class*)RExC_rxi->data->data[n],
4759 struct regnode_charclass_class);
4760 ri->regstclass = (regnode*)RExC_rxi->data->data[n];
4761 r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
4762 DEBUG_COMPILE_r({ SV* sv = sv_newmortal();
4763 regprop(r, sv, (regnode*)data.start_class);
4764 PerlIO_printf(Perl_debug_log,
4765 "synthetic stclass \"%s\".\n",
4766 SvPVX_const(sv));});
4770 /* Guard against an embedded (?=) or (?<=) with a longer minlen than
4771 the "real" pattern. */
4773 PerlIO_printf(Perl_debug_log,"minlen: %"IVdf" r->minlen:%"IVdf"\n",
4774 (IV)minlen, (IV)r->minlen);
4776 r->minlenret = minlen;
4777 if (r->minlen < minlen)
4780 if (RExC_seen & REG_SEEN_GPOS)
4781 r->extflags |= RXf_GPOS_SEEN;
4782 if (RExC_seen & REG_SEEN_LOOKBEHIND)
4783 r->extflags |= RXf_LOOKBEHIND_SEEN;
4784 if (RExC_seen & REG_SEEN_EVAL)
4785 r->extflags |= RXf_EVAL_SEEN;
4786 if (RExC_seen & REG_SEEN_CANY)
4787 r->extflags |= RXf_CANY_SEEN;
4788 if (RExC_seen & REG_SEEN_VERBARG)
4789 r->intflags |= PREGf_VERBARG_SEEN;
4790 if (RExC_seen & REG_SEEN_CUTGROUP)
4791 r->intflags |= PREGf_CUTGROUP_SEEN;
4792 if (RExC_paren_names)
4793 r->paren_names = (HV*)SvREFCNT_inc(RExC_paren_names);
4795 r->paren_names = NULL;
4797 #ifdef STUPID_PATTERN_CHECKS
4798 if (RX_PRELEN(r) == 0)
4799 r->extflags |= RXf_NULL;
4800 if (r->extflags & RXf_SPLIT && RX_PRELEN(r) == 1 && RXp_PRECOMP(r)[0] == ' ')
4801 /* XXX: this should happen BEFORE we compile */
4802 r->extflags |= (RXf_SKIPWHITE|RXf_WHITE);
4803 else if (RX_PRELEN(r) == 3 && memEQ("\\s+", RXp_PRECOMP(r), 3))
4804 r->extflags |= RXf_WHITE;
4805 else if (RX_PRELEN(r) == 1 && RXp_PRECOMP(r)[0] == '^')
4806 r->extflags |= RXf_START_ONLY;
4808 if (r->extflags & RXf_SPLIT && RXp_PRELEN(r) == 1 && RXp_PRECOMP(r)[0] == ' ')
4809 /* XXX: this should happen BEFORE we compile */
4810 r->extflags |= (RXf_SKIPWHITE|RXf_WHITE);
4812 regnode *first = ri->program + 1;
4814 U8 nop = OP(NEXTOPER(first));
4816 if (PL_regkind[fop] == NOTHING && nop == END)
4817 r->extflags |= RXf_NULL;
4818 else if (PL_regkind[fop] == BOL && nop == END)
4819 r->extflags |= RXf_START_ONLY;
4820 else if (fop == PLUS && nop ==SPACE && OP(regnext(first))==END)
4821 r->extflags |= RXf_WHITE;
4825 if (RExC_paren_names) {
4826 ri->name_list_idx = add_data( pRExC_state, 1, "p" );
4827 ri->data->data[ri->name_list_idx] = (void*)SvREFCNT_inc(RExC_paren_name_list);
4830 ri->name_list_idx = 0;
4832 if (RExC_recurse_count) {
4833 for ( ; RExC_recurse_count ; RExC_recurse_count-- ) {
4834 const regnode *scan = RExC_recurse[RExC_recurse_count-1];
4835 ARG2L_SET( scan, RExC_open_parens[ARG(scan)-1] - scan );
4838 Newxz(r->offs, RExC_npar, regexp_paren_pair);
4839 /* assume we don't need to swap parens around before we match */
4842 PerlIO_printf(Perl_debug_log,"Final program:\n");
4845 #ifdef RE_TRACK_PATTERN_OFFSETS
4846 DEBUG_OFFSETS_r(if (ri->u.offsets) {
4847 const U32 len = ri->u.offsets[0];
4849 GET_RE_DEBUG_FLAGS_DECL;
4850 PerlIO_printf(Perl_debug_log, "Offsets: [%"UVuf"]\n\t", (UV)ri->u.offsets[0]);
4851 for (i = 1; i <= len; i++) {
4852 if (ri->u.offsets[i*2-1] || ri->u.offsets[i*2])
4853 PerlIO_printf(Perl_debug_log, "%"UVuf":%"UVuf"[%"UVuf"] ",
4854 (UV)i, (UV)ri->u.offsets[i*2-1], (UV)ri->u.offsets[i*2]);
4856 PerlIO_printf(Perl_debug_log, "\n");
4862 #undef RE_ENGINE_PTR
4866 Perl_reg_named_buff(pTHX_ REGEXP * const rx, SV * const key, SV * const value,
4869 PERL_UNUSED_ARG(value);
4871 if (flags & RXapif_FETCH) {
4872 return reg_named_buff_fetch(rx, key, flags);
4873 } else if (flags & (RXapif_STORE | RXapif_DELETE | RXapif_CLEAR)) {
4874 Perl_croak(aTHX_ PL_no_modify);
4876 } else if (flags & RXapif_EXISTS) {
4877 return reg_named_buff_exists(rx, key, flags)
4880 } else if (flags & RXapif_REGNAMES) {
4881 return reg_named_buff_all(rx, flags);
4882 } else if (flags & (RXapif_SCALAR | RXapif_REGNAMES_COUNT)) {
4883 return reg_named_buff_scalar(rx, flags);
4885 Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff", (int)flags);
4891 Perl_reg_named_buff_iter(pTHX_ REGEXP * const rx, const SV * const lastkey,
4894 PERL_UNUSED_ARG(lastkey);
4896 if (flags & RXapif_FIRSTKEY)
4897 return reg_named_buff_firstkey(rx, flags);
4898 else if (flags & RXapif_NEXTKEY)
4899 return reg_named_buff_nextkey(rx, flags);
4901 Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_iter", (int)flags);
4907 Perl_reg_named_buff_fetch(pTHX_ REGEXP * const rx, SV * const namesv, const U32 flags)
4909 AV *retarray = NULL;
4911 if (flags & RXapif_ALL)
4914 if (rx && rx->paren_names) {
4915 HE *he_str = hv_fetch_ent( rx->paren_names, namesv, 0, 0 );
4918 SV* sv_dat=HeVAL(he_str);
4919 I32 *nums=(I32*)SvPVX(sv_dat);
4920 for ( i=0; i<SvIVX(sv_dat); i++ ) {
4921 if ((I32)(rx->nparens) >= nums[i]
4922 && rx->offs[nums[i]].start != -1
4923 && rx->offs[nums[i]].end != -1)
4926 CALLREG_NUMBUF_FETCH(rx,nums[i],ret);
4930 ret = newSVsv(&PL_sv_undef);
4933 SvREFCNT_inc_simple_void(ret);
4934 av_push(retarray, ret);
4938 return newRV((SV*)retarray);
4945 Perl_reg_named_buff_exists(pTHX_ REGEXP * const rx, SV * const key,
4948 if (rx && rx->paren_names) {
4949 if (flags & RXapif_ALL) {
4950 return hv_exists_ent(rx->paren_names, key, 0);
4952 SV *sv = CALLREG_NAMED_BUFF_FETCH(rx, key, flags);
4966 Perl_reg_named_buff_firstkey(pTHX_ REGEXP * const rx, const U32 flags)
4968 if ( rx && rx->paren_names ) {
4969 (void)hv_iterinit(rx->paren_names);
4971 return CALLREG_NAMED_BUFF_NEXTKEY(rx, NULL, flags & ~RXapif_FIRSTKEY);
4978 Perl_reg_named_buff_nextkey(pTHX_ REGEXP * const rx, const U32 flags)
4980 if (rx && rx->paren_names) {
4981 HV *hv = rx->paren_names;
4983 while ( (temphe = hv_iternext_flags(hv,0)) ) {
4986 SV* sv_dat = HeVAL(temphe);
4987 I32 *nums = (I32*)SvPVX(sv_dat);
4988 for ( i = 0; i < SvIVX(sv_dat); i++ ) {
4989 if ((I32)(rx->lastcloseparen) >= nums[i] &&
4990 rx->offs[nums[i]].start != -1 &&
4991 rx->offs[nums[i]].end != -1)
4997 if (parno || flags & RXapif_ALL) {
4999 char *pv = HePV(temphe, len);
5000 return newSVpvn(pv,len);
5008 Perl_reg_named_buff_scalar(pTHX_ REGEXP * const rx, const U32 flags)
5014 if (rx && rx->paren_names) {
5015 if (flags & (RXapif_ALL | RXapif_REGNAMES_COUNT)) {
5016 return newSViv(HvTOTALKEYS(rx->paren_names));
5017 } else if (flags & RXapif_ONE) {
5018 ret = CALLREG_NAMED_BUFF_ALL(rx, (flags | RXapif_REGNAMES));
5019 av = (AV*)SvRV(ret);
5020 length = av_len(av);
5021 return newSViv(length + 1);
5023 Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_scalar", (int)flags);
5027 return &PL_sv_undef;
5031 Perl_reg_named_buff_all(pTHX_ REGEXP * const rx, const U32 flags)
5035 if (rx && rx->paren_names) {
5036 HV *hv= rx->paren_names;
5038 (void)hv_iterinit(hv);
5039 while ( (temphe = hv_iternext_flags(hv,0)) ) {
5042 SV* sv_dat = HeVAL(temphe);
5043 I32 *nums = (I32*)SvPVX(sv_dat);
5044 for ( i = 0; i < SvIVX(sv_dat); i++ ) {
5045 if ((I32)(rx->lastcloseparen) >= nums[i] &&
5046 rx->offs[nums[i]].start != -1 &&
5047 rx->offs[nums[i]].end != -1)
5053 if (parno || flags & RXapif_ALL) {
5055 char *pv = HePV(temphe, len);
5056 av_push(av, newSVpvn(pv,len));
5061 return newRV((SV*)av);
5065 Perl_reg_numbered_buff_fetch(pTHX_ REGEXP * const rx, const I32 paren, SV * const sv)
5072 sv_setsv(sv,&PL_sv_undef);
5076 if (paren == RX_BUFF_IDX_PREMATCH && rx->offs[0].start != -1) {
5078 i = rx->offs[0].start;
5082 if (paren == RX_BUFF_IDX_POSTMATCH && rx->offs[0].end != -1) {
5084 s = rx->subbeg + rx->offs[0].end;
5085 i = rx->sublen - rx->offs[0].end;
5088 if ( 0 <= paren && paren <= (I32)rx->nparens &&
5089 (s1 = rx->offs[paren].start) != -1 &&
5090 (t1 = rx->offs[paren].end) != -1)
5094 s = rx->subbeg + s1;
5096 sv_setsv(sv,&PL_sv_undef);
5099 assert(rx->sublen >= (s - rx->subbeg) + i );
5101 const int oldtainted = PL_tainted;
5103 sv_setpvn(sv, s, i);
5104 PL_tainted = oldtainted;
5105 if ( (rx->extflags & RXf_CANY_SEEN)
5106 ? (RXp_MATCH_UTF8(rx)
5107 && (!i || is_utf8_string((U8*)s, i)))
5108 : (RXp_MATCH_UTF8(rx)) )
5115 if (RXp_MATCH_TAINTED(rx)) {
5116 if (SvTYPE(sv) >= SVt_PVMG) {
5117 MAGIC* const mg = SvMAGIC(sv);
5120 SvMAGIC_set(sv, mg->mg_moremagic);
5122 if ((mgt = SvMAGIC(sv))) {
5123 mg->mg_moremagic = mgt;
5124 SvMAGIC_set(sv, mg);
5134 sv_setsv(sv,&PL_sv_undef);
5140 Perl_reg_numbered_buff_store(pTHX_ REGEXP * const rx, const I32 paren,
5141 SV const * const value)
5143 PERL_UNUSED_ARG(rx);
5144 PERL_UNUSED_ARG(paren);
5145 PERL_UNUSED_ARG(value);
5148 Perl_croak(aTHX_ PL_no_modify);
5152 Perl_reg_numbered_buff_length(pTHX_ REGEXP * const rx, const SV * const sv,
5158 /* Some of this code was originally in C<Perl_magic_len> in F<mg.c> */
5160 /* $` / ${^PREMATCH} */
5161 case RX_BUFF_IDX_PREMATCH:
5162 if (rx->offs[0].start != -1) {
5163 i = rx->offs[0].start;
5171 /* $' / ${^POSTMATCH} */
5172 case RX_BUFF_IDX_POSTMATCH:
5173 if (rx->offs[0].end != -1) {
5174 i = rx->sublen - rx->offs[0].end;
5176 s1 = rx->offs[0].end;
5182 /* $& / ${^MATCH}, $1, $2, ... */
5184 if (paren <= (I32)rx->nparens &&
5185 (s1 = rx->offs[paren].start) != -1 &&
5186 (t1 = rx->offs[paren].end) != -1)
5191 if (ckWARN(WARN_UNINITIALIZED))
5192 report_uninit((SV*)sv);
5197 if (i > 0 && RXp_MATCH_UTF8(rx)) {
5198 const char * const s = rx->subbeg + s1;
5203 if (is_utf8_string_loclen((U8*)s, i, &ep, &el))
5210 Perl_reg_qr_package(pTHX_ REGEXP * const rx)
5212 PERL_UNUSED_ARG(rx);
5216 /* Scans the name of a named buffer from the pattern.
5217 * If flags is REG_RSN_RETURN_NULL returns null.
5218 * If flags is REG_RSN_RETURN_NAME returns an SV* containing the name
5219 * If flags is REG_RSN_RETURN_DATA returns the data SV* corresponding
5220 * to the parsed name as looked up in the RExC_paren_names hash.
5221 * If there is an error throws a vFAIL().. type exception.
5224 #define REG_RSN_RETURN_NULL 0
5225 #define REG_RSN_RETURN_NAME 1
5226 #define REG_RSN_RETURN_DATA 2
5229 S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags) {
5230 char *name_start = RExC_parse;
5232 if (isIDFIRST_lazy_if(RExC_parse, UTF)) {
5233 /* skip IDFIRST by using do...while */
5236 RExC_parse += UTF8SKIP(RExC_parse);
5237 } while (isALNUM_utf8((U8*)RExC_parse));
5241 } while (isALNUM(*RExC_parse));
5245 SV* sv_name = sv_2mortal(Perl_newSVpvn(aTHX_ name_start,
5246 (int)(RExC_parse - name_start)));
5249 if ( flags == REG_RSN_RETURN_NAME)
5251 else if (flags==REG_RSN_RETURN_DATA) {
5254 if ( ! sv_name ) /* should not happen*/
5255 Perl_croak(aTHX_ "panic: no svname in reg_scan_name");
5256 if (RExC_paren_names)
5257 he_str = hv_fetch_ent( RExC_paren_names, sv_name, 0, 0 );
5259 sv_dat = HeVAL(he_str);
5261 vFAIL("Reference to nonexistent named group");
5265 Perl_croak(aTHX_ "panic: bad flag in reg_scan_name");
5272 #define DEBUG_PARSE_MSG(funcname) DEBUG_PARSE_r({ \
5273 int rem=(int)(RExC_end - RExC_parse); \
5282 if (RExC_lastparse!=RExC_parse) \
5283 PerlIO_printf(Perl_debug_log," >%.*s%-*s", \
5286 iscut ? "..." : "<" \
5289 PerlIO_printf(Perl_debug_log,"%16s",""); \
5292 num = RExC_size + 1; \
5294 num=REG_NODE_NUM(RExC_emit); \
5295 if (RExC_lastnum!=num) \
5296 PerlIO_printf(Perl_debug_log,"|%4d",num); \
5298 PerlIO_printf(Perl_debug_log,"|%4s",""); \
5299 PerlIO_printf(Perl_debug_log,"|%*s%-4s", \
5300 (int)((depth*2)), "", \
5304 RExC_lastparse=RExC_parse; \
5309 #define DEBUG_PARSE(funcname) DEBUG_PARSE_r({ \
5310 DEBUG_PARSE_MSG((funcname)); \
5311 PerlIO_printf(Perl_debug_log,"%4s","\n"); \
5313 #define DEBUG_PARSE_FMT(funcname,fmt,args) DEBUG_PARSE_r({ \
5314 DEBUG_PARSE_MSG((funcname)); \
5315 PerlIO_printf(Perl_debug_log,fmt "\n",args); \
5318 - reg - regular expression, i.e. main body or parenthesized thing
5320 * Caller must absorb opening parenthesis.
5322 * Combining parenthesis handling with the base level of regular expression
5323 * is a trifle forced, but the need to tie the tails of the branches to what
5324 * follows makes it hard to avoid.
5326 #define REGTAIL(x,y,z) regtail((x),(y),(z),depth+1)
5328 #define REGTAIL_STUDY(x,y,z) regtail_study((x),(y),(z),depth+1)
5330 #define REGTAIL_STUDY(x,y,z) regtail((x),(y),(z),depth+1)
5334 S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
5335 /* paren: Parenthesized? 0=top, 1=(, inside: changed to letter. */
5338 register regnode *ret; /* Will be the head of the group. */
5339 register regnode *br;
5340 register regnode *lastbr;
5341 register regnode *ender = NULL;
5342 register I32 parno = 0;
5344 U32 oregflags = RExC_flags;
5345 bool have_branch = 0;
5347 I32 freeze_paren = 0;
5348 I32 after_freeze = 0;
5350 /* for (?g), (?gc), and (?o) warnings; warning
5351 about (?c) will warn about (?g) -- japhy */
5353 #define WASTED_O 0x01
5354 #define WASTED_G 0x02
5355 #define WASTED_C 0x04
5356 #define WASTED_GC (0x02|0x04)
5357 I32 wastedflags = 0x00;
5359 char * parse_start = RExC_parse; /* MJD */
5360 char * const oregcomp_parse = RExC_parse;
5362 GET_RE_DEBUG_FLAGS_DECL;
5363 DEBUG_PARSE("reg ");
5365 *flagp = 0; /* Tentatively. */
5368 /* Make an OPEN node, if parenthesized. */
5370 if ( *RExC_parse == '*') { /* (*VERB:ARG) */
5371 char *start_verb = RExC_parse;
5372 STRLEN verb_len = 0;
5373 char *start_arg = NULL;
5374 unsigned char op = 0;
5376 int internal_argval = 0; /* internal_argval is only useful if !argok */
5377 while ( *RExC_parse && *RExC_parse != ')' ) {
5378 if ( *RExC_parse == ':' ) {
5379 start_arg = RExC_parse + 1;
5385 verb_len = RExC_parse - start_verb;
5388 while ( *RExC_parse && *RExC_parse != ')' )
5390 if ( *RExC_parse != ')' )
5391 vFAIL("Unterminated verb pattern argument");
5392 if ( RExC_parse == start_arg )
5395 if ( *RExC_parse != ')' )
5396 vFAIL("Unterminated verb pattern");
5399 switch ( *start_verb ) {
5400 case 'A': /* (*ACCEPT) */
5401 if ( memEQs(start_verb,verb_len,"ACCEPT") ) {
5403 internal_argval = RExC_nestroot;
5406 case 'C': /* (*COMMIT) */
5407 if ( memEQs(start_verb,verb_len,"COMMIT") )
5410 case 'F': /* (*FAIL) */
5411 if ( verb_len==1 || memEQs(start_verb,verb_len,"FAIL") ) {
5416 case ':': /* (*:NAME) */
5417 case 'M': /* (*MARK:NAME) */
5418 if ( verb_len==0 || memEQs(start_verb,verb_len,"MARK") ) {
5423 case 'P': /* (*PRUNE) */
5424 if ( memEQs(start_verb,verb_len,"PRUNE") )
5427 case 'S': /* (*SKIP) */
5428 if ( memEQs(start_verb,verb_len,"SKIP") )
5431 case 'T': /* (*THEN) */
5432 /* [19:06] <TimToady> :: is then */
5433 if ( memEQs(start_verb,verb_len,"THEN") ) {
5435 RExC_seen |= REG_SEEN_CUTGROUP;
5441 vFAIL3("Unknown verb pattern '%.*s'",
5442 verb_len, start_verb);
5445 if ( start_arg && internal_argval ) {
5446 vFAIL3("Verb pattern '%.*s' may not have an argument",
5447 verb_len, start_verb);
5448 } else if ( argok < 0 && !start_arg ) {
5449 vFAIL3("Verb pattern '%.*s' has a mandatory argument",
5450 verb_len, start_verb);
5452 ret = reganode(pRExC_state, op, internal_argval);
5453 if ( ! internal_argval && ! SIZE_ONLY ) {
5455 SV *sv = newSVpvn( start_arg, RExC_parse - start_arg);
5456 ARG(ret) = add_data( pRExC_state, 1, "S" );
5457 RExC_rxi->data->data[ARG(ret)]=(void*)sv;
5464 if (!internal_argval)
5465 RExC_seen |= REG_SEEN_VERBARG;
5466 } else if ( start_arg ) {
5467 vFAIL3("Verb pattern '%.*s' may not have an argument",
5468 verb_len, start_verb);
5470 ret = reg_node(pRExC_state, op);
5472 nextchar(pRExC_state);
5475 if (*RExC_parse == '?') { /* (?...) */
5476 bool is_logical = 0;
5477 const char * const seqstart = RExC_parse;
5480 paren = *RExC_parse++;
5481 ret = NULL; /* For look-ahead/behind. */
5484 case 'P': /* (?P...) variants for those used to PCRE/Python */
5485 paren = *RExC_parse++;
5486 if ( paren == '<') /* (?P<...>) named capture */
5488 else if (paren == '>') { /* (?P>name) named recursion */
5489 goto named_recursion;
5491 else if (paren == '=') { /* (?P=...) named backref */
5492 /* this pretty much dupes the code for \k<NAME> in regatom(), if
5493 you change this make sure you change that */
5494 char* name_start = RExC_parse;
5496 SV *sv_dat = reg_scan_name(pRExC_state,
5497 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
5498 if (RExC_parse == name_start || *RExC_parse != ')')
5499 vFAIL2("Sequence %.3s... not terminated",parse_start);
5502 num = add_data( pRExC_state, 1, "S" );
5503 RExC_rxi->data->data[num]=(void*)sv_dat;
5504 SvREFCNT_inc_simple_void(sv_dat);
5507 ret = reganode(pRExC_state,
5508 (U8)(FOLD ? (LOC ? NREFFL : NREFF) : NREF),
5512 Set_Node_Offset(ret, parse_start+1);
5513 Set_Node_Cur_Length(ret); /* MJD */
5515 nextchar(pRExC_state);
5519 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
5521 case '<': /* (?<...) */
5522 if (*RExC_parse == '!')
5524 else if (*RExC_parse != '=')
5530 case '\'': /* (?'...') */
5531 name_start= RExC_parse;
5532 svname = reg_scan_name(pRExC_state,
5533 SIZE_ONLY ? /* reverse test from the others */
5534 REG_RSN_RETURN_NAME :
5535 REG_RSN_RETURN_NULL);
5536 if (RExC_parse == name_start) {
5538 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
5541 if (*RExC_parse != paren)
5542 vFAIL2("Sequence (?%c... not terminated",
5543 paren=='>' ? '<' : paren);
5547 if (!svname) /* shouldnt happen */
5549 "panic: reg_scan_name returned NULL");
5550 if (!RExC_paren_names) {
5551 RExC_paren_names= newHV();
5552 sv_2mortal((SV*)RExC_paren_names);
5554 RExC_paren_name_list= newAV();
5555 sv_2mortal((SV*)RExC_paren_name_list);
5558 he_str = hv_fetch_ent( RExC_paren_names, svname, 1, 0 );
5560 sv_dat = HeVAL(he_str);
5562 /* croak baby croak */
5564 "panic: paren_name hash element allocation failed");
5565 } else if ( SvPOK(sv_dat) ) {
5566 /* (?|...) can mean we have dupes so scan to check
5567 its already been stored. Maybe a flag indicating
5568 we are inside such a construct would be useful,
5569 but the arrays are likely to be quite small, so
5570 for now we punt -- dmq */
5571 IV count = SvIV(sv_dat);
5572 I32 *pv = (I32*)SvPVX(sv_dat);
5574 for ( i = 0 ; i < count ; i++ ) {
5575 if ( pv[i] == RExC_npar ) {
5581 pv = (I32*)SvGROW(sv_dat, SvCUR(sv_dat) + sizeof(I32)+1);
5582 SvCUR_set(sv_dat, SvCUR(sv_dat) + sizeof(I32));
5583 pv[count] = RExC_npar;
5587 (void)SvUPGRADE(sv_dat,SVt_PVNV);
5588 sv_setpvn(sv_dat, (char *)&(RExC_npar), sizeof(I32));
5593 if (!av_store(RExC_paren_name_list, RExC_npar, SvREFCNT_inc(svname)))
5594 SvREFCNT_dec(svname);
5597 /*sv_dump(sv_dat);*/
5599 nextchar(pRExC_state);
5601 goto capturing_parens;
5603 RExC_seen |= REG_SEEN_LOOKBEHIND;
5605 case '=': /* (?=...) */
5606 case '!': /* (?!...) */
5607 RExC_seen_zerolen++;
5608 if (*RExC_parse == ')') {
5609 ret=reg_node(pRExC_state, OPFAIL);
5610 nextchar(pRExC_state);
5614 case '|': /* (?|...) */
5615 /* branch reset, behave like a (?:...) except that
5616 buffers in alternations share the same numbers */
5618 after_freeze = freeze_paren = RExC_npar;
5620 case ':': /* (?:...) */
5621 case '>': /* (?>...) */
5623 case '$': /* (?$...) */
5624 case '@': /* (?@...) */
5625 vFAIL2("Sequence (?%c...) not implemented", (int)paren);
5627 case '#': /* (?#...) */
5628 while (*RExC_parse && *RExC_parse != ')')
5630 if (*RExC_parse != ')')
5631 FAIL("Sequence (?#... not terminated");
5632 nextchar(pRExC_state);
5635 case '0' : /* (?0) */
5636 case 'R' : /* (?R) */
5637 if (*RExC_parse != ')')
5638 FAIL("Sequence (?R) not terminated");
5639 ret = reg_node(pRExC_state, GOSTART);
5640 *flagp |= POSTPONED;
5641 nextchar(pRExC_state);
5644 { /* named and numeric backreferences */
5646 case '&': /* (?&NAME) */
5647 parse_start = RExC_parse - 1;
5650 SV *sv_dat = reg_scan_name(pRExC_state,
5651 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
5652 num = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
5654 goto gen_recurse_regop;
5657 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
5659 vFAIL("Illegal pattern");
5661 goto parse_recursion;
5663 case '-': /* (?-1) */
5664 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
5665 RExC_parse--; /* rewind to let it be handled later */
5669 case '1': case '2': case '3': case '4': /* (?1) */
5670 case '5': case '6': case '7': case '8': case '9':
5673 num = atoi(RExC_parse);
5674 parse_start = RExC_parse - 1; /* MJD */
5675 if (*RExC_parse == '-')
5677 while (isDIGIT(*RExC_parse))
5679 if (*RExC_parse!=')')
5680 vFAIL("Expecting close bracket");
5683 if ( paren == '-' ) {
5685 Diagram of capture buffer numbering.
5686 Top line is the normal capture buffer numbers
5687 Botton line is the negative indexing as from
5691 /(a(x)y)(a(b(c(?-2)d)e)f)(g(h))/
5695 num = RExC_npar + num;
5698 vFAIL("Reference to nonexistent group");
5700 } else if ( paren == '+' ) {
5701 num = RExC_npar + num - 1;
5704 ret = reganode(pRExC_state, GOSUB, num);
5706 if (num > (I32)RExC_rx->nparens) {
5708 vFAIL("Reference to nonexistent group");
5710 ARG2L_SET( ret, RExC_recurse_count++);
5712 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
5713 "Recurse #%"UVuf" to %"IVdf"\n", (UV)ARG(ret), (IV)ARG2L(ret)));
5717 RExC_seen |= REG_SEEN_RECURSE;
5718 Set_Node_Length(ret, 1 + regarglen[OP(ret)]); /* MJD */
5719 Set_Node_Offset(ret, parse_start); /* MJD */
5721 *flagp |= POSTPONED;
5722 nextchar(pRExC_state);
5724 } /* named and numeric backreferences */
5727 case '?': /* (??...) */
5729 if (*RExC_parse != '{') {
5731 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
5734 *flagp |= POSTPONED;
5735 paren = *RExC_parse++;
5737 case '{': /* (?{...}) */
5742 char *s = RExC_parse;
5744 RExC_seen_zerolen++;
5745 RExC_seen |= REG_SEEN_EVAL;
5746 while (count && (c = *RExC_parse)) {
5757 if (*RExC_parse != ')') {
5759 vFAIL("Sequence (?{...}) not terminated or not {}-balanced");
5763 OP_4tree *sop, *rop;
5764 SV * const sv = newSVpvn(s, RExC_parse - 1 - s);
5767 Perl_save_re_context(aTHX);
5768 rop = sv_compile_2op(sv, &sop, "re", &pad);
5769 sop->op_private |= OPpREFCOUNTED;
5770 /* re_dup will OpREFCNT_inc */
5771 OpREFCNT_set(sop, 1);
5774 n = add_data(pRExC_state, 3, "nop");
5775 RExC_rxi->data->data[n] = (void*)rop;
5776 RExC_rxi->data->data[n+1] = (void*)sop;
5777 RExC_rxi->data->data[n+2] = (void*)pad;
5780 else { /* First pass */
5781 if (PL_reginterp_cnt < ++RExC_seen_evals
5783 /* No compiled RE interpolated, has runtime
5784 components ===> unsafe. */
5785 FAIL("Eval-group not allowed at runtime, use re 'eval'");
5786 if (PL_tainting && PL_tainted)
5787 FAIL("Eval-group in insecure regular expression");
5788 #if PERL_VERSION > 8
5789 if (IN_PERL_COMPILETIME)
5794 nextchar(pRExC_state);
5796 ret = reg_node(pRExC_state, LOGICAL);
5799 REGTAIL(pRExC_state, ret, reganode(pRExC_state, EVAL, n));
5800 /* deal with the length of this later - MJD */
5803 ret = reganode(pRExC_state, EVAL, n);
5804 Set_Node_Length(ret, RExC_parse - parse_start + 1);
5805 Set_Node_Offset(ret, parse_start);
5808 case '(': /* (?(?{...})...) and (?(?=...)...) */
5811 if (RExC_parse[0] == '?') { /* (?(?...)) */
5812 if (RExC_parse[1] == '=' || RExC_parse[1] == '!'
5813 || RExC_parse[1] == '<'
5814 || RExC_parse[1] == '{') { /* Lookahead or eval. */
5817 ret = reg_node(pRExC_state, LOGICAL);
5820 REGTAIL(pRExC_state, ret, reg(pRExC_state, 1, &flag,depth+1));
5824 else if ( RExC_parse[0] == '<' /* (?(<NAME>)...) */
5825 || RExC_parse[0] == '\'' ) /* (?('NAME')...) */
5827 char ch = RExC_parse[0] == '<' ? '>' : '\'';
5828 char *name_start= RExC_parse++;
5830 SV *sv_dat=reg_scan_name(pRExC_state,
5831 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
5832 if (RExC_parse == name_start || *RExC_parse != ch)
5833 vFAIL2("Sequence (?(%c... not terminated",
5834 (ch == '>' ? '<' : ch));
5837 num = add_data( pRExC_state, 1, "S" );
5838 RExC_rxi->data->data[num]=(void*)sv_dat;
5839 SvREFCNT_inc_simple_void(sv_dat);
5841 ret = reganode(pRExC_state,NGROUPP,num);
5842 goto insert_if_check_paren;
5844 else if (RExC_parse[0] == 'D' &&
5845 RExC_parse[1] == 'E' &&
5846 RExC_parse[2] == 'F' &&
5847 RExC_parse[3] == 'I' &&
5848 RExC_parse[4] == 'N' &&
5849 RExC_parse[5] == 'E')
5851 ret = reganode(pRExC_state,DEFINEP,0);
5854 goto insert_if_check_paren;
5856 else if (RExC_parse[0] == 'R') {
5859 if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
5860 parno = atoi(RExC_parse++);
5861 while (isDIGIT(*RExC_parse))
5863 } else if (RExC_parse[0] == '&') {
5866 sv_dat = reg_scan_name(pRExC_state,
5867 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
5868 parno = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
5870 ret = reganode(pRExC_state,INSUBP,parno);
5871 goto insert_if_check_paren;
5873 else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
5876 parno = atoi(RExC_parse++);
5878 while (isDIGIT(*RExC_parse))
5880 ret = reganode(pRExC_state, GROUPP, parno);
5882 insert_if_check_paren:
5883 if ((c = *nextchar(pRExC_state)) != ')')
5884 vFAIL("Switch condition not recognized");
5886 REGTAIL(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
5887 br = regbranch(pRExC_state, &flags, 1,depth+1);
5889 br = reganode(pRExC_state, LONGJMP, 0);
5891 REGTAIL(pRExC_state, br, reganode(pRExC_state, LONGJMP, 0));
5892 c = *nextchar(pRExC_state);
5897 vFAIL("(?(DEFINE)....) does not allow branches");
5898 lastbr = reganode(pRExC_state, IFTHEN, 0); /* Fake one for optimizer. */
5899 regbranch(pRExC_state, &flags, 1,depth+1);
5900 REGTAIL(pRExC_state, ret, lastbr);
5903 c = *nextchar(pRExC_state);
5908 vFAIL("Switch (?(condition)... contains too many branches");
5909 ender = reg_node(pRExC_state, TAIL);
5910 REGTAIL(pRExC_state, br, ender);
5912 REGTAIL(pRExC_state, lastbr, ender);
5913 REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
5916 REGTAIL(pRExC_state, ret, ender);
5917 RExC_size++; /* XXX WHY do we need this?!!
5918 For large programs it seems to be required
5919 but I can't figure out why. -- dmq*/
5923 vFAIL2("Unknown switch condition (?(%.2s", RExC_parse);
5927 RExC_parse--; /* for vFAIL to print correctly */
5928 vFAIL("Sequence (? incomplete");
5932 parse_flags: /* (?i) */
5934 U32 posflags = 0, negflags = 0;
5935 U32 *flagsp = &posflags;
5937 while (*RExC_parse) {
5938 /* && strchr("iogcmsx", *RExC_parse) */
5939 /* (?g), (?gc) and (?o) are useless here
5940 and must be globally applied -- japhy */
5941 switch (*RExC_parse) {
5942 CASE_STD_PMMOD_FLAGS_PARSE_SET(flagsp);
5943 case ONCE_PAT_MOD: /* 'o' */
5944 case GLOBAL_PAT_MOD: /* 'g' */
5945 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
5946 const I32 wflagbit = *RExC_parse == 'o' ? WASTED_O : WASTED_G;
5947 if (! (wastedflags & wflagbit) ) {
5948 wastedflags |= wflagbit;
5951 "Useless (%s%c) - %suse /%c modifier",
5952 flagsp == &negflags ? "?-" : "?",
5954 flagsp == &negflags ? "don't " : "",
5961 case CONTINUE_PAT_MOD: /* 'c' */
5962 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
5963 if (! (wastedflags & WASTED_C) ) {
5964 wastedflags |= WASTED_GC;
5967 "Useless (%sc) - %suse /gc modifier",
5968 flagsp == &negflags ? "?-" : "?",
5969 flagsp == &negflags ? "don't " : ""
5974 case KEEPCOPY_PAT_MOD: /* 'p' */
5975 if (flagsp == &negflags) {
5976 if (SIZE_ONLY && ckWARN(WARN_REGEXP))
5977 vWARN(RExC_parse + 1,"Useless use of (?-p)");
5979 *flagsp |= RXf_PMf_KEEPCOPY;
5983 if (flagsp == &negflags) {
5985 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
5989 wastedflags = 0; /* reset so (?g-c) warns twice */
5995 RExC_flags |= posflags;
5996 RExC_flags &= ~negflags;
5998 oregflags |= posflags;
5999 oregflags &= ~negflags;
6001 nextchar(pRExC_state);
6012 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
6017 }} /* one for the default block, one for the switch */
6024 ret = reganode(pRExC_state, OPEN, parno);
6027 RExC_nestroot = parno;
6028 if (RExC_seen & REG_SEEN_RECURSE
6029 && !RExC_open_parens[parno-1])
6031 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
6032 "Setting open paren #%"IVdf" to %d\n",
6033 (IV)parno, REG_NODE_NUM(ret)));
6034 RExC_open_parens[parno-1]= ret;
6037 Set_Node_Length(ret, 1); /* MJD */
6038 Set_Node_Offset(ret, RExC_parse); /* MJD */
6046 /* Pick up the branches, linking them together. */
6047 parse_start = RExC_parse; /* MJD */
6048 br = regbranch(pRExC_state, &flags, 1,depth+1);
6049 /* branch_len = (paren != 0); */
6053 if (*RExC_parse == '|') {
6054 if (!SIZE_ONLY && RExC_extralen) {
6055 reginsert(pRExC_state, BRANCHJ, br, depth+1);
6058 reginsert(pRExC_state, BRANCH, br, depth+1);
6059 Set_Node_Length(br, paren != 0);
6060 Set_Node_Offset_To_R(br-RExC_emit_start, parse_start-RExC_start);
6064 RExC_extralen += 1; /* For BRANCHJ-BRANCH. */
6066 else if (paren == ':') {
6067 *flagp |= flags&SIMPLE;
6069 if (is_open) { /* Starts with OPEN. */
6070 REGTAIL(pRExC_state, ret, br); /* OPEN -> first. */
6072 else if (paren != '?') /* Not Conditional */
6074 *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
6076 while (*RExC_parse == '|') {
6077 if (!SIZE_ONLY && RExC_extralen) {
6078 ender = reganode(pRExC_state, LONGJMP,0);
6079 REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender); /* Append to the previous. */
6082 RExC_extralen += 2; /* Account for LONGJMP. */
6083 nextchar(pRExC_state);
6085 if (RExC_npar > after_freeze)
6086 after_freeze = RExC_npar;
6087 RExC_npar = freeze_paren;
6089 br = regbranch(pRExC_state, &flags, 0, depth+1);
6093 REGTAIL(pRExC_state, lastbr, br); /* BRANCH -> BRANCH. */
6095 *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
6098 if (have_branch || paren != ':') {
6099 /* Make a closing node, and hook it on the end. */
6102 ender = reg_node(pRExC_state, TAIL);
6105 ender = reganode(pRExC_state, CLOSE, parno);
6106 if (!SIZE_ONLY && RExC_seen & REG_SEEN_RECURSE) {
6107 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
6108 "Setting close paren #%"IVdf" to %d\n",
6109 (IV)parno, REG_NODE_NUM(ender)));
6110 RExC_close_parens[parno-1]= ender;
6111 if (RExC_nestroot == parno)
6114 Set_Node_Offset(ender,RExC_parse+1); /* MJD */
6115 Set_Node_Length(ender,1); /* MJD */
6121 *flagp &= ~HASWIDTH;
6124 ender = reg_node(pRExC_state, SUCCEED);
6127 ender = reg_node(pRExC_state, END);
6129 assert(!RExC_opend); /* there can only be one! */
6134 REGTAIL(pRExC_state, lastbr, ender);
6136 if (have_branch && !SIZE_ONLY) {
6138 RExC_seen |= REG_TOP_LEVEL_BRANCHES;
6140 /* Hook the tails of the branches to the closing node. */
6141 for (br = ret; br; br = regnext(br)) {
6142 const U8 op = PL_regkind[OP(br)];
6144 REGTAIL_STUDY(pRExC_state, NEXTOPER(br), ender);
6146 else if (op == BRANCHJ) {
6147 REGTAIL_STUDY(pRExC_state, NEXTOPER(NEXTOPER(br)), ender);
6155 static const char parens[] = "=!<,>";
6157 if (paren && (p = strchr(parens, paren))) {
6158 U8 node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
6159 int flag = (p - parens) > 1;
6162 node = SUSPEND, flag = 0;
6163 reginsert(pRExC_state, node,ret, depth+1);
6164 Set_Node_Cur_Length(ret);
6165 Set_Node_Offset(ret, parse_start + 1);
6167 REGTAIL_STUDY(pRExC_state, ret, reg_node(pRExC_state, TAIL));
6171 /* Check for proper termination. */
6173 RExC_flags = oregflags;
6174 if (RExC_parse >= RExC_end || *nextchar(pRExC_state) != ')') {
6175 RExC_parse = oregcomp_parse;
6176 vFAIL("Unmatched (");
6179 else if (!paren && RExC_parse < RExC_end) {
6180 if (*RExC_parse == ')') {
6182 vFAIL("Unmatched )");
6185 FAIL("Junk on end of regexp"); /* "Can't happen". */
6189 RExC_npar = after_freeze;
6194 - regbranch - one alternative of an | operator
6196 * Implements the concatenation operator.
6199 S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first, U32 depth)
6202 register regnode *ret;
6203 register regnode *chain = NULL;
6204 register regnode *latest;
6205 I32 flags = 0, c = 0;
6206 GET_RE_DEBUG_FLAGS_DECL;
6207 DEBUG_PARSE("brnc");
6212 if (!SIZE_ONLY && RExC_extralen)
6213 ret = reganode(pRExC_state, BRANCHJ,0);
6215 ret = reg_node(pRExC_state, BRANCH);
6216 Set_Node_Length(ret, 1);
6220 if (!first && SIZE_ONLY)
6221 RExC_extralen += 1; /* BRANCHJ */
6223 *flagp = WORST; /* Tentatively. */
6226 nextchar(pRExC_state);
6227 while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
6229 latest = regpiece(pRExC_state, &flags,depth+1);
6230 if (latest == NULL) {
6231 if (flags & TRYAGAIN)
6235 else if (ret == NULL)
6237 *flagp |= flags&(HASWIDTH|POSTPONED);
6238 if (chain == NULL) /* First piece. */
6239 *flagp |= flags&SPSTART;
6242 REGTAIL(pRExC_state, chain, latest);
6247 if (chain == NULL) { /* Loop ran zero times. */
6248 chain = reg_node(pRExC_state, NOTHING);
6253 *flagp |= flags&SIMPLE;
6260 - regpiece - something followed by possible [*+?]
6262 * Note that the branching code sequences used for ? and the general cases
6263 * of * and + are somewhat optimized: they use the same NOTHING node as
6264 * both the endmarker for their branch list and the body of the last branch.
6265 * It might seem that this node could be dispensed with entirely, but the
6266 * endmarker role is not redundant.
6269 S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
6272 register regnode *ret;
6274 register char *next;
6276 const char * const origparse = RExC_parse;
6278 I32 max = REG_INFTY;
6280 const char *maxpos = NULL;
6281 GET_RE_DEBUG_FLAGS_DECL;
6282 DEBUG_PARSE("piec");
6284 ret = regatom(pRExC_state, &flags,depth+1);
6286 if (flags & TRYAGAIN)
6293 if (op == '{' && regcurly(RExC_parse)) {
6295 parse_start = RExC_parse; /* MJD */
6296 next = RExC_parse + 1;
6297 while (isDIGIT(*next) || *next == ',') {
6306 if (*next == '}') { /* got one */
6310 min = atoi(RExC_parse);
6314 maxpos = RExC_parse;
6316 if (!max && *maxpos != '0')
6317 max = REG_INFTY; /* meaning "infinity" */
6318 else if (max >= REG_INFTY)
6319 vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
6321 nextchar(pRExC_state);
6324 if ((flags&SIMPLE)) {
6325 RExC_naughty += 2 + RExC_naughty / 2;
6326 reginsert(pRExC_state, CURLY, ret, depth+1);
6327 Set_Node_Offset(ret, parse_start+1); /* MJD */
6328 Set_Node_Cur_Length(ret);
6331 regnode * const w = reg_node(pRExC_state, WHILEM);
6334 REGTAIL(pRExC_state, ret, w);
6335 if (!SIZE_ONLY && RExC_extralen) {
6336 reginsert(pRExC_state, LONGJMP,ret, depth+1);
6337 reginsert(pRExC_state, NOTHING,ret, depth+1);
6338 NEXT_OFF(ret) = 3; /* Go over LONGJMP. */
6340 reginsert(pRExC_state, CURLYX,ret, depth+1);
6342 Set_Node_Offset(ret, parse_start+1);
6343 Set_Node_Length(ret,
6344 op == '{' ? (RExC_parse - parse_start) : 1);
6346 if (!SIZE_ONLY && RExC_extralen)
6347 NEXT_OFF(ret) = 3; /* Go over NOTHING to LONGJMP. */
6348 REGTAIL(pRExC_state, ret, reg_node(pRExC_state, NOTHING));
6350 RExC_whilem_seen++, RExC_extralen += 3;
6351 RExC_naughty += 4 + RExC_naughty; /* compound interest */
6359 if (max && max < min)
6360 vFAIL("Can't do {n,m} with n > m");
6362 ARG1_SET(ret, (U16)min);
6363 ARG2_SET(ret, (U16)max);
6375 #if 0 /* Now runtime fix should be reliable. */
6377 /* if this is reinstated, don't forget to put this back into perldiag:
6379 =item Regexp *+ operand could be empty at {#} in regex m/%s/
6381 (F) The part of the regexp subject to either the * or + quantifier
6382 could match an empty string. The {#} shows in the regular
6383 expression about where the problem was discovered.
6387 if (!(flags&HASWIDTH) && op != '?')
6388 vFAIL("Regexp *+ operand could be empty");
6391 parse_start = RExC_parse;
6392 nextchar(pRExC_state);
6394 *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH);
6396 if (op == '*' && (flags&SIMPLE)) {
6397 reginsert(pRExC_state, STAR, ret, depth+1);
6401 else if (op == '*') {
6405 else if (op == '+' && (flags&SIMPLE)) {
6406 reginsert(pRExC_state, PLUS, ret, depth+1);
6410 else if (op == '+') {
6414 else if (op == '?') {
6419 if (!SIZE_ONLY && !(flags&(HASWIDTH|POSTPONED)) && max > REG_INFTY/3 && ckWARN(WARN_REGEXP)) {
6421 "%.*s matches null string many times",
6422 (int)(RExC_parse >= origparse ? RExC_parse - origparse : 0),
6426 if (RExC_parse < RExC_end && *RExC_parse == '?') {
6427 nextchar(pRExC_state);
6428 reginsert(pRExC_state, MINMOD, ret, depth+1);
6429 REGTAIL(pRExC_state, ret, ret + NODE_STEP_REGNODE);
6431 #ifndef REG_ALLOW_MINMOD_SUSPEND
6434 if (RExC_parse < RExC_end && *RExC_parse == '+') {
6436 nextchar(pRExC_state);
6437 ender = reg_node(pRExC_state, SUCCEED);
6438 REGTAIL(pRExC_state, ret, ender);
6439 reginsert(pRExC_state, SUSPEND, ret, depth+1);
6441 ender = reg_node(pRExC_state, TAIL);
6442 REGTAIL(pRExC_state, ret, ender);
6446 if (RExC_parse < RExC_end && ISMULT2(RExC_parse)) {
6448 vFAIL("Nested quantifiers");
6455 /* reg_namedseq(pRExC_state,UVp)
6457 This is expected to be called by a parser routine that has
6458 recognized'\N' and needs to handle the rest. RExC_parse is
6459 expected to point at the first char following the N at the time
6462 If valuep is non-null then it is assumed that we are parsing inside
6463 of a charclass definition and the first codepoint in the resolved
6464 string is returned via *valuep and the routine will return NULL.
6465 In this mode if a multichar string is returned from the charnames
6466 handler a warning will be issued, and only the first char in the
6467 sequence will be examined. If the string returned is zero length
6468 then the value of *valuep is undefined and NON-NULL will
6469 be returned to indicate failure. (This will NOT be a valid pointer
6472 If value is null then it is assumed that we are parsing normal text
6473 and inserts a new EXACT node into the program containing the resolved
6474 string and returns a pointer to the new node. If the string is
6475 zerolength a NOTHING node is emitted.
6477 On success RExC_parse is set to the char following the endbrace.
6478 Parsing failures will generate a fatal errorvia vFAIL(...)
6480 NOTE: We cache all results from the charnames handler locally in
6481 the RExC_charnames hash (created on first use) to prevent a charnames
6482 handler from playing silly-buggers and returning a short string and
6483 then a long string for a given pattern. Since the regexp program
6484 size is calculated during an initial parse this would result
6485 in a buffer overrun so we cache to prevent the charname result from
6486 changing during the course of the parse.
6490 S_reg_namedseq(pTHX_ RExC_state_t *pRExC_state, UV *valuep)
6492 char * name; /* start of the content of the name */
6493 char * endbrace; /* endbrace following the name */
6496 STRLEN len; /* this has various purposes throughout the code */
6497 bool cached = 0; /* if this is true then we shouldn't refcount dev sv_str */
6498 regnode *ret = NULL;
6500 if (*RExC_parse != '{') {
6501 vFAIL("Missing braces on \\N{}");
6503 name = RExC_parse+1;
6504 endbrace = strchr(RExC_parse, '}');
6507 vFAIL("Missing right brace on \\N{}");
6509 RExC_parse = endbrace + 1;
6512 /* RExC_parse points at the beginning brace,
6513 endbrace points at the last */
6514 if ( name[0]=='U' && name[1]=='+' ) {
6515 /* its a "Unicode hex" notation {U+89AB} */
6516 I32 fl = PERL_SCAN_ALLOW_UNDERSCORES
6517 | PERL_SCAN_DISALLOW_PREFIX
6518 | (SIZE_ONLY ? PERL_SCAN_SILENT_ILLDIGIT : 0);
6521 len = (STRLEN)(endbrace - name - 2);
6522 cp = grok_hex(name + 2, &len, &fl, NULL);
6523 if ( len != (STRLEN)(endbrace - name - 2) ) {
6533 sv_str= newSVpvn(&string, 1);
6535 /* fetch the charnames handler for this scope */
6536 HV * const table = GvHV(PL_hintgv);
6538 hv_fetchs(table, "charnames", FALSE) :
6540 SV *cv= cvp ? *cvp : NULL;
6543 /* create an SV with the name as argument */
6544 sv_name = newSVpvn(name, endbrace - name);
6546 if (!table || !(PL_hints & HINT_LOCALIZE_HH)) {
6547 vFAIL2("Constant(\\N{%s}) unknown: "
6548 "(possibly a missing \"use charnames ...\")",
6551 if (!cvp || !SvOK(*cvp)) { /* when $^H{charnames} = undef; */
6552 vFAIL2("Constant(\\N{%s}): "
6553 "$^H{charnames} is not defined",SvPVX(sv_name));
6558 if (!RExC_charnames) {
6559 /* make sure our cache is allocated */
6560 RExC_charnames = newHV();
6561 sv_2mortal((SV*)RExC_charnames);
6563 /* see if we have looked this one up before */
6564 he_str = hv_fetch_ent( RExC_charnames, sv_name, 0, 0 );
6566 sv_str = HeVAL(he_str);
6579 count= call_sv(cv, G_SCALAR);
6581 if (count == 1) { /* XXXX is this right? dmq */
6583 SvREFCNT_inc_simple_void(sv_str);
6591 if ( !sv_str || !SvOK(sv_str) ) {
6592 vFAIL2("Constant(\\N{%s}): Call to &{$^H{charnames}} "
6593 "did not return a defined value",SvPVX(sv_name));
6595 if (hv_store_ent( RExC_charnames, sv_name, sv_str, 0))
6600 char *p = SvPV(sv_str, len);
6603 if ( SvUTF8(sv_str) ) {
6604 *valuep = utf8_to_uvchr((U8*)p, &numlen);
6608 We have to turn on utf8 for high bit chars otherwise
6609 we get failures with
6611 "ss" =~ /[\N{LATIN SMALL LETTER SHARP S}]/i
6612 "SS" =~ /[\N{LATIN SMALL LETTER SHARP S}]/i
6614 This is different from what \x{} would do with the same
6615 codepoint, where the condition is > 0xFF.
6622 /* warn if we havent used the whole string? */
6624 if (numlen<len && SIZE_ONLY && ckWARN(WARN_REGEXP)) {
6626 "Ignoring excess chars from \\N{%s} in character class",
6630 } else if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
6632 "Ignoring zero length \\N{%s} in character class",
6637 SvREFCNT_dec(sv_name);
6639 SvREFCNT_dec(sv_str);
6640 return len ? NULL : (regnode *)&len;
6641 } else if(SvCUR(sv_str)) {
6647 char * parse_start = name-3; /* needed for the offsets */
6649 GET_RE_DEBUG_FLAGS_DECL; /* needed for the offsets */
6651 ret = reg_node(pRExC_state,
6652 (U8)(FOLD ? (LOC ? EXACTFL : EXACTF) : EXACT));
6655 if ( RExC_utf8 && !SvUTF8(sv_str) ) {
6656 sv_utf8_upgrade(sv_str);
6657 } else if ( !RExC_utf8 && SvUTF8(sv_str) ) {
6661 p = SvPV(sv_str, len);
6663 /* len is the length written, charlen is the size the char read */
6664 for ( len = 0; p < pend; p += charlen ) {
6666 UV uvc = utf8_to_uvchr((U8*)p, &charlen);
6668 STRLEN foldlen,numlen;
6669 U8 tmpbuf[UTF8_MAXBYTES_CASE+1], *foldbuf;
6670 uvc = toFOLD_uni(uvc, tmpbuf, &foldlen);
6671 /* Emit all the Unicode characters. */
6673 for (foldbuf = tmpbuf;
6677 uvc = utf8_to_uvchr(foldbuf, &numlen);
6679 const STRLEN unilen = reguni(pRExC_state, uvc, s);
6682 /* In EBCDIC the numlen
6683 * and unilen can differ. */
6685 if (numlen >= foldlen)
6689 break; /* "Can't happen." */
6692 const STRLEN unilen = reguni(pRExC_state, uvc, s);
6704 RExC_size += STR_SZ(len);
6707 RExC_emit += STR_SZ(len);
6709 Set_Node_Cur_Length(ret); /* MJD */
6711 nextchar(pRExC_state);
6713 ret = reg_node(pRExC_state,NOTHING);
6716 SvREFCNT_dec(sv_str);
6719 SvREFCNT_dec(sv_name);
6729 * It returns the code point in utf8 for the value in *encp.
6730 * value: a code value in the source encoding
6731 * encp: a pointer to an Encode object
6733 * If the result from Encode is not a single character,
6734 * it returns U+FFFD (Replacement character) and sets *encp to NULL.
6737 S_reg_recode(pTHX_ const char value, SV **encp)
6740 SV * const sv = sv_2mortal(newSVpvn(&value, numlen));
6741 const char * const s = *encp ? sv_recode_to_utf8(sv, *encp) : SvPVX(sv);
6742 const STRLEN newlen = SvCUR(sv);
6743 UV uv = UNICODE_REPLACEMENT;
6747 ? utf8n_to_uvchr((U8*)s, newlen, &numlen, UTF8_ALLOW_DEFAULT)
6750 if (!newlen || numlen != newlen) {
6751 uv = UNICODE_REPLACEMENT;
6759 - regatom - the lowest level
6761 Try to identify anything special at the start of the pattern. If there
6762 is, then handle it as required. This may involve generating a single regop,
6763 such as for an assertion; or it may involve recursing, such as to
6764 handle a () structure.
6766 If the string doesn't start with something special then we gobble up
6767 as much literal text as we can.
6769 Once we have been able to handle whatever type of thing started the
6770 sequence, we return.
6772 Note: we have to be careful with escapes, as they can be both literal
6773 and special, and in the case of \10 and friends can either, depending
6774 on context. Specifically there are two seperate switches for handling
6775 escape sequences, with the one for handling literal escapes requiring
6776 a dummy entry for all of the special escapes that are actually handled
6781 S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
6784 register regnode *ret = NULL;
6786 char *parse_start = RExC_parse;
6787 GET_RE_DEBUG_FLAGS_DECL;
6788 DEBUG_PARSE("atom");
6789 *flagp = WORST; /* Tentatively. */
6793 switch ((U8)*RExC_parse) {
6795 RExC_seen_zerolen++;
6796 nextchar(pRExC_state);
6797 if (RExC_flags & RXf_PMf_MULTILINE)
6798 ret = reg_node(pRExC_state, MBOL);
6799 else if (RExC_flags & RXf_PMf_SINGLELINE)
6800 ret = reg_node(pRExC_state, SBOL);
6802 ret = reg_node(pRExC_state, BOL);
6803 Set_Node_Length(ret, 1); /* MJD */
6806 nextchar(pRExC_state);
6808 RExC_seen_zerolen++;
6809 if (RExC_flags & RXf_PMf_MULTILINE)
6810 ret = reg_node(pRExC_state, MEOL);
6811 else if (RExC_flags & RXf_PMf_SINGLELINE)
6812 ret = reg_node(pRExC_state, SEOL);
6814 ret = reg_node(pRExC_state, EOL);
6815 Set_Node_Length(ret, 1); /* MJD */
6818 nextchar(pRExC_state);
6819 if (RExC_flags & RXf_PMf_SINGLELINE)
6820 ret = reg_node(pRExC_state, SANY);
6822 ret = reg_node(pRExC_state, REG_ANY);
6823 *flagp |= HASWIDTH|SIMPLE;
6825 Set_Node_Length(ret, 1); /* MJD */
6829 char * const oregcomp_parse = ++RExC_parse;
6830 ret = regclass(pRExC_state,depth+1);
6831 if (*RExC_parse != ']') {
6832 RExC_parse = oregcomp_parse;
6833 vFAIL("Unmatched [");
6835 nextchar(pRExC_state);
6836 *flagp |= HASWIDTH|SIMPLE;
6837 Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */
6841 nextchar(pRExC_state);
6842 ret = reg(pRExC_state, 1, &flags,depth+1);
6844 if (flags & TRYAGAIN) {
6845 if (RExC_parse == RExC_end) {
6846 /* Make parent create an empty node if needed. */
6854 *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE|POSTPONED);
6858 if (flags & TRYAGAIN) {
6862 vFAIL("Internal urp");
6863 /* Supposed to be caught earlier. */
6866 if (!regcurly(RExC_parse)) {
6875 vFAIL("Quantifier follows nothing");
6883 len=0; /* silence a spurious compiler warning */
6884 if ((cp = what_len_TRICKYFOLD_safe(RExC_parse,RExC_end,UTF,len))) {
6885 *flagp |= HASWIDTH; /* could be SIMPLE too, but needs a handler in regexec.regrepeat */
6886 RExC_parse+=len-1; /* we get one from nextchar() as well. :-( */
6887 ret = reganode(pRExC_state, FOLDCHAR, cp);
6888 Set_Node_Length(ret, 1); /* MJD */
6889 nextchar(pRExC_state); /* kill whitespace under /x */
6897 This switch handles escape sequences that resolve to some kind
6898 of special regop and not to literal text. Escape sequnces that
6899 resolve to literal text are handled below in the switch marked
6902 Every entry in this switch *must* have a corresponding entry
6903 in the literal escape switch. However, the opposite is not
6904 required, as the default for this switch is to jump to the
6905 literal text handling code.
6907 switch ((U8)*++RExC_parse) {
6912 /* Special Escapes */
6914 RExC_seen_zerolen++;
6915 ret = reg_node(pRExC_state, SBOL);
6917 goto finish_meta_pat;
6919 ret = reg_node(pRExC_state, GPOS);
6920 RExC_seen |= REG_SEEN_GPOS;
6922 goto finish_meta_pat;
6924 RExC_seen_zerolen++;
6925 ret = reg_node(pRExC_state, KEEPS);
6927 /* XXX:dmq : disabling in-place substitution seems to
6928 * be necessary here to avoid cases of memory corruption, as
6929 * with: C<$_="x" x 80; s/x\K/y/> -- rgs
6931 RExC_seen |= REG_SEEN_LOOKBEHIND;
6932 goto finish_meta_pat;
6934 ret = reg_node(pRExC_state, SEOL);
6936 RExC_seen_zerolen++; /* Do not optimize RE away */
6937 goto finish_meta_pat;
6939 ret = reg_node(pRExC_state, EOS);
6941 RExC_seen_zerolen++; /* Do not optimize RE away */
6942 goto finish_meta_pat;
6944 ret = reg_node(pRExC_state, CANY);
6945 RExC_seen |= REG_SEEN_CANY;
6946 *flagp |= HASWIDTH|SIMPLE;
6947 goto finish_meta_pat;
6949 ret = reg_node(pRExC_state, CLUMP);
6951 goto finish_meta_pat;
6953 ret = reg_node(pRExC_state, (U8)(LOC ? ALNUML : ALNUM));
6954 *flagp |= HASWIDTH|SIMPLE;
6955 goto finish_meta_pat;
6957 ret = reg_node(pRExC_state, (U8)(LOC ? NALNUML : NALNUM));
6958 *flagp |= HASWIDTH|SIMPLE;
6959 goto finish_meta_pat;
6961 RExC_seen_zerolen++;
6962 RExC_seen |= REG_SEEN_LOOKBEHIND;
6963 ret = reg_node(pRExC_state, (U8)(LOC ? BOUNDL : BOUND));
6965 goto finish_meta_pat;
6967 RExC_seen_zerolen++;
6968 RExC_seen |= REG_SEEN_LOOKBEHIND;
6969 ret = reg_node(pRExC_state, (U8)(LOC ? NBOUNDL : NBOUND));
6971 goto finish_meta_pat;
6973 ret = reg_node(pRExC_state, (U8)(LOC ? SPACEL : SPACE));
6974 *flagp |= HASWIDTH|SIMPLE;
6975 goto finish_meta_pat;
6977 ret = reg_node(pRExC_state, (U8)(LOC ? NSPACEL : NSPACE));
6978 *flagp |= HASWIDTH|SIMPLE;
6979 goto finish_meta_pat;
6981 ret = reg_node(pRExC_state, DIGIT);
6982 *flagp |= HASWIDTH|SIMPLE;
6983 goto finish_meta_pat;
6985 ret = reg_node(pRExC_state, NDIGIT);
6986 *flagp |= HASWIDTH|SIMPLE;
6987 goto finish_meta_pat;
6989 ret = reg_node(pRExC_state, LNBREAK);
6990 *flagp |= HASWIDTH|SIMPLE;
6991 goto finish_meta_pat;
6993 ret = reg_node(pRExC_state, HORIZWS);
6994 *flagp |= HASWIDTH|SIMPLE;
6995 goto finish_meta_pat;
6997 ret = reg_node(pRExC_state, NHORIZWS);
6998 *flagp |= HASWIDTH|SIMPLE;
6999 goto finish_meta_pat;
7001 ret = reg_node(pRExC_state, VERTWS);
7002 *flagp |= HASWIDTH|SIMPLE;
7003 goto finish_meta_pat;
7005 ret = reg_node(pRExC_state, NVERTWS);
7006 *flagp |= HASWIDTH|SIMPLE;
7008 nextchar(pRExC_state);
7009 Set_Node_Length(ret, 2); /* MJD */
7014 char* const oldregxend = RExC_end;
7016 char* parse_start = RExC_parse - 2;
7019 if (RExC_parse[1] == '{') {
7020 /* a lovely hack--pretend we saw [\pX] instead */
7021 RExC_end = strchr(RExC_parse, '}');
7023 const U8 c = (U8)*RExC_parse;
7025 RExC_end = oldregxend;
7026 vFAIL2("Missing right brace on \\%c{}", c);
7031 RExC_end = RExC_parse + 2;
7032 if (RExC_end > oldregxend)
7033 RExC_end = oldregxend;
7037 ret = regclass(pRExC_state,depth+1);
7039 RExC_end = oldregxend;
7042 Set_Node_Offset(ret, parse_start + 2);
7043 Set_Node_Cur_Length(ret);
7044 nextchar(pRExC_state);
7045 *flagp |= HASWIDTH|SIMPLE;
7049 /* Handle \N{NAME} here and not below because it can be
7050 multicharacter. join_exact() will join them up later on.
7051 Also this makes sure that things like /\N{BLAH}+/ and
7052 \N{BLAH} being multi char Just Happen. dmq*/
7054 ret= reg_namedseq(pRExC_state, NULL);
7056 case 'k': /* Handle \k<NAME> and \k'NAME' */
7059 char ch= RExC_parse[1];
7060 if (ch != '<' && ch != '\'' && ch != '{') {
7062 vFAIL2("Sequence %.2s... not terminated",parse_start);
7064 /* this pretty much dupes the code for (?P=...) in reg(), if
7065 you change this make sure you change that */
7066 char* name_start = (RExC_parse += 2);
7068 SV *sv_dat = reg_scan_name(pRExC_state,
7069 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
7070 ch= (ch == '<') ? '>' : (ch == '{') ? '}' : '\'';
7071 if (RExC_parse == name_start || *RExC_parse != ch)
7072 vFAIL2("Sequence %.3s... not terminated",parse_start);
7075 num = add_data( pRExC_state, 1, "S" );
7076 RExC_rxi->data->data[num]=(void*)sv_dat;
7077 SvREFCNT_inc_simple_void(sv_dat);
7081 ret = reganode(pRExC_state,
7082 (U8)(FOLD ? (LOC ? NREFFL : NREFF) : NREF),
7086 /* override incorrect value set in reganode MJD */
7087 Set_Node_Offset(ret, parse_start+1);
7088 Set_Node_Cur_Length(ret); /* MJD */
7089 nextchar(pRExC_state);
7095 case '1': case '2': case '3': case '4':
7096 case '5': case '6': case '7': case '8': case '9':
7099 bool isg = *RExC_parse == 'g';
7104 if (*RExC_parse == '{') {
7108 if (*RExC_parse == '-') {
7112 if (hasbrace && !isDIGIT(*RExC_parse)) {
7113 if (isrel) RExC_parse--;
7115 goto parse_named_seq;
7117 num = atoi(RExC_parse);
7118 if (isg && num == 0)
7119 vFAIL("Reference to invalid group 0");
7121 num = RExC_npar - num;
7123 vFAIL("Reference to nonexistent or unclosed group");
7125 if (!isg && num > 9 && num >= RExC_npar)
7128 char * const parse_start = RExC_parse - 1; /* MJD */
7129 while (isDIGIT(*RExC_parse))
7131 if (parse_start == RExC_parse - 1)
7132 vFAIL("Unterminated \\g... pattern");
7134 if (*RExC_parse != '}')
7135 vFAIL("Unterminated \\g{...} pattern");
7139 if (num > (I32)RExC_rx->nparens)
7140 vFAIL("Reference to nonexistent group");
7143 ret = reganode(pRExC_state,
7144 (U8)(FOLD ? (LOC ? REFFL : REFF) : REF),
7148 /* override incorrect value set in reganode MJD */
7149 Set_Node_Offset(ret, parse_start+1);
7150 Set_Node_Cur_Length(ret); /* MJD */
7152 nextchar(pRExC_state);
7157 if (RExC_parse >= RExC_end)
7158 FAIL("Trailing \\");
7161 /* Do not generate "unrecognized" warnings here, we fall
7162 back into the quick-grab loop below */
7169 if (RExC_flags & RXf_PMf_EXTENDED) {
7170 if ( reg_skipcomment( pRExC_state ) )
7177 register STRLEN len;
7182 U8 tmpbuf[UTF8_MAXBYTES_CASE+1], *foldbuf;
7184 parse_start = RExC_parse - 1;
7190 ret = reg_node(pRExC_state,
7191 (U8)(FOLD ? (LOC ? EXACTFL : EXACTF) : EXACT));
7193 for (len = 0, p = RExC_parse - 1;
7194 len < 127 && p < RExC_end;
7197 char * const oldp = p;
7199 if (RExC_flags & RXf_PMf_EXTENDED)
7200 p = regwhite( pRExC_state, p );
7205 if (LOC || !FOLD || !is_TRICKYFOLD_safe(p,RExC_end,UTF))
7206 goto normal_default;
7216 /* Literal Escapes Switch
7218 This switch is meant to handle escape sequences that
7219 resolve to a literal character.
7221 Every escape sequence that represents something
7222 else, like an assertion or a char class, is handled
7223 in the switch marked 'Special Escapes' above in this
7224 routine, but also has an entry here as anything that
7225 isn't explicitly mentioned here will be treated as
7226 an unescaped equivalent literal.
7230 /* These are all the special escapes. */
7234 if (LOC || !FOLD || !is_TRICKYFOLD_safe(p,RExC_end,UTF))
7235 goto normal_default;
7236 case 'A': /* Start assertion */
7237 case 'b': case 'B': /* Word-boundary assertion*/
7238 case 'C': /* Single char !DANGEROUS! */
7239 case 'd': case 'D': /* digit class */
7240 case 'g': case 'G': /* generic-backref, pos assertion */
7241 case 'h': case 'H': /* HORIZWS */
7242 case 'k': case 'K': /* named backref, keep marker */
7243 case 'N': /* named char sequence */
7244 case 'p': case 'P': /* Unicode property */
7245 case 'R': /* LNBREAK */
7246 case 's': case 'S': /* space class */
7247 case 'v': case 'V': /* VERTWS */
7248 case 'w': case 'W': /* word class */
7249 case 'X': /* eXtended Unicode "combining character sequence" */
7250 case 'z': case 'Z': /* End of line/string assertion */
7254 /* Anything after here is an escape that resolves to a
7255 literal. (Except digits, which may or may not)
7274 ender = ASCII_TO_NATIVE('\033');
7278 ender = ASCII_TO_NATIVE('\007');
7283 char* const e = strchr(p, '}');
7287 vFAIL("Missing right brace on \\x{}");
7290 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
7291 | PERL_SCAN_DISALLOW_PREFIX;
7292 STRLEN numlen = e - p - 1;
7293 ender = grok_hex(p + 1, &numlen, &flags, NULL);
7300 I32 flags = PERL_SCAN_DISALLOW_PREFIX;
7302 ender = grok_hex(p, &numlen, &flags, NULL);
7305 if (PL_encoding && ender < 0x100)
7306 goto recode_encoding;
7310 ender = UCHARAT(p++);
7311 ender = toCTRL(ender);
7313 case '0': case '1': case '2': case '3':case '4':
7314 case '5': case '6': case '7': case '8':case '9':
7316 (isDIGIT(p[1]) && atoi(p) >= RExC_npar) ) {
7319 ender = grok_oct(p, &numlen, &flags, NULL);
7326 if (PL_encoding && ender < 0x100)
7327 goto recode_encoding;
7331 SV* enc = PL_encoding;
7332 ender = reg_recode((const char)(U8)ender, &enc);
7333 if (!enc && SIZE_ONLY && ckWARN(WARN_REGEXP))
7334 vWARN(p, "Invalid escape in the specified encoding");
7340 FAIL("Trailing \\");
7343 if (!SIZE_ONLY&& isALPHA(*p) && ckWARN(WARN_REGEXP))
7344 vWARN2(p + 1, "Unrecognized escape \\%c passed through", UCHARAT(p));
7345 goto normal_default;
7350 if (UTF8_IS_START(*p) && UTF) {
7352 ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
7353 &numlen, UTF8_ALLOW_DEFAULT);
7360 if ( RExC_flags & RXf_PMf_EXTENDED)
7361 p = regwhite( pRExC_state, p );
7363 /* Prime the casefolded buffer. */
7364 ender = toFOLD_uni(ender, tmpbuf, &foldlen);
7366 if (p < RExC_end && ISMULT2(p)) { /* Back off on ?+*. */
7371 /* Emit all the Unicode characters. */
7373 for (foldbuf = tmpbuf;
7375 foldlen -= numlen) {
7376 ender = utf8_to_uvchr(foldbuf, &numlen);
7378 const STRLEN unilen = reguni(pRExC_state, ender, s);
7381 /* In EBCDIC the numlen
7382 * and unilen can differ. */
7384 if (numlen >= foldlen)
7388 break; /* "Can't happen." */
7392 const STRLEN unilen = reguni(pRExC_state, ender, s);
7401 REGC((char)ender, s++);
7407 /* Emit all the Unicode characters. */
7409 for (foldbuf = tmpbuf;
7411 foldlen -= numlen) {
7412 ender = utf8_to_uvchr(foldbuf, &numlen);
7414 const STRLEN unilen = reguni(pRExC_state, ender, s);
7417 /* In EBCDIC the numlen
7418 * and unilen can differ. */
7420 if (numlen >= foldlen)
7428 const STRLEN unilen = reguni(pRExC_state, ender, s);
7437 REGC((char)ender, s++);
7441 Set_Node_Cur_Length(ret); /* MJD */
7442 nextchar(pRExC_state);
7444 /* len is STRLEN which is unsigned, need to copy to signed */
7447 vFAIL("Internal disaster");
7451 if (len == 1 && UNI_IS_INVARIANT(ender))
7455 RExC_size += STR_SZ(len);
7458 RExC_emit += STR_SZ(len);
7468 S_regwhite( RExC_state_t *pRExC_state, char *p )
7470 const char *e = RExC_end;
7474 else if (*p == '#') {
7483 RExC_seen |= REG_SEEN_RUN_ON_COMMENT;
7491 /* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
7492 Character classes ([:foo:]) can also be negated ([:^foo:]).
7493 Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
7494 Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
7495 but trigger failures because they are currently unimplemented. */
7497 #define POSIXCC_DONE(c) ((c) == ':')
7498 #define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.')
7499 #define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c))
7502 S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value)
7505 I32 namedclass = OOB_NAMEDCLASS;
7507 if (value == '[' && RExC_parse + 1 < RExC_end &&
7508 /* I smell either [: or [= or [. -- POSIX has been here, right? */
7509 POSIXCC(UCHARAT(RExC_parse))) {
7510 const char c = UCHARAT(RExC_parse);
7511 char* const s = RExC_parse++;
7513 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != c)
7515 if (RExC_parse == RExC_end)
7516 /* Grandfather lone [:, [=, [. */
7519 const char* const t = RExC_parse++; /* skip over the c */
7522 if (UCHARAT(RExC_parse) == ']') {
7523 const char *posixcc = s + 1;
7524 RExC_parse++; /* skip over the ending ] */
7527 const I32 complement = *posixcc == '^' ? *posixcc++ : 0;
7528 const I32 skip = t - posixcc;
7530 /* Initially switch on the length of the name. */
7533 if (memEQ(posixcc, "word", 4)) /* this is not POSIX, this is the Perl \w */
7534 namedclass = complement ? ANYOF_NALNUM : ANYOF_ALNUM;
7537 /* Names all of length 5. */
7538 /* alnum alpha ascii blank cntrl digit graph lower
7539 print punct space upper */
7540 /* Offset 4 gives the best switch position. */
7541 switch (posixcc[4]) {
7543 if (memEQ(posixcc, "alph", 4)) /* alpha */
7544 namedclass = complement ? ANYOF_NALPHA : ANYOF_ALPHA;
7547 if (memEQ(posixcc, "spac", 4)) /* space */
7548 namedclass = complement ? ANYOF_NPSXSPC : ANYOF_PSXSPC;
7551 if (memEQ(posixcc, "grap", 4)) /* graph */
7552 namedclass = complement ? ANYOF_NGRAPH : ANYOF_GRAPH;
7555 if (memEQ(posixcc, "asci", 4)) /* ascii */
7556 namedclass = complement ? ANYOF_NASCII : ANYOF_ASCII;
7559 if (memEQ(posixcc, "blan", 4)) /* blank */
7560 namedclass = complement ? ANYOF_NBLANK : ANYOF_BLANK;
7563 if (memEQ(posixcc, "cntr", 4)) /* cntrl */
7564 namedclass = complement ? ANYOF_NCNTRL : ANYOF_CNTRL;
7567 if (memEQ(posixcc, "alnu", 4)) /* alnum */
7568 namedclass = complement ? ANYOF_NALNUMC : ANYOF_ALNUMC;
7571 if (memEQ(posixcc, "lowe", 4)) /* lower */
7572 namedclass = complement ? ANYOF_NLOWER : ANYOF_LOWER;
7573 else if (memEQ(posixcc, "uppe", 4)) /* upper */
7574 namedclass = complement ? ANYOF_NUPPER : ANYOF_UPPER;
7577 if (memEQ(posixcc, "digi", 4)) /* digit */
7578 namedclass = complement ? ANYOF_NDIGIT : ANYOF_DIGIT;
7579 else if (memEQ(posixcc, "prin", 4)) /* print */
7580 namedclass = complement ? ANYOF_NPRINT : ANYOF_PRINT;
7581 else if (memEQ(posixcc, "punc", 4)) /* punct */
7582 namedclass = complement ? ANYOF_NPUNCT : ANYOF_PUNCT;
7587 if (memEQ(posixcc, "xdigit", 6))
7588 namedclass = complement ? ANYOF_NXDIGIT : ANYOF_XDIGIT;
7592 if (namedclass == OOB_NAMEDCLASS)
7593 Simple_vFAIL3("POSIX class [:%.*s:] unknown",
7595 assert (posixcc[skip] == ':');
7596 assert (posixcc[skip+1] == ']');
7597 } else if (!SIZE_ONLY) {
7598 /* [[=foo=]] and [[.foo.]] are still future. */
7600 /* adjust RExC_parse so the warning shows after
7602 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse) != ']')
7604 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
7607 /* Maternal grandfather:
7608 * "[:" ending in ":" but not in ":]" */
7618 S_checkposixcc(pTHX_ RExC_state_t *pRExC_state)
7621 if (POSIXCC(UCHARAT(RExC_parse))) {
7622 const char *s = RExC_parse;
7623 const char c = *s++;
7627 if (*s && c == *s && s[1] == ']') {
7628 if (ckWARN(WARN_REGEXP))
7630 "POSIX syntax [%c %c] belongs inside character classes",
7633 /* [[=foo=]] and [[.foo.]] are still future. */
7634 if (POSIXCC_NOTYET(c)) {
7635 /* adjust RExC_parse so the error shows after
7637 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse++) != ']')
7639 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
7646 #define _C_C_T_(NAME,TEST,WORD) \
7649 ANYOF_CLASS_SET(ret, ANYOF_##NAME); \
7651 for (value = 0; value < 256; value++) \
7653 ANYOF_BITMAP_SET(ret, value); \
7658 case ANYOF_N##NAME: \
7660 ANYOF_CLASS_SET(ret, ANYOF_N##NAME); \
7662 for (value = 0; value < 256; value++) \
7664 ANYOF_BITMAP_SET(ret, value); \
7670 #define _C_C_T_NOLOC_(NAME,TEST,WORD) \
7672 for (value = 0; value < 256; value++) \
7674 ANYOF_BITMAP_SET(ret, value); \
7678 case ANYOF_N##NAME: \
7679 for (value = 0; value < 256; value++) \
7681 ANYOF_BITMAP_SET(ret, value); \
7687 parse a class specification and produce either an ANYOF node that
7688 matches the pattern or if the pattern matches a single char only and
7689 that char is < 256 and we are case insensitive then we produce an
7694 S_regclass(pTHX_ RExC_state_t *pRExC_state, U32 depth)
7697 register UV nextvalue;
7698 register IV prevvalue = OOB_UNICODE;
7699 register IV range = 0;
7700 UV value = 0; /* XXX:dmq: needs to be referenceable (unfortunately) */
7701 register regnode *ret;
7704 char *rangebegin = NULL;
7705 bool need_class = 0;
7708 bool optimize_invert = TRUE;
7709 AV* unicode_alternate = NULL;
7711 UV literal_endpoint = 0;
7713 UV stored = 0; /* number of chars stored in the class */
7715 regnode * const orig_emit = RExC_emit; /* Save the original RExC_emit in
7716 case we need to change the emitted regop to an EXACT. */
7717 const char * orig_parse = RExC_parse;
7718 GET_RE_DEBUG_FLAGS_DECL;
7720 PERL_UNUSED_ARG(depth);
7723 DEBUG_PARSE("clas");
7725 /* Assume we are going to generate an ANYOF node. */
7726 ret = reganode(pRExC_state, ANYOF, 0);
7729 ANYOF_FLAGS(ret) = 0;
7731 if (UCHARAT(RExC_parse) == '^') { /* Complement of range. */
7735 ANYOF_FLAGS(ret) |= ANYOF_INVERT;
7739 RExC_size += ANYOF_SKIP;
7740 listsv = &PL_sv_undef; /* For code scanners: listsv always non-NULL. */
7743 RExC_emit += ANYOF_SKIP;
7745 ANYOF_FLAGS(ret) |= ANYOF_FOLD;
7747 ANYOF_FLAGS(ret) |= ANYOF_LOCALE;
7748 ANYOF_BITMAP_ZERO(ret);
7749 listsv = newSVpvs("# comment\n");
7752 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
7754 if (!SIZE_ONLY && POSIXCC(nextvalue))
7755 checkposixcc(pRExC_state);
7757 /* allow 1st char to be ] (allowing it to be - is dealt with later) */
7758 if (UCHARAT(RExC_parse) == ']')
7762 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != ']') {
7766 namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
7769 rangebegin = RExC_parse;
7771 value = utf8n_to_uvchr((U8*)RExC_parse,
7772 RExC_end - RExC_parse,
7773 &numlen, UTF8_ALLOW_DEFAULT);
7774 RExC_parse += numlen;
7777 value = UCHARAT(RExC_parse++);
7779 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
7780 if (value == '[' && POSIXCC(nextvalue))
7781 namedclass = regpposixcc(pRExC_state, value);
7782 else if (value == '\\') {
7784 value = utf8n_to_uvchr((U8*)RExC_parse,
7785 RExC_end - RExC_parse,
7786 &numlen, UTF8_ALLOW_DEFAULT);
7787 RExC_parse += numlen;
7790 value = UCHARAT(RExC_parse++);
7791 /* Some compilers cannot handle switching on 64-bit integer
7792 * values, therefore value cannot be an UV. Yes, this will
7793 * be a problem later if we want switch on Unicode.
7794 * A similar issue a little bit later when switching on
7795 * namedclass. --jhi */
7796 switch ((I32)value) {
7797 case 'w': namedclass = ANYOF_ALNUM; break;
7798 case 'W': namedclass = ANYOF_NALNUM; break;
7799 case 's': namedclass = ANYOF_SPACE; break;
7800 case 'S': namedclass = ANYOF_NSPACE; break;
7801 case 'd': namedclass = ANYOF_DIGIT; break;
7802 case 'D': namedclass = ANYOF_NDIGIT; break;
7803 case 'v': namedclass = ANYOF_VERTWS; break;
7804 case 'V': namedclass = ANYOF_NVERTWS; break;
7805 case 'h': namedclass = ANYOF_HORIZWS; break;
7806 case 'H': namedclass = ANYOF_NHORIZWS; break;
7807 case 'N': /* Handle \N{NAME} in class */
7809 /* We only pay attention to the first char of
7810 multichar strings being returned. I kinda wonder
7811 if this makes sense as it does change the behaviour
7812 from earlier versions, OTOH that behaviour was broken
7814 UV v; /* value is register so we cant & it /grrr */
7815 if (reg_namedseq(pRExC_state, &v)) {
7825 if (RExC_parse >= RExC_end)
7826 vFAIL2("Empty \\%c{}", (U8)value);
7827 if (*RExC_parse == '{') {
7828 const U8 c = (U8)value;
7829 e = strchr(RExC_parse++, '}');
7831 vFAIL2("Missing right brace on \\%c{}", c);
7832 while (isSPACE(UCHARAT(RExC_parse)))
7834 if (e == RExC_parse)
7835 vFAIL2("Empty \\%c{}", c);
7837 while (isSPACE(UCHARAT(RExC_parse + n - 1)))
7845 if (UCHARAT(RExC_parse) == '^') {
7848 value = value == 'p' ? 'P' : 'p'; /* toggle */
7849 while (isSPACE(UCHARAT(RExC_parse))) {
7854 Perl_sv_catpvf(aTHX_ listsv, "%cutf8::%.*s\n",
7855 (value=='p' ? '+' : '!'), (int)n, RExC_parse);
7858 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
7859 namedclass = ANYOF_MAX; /* no official name, but it's named */
7862 case 'n': value = '\n'; break;
7863 case 'r': value = '\r'; break;
7864 case 't': value = '\t'; break;
7865 case 'f': value = '\f'; break;
7866 case 'b': value = '\b'; break;
7867 case 'e': value = ASCII_TO_NATIVE('\033');break;
7868 case 'a': value = ASCII_TO_NATIVE('\007');break;
7870 if (*RExC_parse == '{') {
7871 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
7872 | PERL_SCAN_DISALLOW_PREFIX;
7873 char * const e = strchr(RExC_parse++, '}');
7875 vFAIL("Missing right brace on \\x{}");
7877 numlen = e - RExC_parse;
7878 value = grok_hex(RExC_parse, &numlen, &flags, NULL);
7882 I32 flags = PERL_SCAN_DISALLOW_PREFIX;
7884 value = grok_hex(RExC_parse, &numlen, &flags, NULL);
7885 RExC_parse += numlen;
7887 if (PL_encoding && value < 0x100)
7888 goto recode_encoding;
7891 value = UCHARAT(RExC_parse++);
7892 value = toCTRL(value);
7894 case '0': case '1': case '2': case '3': case '4':
7895 case '5': case '6': case '7': case '8': case '9':
7899 value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
7900 RExC_parse += numlen;
7901 if (PL_encoding && value < 0x100)
7902 goto recode_encoding;
7907 SV* enc = PL_encoding;
7908 value = reg_recode((const char)(U8)value, &enc);
7909 if (!enc && SIZE_ONLY && ckWARN(WARN_REGEXP))
7911 "Invalid escape in the specified encoding");
7915 if (!SIZE_ONLY && isALPHA(value) && ckWARN(WARN_REGEXP))
7917 "Unrecognized escape \\%c in character class passed through",
7921 } /* end of \blah */
7927 if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
7929 if (!SIZE_ONLY && !need_class)
7930 ANYOF_CLASS_ZERO(ret);
7934 /* a bad range like a-\d, a-[:digit:] ? */
7937 if (ckWARN(WARN_REGEXP)) {
7939 RExC_parse >= rangebegin ?
7940 RExC_parse - rangebegin : 0;
7942 "False [] range \"%*.*s\"",
7945 if (prevvalue < 256) {
7946 ANYOF_BITMAP_SET(ret, prevvalue);
7947 ANYOF_BITMAP_SET(ret, '-');
7950 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
7951 Perl_sv_catpvf(aTHX_ listsv,
7952 "%04"UVxf"\n%04"UVxf"\n", (UV)prevvalue, (UV) '-');
7956 range = 0; /* this was not a true range */
7962 const char *what = NULL;
7965 if (namedclass > OOB_NAMEDCLASS)
7966 optimize_invert = FALSE;
7967 /* Possible truncation here but in some 64-bit environments
7968 * the compiler gets heartburn about switch on 64-bit values.
7969 * A similar issue a little earlier when switching on value.
7971 switch ((I32)namedclass) {
7972 case _C_C_T_(ALNUM, isALNUM(value), "Word");
7973 case _C_C_T_(ALNUMC, isALNUMC(value), "Alnum");
7974 case _C_C_T_(ALPHA, isALPHA(value), "Alpha");
7975 case _C_C_T_(BLANK, isBLANK(value), "Blank");
7976 case _C_C_T_(CNTRL, isCNTRL(value), "Cntrl");
7977 case _C_C_T_(GRAPH, isGRAPH(value), "Graph");
7978 case _C_C_T_(LOWER, isLOWER(value), "Lower");
7979 case _C_C_T_(PRINT, isPRINT(value), "Print");
7980 case _C_C_T_(PSXSPC, isPSXSPC(value), "Space");
7981 case _C_C_T_(PUNCT, isPUNCT(value), "Punct");
7982 case _C_C_T_(SPACE, isSPACE(value), "SpacePerl");
7983 case _C_C_T_(UPPER, isUPPER(value), "Upper");
7984 case _C_C_T_(XDIGIT, isXDIGIT(value), "XDigit");
7985 case _C_C_T_NOLOC_(VERTWS, is_VERTWS_latin1(&value), "VertSpace");
7986 case _C_C_T_NOLOC_(HORIZWS, is_HORIZWS_latin1(&value), "HorizSpace");
7989 ANYOF_CLASS_SET(ret, ANYOF_ASCII);
7992 for (value = 0; value < 128; value++)
7993 ANYOF_BITMAP_SET(ret, value);
7995 for (value = 0; value < 256; value++) {
7997 ANYOF_BITMAP_SET(ret, value);
8006 ANYOF_CLASS_SET(ret, ANYOF_NASCII);
8009 for (value = 128; value < 256; value++)
8010 ANYOF_BITMAP_SET(ret, value);
8012 for (value = 0; value < 256; value++) {
8013 if (!isASCII(value))
8014 ANYOF_BITMAP_SET(ret, value);
8023 ANYOF_CLASS_SET(ret, ANYOF_DIGIT);
8025 /* consecutive digits assumed */
8026 for (value = '0'; value <= '9'; value++)
8027 ANYOF_BITMAP_SET(ret, value);
8034 ANYOF_CLASS_SET(ret, ANYOF_NDIGIT);
8036 /* consecutive digits assumed */
8037 for (value = 0; value < '0'; value++)
8038 ANYOF_BITMAP_SET(ret, value);
8039 for (value = '9' + 1; value < 256; value++)
8040 ANYOF_BITMAP_SET(ret, value);
8046 /* this is to handle \p and \P */
8049 vFAIL("Invalid [::] class");
8053 /* Strings such as "+utf8::isWord\n" */
8054 Perl_sv_catpvf(aTHX_ listsv, "%cutf8::Is%s\n", yesno, what);
8057 ANYOF_FLAGS(ret) |= ANYOF_CLASS;
8060 } /* end of namedclass \blah */
8063 if (prevvalue > (IV)value) /* b-a */ {
8064 const int w = RExC_parse - rangebegin;
8065 Simple_vFAIL4("Invalid [] range \"%*.*s\"", w, w, rangebegin);
8066 range = 0; /* not a valid range */
8070 prevvalue = value; /* save the beginning of the range */
8071 if (*RExC_parse == '-' && RExC_parse+1 < RExC_end &&
8072 RExC_parse[1] != ']') {
8075 /* a bad range like \w-, [:word:]- ? */
8076 if (namedclass > OOB_NAMEDCLASS) {
8077 if (ckWARN(WARN_REGEXP)) {
8079 RExC_parse >= rangebegin ?
8080 RExC_parse - rangebegin : 0;
8082 "False [] range \"%*.*s\"",
8086 ANYOF_BITMAP_SET(ret, '-');
8088 range = 1; /* yeah, it's a range! */
8089 continue; /* but do it the next time */
8093 /* now is the next time */
8094 /*stored += (value - prevvalue + 1);*/
8096 if (prevvalue < 256) {
8097 const IV ceilvalue = value < 256 ? value : 255;
8100 /* In EBCDIC [\x89-\x91] should include
8101 * the \x8e but [i-j] should not. */
8102 if (literal_endpoint == 2 &&
8103 ((isLOWER(prevvalue) && isLOWER(ceilvalue)) ||
8104 (isUPPER(prevvalue) && isUPPER(ceilvalue))))
8106 if (isLOWER(prevvalue)) {
8107 for (i = prevvalue; i <= ceilvalue; i++)
8108 if (isLOWER(i) && !ANYOF_BITMAP_TEST(ret,i)) {
8110 ANYOF_BITMAP_SET(ret, i);
8113 for (i = prevvalue; i <= ceilvalue; i++)
8114 if (isUPPER(i) && !ANYOF_BITMAP_TEST(ret,i)) {
8116 ANYOF_BITMAP_SET(ret, i);
8122 for (i = prevvalue; i <= ceilvalue; i++) {
8123 if (!ANYOF_BITMAP_TEST(ret,i)) {
8125 ANYOF_BITMAP_SET(ret, i);
8129 if (value > 255 || UTF) {
8130 const UV prevnatvalue = NATIVE_TO_UNI(prevvalue);
8131 const UV natvalue = NATIVE_TO_UNI(value);
8132 stored+=2; /* can't optimize this class */
8133 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
8134 if (prevnatvalue < natvalue) { /* what about > ? */
8135 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\t%04"UVxf"\n",
8136 prevnatvalue, natvalue);
8138 else if (prevnatvalue == natvalue) {
8139 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n", natvalue);
8141 U8 foldbuf[UTF8_MAXBYTES_CASE+1];
8143 const UV f = to_uni_fold(natvalue, foldbuf, &foldlen);
8145 #ifdef EBCDIC /* RD t/uni/fold ff and 6b */
8146 if (RExC_precomp[0] == ':' &&
8147 RExC_precomp[1] == '[' &&
8148 (f == 0xDF || f == 0x92)) {
8149 f = NATIVE_TO_UNI(f);
8152 /* If folding and foldable and a single
8153 * character, insert also the folded version
8154 * to the charclass. */
8156 #ifdef EBCDIC /* RD tunifold ligatures s,t fb05, fb06 */
8157 if ((RExC_precomp[0] == ':' &&
8158 RExC_precomp[1] == '[' &&
8160 (value == 0xFB05 || value == 0xFB06))) ?
8161 foldlen == ((STRLEN)UNISKIP(f) - 1) :
8162 foldlen == (STRLEN)UNISKIP(f) )
8164 if (foldlen == (STRLEN)UNISKIP(f))
8166 Perl_sv_catpvf(aTHX_ listsv,
8169 /* Any multicharacter foldings
8170 * require the following transform:
8171 * [ABCDEF] -> (?:[ABCabcDEFd]|pq|rst)
8172 * where E folds into "pq" and F folds
8173 * into "rst", all other characters
8174 * fold to single characters. We save
8175 * away these multicharacter foldings,
8176 * to be later saved as part of the
8177 * additional "s" data. */
8180 if (!unicode_alternate)
8181 unicode_alternate = newAV();
8182 sv = newSVpvn((char*)foldbuf, foldlen);
8184 av_push(unicode_alternate, sv);
8188 /* If folding and the value is one of the Greek
8189 * sigmas insert a few more sigmas to make the
8190 * folding rules of the sigmas to work right.
8191 * Note that not all the possible combinations
8192 * are handled here: some of them are handled
8193 * by the standard folding rules, and some of
8194 * them (literal or EXACTF cases) are handled
8195 * during runtime in regexec.c:S_find_byclass(). */
8196 if (value == UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA) {
8197 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
8198 (UV)UNICODE_GREEK_CAPITAL_LETTER_SIGMA);
8199 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
8200 (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
8202 else if (value == UNICODE_GREEK_CAPITAL_LETTER_SIGMA)
8203 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
8204 (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
8209 literal_endpoint = 0;
8213 range = 0; /* this range (if it was one) is done now */
8217 ANYOF_FLAGS(ret) |= ANYOF_LARGE;
8219 RExC_size += ANYOF_CLASS_ADD_SKIP;
8221 RExC_emit += ANYOF_CLASS_ADD_SKIP;
8227 /****** !SIZE_ONLY AFTER HERE *********/
8229 if( stored == 1 && (value < 128 || (value < 256 && !UTF))
8230 && !( ANYOF_FLAGS(ret) & ( ANYOF_FLAGS_ALL ^ ANYOF_FOLD ) )
8232 /* optimize single char class to an EXACT node
8233 but *only* when its not a UTF/high char */
8234 const char * cur_parse= RExC_parse;
8235 RExC_emit = (regnode *)orig_emit;
8236 RExC_parse = (char *)orig_parse;
8237 ret = reg_node(pRExC_state,
8238 (U8)((ANYOF_FLAGS(ret) & ANYOF_FOLD) ? EXACTF : EXACT));
8239 RExC_parse = (char *)cur_parse;
8240 *STRING(ret)= (char)value;
8242 RExC_emit += STR_SZ(1);
8245 /* optimize case-insensitive simple patterns (e.g. /[a-z]/i) */
8246 if ( /* If the only flag is folding (plus possibly inversion). */
8247 ((ANYOF_FLAGS(ret) & (ANYOF_FLAGS_ALL ^ ANYOF_INVERT)) == ANYOF_FOLD)
8249 for (value = 0; value < 256; ++value) {
8250 if (ANYOF_BITMAP_TEST(ret, value)) {
8251 UV fold = PL_fold[value];
8254 ANYOF_BITMAP_SET(ret, fold);
8257 ANYOF_FLAGS(ret) &= ~ANYOF_FOLD;
8260 /* optimize inverted simple patterns (e.g. [^a-z]) */
8261 if (optimize_invert &&
8262 /* If the only flag is inversion. */
8263 (ANYOF_FLAGS(ret) & ANYOF_FLAGS_ALL) == ANYOF_INVERT) {
8264 for (value = 0; value < ANYOF_BITMAP_SIZE; ++value)
8265 ANYOF_BITMAP(ret)[value] ^= ANYOF_FLAGS_ALL;
8266 ANYOF_FLAGS(ret) = ANYOF_UNICODE_ALL;
8269 AV * const av = newAV();
8271 /* The 0th element stores the character class description
8272 * in its textual form: used later (regexec.c:Perl_regclass_swash())
8273 * to initialize the appropriate swash (which gets stored in
8274 * the 1st element), and also useful for dumping the regnode.
8275 * The 2nd element stores the multicharacter foldings,
8276 * used later (regexec.c:S_reginclass()). */
8277 av_store(av, 0, listsv);
8278 av_store(av, 1, NULL);
8279 av_store(av, 2, (SV*)unicode_alternate);
8280 rv = newRV_noinc((SV*)av);
8281 n = add_data(pRExC_state, 1, "s");
8282 RExC_rxi->data->data[n] = (void*)rv;
8290 /* reg_skipcomment()
8292 Absorbs an /x style # comments from the input stream.
8293 Returns true if there is more text remaining in the stream.
8294 Will set the REG_SEEN_RUN_ON_COMMENT flag if the comment
8295 terminates the pattern without including a newline.
8297 Note its the callers responsibility to ensure that we are
8303 S_reg_skipcomment(pTHX_ RExC_state_t *pRExC_state)
8306 while (RExC_parse < RExC_end)
8307 if (*RExC_parse++ == '\n') {
8312 /* we ran off the end of the pattern without ending
8313 the comment, so we have to add an \n when wrapping */
8314 RExC_seen |= REG_SEEN_RUN_ON_COMMENT;
8322 Advance that parse position, and optionall absorbs
8323 "whitespace" from the inputstream.
8325 Without /x "whitespace" means (?#...) style comments only,
8326 with /x this means (?#...) and # comments and whitespace proper.
8328 Returns the RExC_parse point from BEFORE the scan occurs.
8330 This is the /x friendly way of saying RExC_parse++.
8334 S_nextchar(pTHX_ RExC_state_t *pRExC_state)
8336 char* const retval = RExC_parse++;
8339 if (*RExC_parse == '(' && RExC_parse[1] == '?' &&
8340 RExC_parse[2] == '#') {
8341 while (*RExC_parse != ')') {
8342 if (RExC_parse == RExC_end)
8343 FAIL("Sequence (?#... not terminated");
8349 if (RExC_flags & RXf_PMf_EXTENDED) {
8350 if (isSPACE(*RExC_parse)) {
8354 else if (*RExC_parse == '#') {
8355 if ( reg_skipcomment( pRExC_state ) )
8364 - reg_node - emit a node
8366 STATIC regnode * /* Location. */
8367 S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
8370 register regnode *ptr;
8371 regnode * const ret = RExC_emit;
8372 GET_RE_DEBUG_FLAGS_DECL;
8375 SIZE_ALIGN(RExC_size);
8379 if (RExC_emit >= RExC_emit_bound)
8380 Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d", op);
8382 NODE_ALIGN_FILL(ret);
8384 FILL_ADVANCE_NODE(ptr, op);
8385 #ifdef RE_TRACK_PATTERN_OFFSETS
8386 if (RExC_offsets) { /* MJD */
8387 MJD_OFFSET_DEBUG(("%s:%d: (op %s) %s %"UVuf" (len %"UVuf") (max %"UVuf").\n",
8388 "reg_node", __LINE__,
8390 (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0]
8391 ? "Overwriting end of array!\n" : "OK",
8392 (UV)(RExC_emit - RExC_emit_start),
8393 (UV)(RExC_parse - RExC_start),
8394 (UV)RExC_offsets[0]));
8395 Set_Node_Offset(RExC_emit, RExC_parse + (op == END));
8403 - reganode - emit a node with an argument
8405 STATIC regnode * /* Location. */
8406 S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
8409 register regnode *ptr;
8410 regnode * const ret = RExC_emit;
8411 GET_RE_DEBUG_FLAGS_DECL;
8414 SIZE_ALIGN(RExC_size);
8419 assert(2==regarglen[op]+1);
8421 Anything larger than this has to allocate the extra amount.
8422 If we changed this to be:
8424 RExC_size += (1 + regarglen[op]);
8426 then it wouldn't matter. Its not clear what side effect
8427 might come from that so its not done so far.
8432 if (RExC_emit >= RExC_emit_bound)
8433 Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d", op);
8435 NODE_ALIGN_FILL(ret);
8437 FILL_ADVANCE_NODE_ARG(ptr, op, arg);
8438 #ifdef RE_TRACK_PATTERN_OFFSETS
8439 if (RExC_offsets) { /* MJD */
8440 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
8444 (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0] ?
8445 "Overwriting end of array!\n" : "OK",
8446 (UV)(RExC_emit - RExC_emit_start),
8447 (UV)(RExC_parse - RExC_start),
8448 (UV)RExC_offsets[0]));
8449 Set_Cur_Node_Offset;
8457 - reguni - emit (if appropriate) a Unicode character
8460 S_reguni(pTHX_ const RExC_state_t *pRExC_state, UV uv, char* s)
8463 return SIZE_ONLY ? UNISKIP(uv) : (uvchr_to_utf8((U8*)s, uv) - (U8*)s);
8467 - reginsert - insert an operator in front of already-emitted operand
8469 * Means relocating the operand.
8472 S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd, U32 depth)
8475 register regnode *src;
8476 register regnode *dst;
8477 register regnode *place;
8478 const int offset = regarglen[(U8)op];
8479 const int size = NODE_STEP_REGNODE + offset;
8480 GET_RE_DEBUG_FLAGS_DECL;
8481 PERL_UNUSED_ARG(depth);
8482 /* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */
8483 DEBUG_PARSE_FMT("inst"," - %s",PL_reg_name[op]);
8492 if (RExC_open_parens) {
8494 /*DEBUG_PARSE_FMT("inst"," - %"IVdf, (IV)RExC_npar);*/
8495 for ( paren=0 ; paren < RExC_npar ; paren++ ) {
8496 if ( RExC_open_parens[paren] >= opnd ) {
8497 /*DEBUG_PARSE_FMT("open"," - %d",size);*/
8498 RExC_open_parens[paren] += size;
8500 /*DEBUG_PARSE_FMT("open"," - %s","ok");*/
8502 if ( RExC_close_parens[paren] >= opnd ) {
8503 /*DEBUG_PARSE_FMT("close"," - %d",size);*/
8504 RExC_close_parens[paren] += size;
8506 /*DEBUG_PARSE_FMT("close"," - %s","ok");*/
8511 while (src > opnd) {
8512 StructCopy(--src, --dst, regnode);
8513 #ifdef RE_TRACK_PATTERN_OFFSETS
8514 if (RExC_offsets) { /* MJD 20010112 */
8515 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s copy %"UVuf" -> %"UVuf" (max %"UVuf").\n",
8519 (UV)(dst - RExC_emit_start) > RExC_offsets[0]
8520 ? "Overwriting end of array!\n" : "OK",
8521 (UV)(src - RExC_emit_start),
8522 (UV)(dst - RExC_emit_start),
8523 (UV)RExC_offsets[0]));
8524 Set_Node_Offset_To_R(dst-RExC_emit_start, Node_Offset(src));
8525 Set_Node_Length_To_R(dst-RExC_emit_start, Node_Length(src));
8531 place = opnd; /* Op node, where operand used to be. */
8532 #ifdef RE_TRACK_PATTERN_OFFSETS
8533 if (RExC_offsets) { /* MJD */
8534 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
8538 (UV)(place - RExC_emit_start) > RExC_offsets[0]
8539 ? "Overwriting end of array!\n" : "OK",
8540 (UV)(place - RExC_emit_start),
8541 (UV)(RExC_parse - RExC_start),
8542 (UV)RExC_offsets[0]));
8543 Set_Node_Offset(place, RExC_parse);
8544 Set_Node_Length(place, 1);
8547 src = NEXTOPER(place);
8548 FILL_ADVANCE_NODE(place, op);
8549 Zero(src, offset, regnode);
8553 - regtail - set the next-pointer at the end of a node chain of p to val.
8554 - SEE ALSO: regtail_study
8556 /* TODO: All three parms should be const */
8558 S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
8561 register regnode *scan;
8562 GET_RE_DEBUG_FLAGS_DECL;
8564 PERL_UNUSED_ARG(depth);
8570 /* Find last node. */
8573 regnode * const temp = regnext(scan);
8575 SV * const mysv=sv_newmortal();
8576 DEBUG_PARSE_MSG((scan==p ? "tail" : ""));
8577 regprop(RExC_rx, mysv, scan);
8578 PerlIO_printf(Perl_debug_log, "~ %s (%d) %s %s\n",
8579 SvPV_nolen_const(mysv), REG_NODE_NUM(scan),
8580 (temp == NULL ? "->" : ""),
8581 (temp == NULL ? PL_reg_name[OP(val)] : "")
8589 if (reg_off_by_arg[OP(scan)]) {
8590 ARG_SET(scan, val - scan);
8593 NEXT_OFF(scan) = val - scan;
8599 - regtail_study - set the next-pointer at the end of a node chain of p to val.
8600 - Look for optimizable sequences at the same time.
8601 - currently only looks for EXACT chains.
8603 This is expermental code. The idea is to use this routine to perform
8604 in place optimizations on branches and groups as they are constructed,
8605 with the long term intention of removing optimization from study_chunk so
8606 that it is purely analytical.
8608 Currently only used when in DEBUG mode. The macro REGTAIL_STUDY() is used
8609 to control which is which.
8612 /* TODO: All four parms should be const */
8615 S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
8618 register regnode *scan;
8620 #ifdef EXPERIMENTAL_INPLACESCAN
8624 GET_RE_DEBUG_FLAGS_DECL;
8630 /* Find last node. */
8634 regnode * const temp = regnext(scan);
8635 #ifdef EXPERIMENTAL_INPLACESCAN
8636 if (PL_regkind[OP(scan)] == EXACT)
8637 if (join_exact(pRExC_state,scan,&min,1,val,depth+1))
8645 if( exact == PSEUDO )
8647 else if ( exact != OP(scan) )
8656 SV * const mysv=sv_newmortal();
8657 DEBUG_PARSE_MSG((scan==p ? "tsdy" : ""));
8658 regprop(RExC_rx, mysv, scan);
8659 PerlIO_printf(Perl_debug_log, "~ %s (%d) -> %s\n",
8660 SvPV_nolen_const(mysv),
8662 PL_reg_name[exact]);
8669 SV * const mysv_val=sv_newmortal();
8670 DEBUG_PARSE_MSG("");
8671 regprop(RExC_rx, mysv_val, val);
8672 PerlIO_printf(Perl_debug_log, "~ attach to %s (%"IVdf") offset to %"IVdf"\n",
8673 SvPV_nolen_const(mysv_val),
8674 (IV)REG_NODE_NUM(val),
8678 if (reg_off_by_arg[OP(scan)]) {
8679 ARG_SET(scan, val - scan);
8682 NEXT_OFF(scan) = val - scan;
8690 - regcurly - a little FSA that accepts {\d+,?\d*}
8693 S_regcurly(register const char *s)
8712 - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
8716 S_regdump_extflags(pTHX_ const char *lead, const U32 flags) {
8719 for (bit=0; bit<32; bit++) {
8720 if (flags & (1<<bit)) {
8722 PerlIO_printf(Perl_debug_log, "%s",lead);
8723 PerlIO_printf(Perl_debug_log, "%s ",PL_reg_extflags_name[bit]);
8728 PerlIO_printf(Perl_debug_log, "\n");
8730 PerlIO_printf(Perl_debug_log, "%s[none-set]\n",lead);
8736 Perl_regdump(pTHX_ const regexp *r)
8740 SV * const sv = sv_newmortal();
8741 SV *dsv= sv_newmortal();
8743 GET_RE_DEBUG_FLAGS_DECL;
8745 (void)dumpuntil(r, ri->program, ri->program + 1, NULL, NULL, sv, 0, 0);
8747 /* Header fields of interest. */
8748 if (r->anchored_substr) {
8749 RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->anchored_substr),
8750 RE_SV_DUMPLEN(r->anchored_substr), 30);
8751 PerlIO_printf(Perl_debug_log,
8752 "anchored %s%s at %"IVdf" ",
8753 s, RE_SV_TAIL(r->anchored_substr),
8754 (IV)r->anchored_offset);
8755 } else if (r->anchored_utf8) {
8756 RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->anchored_utf8),
8757 RE_SV_DUMPLEN(r->anchored_utf8), 30);
8758 PerlIO_printf(Perl_debug_log,
8759 "anchored utf8 %s%s at %"IVdf" ",
8760 s, RE_SV_TAIL(r->anchored_utf8),
8761 (IV)r->anchored_offset);
8763 if (r->float_substr) {
8764 RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->float_substr),
8765 RE_SV_DUMPLEN(r->float_substr), 30);
8766 PerlIO_printf(Perl_debug_log,
8767 "floating %s%s at %"IVdf"..%"UVuf" ",
8768 s, RE_SV_TAIL(r->float_substr),
8769 (IV)r->float_min_offset, (UV)r->float_max_offset);
8770 } else if (r->float_utf8) {
8771 RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->float_utf8),
8772 RE_SV_DUMPLEN(r->float_utf8), 30);
8773 PerlIO_printf(Perl_debug_log,
8774 "floating utf8 %s%s at %"IVdf"..%"UVuf" ",
8775 s, RE_SV_TAIL(r->float_utf8),
8776 (IV)r->float_min_offset, (UV)r->float_max_offset);
8778 if (r->check_substr || r->check_utf8)
8779 PerlIO_printf(Perl_debug_log,
8781 (r->check_substr == r->float_substr
8782 && r->check_utf8 == r->float_utf8
8783 ? "(checking floating" : "(checking anchored"));
8784 if (r->extflags & RXf_NOSCAN)
8785 PerlIO_printf(Perl_debug_log, " noscan");
8786 if (r->extflags & RXf_CHECK_ALL)
8787 PerlIO_printf(Perl_debug_log, " isall");
8788 if (r->check_substr || r->check_utf8)
8789 PerlIO_printf(Perl_debug_log, ") ");
8791 if (ri->regstclass) {
8792 regprop(r, sv, ri->regstclass);
8793 PerlIO_printf(Perl_debug_log, "stclass %s ", SvPVX_const(sv));
8795 if (r->extflags & RXf_ANCH) {
8796 PerlIO_printf(Perl_debug_log, "anchored");
8797 if (r->extflags & RXf_ANCH_BOL)
8798 PerlIO_printf(Perl_debug_log, "(BOL)");
8799 if (r->extflags & RXf_ANCH_MBOL)
8800 PerlIO_printf(Perl_debug_log, "(MBOL)");
8801 if (r->extflags & RXf_ANCH_SBOL)
8802 PerlIO_printf(Perl_debug_log, "(SBOL)");
8803 if (r->extflags & RXf_ANCH_GPOS)
8804 PerlIO_printf(Perl_debug_log, "(GPOS)");
8805 PerlIO_putc(Perl_debug_log, ' ');
8807 if (r->extflags & RXf_GPOS_SEEN)
8808 PerlIO_printf(Perl_debug_log, "GPOS:%"UVuf" ", (UV)r->gofs);
8809 if (r->intflags & PREGf_SKIP)
8810 PerlIO_printf(Perl_debug_log, "plus ");
8811 if (r->intflags & PREGf_IMPLICIT)
8812 PerlIO_printf(Perl_debug_log, "implicit ");
8813 PerlIO_printf(Perl_debug_log, "minlen %"IVdf" ", (IV)r->minlen);
8814 if (r->extflags & RXf_EVAL_SEEN)
8815 PerlIO_printf(Perl_debug_log, "with eval ");
8816 PerlIO_printf(Perl_debug_log, "\n");
8817 DEBUG_FLAGS_r(regdump_extflags("r->extflags: ",r->extflags));
8819 PERL_UNUSED_CONTEXT;
8821 #endif /* DEBUGGING */
8825 - regprop - printable representation of opcode
8828 Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o)
8833 RXi_GET_DECL(prog,progi);
8834 GET_RE_DEBUG_FLAGS_DECL;
8837 sv_setpvn(sv, "", 0);
8839 if (OP(o) > REGNODE_MAX) /* regnode.type is unsigned */
8840 /* It would be nice to FAIL() here, but this may be called from
8841 regexec.c, and it would be hard to supply pRExC_state. */
8842 Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d", (int)OP(o), (int)REGNODE_MAX);
8843 sv_catpv(sv, PL_reg_name[OP(o)]); /* Take off const! */
8845 k = PL_regkind[OP(o)];
8849 /* Using is_utf8_string() (via PERL_PV_UNI_DETECT)
8850 * is a crude hack but it may be the best for now since
8851 * we have no flag "this EXACTish node was UTF-8"
8853 pv_pretty(sv, STRING(o), STR_LEN(o), 60, PL_colors[0], PL_colors[1],
8854 PERL_PV_ESCAPE_UNI_DETECT |
8855 PERL_PV_PRETTY_ELLIPSES |
8856 PERL_PV_PRETTY_LTGT |
8857 PERL_PV_PRETTY_NOCLEAR
8859 } else if (k == TRIE) {
8860 /* print the details of the trie in dumpuntil instead, as
8861 * progi->data isn't available here */
8862 const char op = OP(o);
8863 const U32 n = ARG(o);
8864 const reg_ac_data * const ac = IS_TRIE_AC(op) ?
8865 (reg_ac_data *)progi->data->data[n] :
8867 const reg_trie_data * const trie
8868 = (reg_trie_data*)progi->data->data[!IS_TRIE_AC(op) ? n : ac->trie];
8870 Perl_sv_catpvf(aTHX_ sv, "-%s",PL_reg_name[o->flags]);
8871 DEBUG_TRIE_COMPILE_r(
8872 Perl_sv_catpvf(aTHX_ sv,
8873 "<S:%"UVuf"/%"IVdf" W:%"UVuf" L:%"UVuf"/%"UVuf" C:%"UVuf"/%"UVuf">",
8874 (UV)trie->startstate,
8875 (IV)trie->statecount-1, /* -1 because of the unused 0 element */
8876 (UV)trie->wordcount,
8879 (UV)TRIE_CHARCOUNT(trie),
8880 (UV)trie->uniquecharcount
8883 if ( IS_ANYOF_TRIE(op) || trie->bitmap ) {
8885 int rangestart = -1;
8886 U8* bitmap = IS_ANYOF_TRIE(op) ? (U8*)ANYOF_BITMAP(o) : (U8*)TRIE_BITMAP(trie);
8888 for (i = 0; i <= 256; i++) {
8889 if (i < 256 && BITMAP_TEST(bitmap,i)) {
8890 if (rangestart == -1)
8892 } else if (rangestart != -1) {
8893 if (i <= rangestart + 3)
8894 for (; rangestart < i; rangestart++)
8895 put_byte(sv, rangestart);
8897 put_byte(sv, rangestart);
8899 put_byte(sv, i - 1);
8907 } else if (k == CURLY) {
8908 if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX)
8909 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */
8910 Perl_sv_catpvf(aTHX_ sv, " {%d,%d}", ARG1(o), ARG2(o));
8912 else if (k == WHILEM && o->flags) /* Ordinal/of */
8913 Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4);
8914 else if (k == REF || k == OPEN || k == CLOSE || k == GROUPP || OP(o)==ACCEPT) {
8915 Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o)); /* Parenth number */
8916 if ( prog->paren_names ) {
8917 if ( k != REF || OP(o) < NREF) {
8918 AV *list= (AV *)progi->data->data[progi->name_list_idx];
8919 SV **name= av_fetch(list, ARG(o), 0 );
8921 Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
8924 AV *list= (AV *)progi->data->data[ progi->name_list_idx ];
8925 SV *sv_dat=(SV*)progi->data->data[ ARG( o ) ];
8926 I32 *nums=(I32*)SvPVX(sv_dat);
8927 SV **name= av_fetch(list, nums[0], 0 );
8930 for ( n=0; n<SvIVX(sv_dat); n++ ) {
8931 Perl_sv_catpvf(aTHX_ sv, "%s%"IVdf,
8932 (n ? "," : ""), (IV)nums[n]);
8934 Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
8938 } else if (k == GOSUB)
8939 Perl_sv_catpvf(aTHX_ sv, "%d[%+d]", (int)ARG(o),(int)ARG2L(o)); /* Paren and offset */
8940 else if (k == VERB) {
8942 Perl_sv_catpvf(aTHX_ sv, ":%"SVf,
8943 SVfARG((SV*)progi->data->data[ ARG( o ) ]));
8944 } else if (k == LOGICAL)
8945 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* 2: embedded, otherwise 1 */
8946 else if (k == FOLDCHAR)
8947 Perl_sv_catpvf(aTHX_ sv, "[0x%"UVXf"]", PTR2UV(ARG(o)) );
8948 else if (k == ANYOF) {
8949 int i, rangestart = -1;
8950 const U8 flags = ANYOF_FLAGS(o);
8952 /* Should be synchronized with * ANYOF_ #xdefines in regcomp.h */
8953 static const char * const anyofs[] = {
8986 if (flags & ANYOF_LOCALE)
8987 sv_catpvs(sv, "{loc}");
8988 if (flags & ANYOF_FOLD)
8989 sv_catpvs(sv, "{i}");
8990 Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
8991 if (flags & ANYOF_INVERT)
8993 for (i = 0; i <= 256; i++) {
8994 if (i < 256 && ANYOF_BITMAP_TEST(o,i)) {
8995 if (rangestart == -1)
8997 } else if (rangestart != -1) {
8998 if (i <= rangestart + 3)
8999 for (; rangestart < i; rangestart++)
9000 put_byte(sv, rangestart);
9002 put_byte(sv, rangestart);
9004 put_byte(sv, i - 1);
9010 if (o->flags & ANYOF_CLASS)
9011 for (i = 0; i < (int)(sizeof(anyofs)/sizeof(char*)); i++)
9012 if (ANYOF_CLASS_TEST(o,i))
9013 sv_catpv(sv, anyofs[i]);
9015 if (flags & ANYOF_UNICODE)
9016 sv_catpvs(sv, "{unicode}");
9017 else if (flags & ANYOF_UNICODE_ALL)
9018 sv_catpvs(sv, "{unicode_all}");
9022 SV * const sw = regclass_swash(prog, o, FALSE, &lv, 0);
9026 U8 s[UTF8_MAXBYTES_CASE+1];
9028 for (i = 0; i <= 256; i++) { /* just the first 256 */
9029 uvchr_to_utf8(s, i);
9031 if (i < 256 && swash_fetch(sw, s, TRUE)) {
9032 if (rangestart == -1)
9034 } else if (rangestart != -1) {
9035 if (i <= rangestart + 3)
9036 for (; rangestart < i; rangestart++) {
9037 const U8 * const e = uvchr_to_utf8(s,rangestart);
9039 for(p = s; p < e; p++)
9043 const U8 *e = uvchr_to_utf8(s,rangestart);
9045 for (p = s; p < e; p++)
9048 e = uvchr_to_utf8(s, i-1);
9049 for (p = s; p < e; p++)
9056 sv_catpvs(sv, "..."); /* et cetera */
9060 char *s = savesvpv(lv);
9061 char * const origs = s;
9063 while (*s && *s != '\n')
9067 const char * const t = ++s;
9085 Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
9087 else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH))
9088 Perl_sv_catpvf(aTHX_ sv, "[%d]", -(o->flags));
9090 PERL_UNUSED_CONTEXT;
9091 PERL_UNUSED_ARG(sv);
9093 PERL_UNUSED_ARG(prog);
9094 #endif /* DEBUGGING */
9098 Perl_re_intuit_string(pTHX_ REGEXP * const prog)
9099 { /* Assume that RE_INTUIT is set */
9101 GET_RE_DEBUG_FLAGS_DECL;
9102 PERL_UNUSED_CONTEXT;
9106 const char * const s = SvPV_nolen_const(prog->check_substr
9107 ? prog->check_substr : prog->check_utf8);
9109 if (!PL_colorset) reginitcolors();
9110 PerlIO_printf(Perl_debug_log,
9111 "%sUsing REx %ssubstr:%s \"%s%.60s%s%s\"\n",
9113 prog->check_substr ? "" : "utf8 ",
9114 PL_colors[5],PL_colors[0],
9117 (strlen(s) > 60 ? "..." : ""));
9120 return prog->check_substr ? prog->check_substr : prog->check_utf8;
9126 handles refcounting and freeing the perl core regexp structure. When
9127 it is necessary to actually free the structure the first thing it
9128 does is call the 'free' method of the regexp_engine associated to to
9129 the regexp, allowing the handling of the void *pprivate; member
9130 first. (This routine is not overridable by extensions, which is why
9131 the extensions free is called first.)
9133 See regdupe and regdupe_internal if you change anything here.
9135 #ifndef PERL_IN_XSUB_RE
9137 Perl_pregfree(pTHX_ struct regexp *r)
9140 GET_RE_DEBUG_FLAGS_DECL;
9142 if (!r || (--r->refcnt > 0))
9145 ReREFCNT_dec(r->mother_re);
9147 CALLREGFREE_PVT(r); /* free the private data */
9149 SvREFCNT_dec(r->paren_names);
9150 Safefree(RXp_WRAPPED(r));
9153 if (r->anchored_substr)
9154 SvREFCNT_dec(r->anchored_substr);
9155 if (r->anchored_utf8)
9156 SvREFCNT_dec(r->anchored_utf8);
9157 if (r->float_substr)
9158 SvREFCNT_dec(r->float_substr);
9160 SvREFCNT_dec(r->float_utf8);
9161 Safefree(r->substrs);
9163 RX_MATCH_COPY_FREE(r);
9164 #ifdef PERL_OLD_COPY_ON_WRITE
9166 SvREFCNT_dec(r->saved_copy);
9175 This is a hacky workaround to the structural issue of match results
9176 being stored in the regexp structure which is in turn stored in
9177 PL_curpm/PL_reg_curpm. The problem is that due to qr// the pattern
9178 could be PL_curpm in multiple contexts, and could require multiple
9179 result sets being associated with the pattern simultaneously, such
9180 as when doing a recursive match with (??{$qr})
9182 The solution is to make a lightweight copy of the regexp structure
9183 when a qr// is returned from the code executed by (??{$qr}) this
9184 lightweight copy doesnt actually own any of its data except for
9185 the starp/end and the actual regexp structure itself.
9191 Perl_reg_temp_copy (pTHX_ struct regexp *r) {
9193 register const I32 npar = r->nparens+1;
9194 (void)ReREFCNT_inc(r);
9195 Newx(ret, 1, regexp);
9196 StructCopy(r, ret, regexp);
9197 Newx(ret->offs, npar, regexp_paren_pair);
9198 Copy(r->offs, ret->offs, npar, regexp_paren_pair);
9201 Newx(ret->substrs, 1, struct reg_substr_data);
9202 StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
9204 SvREFCNT_inc_void(ret->anchored_substr);
9205 SvREFCNT_inc_void(ret->anchored_utf8);
9206 SvREFCNT_inc_void(ret->float_substr);
9207 SvREFCNT_inc_void(ret->float_utf8);
9209 /* check_substr and check_utf8, if non-NULL, point to either their
9210 anchored or float namesakes, and don't hold a second reference. */
9212 RX_MATCH_COPIED_off(ret);
9213 #ifdef PERL_OLD_COPY_ON_WRITE
9214 ret->saved_copy = NULL;
9223 /* regfree_internal()
9225 Free the private data in a regexp. This is overloadable by
9226 extensions. Perl takes care of the regexp structure in pregfree(),
9227 this covers the *pprivate pointer which technically perldoesnt
9228 know about, however of course we have to handle the
9229 regexp_internal structure when no extension is in use.
9231 Note this is called before freeing anything in the regexp
9236 Perl_regfree_internal(pTHX_ REGEXP * const r)
9240 GET_RE_DEBUG_FLAGS_DECL;
9246 SV *dsv= sv_newmortal();
9247 RE_PV_QUOTED_DECL(s, (r->extflags & RXf_UTF8),
9248 dsv, RXp_PRECOMP(r), RXp_PRELEN(r), 60);
9249 PerlIO_printf(Perl_debug_log,"%sFreeing REx:%s %s\n",
9250 PL_colors[4],PL_colors[5],s);
9253 #ifdef RE_TRACK_PATTERN_OFFSETS
9255 Safefree(ri->u.offsets); /* 20010421 MJD */
9258 int n = ri->data->count;
9259 PAD* new_comppad = NULL;
9264 /* If you add a ->what type here, update the comment in regcomp.h */
9265 switch (ri->data->what[n]) {
9269 SvREFCNT_dec((SV*)ri->data->data[n]);
9272 Safefree(ri->data->data[n]);
9275 new_comppad = (AV*)ri->data->data[n];
9278 if (new_comppad == NULL)
9279 Perl_croak(aTHX_ "panic: pregfree comppad");
9280 PAD_SAVE_LOCAL(old_comppad,
9281 /* Watch out for global destruction's random ordering. */
9282 (SvTYPE(new_comppad) == SVt_PVAV) ? new_comppad : NULL
9285 refcnt = OpREFCNT_dec((OP_4tree*)ri->data->data[n]);
9288 op_free((OP_4tree*)ri->data->data[n]);
9290 PAD_RESTORE_LOCAL(old_comppad);
9291 SvREFCNT_dec((SV*)new_comppad);
9297 { /* Aho Corasick add-on structure for a trie node.
9298 Used in stclass optimization only */
9300 reg_ac_data *aho=(reg_ac_data*)ri->data->data[n];
9302 refcount = --aho->refcount;
9305 PerlMemShared_free(aho->states);
9306 PerlMemShared_free(aho->fail);
9307 /* do this last!!!! */
9308 PerlMemShared_free(ri->data->data[n]);
9309 PerlMemShared_free(ri->regstclass);
9315 /* trie structure. */
9317 reg_trie_data *trie=(reg_trie_data*)ri->data->data[n];
9319 refcount = --trie->refcount;
9322 PerlMemShared_free(trie->charmap);
9323 PerlMemShared_free(trie->states);
9324 PerlMemShared_free(trie->trans);
9326 PerlMemShared_free(trie->bitmap);
9328 PerlMemShared_free(trie->wordlen);
9330 PerlMemShared_free(trie->jump);
9332 PerlMemShared_free(trie->nextword);
9333 /* do this last!!!! */
9334 PerlMemShared_free(ri->data->data[n]);
9339 Perl_croak(aTHX_ "panic: regfree data code '%c'", ri->data->what[n]);
9342 Safefree(ri->data->what);
9349 #define sv_dup_inc(s,t) SvREFCNT_inc(sv_dup(s,t))
9350 #define av_dup_inc(s,t) (AV*)SvREFCNT_inc(sv_dup((SV*)s,t))
9351 #define hv_dup_inc(s,t) (HV*)SvREFCNT_inc(sv_dup((SV*)s,t))
9352 #define SAVEPVN(p,n) ((p) ? savepvn(p,n) : NULL)
9355 re_dup - duplicate a regexp.
9357 This routine is expected to clone a given regexp structure. It is not
9358 compiler under USE_ITHREADS.
9360 After all of the core data stored in struct regexp is duplicated
9361 the regexp_engine.dupe method is used to copy any private data
9362 stored in the *pprivate pointer. This allows extensions to handle
9363 any duplication it needs to do.
9365 See pregfree() and regfree_internal() if you change anything here.
9367 #if defined(USE_ITHREADS)
9368 #ifndef PERL_IN_XSUB_RE
9370 Perl_re_dup(pTHX_ const regexp *r, CLONE_PARAMS *param)
9377 return (REGEXP *)NULL;
9379 if ((ret = (REGEXP *)ptr_table_fetch(PL_ptr_table, r)))
9383 npar = r->nparens+1;
9384 Newx(ret, 1, regexp);
9385 StructCopy(r, ret, regexp);
9386 Newx(ret->offs, npar, regexp_paren_pair);
9387 Copy(r->offs, ret->offs, npar, regexp_paren_pair);
9389 /* no need to copy these */
9390 Newx(ret->swap, npar, regexp_paren_pair);
9394 /* Do it this way to avoid reading from *r after the StructCopy().
9395 That way, if any of the sv_dup_inc()s dislodge *r from the L1
9396 cache, it doesn't matter. */
9397 const bool anchored = r->check_substr == r->anchored_substr;
9398 Newx(ret->substrs, 1, struct reg_substr_data);
9399 StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
9401 ret->anchored_substr = sv_dup_inc(ret->anchored_substr, param);
9402 ret->anchored_utf8 = sv_dup_inc(ret->anchored_utf8, param);
9403 ret->float_substr = sv_dup_inc(ret->float_substr, param);
9404 ret->float_utf8 = sv_dup_inc(ret->float_utf8, param);
9406 /* check_substr and check_utf8, if non-NULL, point to either their
9407 anchored or float namesakes, and don't hold a second reference. */
9409 if (ret->check_substr) {
9411 assert(r->check_utf8 == r->anchored_utf8);
9412 ret->check_substr = ret->anchored_substr;
9413 ret->check_utf8 = ret->anchored_utf8;
9415 assert(r->check_substr == r->float_substr);
9416 assert(r->check_utf8 == r->float_utf8);
9417 ret->check_substr = ret->float_substr;
9418 ret->check_utf8 = ret->float_utf8;
9423 RXp_WRAPPED(ret) = SAVEPVN(RXp_WRAPPED(ret), RXp_WRAPLEN(ret)+1);
9424 ret->paren_names = hv_dup_inc(ret->paren_names, param);
9427 RXi_SET(ret,CALLREGDUPE_PVT(ret,param));
9429 if (RX_MATCH_COPIED(ret))
9430 ret->subbeg = SAVEPVN(ret->subbeg, ret->sublen);
9433 #ifdef PERL_OLD_COPY_ON_WRITE
9434 ret->saved_copy = NULL;
9437 ret->mother_re = NULL;
9439 ret->seen_evals = 0;
9441 ptr_table_store(PL_ptr_table, r, ret);
9444 #endif /* PERL_IN_XSUB_RE */
9449 This is the internal complement to regdupe() which is used to copy
9450 the structure pointed to by the *pprivate pointer in the regexp.
9451 This is the core version of the extension overridable cloning hook.
9452 The regexp structure being duplicated will be copied by perl prior
9453 to this and will be provided as the regexp *r argument, however
9454 with the /old/ structures pprivate pointer value. Thus this routine
9455 may override any copying normally done by perl.
9457 It returns a pointer to the new regexp_internal structure.
9461 Perl_regdupe_internal(pTHX_ REGEXP * const r, CLONE_PARAMS *param)
9464 regexp_internal *reti;
9468 npar = r->nparens+1;
9471 Newxc(reti, sizeof(regexp_internal) + (len+1)*sizeof(regnode), char, regexp_internal);
9472 Copy(ri->program, reti->program, len+1, regnode);
9475 reti->regstclass = NULL;
9479 const int count = ri->data->count;
9482 Newxc(d, sizeof(struct reg_data) + count*sizeof(void *),
9483 char, struct reg_data);
9484 Newx(d->what, count, U8);
9487 for (i = 0; i < count; i++) {
9488 d->what[i] = ri->data->what[i];
9489 switch (d->what[i]) {
9490 /* legal options are one of: sSfpontTu
9491 see also regcomp.h and pregfree() */
9494 case 'p': /* actually an AV, but the dup function is identical. */
9495 case 'u': /* actually an HV, but the dup function is identical. */
9496 d->data[i] = sv_dup_inc((SV *)ri->data->data[i], param);
9499 /* This is cheating. */
9500 Newx(d->data[i], 1, struct regnode_charclass_class);
9501 StructCopy(ri->data->data[i], d->data[i],
9502 struct regnode_charclass_class);
9503 reti->regstclass = (regnode*)d->data[i];
9506 /* Compiled op trees are readonly and in shared memory,
9507 and can thus be shared without duplication. */
9509 d->data[i] = (void*)OpREFCNT_inc((OP*)ri->data->data[i]);
9513 /* Trie stclasses are readonly and can thus be shared
9514 * without duplication. We free the stclass in pregfree
9515 * when the corresponding reg_ac_data struct is freed.
9517 reti->regstclass= ri->regstclass;
9521 ((reg_trie_data*)ri->data->data[i])->refcount++;
9525 d->data[i] = ri->data->data[i];
9528 Perl_croak(aTHX_ "panic: re_dup unknown data code '%c'", ri->data->what[i]);
9537 reti->name_list_idx = ri->name_list_idx;
9539 #ifdef RE_TRACK_PATTERN_OFFSETS
9540 if (ri->u.offsets) {
9541 Newx(reti->u.offsets, 2*len+1, U32);
9542 Copy(ri->u.offsets, reti->u.offsets, 2*len+1, U32);
9545 SetProgLen(reti,len);
9551 #endif /* USE_ITHREADS */
9556 converts a regexp embedded in a MAGIC struct to its stringified form,
9557 caching the converted form in the struct and returns the cached
9560 If lp is nonnull then it is used to return the length of the
9563 If flags is nonnull and the returned string contains UTF8 then
9564 (*flags & 1) will be true.
9566 If haseval is nonnull then it is used to return whether the pattern
9569 Normally called via macro:
9571 CALLREG_STRINGIFY(mg,&len,&utf8);
9575 CALLREG_AS_STR(mg,&lp,&flags,&haseval)
9577 See sv_2pv_flags() in sv.c for an example of internal usage.
9580 #ifndef PERL_IN_XSUB_RE
9583 Perl_reg_stringify(pTHX_ MAGIC *mg, STRLEN *lp, U32 *flags, I32 *haseval ) {
9585 const regexp * const re = (regexp *)mg->mg_obj;
9587 *haseval = RX_SEEN_EVALS(re);
9589 *flags = ((RX_EXTFLAGS(re) & RXf_UTF8) ? 1 : 0);
9591 *lp = RX_WRAPLEN(re);
9592 return RX_WRAPPED(re);
9596 - regnext - dig the "next" pointer out of a node
9599 Perl_regnext(pTHX_ register regnode *p)
9602 register I32 offset;
9607 offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p));
9616 S_re_croak2(pTHX_ const char* pat1,const char* pat2,...)
9619 STRLEN l1 = strlen(pat1);
9620 STRLEN l2 = strlen(pat2);
9623 const char *message;
9629 Copy(pat1, buf, l1 , char);
9630 Copy(pat2, buf + l1, l2 , char);
9631 buf[l1 + l2] = '\n';
9632 buf[l1 + l2 + 1] = '\0';
9634 /* ANSI variant takes additional second argument */
9635 va_start(args, pat2);
9639 msv = vmess(buf, &args);
9641 message = SvPV_const(msv,l1);
9644 Copy(message, buf, l1 , char);
9645 buf[l1-1] = '\0'; /* Overwrite \n */
9646 Perl_croak(aTHX_ "%s", buf);
9649 /* XXX Here's a total kludge. But we need to re-enter for swash routines. */
9651 #ifndef PERL_IN_XSUB_RE
9653 Perl_save_re_context(pTHX)
9657 struct re_save_state *state;
9659 SAVEVPTR(PL_curcop);
9660 SSGROW(SAVESTACK_ALLOC_FOR_RE_SAVE_STATE + 1);
9662 state = (struct re_save_state *)(PL_savestack + PL_savestack_ix);
9663 PL_savestack_ix += SAVESTACK_ALLOC_FOR_RE_SAVE_STATE;
9664 SSPUSHINT(SAVEt_RE_STATE);
9666 Copy(&PL_reg_state, state, 1, struct re_save_state);
9668 PL_reg_start_tmp = 0;
9669 PL_reg_start_tmpl = 0;
9670 PL_reg_oldsaved = NULL;
9671 PL_reg_oldsavedlen = 0;
9673 PL_reg_leftiter = 0;
9674 PL_reg_poscache = NULL;
9675 PL_reg_poscache_size = 0;
9676 #ifdef PERL_OLD_COPY_ON_WRITE
9680 /* Save $1..$n (#18107: UTF-8 s/(\w+)/uc($1)/e); AMS 20021106. */
9682 const REGEXP * const rx = PM_GETRE(PL_curpm);
9685 for (i = 1; i <= RX_NPARENS(rx); i++) {
9686 char digits[TYPE_CHARS(long)];
9687 const STRLEN len = my_snprintf(digits, sizeof(digits), "%lu", (long)i);
9688 GV *const *const gvp
9689 = (GV**)hv_fetch(PL_defstash, digits, len, 0);
9692 GV * const gv = *gvp;
9693 if (SvTYPE(gv) == SVt_PVGV && GvSV(gv))
9703 clear_re(pTHX_ void *r)
9706 ReREFCNT_dec((regexp *)r);
9712 S_put_byte(pTHX_ SV *sv, int c)
9714 /* Our definition of isPRINT() ignores locales, so only bytes that are
9715 not part of UTF-8 are considered printable. I assume that the same
9716 holds for UTF-EBCDIC.
9717 Also, code point 255 is not printable in either (it's E0 in EBCDIC,
9718 which Wikipedia says:
9720 EO, or Eight Ones, is an 8-bit EBCDIC character code represented as all
9721 ones (binary 1111 1111, hexadecimal FF). It is similar, but not
9722 identical, to the ASCII delete (DEL) or rubout control character.
9723 ) So the old condition can be simplified to !isPRINT(c) */
9725 Perl_sv_catpvf(aTHX_ sv, "\\%o", c);
9727 const char string = c;
9728 if (c == '-' || c == ']' || c == '\\' || c == '^')
9729 sv_catpvs(sv, "\\");
9730 sv_catpvn(sv, &string, 1);
9735 #define CLEAR_OPTSTART \
9736 if (optstart) STMT_START { \
9737 DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log, " (%"IVdf" nodes)\n", (IV)(node - optstart))); \
9741 #define DUMPUNTIL(b,e) CLEAR_OPTSTART; node=dumpuntil(r,start,(b),(e),last,sv,indent+1,depth+1);
9743 STATIC const regnode *
9744 S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node,
9745 const regnode *last, const regnode *plast,
9746 SV* sv, I32 indent, U32 depth)
9749 register U8 op = PSEUDO; /* Arbitrary non-END op. */
9750 register const regnode *next;
9751 const regnode *optstart= NULL;
9754 GET_RE_DEBUG_FLAGS_DECL;
9756 #ifdef DEBUG_DUMPUNTIL
9757 PerlIO_printf(Perl_debug_log, "--- %d : %d - %d - %d\n",indent,node-start,
9758 last ? last-start : 0,plast ? plast-start : 0);
9761 if (plast && plast < last)
9764 while (PL_regkind[op] != END && (!last || node < last)) {
9765 /* While that wasn't END last time... */
9768 if (op == CLOSE || op == WHILEM)
9770 next = regnext((regnode *)node);
9773 if (OP(node) == OPTIMIZED) {
9774 if (!optstart && RE_DEBUG_FLAG(RE_DEBUG_COMPILE_OPTIMISE))
9781 regprop(r, sv, node);
9782 PerlIO_printf(Perl_debug_log, "%4"IVdf":%*s%s", (IV)(node - start),
9783 (int)(2*indent + 1), "", SvPVX_const(sv));
9785 if (OP(node) != OPTIMIZED) {
9786 if (next == NULL) /* Next ptr. */
9787 PerlIO_printf(Perl_debug_log, " (0)");
9788 else if (PL_regkind[(U8)op] == BRANCH && PL_regkind[OP(next)] != BRANCH )
9789 PerlIO_printf(Perl_debug_log, " (FAIL)");
9791 PerlIO_printf(Perl_debug_log, " (%"IVdf")", (IV)(next - start));
9792 (void)PerlIO_putc(Perl_debug_log, '\n');
9796 if (PL_regkind[(U8)op] == BRANCHJ) {
9799 register const regnode *nnode = (OP(next) == LONGJMP
9800 ? regnext((regnode *)next)
9802 if (last && nnode > last)
9804 DUMPUNTIL(NEXTOPER(NEXTOPER(node)), nnode);
9807 else if (PL_regkind[(U8)op] == BRANCH) {
9809 DUMPUNTIL(NEXTOPER(node), next);
9811 else if ( PL_regkind[(U8)op] == TRIE ) {
9812 const regnode *this_trie = node;
9813 const char op = OP(node);
9814 const U32 n = ARG(node);
9815 const reg_ac_data * const ac = op>=AHOCORASICK ?
9816 (reg_ac_data *)ri->data->data[n] :
9818 const reg_trie_data * const trie =
9819 (reg_trie_data*)ri->data->data[op<AHOCORASICK ? n : ac->trie];
9821 AV *const trie_words = (AV *) ri->data->data[n + TRIE_WORDS_OFFSET];
9823 const regnode *nextbranch= NULL;
9825 sv_setpvn(sv, "", 0);
9826 for (word_idx= 0; word_idx < (I32)trie->wordcount; word_idx++) {
9827 SV ** const elem_ptr = av_fetch(trie_words,word_idx,0);
9829 PerlIO_printf(Perl_debug_log, "%*s%s ",
9830 (int)(2*(indent+3)), "",
9831 elem_ptr ? pv_pretty(sv, SvPV_nolen_const(*elem_ptr), SvCUR(*elem_ptr), 60,
9832 PL_colors[0], PL_colors[1],
9833 (SvUTF8(*elem_ptr) ? PERL_PV_ESCAPE_UNI : 0) |
9834 PERL_PV_PRETTY_ELLIPSES |
9840 U16 dist= trie->jump[word_idx+1];
9841 PerlIO_printf(Perl_debug_log, "(%"UVuf")\n",
9842 (UV)((dist ? this_trie + dist : next) - start));
9845 nextbranch= this_trie + trie->jump[0];
9846 DUMPUNTIL(this_trie + dist, nextbranch);
9848 if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
9849 nextbranch= regnext((regnode *)nextbranch);
9851 PerlIO_printf(Perl_debug_log, "\n");
9854 if (last && next > last)
9859 else if ( op == CURLY ) { /* "next" might be very big: optimizer */
9860 DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS,
9861 NEXTOPER(node) + EXTRA_STEP_2ARGS + 1);
9863 else if (PL_regkind[(U8)op] == CURLY && op != CURLYX) {
9865 DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS, next);
9867 else if ( op == PLUS || op == STAR) {
9868 DUMPUNTIL(NEXTOPER(node), NEXTOPER(node) + 1);
9870 else if (op == ANYOF) {
9871 /* arglen 1 + class block */
9872 node += 1 + ((ANYOF_FLAGS(node) & ANYOF_LARGE)
9873 ? ANYOF_CLASS_SKIP : ANYOF_SKIP);
9874 node = NEXTOPER(node);
9876 else if (PL_regkind[(U8)op] == EXACT) {
9877 /* Literal string, where present. */
9878 node += NODE_SZ_STR(node) - 1;
9879 node = NEXTOPER(node);
9882 node = NEXTOPER(node);
9883 node += regarglen[(U8)op];
9885 if (op == CURLYX || op == OPEN)
9889 #ifdef DEBUG_DUMPUNTIL
9890 PerlIO_printf(Perl_debug_log, "--- %d\n", (int)indent);
9895 #endif /* DEBUGGING */
9899 * c-indentation-style: bsd
9901 * indent-tabs-mode: t
9904 * ex: set ts=8 sts=4 sw=4 noet: