5 * "A fair jaw-cracker dwarf-language must be." --Samwise Gamgee
8 /* This file contains functions for compiling a regular expression. See
9 * also regexec.c which funnily enough, contains functions for executing
10 * a regular expression.
12 * This file is also copied at build time to ext/re/re_comp.c, where
13 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
14 * This causes the main functions to be compiled under new names and with
15 * debugging support added, which makes "use re 'debug'" work.
18 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
19 * confused with the original package (see point 3 below). Thanks, Henry!
22 /* Additional note: this code is very heavily munged from Henry's version
23 * in places. In some spots I've traded clarity for efficiency, so don't
24 * blame Henry for some of the lack of readability.
27 /* The names of the functions have been changed from regcomp and
28 * regexec to pregcomp and pregexec in order to avoid conflicts
29 * with the POSIX routines of the same names.
32 #ifdef PERL_EXT_RE_BUILD
37 * pregcomp and pregexec -- regsub and regerror are not used in perl
39 * Copyright (c) 1986 by University of Toronto.
40 * Written by Henry Spencer. Not derived from licensed software.
42 * Permission is granted to anyone to use this software for any
43 * purpose on any computer system, and to redistribute it freely,
44 * subject to the following restrictions:
46 * 1. The author is not responsible for the consequences of use of
47 * this software, no matter how awful, even if they arise
50 * 2. The origin of this software must not be misrepresented, either
51 * by explicit claim or by omission.
53 * 3. Altered versions must be plainly marked as such, and must not
54 * be misrepresented as being the original software.
57 **** Alterations to Henry's code are...
59 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
60 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007 by Larry Wall and others
62 **** You may distribute under the terms of either the GNU General Public
63 **** License or the Artistic License, as specified in the README file.
66 * Beware that some of this code is subtly aware of the way operator
67 * precedence is structured in regular expressions. Serious changes in
68 * regular-expression syntax might require a total rethink.
71 #define PERL_IN_REGCOMP_C
74 #ifndef PERL_IN_XSUB_RE
79 #ifdef PERL_IN_XSUB_RE
90 # if defined(BUGGY_MSC6)
91 /* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */
92 # pragma optimize("a",off)
93 /* But MSC 6.00A is happy with 'w', for aliases only across function calls*/
94 # pragma optimize("w",on )
95 # endif /* BUGGY_MSC6 */
102 typedef struct RExC_state_t {
103 U32 flags; /* are we folding, multilining? */
104 char *precomp; /* uncompiled string. */
105 regexp *rx; /* perl core regexp structure */
106 regexp_internal *rxi; /* internal data for regexp object pprivate field */
107 char *start; /* Start of input for compile */
108 char *end; /* End of input for compile */
109 char *parse; /* Input-scan pointer. */
110 I32 whilem_seen; /* number of WHILEM in this expr */
111 regnode *emit_start; /* Start of emitted-code area */
112 regnode *emit_bound; /* First regnode outside of the allocated space */
113 regnode *emit; /* Code-emit pointer; ®dummy = don't = compiling */
114 I32 naughty; /* How bad is this pattern? */
115 I32 sawback; /* Did we see \1, ...? */
117 I32 size; /* Code size. */
118 I32 npar; /* Capture buffer count, (OPEN). */
119 I32 cpar; /* Capture buffer count, (CLOSE). */
120 I32 nestroot; /* root parens we are in - used by accept */
124 regnode **open_parens; /* pointers to open parens */
125 regnode **close_parens; /* pointers to close parens */
126 regnode *opend; /* END node in program */
127 I32 utf8; /* whether the pattern is utf8 or not */
128 I32 orig_utf8; /* whether the pattern was originally in utf8 */
129 /* XXX use this for future optimisation of case
130 * where pattern must be upgraded to utf8. */
131 HV *charnames; /* cache of named sequences */
132 HV *paren_names; /* Paren names */
134 regnode **recurse; /* Recurse regops */
135 I32 recurse_count; /* Number of recurse regops */
137 char *starttry; /* -Dr: where regtry was called. */
138 #define RExC_starttry (pRExC_state->starttry)
141 const char *lastparse;
143 AV *paren_name_list; /* idx -> name */
144 #define RExC_lastparse (pRExC_state->lastparse)
145 #define RExC_lastnum (pRExC_state->lastnum)
146 #define RExC_paren_name_list (pRExC_state->paren_name_list)
150 #define RExC_flags (pRExC_state->flags)
151 #define RExC_precomp (pRExC_state->precomp)
152 #define RExC_rx (pRExC_state->rx)
153 #define RExC_rxi (pRExC_state->rxi)
154 #define RExC_start (pRExC_state->start)
155 #define RExC_end (pRExC_state->end)
156 #define RExC_parse (pRExC_state->parse)
157 #define RExC_whilem_seen (pRExC_state->whilem_seen)
158 #ifdef RE_TRACK_PATTERN_OFFSETS
159 #define RExC_offsets (pRExC_state->rxi->u.offsets) /* I am not like the others */
161 #define RExC_emit (pRExC_state->emit)
162 #define RExC_emit_start (pRExC_state->emit_start)
163 #define RExC_emit_bound (pRExC_state->emit_bound)
164 #define RExC_naughty (pRExC_state->naughty)
165 #define RExC_sawback (pRExC_state->sawback)
166 #define RExC_seen (pRExC_state->seen)
167 #define RExC_size (pRExC_state->size)
168 #define RExC_npar (pRExC_state->npar)
169 #define RExC_nestroot (pRExC_state->nestroot)
170 #define RExC_extralen (pRExC_state->extralen)
171 #define RExC_seen_zerolen (pRExC_state->seen_zerolen)
172 #define RExC_seen_evals (pRExC_state->seen_evals)
173 #define RExC_utf8 (pRExC_state->utf8)
174 #define RExC_orig_utf8 (pRExC_state->orig_utf8)
175 #define RExC_charnames (pRExC_state->charnames)
176 #define RExC_open_parens (pRExC_state->open_parens)
177 #define RExC_close_parens (pRExC_state->close_parens)
178 #define RExC_opend (pRExC_state->opend)
179 #define RExC_paren_names (pRExC_state->paren_names)
180 #define RExC_recurse (pRExC_state->recurse)
181 #define RExC_recurse_count (pRExC_state->recurse_count)
184 #define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?')
185 #define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
186 ((*s) == '{' && regcurly(s)))
189 #undef SPSTART /* dratted cpp namespace... */
192 * Flags to be passed up and down.
194 #define WORST 0 /* Worst case. */
195 #define HASWIDTH 0x01 /* Known to match non-null strings. */
196 #define SIMPLE 0x02 /* Simple enough to be STAR/PLUS operand. */
197 #define SPSTART 0x04 /* Starts with * or +. */
198 #define TRYAGAIN 0x08 /* Weeded out a declaration. */
199 #define POSTPONED 0x10 /* (?1),(?&name), (??{...}) or similar */
201 #define REG_NODE_NUM(x) ((x) ? (int)((x)-RExC_emit_start) : -1)
203 /* whether trie related optimizations are enabled */
204 #if PERL_ENABLE_EXTENDED_TRIE_OPTIMISATION
205 #define TRIE_STUDY_OPT
206 #define FULL_TRIE_STUDY
212 #define PBYTE(u8str,paren) ((U8*)(u8str))[(paren) >> 3]
213 #define PBITVAL(paren) (1 << ((paren) & 7))
214 #define PAREN_TEST(u8str,paren) ( PBYTE(u8str,paren) & PBITVAL(paren))
215 #define PAREN_SET(u8str,paren) PBYTE(u8str,paren) |= PBITVAL(paren)
216 #define PAREN_UNSET(u8str,paren) PBYTE(u8str,paren) &= (~PBITVAL(paren))
219 /* About scan_data_t.
221 During optimisation we recurse through the regexp program performing
222 various inplace (keyhole style) optimisations. In addition study_chunk
223 and scan_commit populate this data structure with information about
224 what strings MUST appear in the pattern. We look for the longest
225 string that must appear for at a fixed location, and we look for the
226 longest string that may appear at a floating location. So for instance
231 Both 'FOO' and 'A' are fixed strings. Both 'B' and 'BAR' are floating
232 strings (because they follow a .* construct). study_chunk will identify
233 both FOO and BAR as being the longest fixed and floating strings respectively.
235 The strings can be composites, for instance
239 will result in a composite fixed substring 'foo'.
241 For each string some basic information is maintained:
243 - offset or min_offset
244 This is the position the string must appear at, or not before.
245 It also implicitly (when combined with minlenp) tells us how many
246 character must match before the string we are searching.
247 Likewise when combined with minlenp and the length of the string
248 tells us how many characters must appear after the string we have
252 Only used for floating strings. This is the rightmost point that
253 the string can appear at. Ifset to I32 max it indicates that the
254 string can occur infinitely far to the right.
257 A pointer to the minimum length of the pattern that the string
258 was found inside. This is important as in the case of positive
259 lookahead or positive lookbehind we can have multiple patterns
264 The minimum length of the pattern overall is 3, the minimum length
265 of the lookahead part is 3, but the minimum length of the part that
266 will actually match is 1. So 'FOO's minimum length is 3, but the
267 minimum length for the F is 1. This is important as the minimum length
268 is used to determine offsets in front of and behind the string being
269 looked for. Since strings can be composites this is the length of the
270 pattern at the time it was commited with a scan_commit. Note that
271 the length is calculated by study_chunk, so that the minimum lengths
272 are not known until the full pattern has been compiled, thus the
273 pointer to the value.
277 In the case of lookbehind the string being searched for can be
278 offset past the start point of the final matching string.
279 If this value was just blithely removed from the min_offset it would
280 invalidate some of the calculations for how many chars must match
281 before or after (as they are derived from min_offset and minlen and
282 the length of the string being searched for).
283 When the final pattern is compiled and the data is moved from the
284 scan_data_t structure into the regexp structure the information
285 about lookbehind is factored in, with the information that would
286 have been lost precalculated in the end_shift field for the
289 The fields pos_min and pos_delta are used to store the minimum offset
290 and the delta to the maximum offset at the current point in the pattern.
294 typedef struct scan_data_t {
295 /*I32 len_min; unused */
296 /*I32 len_delta; unused */
300 I32 last_end; /* min value, <0 unless valid. */
303 SV **longest; /* Either &l_fixed, or &l_float. */
304 SV *longest_fixed; /* longest fixed string found in pattern */
305 I32 offset_fixed; /* offset where it starts */
306 I32 *minlen_fixed; /* pointer to the minlen relevent to the string */
307 I32 lookbehind_fixed; /* is the position of the string modfied by LB */
308 SV *longest_float; /* longest floating string found in pattern */
309 I32 offset_float_min; /* earliest point in string it can appear */
310 I32 offset_float_max; /* latest point in string it can appear */
311 I32 *minlen_float; /* pointer to the minlen relevent to the string */
312 I32 lookbehind_float; /* is the position of the string modified by LB */
316 struct regnode_charclass_class *start_class;
320 * Forward declarations for pregcomp()'s friends.
323 static const scan_data_t zero_scan_data =
324 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ,0};
326 #define SF_BEFORE_EOL (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
327 #define SF_BEFORE_SEOL 0x0001
328 #define SF_BEFORE_MEOL 0x0002
329 #define SF_FIX_BEFORE_EOL (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
330 #define SF_FL_BEFORE_EOL (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
333 # define SF_FIX_SHIFT_EOL (0+2)
334 # define SF_FL_SHIFT_EOL (0+4)
336 # define SF_FIX_SHIFT_EOL (+2)
337 # define SF_FL_SHIFT_EOL (+4)
340 #define SF_FIX_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
341 #define SF_FIX_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
343 #define SF_FL_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
344 #define SF_FL_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
345 #define SF_IS_INF 0x0040
346 #define SF_HAS_PAR 0x0080
347 #define SF_IN_PAR 0x0100
348 #define SF_HAS_EVAL 0x0200
349 #define SCF_DO_SUBSTR 0x0400
350 #define SCF_DO_STCLASS_AND 0x0800
351 #define SCF_DO_STCLASS_OR 0x1000
352 #define SCF_DO_STCLASS (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
353 #define SCF_WHILEM_VISITED_POS 0x2000
355 #define SCF_TRIE_RESTUDY 0x4000 /* Do restudy? */
356 #define SCF_SEEN_ACCEPT 0x8000
358 #define UTF (RExC_utf8 != 0)
359 #define LOC ((RExC_flags & RXf_PMf_LOCALE) != 0)
360 #define FOLD ((RExC_flags & RXf_PMf_FOLD) != 0)
362 #define OOB_UNICODE 12345678
363 #define OOB_NAMEDCLASS -1
365 #define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
366 #define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
369 /* length of regex to show in messages that don't mark a position within */
370 #define RegexLengthToShowInErrorMessages 127
373 * If MARKER[12] are adjusted, be sure to adjust the constants at the top
374 * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
375 * op/pragma/warn/regcomp.
377 #define MARKER1 "<-- HERE" /* marker as it appears in the description */
378 #define MARKER2 " <-- HERE " /* marker as it appears within the regex */
380 #define REPORT_LOCATION " in regex; marked by " MARKER1 " in m/%.*s" MARKER2 "%s/"
383 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
384 * arg. Show regex, up to a maximum length. If it's too long, chop and add
387 #define _FAIL(code) STMT_START { \
388 const char *ellipses = ""; \
389 IV len = RExC_end - RExC_precomp; \
392 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
393 if (len > RegexLengthToShowInErrorMessages) { \
394 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
395 len = RegexLengthToShowInErrorMessages - 10; \
401 #define FAIL(msg) _FAIL( \
402 Perl_croak(aTHX_ "%s in regex m/%.*s%s/", \
403 msg, (int)len, RExC_precomp, ellipses))
405 #define FAIL2(msg,arg) _FAIL( \
406 Perl_croak(aTHX_ msg " in regex m/%.*s%s/", \
407 arg, (int)len, RExC_precomp, ellipses))
410 * Simple_vFAIL -- like FAIL, but marks the current location in the scan
412 #define Simple_vFAIL(m) STMT_START { \
413 const IV offset = RExC_parse - RExC_precomp; \
414 Perl_croak(aTHX_ "%s" REPORT_LOCATION, \
415 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
419 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
421 #define vFAIL(m) STMT_START { \
423 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
428 * Like Simple_vFAIL(), but accepts two arguments.
430 #define Simple_vFAIL2(m,a1) STMT_START { \
431 const IV offset = RExC_parse - RExC_precomp; \
432 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, \
433 (int)offset, RExC_precomp, RExC_precomp + offset); \
437 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
439 #define vFAIL2(m,a1) STMT_START { \
441 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
442 Simple_vFAIL2(m, a1); \
447 * Like Simple_vFAIL(), but accepts three arguments.
449 #define Simple_vFAIL3(m, a1, a2) STMT_START { \
450 const IV offset = RExC_parse - RExC_precomp; \
451 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, \
452 (int)offset, RExC_precomp, RExC_precomp + offset); \
456 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
458 #define vFAIL3(m,a1,a2) STMT_START { \
460 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
461 Simple_vFAIL3(m, a1, a2); \
465 * Like Simple_vFAIL(), but accepts four arguments.
467 #define Simple_vFAIL4(m, a1, a2, a3) STMT_START { \
468 const IV offset = RExC_parse - RExC_precomp; \
469 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3, \
470 (int)offset, RExC_precomp, RExC_precomp + offset); \
473 #define vWARN(loc,m) STMT_START { \
474 const IV offset = loc - RExC_precomp; \
475 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s" REPORT_LOCATION, \
476 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
479 #define vWARNdep(loc,m) STMT_START { \
480 const IV offset = loc - RExC_precomp; \
481 Perl_warner(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), \
482 "%s" REPORT_LOCATION, \
483 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
487 #define vWARN2(loc, m, a1) STMT_START { \
488 const IV offset = loc - RExC_precomp; \
489 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
490 a1, (int)offset, RExC_precomp, RExC_precomp + offset); \
493 #define vWARN3(loc, m, a1, a2) STMT_START { \
494 const IV offset = loc - RExC_precomp; \
495 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
496 a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset); \
499 #define vWARN4(loc, m, a1, a2, a3) STMT_START { \
500 const IV offset = loc - RExC_precomp; \
501 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
502 a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
505 #define vWARN5(loc, m, a1, a2, a3, a4) STMT_START { \
506 const IV offset = loc - RExC_precomp; \
507 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
508 a1, a2, a3, a4, (int)offset, RExC_precomp, RExC_precomp + offset); \
512 /* Allow for side effects in s */
513 #define REGC(c,s) STMT_START { \
514 if (!SIZE_ONLY) *(s) = (c); else (void)(s); \
517 /* Macros for recording node offsets. 20001227 mjd@plover.com
518 * Nodes are numbered 1, 2, 3, 4. Node #n's position is recorded in
519 * element 2*n-1 of the array. Element #2n holds the byte length node #n.
520 * Element 0 holds the number n.
521 * Position is 1 indexed.
523 #ifndef RE_TRACK_PATTERN_OFFSETS
524 #define Set_Node_Offset_To_R(node,byte)
525 #define Set_Node_Offset(node,byte)
526 #define Set_Cur_Node_Offset
527 #define Set_Node_Length_To_R(node,len)
528 #define Set_Node_Length(node,len)
529 #define Set_Node_Cur_Length(node)
530 #define Node_Offset(n)
531 #define Node_Length(n)
532 #define Set_Node_Offset_Length(node,offset,len)
533 #define ProgLen(ri) ri->u.proglen
534 #define SetProgLen(ri,x) ri->u.proglen = x
536 #define ProgLen(ri) ri->u.offsets[0]
537 #define SetProgLen(ri,x) ri->u.offsets[0] = x
538 #define Set_Node_Offset_To_R(node,byte) STMT_START { \
540 MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n", \
541 __LINE__, (int)(node), (int)(byte))); \
543 Perl_croak(aTHX_ "value of node is %d in Offset macro", (int)(node)); \
545 RExC_offsets[2*(node)-1] = (byte); \
550 #define Set_Node_Offset(node,byte) \
551 Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
552 #define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
554 #define Set_Node_Length_To_R(node,len) STMT_START { \
556 MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n", \
557 __LINE__, (int)(node), (int)(len))); \
559 Perl_croak(aTHX_ "value of node is %d in Length macro", (int)(node)); \
561 RExC_offsets[2*(node)] = (len); \
566 #define Set_Node_Length(node,len) \
567 Set_Node_Length_To_R((node)-RExC_emit_start, len)
568 #define Set_Cur_Node_Length(len) Set_Node_Length(RExC_emit, len)
569 #define Set_Node_Cur_Length(node) \
570 Set_Node_Length(node, RExC_parse - parse_start)
572 /* Get offsets and lengths */
573 #define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
574 #define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
576 #define Set_Node_Offset_Length(node,offset,len) STMT_START { \
577 Set_Node_Offset_To_R((node)-RExC_emit_start, (offset)); \
578 Set_Node_Length_To_R((node)-RExC_emit_start, (len)); \
582 #if PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS
583 #define EXPERIMENTAL_INPLACESCAN
584 #endif /*RE_TRACK_PATTERN_OFFSETS*/
586 #define DEBUG_STUDYDATA(str,data,depth) \
587 DEBUG_OPTIMISE_MORE_r(if(data){ \
588 PerlIO_printf(Perl_debug_log, \
589 "%*s" str "Pos:%"IVdf"/%"IVdf \
590 " Flags: 0x%"UVXf" Whilem_c: %"IVdf" Lcp: %"IVdf" %s", \
591 (int)(depth)*2, "", \
592 (IV)((data)->pos_min), \
593 (IV)((data)->pos_delta), \
594 (UV)((data)->flags), \
595 (IV)((data)->whilem_c), \
596 (IV)((data)->last_closep ? *((data)->last_closep) : -1), \
597 is_inf ? "INF " : "" \
599 if ((data)->last_found) \
600 PerlIO_printf(Perl_debug_log, \
601 "Last:'%s' %"IVdf":%"IVdf"/%"IVdf" %sFixed:'%s' @ %"IVdf \
602 " %sFloat: '%s' @ %"IVdf"/%"IVdf"", \
603 SvPVX_const((data)->last_found), \
604 (IV)((data)->last_end), \
605 (IV)((data)->last_start_min), \
606 (IV)((data)->last_start_max), \
607 ((data)->longest && \
608 (data)->longest==&((data)->longest_fixed)) ? "*" : "", \
609 SvPVX_const((data)->longest_fixed), \
610 (IV)((data)->offset_fixed), \
611 ((data)->longest && \
612 (data)->longest==&((data)->longest_float)) ? "*" : "", \
613 SvPVX_const((data)->longest_float), \
614 (IV)((data)->offset_float_min), \
615 (IV)((data)->offset_float_max) \
617 PerlIO_printf(Perl_debug_log,"\n"); \
620 static void clear_re(pTHX_ void *r);
622 /* Mark that we cannot extend a found fixed substring at this point.
623 Update the longest found anchored substring and the longest found
624 floating substrings if needed. */
627 S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data, I32 *minlenp, int is_inf)
629 const STRLEN l = CHR_SVLEN(data->last_found);
630 const STRLEN old_l = CHR_SVLEN(*data->longest);
631 GET_RE_DEBUG_FLAGS_DECL;
633 if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
634 SvSetMagicSV(*data->longest, data->last_found);
635 if (*data->longest == data->longest_fixed) {
636 data->offset_fixed = l ? data->last_start_min : data->pos_min;
637 if (data->flags & SF_BEFORE_EOL)
639 |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
641 data->flags &= ~SF_FIX_BEFORE_EOL;
642 data->minlen_fixed=minlenp;
643 data->lookbehind_fixed=0;
645 else { /* *data->longest == data->longest_float */
646 data->offset_float_min = l ? data->last_start_min : data->pos_min;
647 data->offset_float_max = (l
648 ? data->last_start_max
649 : data->pos_min + data->pos_delta);
650 if (is_inf || (U32)data->offset_float_max > (U32)I32_MAX)
651 data->offset_float_max = I32_MAX;
652 if (data->flags & SF_BEFORE_EOL)
654 |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
656 data->flags &= ~SF_FL_BEFORE_EOL;
657 data->minlen_float=minlenp;
658 data->lookbehind_float=0;
661 SvCUR_set(data->last_found, 0);
663 SV * const sv = data->last_found;
664 if (SvUTF8(sv) && SvMAGICAL(sv)) {
665 MAGIC * const mg = mg_find(sv, PERL_MAGIC_utf8);
671 data->flags &= ~SF_BEFORE_EOL;
672 DEBUG_STUDYDATA("commit: ",data,0);
675 /* Can match anything (initialization) */
677 S_cl_anything(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
679 ANYOF_CLASS_ZERO(cl);
680 ANYOF_BITMAP_SETALL(cl);
681 cl->flags = ANYOF_EOS|ANYOF_UNICODE_ALL;
683 cl->flags |= ANYOF_LOCALE;
686 /* Can match anything (initialization) */
688 S_cl_is_anything(const struct regnode_charclass_class *cl)
692 for (value = 0; value <= ANYOF_MAX; value += 2)
693 if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1))
695 if (!(cl->flags & ANYOF_UNICODE_ALL))
697 if (!ANYOF_BITMAP_TESTALLSET((const void*)cl))
702 /* Can match anything (initialization) */
704 S_cl_init(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
706 Zero(cl, 1, struct regnode_charclass_class);
708 cl_anything(pRExC_state, cl);
712 S_cl_init_zero(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
714 Zero(cl, 1, struct regnode_charclass_class);
716 cl_anything(pRExC_state, cl);
718 cl->flags |= ANYOF_LOCALE;
721 /* 'And' a given class with another one. Can create false positives */
722 /* We assume that cl is not inverted */
724 S_cl_and(struct regnode_charclass_class *cl,
725 const struct regnode_charclass_class *and_with)
728 assert(and_with->type == ANYOF);
729 if (!(and_with->flags & ANYOF_CLASS)
730 && !(cl->flags & ANYOF_CLASS)
731 && (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
732 && !(and_with->flags & ANYOF_FOLD)
733 && !(cl->flags & ANYOF_FOLD)) {
736 if (and_with->flags & ANYOF_INVERT)
737 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
738 cl->bitmap[i] &= ~and_with->bitmap[i];
740 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
741 cl->bitmap[i] &= and_with->bitmap[i];
742 } /* XXXX: logic is complicated otherwise, leave it along for a moment. */
743 if (!(and_with->flags & ANYOF_EOS))
744 cl->flags &= ~ANYOF_EOS;
746 if (cl->flags & ANYOF_UNICODE_ALL && and_with->flags & ANYOF_UNICODE &&
747 !(and_with->flags & ANYOF_INVERT)) {
748 cl->flags &= ~ANYOF_UNICODE_ALL;
749 cl->flags |= ANYOF_UNICODE;
750 ARG_SET(cl, ARG(and_with));
752 if (!(and_with->flags & ANYOF_UNICODE_ALL) &&
753 !(and_with->flags & ANYOF_INVERT))
754 cl->flags &= ~ANYOF_UNICODE_ALL;
755 if (!(and_with->flags & (ANYOF_UNICODE|ANYOF_UNICODE_ALL)) &&
756 !(and_with->flags & ANYOF_INVERT))
757 cl->flags &= ~ANYOF_UNICODE;
760 /* 'OR' a given class with another one. Can create false positives */
761 /* We assume that cl is not inverted */
763 S_cl_or(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, const struct regnode_charclass_class *or_with)
765 if (or_with->flags & ANYOF_INVERT) {
767 * (B1 | CL1) | (!B2 & !CL2) = (B1 | !B2 & !CL2) | (CL1 | (!B2 & !CL2))
768 * <= (B1 | !B2) | (CL1 | !CL2)
769 * which is wasteful if CL2 is small, but we ignore CL2:
770 * (B1 | CL1) | (!B2 & !CL2) <= (B1 | CL1) | !B2 = (B1 | !B2) | CL1
771 * XXXX Can we handle case-fold? Unclear:
772 * (OK1(i) | OK1(i')) | !(OK1(i) | OK1(i')) =
773 * (OK1(i) | OK1(i')) | (!OK1(i) & !OK1(i'))
775 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
776 && !(or_with->flags & ANYOF_FOLD)
777 && !(cl->flags & ANYOF_FOLD) ) {
780 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
781 cl->bitmap[i] |= ~or_with->bitmap[i];
782 } /* XXXX: logic is complicated otherwise */
784 cl_anything(pRExC_state, cl);
787 /* (B1 | CL1) | (B2 | CL2) = (B1 | B2) | (CL1 | CL2)) */
788 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
789 && (!(or_with->flags & ANYOF_FOLD)
790 || (cl->flags & ANYOF_FOLD)) ) {
793 /* OR char bitmap and class bitmap separately */
794 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
795 cl->bitmap[i] |= or_with->bitmap[i];
796 if (or_with->flags & ANYOF_CLASS) {
797 for (i = 0; i < ANYOF_CLASSBITMAP_SIZE; i++)
798 cl->classflags[i] |= or_with->classflags[i];
799 cl->flags |= ANYOF_CLASS;
802 else { /* XXXX: logic is complicated, leave it along for a moment. */
803 cl_anything(pRExC_state, cl);
806 if (or_with->flags & ANYOF_EOS)
807 cl->flags |= ANYOF_EOS;
809 if (cl->flags & ANYOF_UNICODE && or_with->flags & ANYOF_UNICODE &&
810 ARG(cl) != ARG(or_with)) {
811 cl->flags |= ANYOF_UNICODE_ALL;
812 cl->flags &= ~ANYOF_UNICODE;
814 if (or_with->flags & ANYOF_UNICODE_ALL) {
815 cl->flags |= ANYOF_UNICODE_ALL;
816 cl->flags &= ~ANYOF_UNICODE;
820 #define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ]
821 #define TRIE_LIST_CUR(state) ( TRIE_LIST_ITEM( state, 0 ).forid )
822 #define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate )
823 #define TRIE_LIST_USED(idx) ( trie->states[state].trans.list ? (TRIE_LIST_CUR( idx ) - 1) : 0 )
828 dump_trie(trie,widecharmap,revcharmap)
829 dump_trie_interim_list(trie,widecharmap,revcharmap,next_alloc)
830 dump_trie_interim_table(trie,widecharmap,revcharmap,next_alloc)
832 These routines dump out a trie in a somewhat readable format.
833 The _interim_ variants are used for debugging the interim
834 tables that are used to generate the final compressed
835 representation which is what dump_trie expects.
837 Part of the reason for their existance is to provide a form
838 of documentation as to how the different representations function.
843 Dumps the final compressed table form of the trie to Perl_debug_log.
844 Used for debugging make_trie().
848 S_dump_trie(pTHX_ const struct _reg_trie_data *trie, HV *widecharmap,
849 AV *revcharmap, U32 depth)
852 SV *sv=sv_newmortal();
853 int colwidth= widecharmap ? 6 : 4;
854 GET_RE_DEBUG_FLAGS_DECL;
857 PerlIO_printf( Perl_debug_log, "%*sChar : %-6s%-6s%-4s ",
858 (int)depth * 2 + 2,"",
859 "Match","Base","Ofs" );
861 for( state = 0 ; state < trie->uniquecharcount ; state++ ) {
862 SV ** const tmp = av_fetch( revcharmap, state, 0);
864 PerlIO_printf( Perl_debug_log, "%*s",
866 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
867 PL_colors[0], PL_colors[1],
868 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
869 PERL_PV_ESCAPE_FIRSTCHAR
874 PerlIO_printf( Perl_debug_log, "\n%*sState|-----------------------",
875 (int)depth * 2 + 2,"");
877 for( state = 0 ; state < trie->uniquecharcount ; state++ )
878 PerlIO_printf( Perl_debug_log, "%.*s", colwidth, "--------");
879 PerlIO_printf( Perl_debug_log, "\n");
881 for( state = 1 ; state < trie->statecount ; state++ ) {
882 const U32 base = trie->states[ state ].trans.base;
884 PerlIO_printf( Perl_debug_log, "%*s#%4"UVXf"|", (int)depth * 2 + 2,"", (UV)state);
886 if ( trie->states[ state ].wordnum ) {
887 PerlIO_printf( Perl_debug_log, " W%4X", trie->states[ state ].wordnum );
889 PerlIO_printf( Perl_debug_log, "%6s", "" );
892 PerlIO_printf( Perl_debug_log, " @%4"UVXf" ", (UV)base );
897 while( ( base + ofs < trie->uniquecharcount ) ||
898 ( base + ofs - trie->uniquecharcount < trie->lasttrans
899 && trie->trans[ base + ofs - trie->uniquecharcount ].check != state))
902 PerlIO_printf( Perl_debug_log, "+%2"UVXf"[ ", (UV)ofs);
904 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
905 if ( ( base + ofs >= trie->uniquecharcount ) &&
906 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
907 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
909 PerlIO_printf( Perl_debug_log, "%*"UVXf,
911 (UV)trie->trans[ base + ofs - trie->uniquecharcount ].next );
913 PerlIO_printf( Perl_debug_log, "%*s",colwidth," ." );
917 PerlIO_printf( Perl_debug_log, "]");
920 PerlIO_printf( Perl_debug_log, "\n" );
924 Dumps a fully constructed but uncompressed trie in list form.
925 List tries normally only are used for construction when the number of
926 possible chars (trie->uniquecharcount) is very high.
927 Used for debugging make_trie().
930 S_dump_trie_interim_list(pTHX_ const struct _reg_trie_data *trie,
931 HV *widecharmap, AV *revcharmap, U32 next_alloc,
935 SV *sv=sv_newmortal();
936 int colwidth= widecharmap ? 6 : 4;
937 GET_RE_DEBUG_FLAGS_DECL;
938 /* print out the table precompression. */
939 PerlIO_printf( Perl_debug_log, "%*sState :Word | Transition Data\n%*s%s",
940 (int)depth * 2 + 2,"", (int)depth * 2 + 2,"",
941 "------:-----+-----------------\n" );
943 for( state=1 ; state < next_alloc ; state ++ ) {
946 PerlIO_printf( Perl_debug_log, "%*s %4"UVXf" :",
947 (int)depth * 2 + 2,"", (UV)state );
948 if ( ! trie->states[ state ].wordnum ) {
949 PerlIO_printf( Perl_debug_log, "%5s| ","");
951 PerlIO_printf( Perl_debug_log, "W%4x| ",
952 trie->states[ state ].wordnum
955 for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) {
956 SV ** const tmp = av_fetch( revcharmap, TRIE_LIST_ITEM(state,charid).forid, 0);
958 PerlIO_printf( Perl_debug_log, "%*s:%3X=%4"UVXf" | ",
960 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
961 PL_colors[0], PL_colors[1],
962 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
963 PERL_PV_ESCAPE_FIRSTCHAR
965 TRIE_LIST_ITEM(state,charid).forid,
966 (UV)TRIE_LIST_ITEM(state,charid).newstate
969 PerlIO_printf(Perl_debug_log, "\n%*s| ",
970 (int)((depth * 2) + 14), "");
973 PerlIO_printf( Perl_debug_log, "\n");
978 Dumps a fully constructed but uncompressed trie in table form.
979 This is the normal DFA style state transition table, with a few
980 twists to facilitate compression later.
981 Used for debugging make_trie().
984 S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie,
985 HV *widecharmap, AV *revcharmap, U32 next_alloc,
990 SV *sv=sv_newmortal();
991 int colwidth= widecharmap ? 6 : 4;
992 GET_RE_DEBUG_FLAGS_DECL;
995 print out the table precompression so that we can do a visual check
996 that they are identical.
999 PerlIO_printf( Perl_debug_log, "%*sChar : ",(int)depth * 2 + 2,"" );
1001 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
1002 SV ** const tmp = av_fetch( revcharmap, charid, 0);
1004 PerlIO_printf( Perl_debug_log, "%*s",
1006 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
1007 PL_colors[0], PL_colors[1],
1008 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1009 PERL_PV_ESCAPE_FIRSTCHAR
1015 PerlIO_printf( Perl_debug_log, "\n%*sState+-",(int)depth * 2 + 2,"" );
1017 for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) {
1018 PerlIO_printf( Perl_debug_log, "%.*s", colwidth,"--------");
1021 PerlIO_printf( Perl_debug_log, "\n" );
1023 for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) {
1025 PerlIO_printf( Perl_debug_log, "%*s%4"UVXf" : ",
1026 (int)depth * 2 + 2,"",
1027 (UV)TRIE_NODENUM( state ) );
1029 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
1030 UV v=(UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next );
1032 PerlIO_printf( Perl_debug_log, "%*"UVXf, colwidth, v );
1034 PerlIO_printf( Perl_debug_log, "%*s", colwidth, "." );
1036 if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) {
1037 PerlIO_printf( Perl_debug_log, " (%4"UVXf")\n", (UV)trie->trans[ state ].check );
1039 PerlIO_printf( Perl_debug_log, " (%4"UVXf") W%4X\n", (UV)trie->trans[ state ].check,
1040 trie->states[ TRIE_NODENUM( state ) ].wordnum );
1047 /* make_trie(startbranch,first,last,tail,word_count,flags,depth)
1048 startbranch: the first branch in the whole branch sequence
1049 first : start branch of sequence of branch-exact nodes.
1050 May be the same as startbranch
1051 last : Thing following the last branch.
1052 May be the same as tail.
1053 tail : item following the branch sequence
1054 count : words in the sequence
1055 flags : currently the OP() type we will be building one of /EXACT(|F|Fl)/
1056 depth : indent depth
1058 Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node.
1060 A trie is an N'ary tree where the branches are determined by digital
1061 decomposition of the key. IE, at the root node you look up the 1st character and
1062 follow that branch repeat until you find the end of the branches. Nodes can be
1063 marked as "accepting" meaning they represent a complete word. Eg:
1067 would convert into the following structure. Numbers represent states, letters
1068 following numbers represent valid transitions on the letter from that state, if
1069 the number is in square brackets it represents an accepting state, otherwise it
1070 will be in parenthesis.
1072 +-h->+-e->[3]-+-r->(8)-+-s->[9]
1076 (1) +-i->(6)-+-s->[7]
1078 +-s->(3)-+-h->(4)-+-e->[5]
1080 Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers)
1082 This shows that when matching against the string 'hers' we will begin at state 1
1083 read 'h' and move to state 2, read 'e' and move to state 3 which is accepting,
1084 then read 'r' and go to state 8 followed by 's' which takes us to state 9 which
1085 is also accepting. Thus we know that we can match both 'he' and 'hers' with a
1086 single traverse. We store a mapping from accepting to state to which word was
1087 matched, and then when we have multiple possibilities we try to complete the
1088 rest of the regex in the order in which they occured in the alternation.
1090 The only prior NFA like behaviour that would be changed by the TRIE support is
1091 the silent ignoring of duplicate alternations which are of the form:
1093 / (DUPE|DUPE) X? (?{ ... }) Y /x
1095 Thus EVAL blocks follwing a trie may be called a different number of times with
1096 and without the optimisation. With the optimisations dupes will be silently
1097 ignored. This inconsistant behaviour of EVAL type nodes is well established as
1098 the following demonstrates:
1100 'words'=~/(word|word|word)(?{ print $1 })[xyz]/
1102 which prints out 'word' three times, but
1104 'words'=~/(word|word|word)(?{ print $1 })S/
1106 which doesnt print it out at all. This is due to other optimisations kicking in.
1108 Example of what happens on a structural level:
1110 The regexp /(ac|ad|ab)+/ will produce the folowing debug output:
1112 1: CURLYM[1] {1,32767}(18)
1123 This would be optimizable with startbranch=5, first=5, last=16, tail=16
1124 and should turn into:
1126 1: CURLYM[1] {1,32767}(18)
1128 [Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1]
1136 Cases where tail != last would be like /(?foo|bar)baz/:
1146 which would be optimizable with startbranch=1, first=1, last=7, tail=8
1147 and would end up looking like:
1150 [Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1]
1157 d = uvuni_to_utf8_flags(d, uv, 0);
1159 is the recommended Unicode-aware way of saying
1164 #define TRIE_STORE_REVCHAR \
1167 SV *zlopp = newSV(2); \
1168 unsigned char *flrbbbbb = (unsigned char *) SvPVX(zlopp); \
1169 unsigned const char *const kapow = uvuni_to_utf8(flrbbbbb, uvc & 0xFF); \
1170 SvCUR_set(zlopp, kapow - flrbbbbb); \
1173 av_push(revcharmap, zlopp); \
1175 char ooooff = (char)uvc; \
1176 av_push(revcharmap, newSVpvn(&ooooff, 1)); \
1180 #define TRIE_READ_CHAR STMT_START { \
1184 if ( foldlen > 0 ) { \
1185 uvc = utf8n_to_uvuni( scan, UTF8_MAXLEN, &len, uniflags ); \
1190 uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
1191 uvc = to_uni_fold( uvc, foldbuf, &foldlen ); \
1192 foldlen -= UNISKIP( uvc ); \
1193 scan = foldbuf + UNISKIP( uvc ); \
1196 uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
1206 #define TRIE_LIST_PUSH(state,fid,ns) STMT_START { \
1207 if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) { \
1208 U32 ging = TRIE_LIST_LEN( state ) *= 2; \
1209 Renew( trie->states[ state ].trans.list, ging, reg_trie_trans_le ); \
1211 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid; \
1212 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns; \
1213 TRIE_LIST_CUR( state )++; \
1216 #define TRIE_LIST_NEW(state) STMT_START { \
1217 Newxz( trie->states[ state ].trans.list, \
1218 4, reg_trie_trans_le ); \
1219 TRIE_LIST_CUR( state ) = 1; \
1220 TRIE_LIST_LEN( state ) = 4; \
1223 #define TRIE_HANDLE_WORD(state) STMT_START { \
1224 U16 dupe= trie->states[ state ].wordnum; \
1225 regnode * const noper_next = regnext( noper ); \
1227 if (trie->wordlen) \
1228 trie->wordlen[ curword ] = wordlen; \
1230 /* store the word for dumping */ \
1232 if (OP(noper) != NOTHING) \
1233 tmp = newSVpvn(STRING(noper), STR_LEN(noper)); \
1235 tmp = newSVpvn( "", 0 ); \
1236 if ( UTF ) SvUTF8_on( tmp ); \
1237 av_push( trie_words, tmp ); \
1242 if ( noper_next < tail ) { \
1244 trie->jump = (U16 *) PerlMemShared_calloc( word_count + 1, sizeof(U16) ); \
1245 trie->jump[curword] = (U16)(noper_next - convert); \
1247 jumper = noper_next; \
1249 nextbranch= regnext(cur); \
1253 /* So it's a dupe. This means we need to maintain a */\
1254 /* linked-list from the first to the next. */\
1255 /* we only allocate the nextword buffer when there */\
1256 /* a dupe, so first time we have to do the allocation */\
1257 if (!trie->nextword) \
1258 trie->nextword = (U16 *) \
1259 PerlMemShared_calloc( word_count + 1, sizeof(U16)); \
1260 while ( trie->nextword[dupe] ) \
1261 dupe= trie->nextword[dupe]; \
1262 trie->nextword[dupe]= curword; \
1264 /* we haven't inserted this word yet. */ \
1265 trie->states[ state ].wordnum = curword; \
1270 #define TRIE_TRANS_STATE(state,base,ucharcount,charid,special) \
1271 ( ( base + charid >= ucharcount \
1272 && base + charid < ubound \
1273 && state == trie->trans[ base - ucharcount + charid ].check \
1274 && trie->trans[ base - ucharcount + charid ].next ) \
1275 ? trie->trans[ base - ucharcount + charid ].next \
1276 : ( state==1 ? special : 0 ) \
1280 #define MADE_JUMP_TRIE 2
1281 #define MADE_EXACT_TRIE 4
1284 S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *first, regnode *last, regnode *tail, U32 word_count, U32 flags, U32 depth)
1287 /* first pass, loop through and scan words */
1288 reg_trie_data *trie;
1289 HV *widecharmap = NULL;
1290 AV *revcharmap = newAV();
1292 const U32 uniflags = UTF8_ALLOW_DEFAULT;
1297 regnode *jumper = NULL;
1298 regnode *nextbranch = NULL;
1299 regnode *convert = NULL;
1300 /* we just use folder as a flag in utf8 */
1301 const U8 * const folder = ( flags == EXACTF
1303 : ( flags == EXACTFL
1310 const U32 data_slot = add_data( pRExC_state, 4, "tuuu" );
1311 AV *trie_words = NULL;
1312 /* along with revcharmap, this only used during construction but both are
1313 * useful during debugging so we store them in the struct when debugging.
1316 const U32 data_slot = add_data( pRExC_state, 2, "tu" );
1317 STRLEN trie_charcount=0;
1319 SV *re_trie_maxbuff;
1320 GET_RE_DEBUG_FLAGS_DECL;
1322 PERL_UNUSED_ARG(depth);
1325 trie = (reg_trie_data *) PerlMemShared_calloc( 1, sizeof(reg_trie_data) );
1327 trie->startstate = 1;
1328 trie->wordcount = word_count;
1329 RExC_rxi->data->data[ data_slot ] = (void*)trie;
1330 trie->charmap = (U16 *) PerlMemShared_calloc( 256, sizeof(U16) );
1331 if (!(UTF && folder))
1332 trie->bitmap = (char *) PerlMemShared_calloc( ANYOF_BITMAP_SIZE, 1 );
1334 trie_words = newAV();
1337 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
1338 if (!SvIOK(re_trie_maxbuff)) {
1339 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
1342 PerlIO_printf( Perl_debug_log,
1343 "%*smake_trie start==%d, first==%d, last==%d, tail==%d depth=%d\n",
1344 (int)depth * 2 + 2, "",
1345 REG_NODE_NUM(startbranch),REG_NODE_NUM(first),
1346 REG_NODE_NUM(last), REG_NODE_NUM(tail),
1350 /* Find the node we are going to overwrite */
1351 if ( first == startbranch && OP( last ) != BRANCH ) {
1352 /* whole branch chain */
1355 /* branch sub-chain */
1356 convert = NEXTOPER( first );
1359 /* -- First loop and Setup --
1361 We first traverse the branches and scan each word to determine if it
1362 contains widechars, and how many unique chars there are, this is
1363 important as we have to build a table with at least as many columns as we
1366 We use an array of integers to represent the character codes 0..255
1367 (trie->charmap) and we use a an HV* to store Unicode characters. We use the
1368 native representation of the character value as the key and IV's for the
1371 *TODO* If we keep track of how many times each character is used we can
1372 remap the columns so that the table compression later on is more
1373 efficient in terms of memory by ensuring most common value is in the
1374 middle and the least common are on the outside. IMO this would be better
1375 than a most to least common mapping as theres a decent chance the most
1376 common letter will share a node with the least common, meaning the node
1377 will not be compressable. With a middle is most common approach the worst
1378 case is when we have the least common nodes twice.
1382 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1383 regnode * const noper = NEXTOPER( cur );
1384 const U8 *uc = (U8*)STRING( noper );
1385 const U8 * const e = uc + STR_LEN( noper );
1387 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1388 const U8 *scan = (U8*)NULL;
1389 U32 wordlen = 0; /* required init */
1391 bool set_bit = trie->bitmap ? 1 : 0; /*store the first char in the bitmap?*/
1393 if (OP(noper) == NOTHING) {
1397 if ( set_bit ) /* bitmap only alloced when !(UTF&&Folding) */
1398 TRIE_BITMAP_SET(trie,*uc); /* store the raw first byte
1399 regardless of encoding */
1401 for ( ; uc < e ; uc += len ) {
1402 TRIE_CHARCOUNT(trie)++;
1406 if ( !trie->charmap[ uvc ] ) {
1407 trie->charmap[ uvc ]=( ++trie->uniquecharcount );
1409 trie->charmap[ folder[ uvc ] ] = trie->charmap[ uvc ];
1413 /* store the codepoint in the bitmap, and if its ascii
1414 also store its folded equivelent. */
1415 TRIE_BITMAP_SET(trie,uvc);
1417 /* store the folded codepoint */
1418 if ( folder ) TRIE_BITMAP_SET(trie,folder[ uvc ]);
1421 /* store first byte of utf8 representation of
1422 codepoints in the 127 < uvc < 256 range */
1423 if (127 < uvc && uvc < 192) {
1424 TRIE_BITMAP_SET(trie,194);
1425 } else if (191 < uvc ) {
1426 TRIE_BITMAP_SET(trie,195);
1427 /* && uvc < 256 -- we know uvc is < 256 already */
1430 set_bit = 0; /* We've done our bit :-) */
1435 widecharmap = newHV();
1437 svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 1 );
1440 Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%"UVXf, uvc );
1442 if ( !SvTRUE( *svpp ) ) {
1443 sv_setiv( *svpp, ++trie->uniquecharcount );
1448 if( cur == first ) {
1451 } else if (chars < trie->minlen) {
1453 } else if (chars > trie->maxlen) {
1457 } /* end first pass */
1458 DEBUG_TRIE_COMPILE_r(
1459 PerlIO_printf( Perl_debug_log, "%*sTRIE(%s): W:%d C:%d Uq:%d Min:%d Max:%d\n",
1460 (int)depth * 2 + 2,"",
1461 ( widecharmap ? "UTF8" : "NATIVE" ), (int)word_count,
1462 (int)TRIE_CHARCOUNT(trie), trie->uniquecharcount,
1463 (int)trie->minlen, (int)trie->maxlen )
1465 trie->wordlen = (U32 *) PerlMemShared_calloc( word_count, sizeof(U32) );
1468 We now know what we are dealing with in terms of unique chars and
1469 string sizes so we can calculate how much memory a naive
1470 representation using a flat table will take. If it's over a reasonable
1471 limit (as specified by ${^RE_TRIE_MAXBUF}) we use a more memory
1472 conservative but potentially much slower representation using an array
1475 At the end we convert both representations into the same compressed
1476 form that will be used in regexec.c for matching with. The latter
1477 is a form that cannot be used to construct with but has memory
1478 properties similar to the list form and access properties similar
1479 to the table form making it both suitable for fast searches and
1480 small enough that its feasable to store for the duration of a program.
1482 See the comment in the code where the compressed table is produced
1483 inplace from the flat tabe representation for an explanation of how
1484 the compression works.
1489 if ( (IV)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1) > SvIV(re_trie_maxbuff) ) {
1491 Second Pass -- Array Of Lists Representation
1493 Each state will be represented by a list of charid:state records
1494 (reg_trie_trans_le) the first such element holds the CUR and LEN
1495 points of the allocated array. (See defines above).
1497 We build the initial structure using the lists, and then convert
1498 it into the compressed table form which allows faster lookups
1499 (but cant be modified once converted).
1502 STRLEN transcount = 1;
1504 DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
1505 "%*sCompiling trie using list compiler\n",
1506 (int)depth * 2 + 2, ""));
1508 trie->states = (reg_trie_state *)
1509 PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
1510 sizeof(reg_trie_state) );
1514 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1516 regnode * const noper = NEXTOPER( cur );
1517 U8 *uc = (U8*)STRING( noper );
1518 const U8 * const e = uc + STR_LEN( noper );
1519 U32 state = 1; /* required init */
1520 U16 charid = 0; /* sanity init */
1521 U8 *scan = (U8*)NULL; /* sanity init */
1522 STRLEN foldlen = 0; /* required init */
1523 U32 wordlen = 0; /* required init */
1524 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1526 if (OP(noper) != NOTHING) {
1527 for ( ; uc < e ; uc += len ) {
1532 charid = trie->charmap[ uvc ];
1534 SV** const svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 0);
1538 charid=(U16)SvIV( *svpp );
1541 /* charid is now 0 if we dont know the char read, or nonzero if we do */
1548 if ( !trie->states[ state ].trans.list ) {
1549 TRIE_LIST_NEW( state );
1551 for ( check = 1; check <= TRIE_LIST_USED( state ); check++ ) {
1552 if ( TRIE_LIST_ITEM( state, check ).forid == charid ) {
1553 newstate = TRIE_LIST_ITEM( state, check ).newstate;
1558 newstate = next_alloc++;
1559 TRIE_LIST_PUSH( state, charid, newstate );
1564 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
1568 TRIE_HANDLE_WORD(state);
1570 } /* end second pass */
1572 /* next alloc is the NEXT state to be allocated */
1573 trie->statecount = next_alloc;
1574 trie->states = (reg_trie_state *)
1575 PerlMemShared_realloc( trie->states,
1577 * sizeof(reg_trie_state) );
1579 /* and now dump it out before we compress it */
1580 DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_list(trie, widecharmap,
1581 revcharmap, next_alloc,
1585 trie->trans = (reg_trie_trans *)
1586 PerlMemShared_calloc( transcount, sizeof(reg_trie_trans) );
1593 for( state=1 ; state < next_alloc ; state ++ ) {
1597 DEBUG_TRIE_COMPILE_MORE_r(
1598 PerlIO_printf( Perl_debug_log, "tp: %d zp: %d ",tp,zp)
1602 if (trie->states[state].trans.list) {
1603 U16 minid=TRIE_LIST_ITEM( state, 1).forid;
1607 for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
1608 const U16 forid = TRIE_LIST_ITEM( state, idx).forid;
1609 if ( forid < minid ) {
1611 } else if ( forid > maxid ) {
1615 if ( transcount < tp + maxid - minid + 1) {
1617 trie->trans = (reg_trie_trans *)
1618 PerlMemShared_realloc( trie->trans,
1620 * sizeof(reg_trie_trans) );
1621 Zero( trie->trans + (transcount / 2), transcount / 2 , reg_trie_trans );
1623 base = trie->uniquecharcount + tp - minid;
1624 if ( maxid == minid ) {
1626 for ( ; zp < tp ; zp++ ) {
1627 if ( ! trie->trans[ zp ].next ) {
1628 base = trie->uniquecharcount + zp - minid;
1629 trie->trans[ zp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1630 trie->trans[ zp ].check = state;
1636 trie->trans[ tp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1637 trie->trans[ tp ].check = state;
1642 for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
1643 const U32 tid = base - trie->uniquecharcount + TRIE_LIST_ITEM( state, idx ).forid;
1644 trie->trans[ tid ].next = TRIE_LIST_ITEM( state, idx ).newstate;
1645 trie->trans[ tid ].check = state;
1647 tp += ( maxid - minid + 1 );
1649 Safefree(trie->states[ state ].trans.list);
1652 DEBUG_TRIE_COMPILE_MORE_r(
1653 PerlIO_printf( Perl_debug_log, " base: %d\n",base);
1656 trie->states[ state ].trans.base=base;
1658 trie->lasttrans = tp + 1;
1662 Second Pass -- Flat Table Representation.
1664 we dont use the 0 slot of either trans[] or states[] so we add 1 to each.
1665 We know that we will need Charcount+1 trans at most to store the data
1666 (one row per char at worst case) So we preallocate both structures
1667 assuming worst case.
1669 We then construct the trie using only the .next slots of the entry
1672 We use the .check field of the first entry of the node temporarily to
1673 make compression both faster and easier by keeping track of how many non
1674 zero fields are in the node.
1676 Since trans are numbered from 1 any 0 pointer in the table is a FAIL
1679 There are two terms at use here: state as a TRIE_NODEIDX() which is a
1680 number representing the first entry of the node, and state as a
1681 TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1) and
1682 TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3) if there
1683 are 2 entrys per node. eg:
1691 The table is internally in the right hand, idx form. However as we also
1692 have to deal with the states array which is indexed by nodenum we have to
1693 use TRIE_NODENUM() to convert.
1696 DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
1697 "%*sCompiling trie using table compiler\n",
1698 (int)depth * 2 + 2, ""));
1700 trie->trans = (reg_trie_trans *)
1701 PerlMemShared_calloc( ( TRIE_CHARCOUNT(trie) + 1 )
1702 * trie->uniquecharcount + 1,
1703 sizeof(reg_trie_trans) );
1704 trie->states = (reg_trie_state *)
1705 PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
1706 sizeof(reg_trie_state) );
1707 next_alloc = trie->uniquecharcount + 1;
1710 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1712 regnode * const noper = NEXTOPER( cur );
1713 const U8 *uc = (U8*)STRING( noper );
1714 const U8 * const e = uc + STR_LEN( noper );
1716 U32 state = 1; /* required init */
1718 U16 charid = 0; /* sanity init */
1719 U32 accept_state = 0; /* sanity init */
1720 U8 *scan = (U8*)NULL; /* sanity init */
1722 STRLEN foldlen = 0; /* required init */
1723 U32 wordlen = 0; /* required init */
1724 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1726 if ( OP(noper) != NOTHING ) {
1727 for ( ; uc < e ; uc += len ) {
1732 charid = trie->charmap[ uvc ];
1734 SV* const * const svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 0);
1735 charid = svpp ? (U16)SvIV(*svpp) : 0;
1739 if ( !trie->trans[ state + charid ].next ) {
1740 trie->trans[ state + charid ].next = next_alloc;
1741 trie->trans[ state ].check++;
1742 next_alloc += trie->uniquecharcount;
1744 state = trie->trans[ state + charid ].next;
1746 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
1748 /* charid is now 0 if we dont know the char read, or nonzero if we do */
1751 accept_state = TRIE_NODENUM( state );
1752 TRIE_HANDLE_WORD(accept_state);
1754 } /* end second pass */
1756 /* and now dump it out before we compress it */
1757 DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_table(trie, widecharmap,
1759 next_alloc, depth+1));
1763 * Inplace compress the table.*
1765 For sparse data sets the table constructed by the trie algorithm will
1766 be mostly 0/FAIL transitions or to put it another way mostly empty.
1767 (Note that leaf nodes will not contain any transitions.)
1769 This algorithm compresses the tables by eliminating most such
1770 transitions, at the cost of a modest bit of extra work during lookup:
1772 - Each states[] entry contains a .base field which indicates the
1773 index in the state[] array wheres its transition data is stored.
1775 - If .base is 0 there are no valid transitions from that node.
1777 - If .base is nonzero then charid is added to it to find an entry in
1780 -If trans[states[state].base+charid].check!=state then the
1781 transition is taken to be a 0/Fail transition. Thus if there are fail
1782 transitions at the front of the node then the .base offset will point
1783 somewhere inside the previous nodes data (or maybe even into a node
1784 even earlier), but the .check field determines if the transition is
1788 The following process inplace converts the table to the compressed
1789 table: We first do not compress the root node 1,and mark its all its
1790 .check pointers as 1 and set its .base pointer as 1 as well. This
1791 allows to do a DFA construction from the compressed table later, and
1792 ensures that any .base pointers we calculate later are greater than
1795 - We set 'pos' to indicate the first entry of the second node.
1797 - We then iterate over the columns of the node, finding the first and
1798 last used entry at l and m. We then copy l..m into pos..(pos+m-l),
1799 and set the .check pointers accordingly, and advance pos
1800 appropriately and repreat for the next node. Note that when we copy
1801 the next pointers we have to convert them from the original
1802 NODEIDX form to NODENUM form as the former is not valid post
1805 - If a node has no transitions used we mark its base as 0 and do not
1806 advance the pos pointer.
1808 - If a node only has one transition we use a second pointer into the
1809 structure to fill in allocated fail transitions from other states.
1810 This pointer is independent of the main pointer and scans forward
1811 looking for null transitions that are allocated to a state. When it
1812 finds one it writes the single transition into the "hole". If the
1813 pointer doesnt find one the single transition is appended as normal.
1815 - Once compressed we can Renew/realloc the structures to release the
1818 See "Table-Compression Methods" in sec 3.9 of the Red Dragon,
1819 specifically Fig 3.47 and the associated pseudocode.
1823 const U32 laststate = TRIE_NODENUM( next_alloc );
1826 trie->statecount = laststate;
1828 for ( state = 1 ; state < laststate ; state++ ) {
1830 const U32 stateidx = TRIE_NODEIDX( state );
1831 const U32 o_used = trie->trans[ stateidx ].check;
1832 U32 used = trie->trans[ stateidx ].check;
1833 trie->trans[ stateidx ].check = 0;
1835 for ( charid = 0 ; used && charid < trie->uniquecharcount ; charid++ ) {
1836 if ( flag || trie->trans[ stateidx + charid ].next ) {
1837 if ( trie->trans[ stateidx + charid ].next ) {
1839 for ( ; zp < pos ; zp++ ) {
1840 if ( ! trie->trans[ zp ].next ) {
1844 trie->states[ state ].trans.base = zp + trie->uniquecharcount - charid ;
1845 trie->trans[ zp ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
1846 trie->trans[ zp ].check = state;
1847 if ( ++zp > pos ) pos = zp;
1854 trie->states[ state ].trans.base = pos + trie->uniquecharcount - charid ;
1856 trie->trans[ pos ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
1857 trie->trans[ pos ].check = state;
1862 trie->lasttrans = pos + 1;
1863 trie->states = (reg_trie_state *)
1864 PerlMemShared_realloc( trie->states, laststate
1865 * sizeof(reg_trie_state) );
1866 DEBUG_TRIE_COMPILE_MORE_r(
1867 PerlIO_printf( Perl_debug_log,
1868 "%*sAlloc: %d Orig: %"IVdf" elements, Final:%"IVdf". Savings of %%%5.2f\n",
1869 (int)depth * 2 + 2,"",
1870 (int)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1 ),
1873 ( ( next_alloc - pos ) * 100 ) / (double)next_alloc );
1876 } /* end table compress */
1878 DEBUG_TRIE_COMPILE_MORE_r(
1879 PerlIO_printf(Perl_debug_log, "%*sStatecount:%"UVxf" Lasttrans:%"UVxf"\n",
1880 (int)depth * 2 + 2, "",
1881 (UV)trie->statecount,
1882 (UV)trie->lasttrans)
1884 /* resize the trans array to remove unused space */
1885 trie->trans = (reg_trie_trans *)
1886 PerlMemShared_realloc( trie->trans, trie->lasttrans
1887 * sizeof(reg_trie_trans) );
1889 /* and now dump out the compressed format */
1890 DEBUG_TRIE_COMPILE_r(dump_trie(trie, widecharmap, revcharmap, depth+1));
1892 { /* Modify the program and insert the new TRIE node*/
1893 U8 nodetype =(U8)(flags & 0xFF);
1897 regnode *optimize = NULL;
1898 #ifdef RE_TRACK_PATTERN_OFFSETS
1901 U32 mjd_nodelen = 0;
1902 #endif /* RE_TRACK_PATTERN_OFFSETS */
1903 #endif /* DEBUGGING */
1905 This means we convert either the first branch or the first Exact,
1906 depending on whether the thing following (in 'last') is a branch
1907 or not and whther first is the startbranch (ie is it a sub part of
1908 the alternation or is it the whole thing.)
1909 Assuming its a sub part we conver the EXACT otherwise we convert
1910 the whole branch sequence, including the first.
1912 /* Find the node we are going to overwrite */
1913 if ( first != startbranch || OP( last ) == BRANCH ) {
1914 /* branch sub-chain */
1915 NEXT_OFF( first ) = (U16)(last - first);
1916 #ifdef RE_TRACK_PATTERN_OFFSETS
1918 mjd_offset= Node_Offset((convert));
1919 mjd_nodelen= Node_Length((convert));
1922 /* whole branch chain */
1924 #ifdef RE_TRACK_PATTERN_OFFSETS
1927 const regnode *nop = NEXTOPER( convert );
1928 mjd_offset= Node_Offset((nop));
1929 mjd_nodelen= Node_Length((nop));
1933 PerlIO_printf(Perl_debug_log, "%*sMJD offset:%"UVuf" MJD length:%"UVuf"\n",
1934 (int)depth * 2 + 2, "",
1935 (UV)mjd_offset, (UV)mjd_nodelen)
1938 /* But first we check to see if there is a common prefix we can
1939 split out as an EXACT and put in front of the TRIE node. */
1940 trie->startstate= 1;
1941 if ( trie->bitmap && !widecharmap && !trie->jump ) {
1943 for ( state = 1 ; state < trie->statecount-1 ; state++ ) {
1947 const U32 base = trie->states[ state ].trans.base;
1949 if ( trie->states[state].wordnum )
1952 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
1953 if ( ( base + ofs >= trie->uniquecharcount ) &&
1954 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
1955 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
1957 if ( ++count > 1 ) {
1958 SV **tmp = av_fetch( revcharmap, ofs, 0);
1959 const U8 *ch = (U8*)SvPV_nolen_const( *tmp );
1960 if ( state == 1 ) break;
1962 Zero(trie->bitmap, ANYOF_BITMAP_SIZE, char);
1964 PerlIO_printf(Perl_debug_log,
1965 "%*sNew Start State=%"UVuf" Class: [",
1966 (int)depth * 2 + 2, "",
1969 SV ** const tmp = av_fetch( revcharmap, idx, 0);
1970 const U8 * const ch = (U8*)SvPV_nolen_const( *tmp );
1972 TRIE_BITMAP_SET(trie,*ch);
1974 TRIE_BITMAP_SET(trie, folder[ *ch ]);
1976 PerlIO_printf(Perl_debug_log, (char*)ch)
1980 TRIE_BITMAP_SET(trie,*ch);
1982 TRIE_BITMAP_SET(trie,folder[ *ch ]);
1983 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"%s", ch));
1989 SV **tmp = av_fetch( revcharmap, idx, 0);
1991 char *ch = SvPV( *tmp, len );
1993 SV *sv=sv_newmortal();
1994 PerlIO_printf( Perl_debug_log,
1995 "%*sPrefix State: %"UVuf" Idx:%"UVuf" Char='%s'\n",
1996 (int)depth * 2 + 2, "",
1998 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 6,
1999 PL_colors[0], PL_colors[1],
2000 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
2001 PERL_PV_ESCAPE_FIRSTCHAR
2006 OP( convert ) = nodetype;
2007 str=STRING(convert);
2010 STR_LEN(convert) += len;
2016 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"]\n"));
2022 regnode *n = convert+NODE_SZ_STR(convert);
2023 NEXT_OFF(convert) = NODE_SZ_STR(convert);
2024 trie->startstate = state;
2025 trie->minlen -= (state - 1);
2026 trie->maxlen -= (state - 1);
2028 /* At least the UNICOS C compiler choked on this
2029 * being argument to DEBUG_r(), so let's just have
2032 #ifdef PERL_EXT_RE_BUILD
2038 regnode *fix = convert;
2039 U32 word = trie->wordcount;
2041 Set_Node_Offset_Length(convert, mjd_offset, state - 1);
2042 while( ++fix < n ) {
2043 Set_Node_Offset_Length(fix, 0, 0);
2046 SV ** const tmp = av_fetch( trie_words, word, 0 );
2048 if ( STR_LEN(convert) <= SvCUR(*tmp) )
2049 sv_chop(*tmp, SvPV_nolen(*tmp) + STR_LEN(convert));
2051 sv_chop(*tmp, SvPV_nolen(*tmp) + SvCUR(*tmp));
2059 NEXT_OFF(convert) = (U16)(tail - convert);
2060 DEBUG_r(optimize= n);
2066 if ( trie->maxlen ) {
2067 NEXT_OFF( convert ) = (U16)(tail - convert);
2068 ARG_SET( convert, data_slot );
2069 /* Store the offset to the first unabsorbed branch in
2070 jump[0], which is otherwise unused by the jump logic.
2071 We use this when dumping a trie and during optimisation. */
2073 trie->jump[0] = (U16)(nextbranch - convert);
2076 if ( !trie->states[trie->startstate].wordnum && trie->bitmap &&
2077 ( (char *)jumper - (char *)convert) >= (int)sizeof(struct regnode_charclass) )
2079 OP( convert ) = TRIEC;
2080 Copy(trie->bitmap, ((struct regnode_charclass *)convert)->bitmap, ANYOF_BITMAP_SIZE, char);
2081 PerlMemShared_free(trie->bitmap);
2084 OP( convert ) = TRIE;
2086 /* store the type in the flags */
2087 convert->flags = nodetype;
2091 + regarglen[ OP( convert ) ];
2093 /* XXX We really should free up the resource in trie now,
2094 as we won't use them - (which resources?) dmq */
2096 /* needed for dumping*/
2097 DEBUG_r(if (optimize) {
2098 regnode *opt = convert;
2100 while ( ++opt < optimize) {
2101 Set_Node_Offset_Length(opt,0,0);
2104 Try to clean up some of the debris left after the
2107 while( optimize < jumper ) {
2108 mjd_nodelen += Node_Length((optimize));
2109 OP( optimize ) = OPTIMIZED;
2110 Set_Node_Offset_Length(optimize,0,0);
2113 Set_Node_Offset_Length(convert,mjd_offset,mjd_nodelen);
2115 } /* end node insert */
2116 RExC_rxi->data->data[ data_slot + 1 ] = (void*)widecharmap;
2118 RExC_rxi->data->data[ data_slot + TRIE_WORDS_OFFSET ] = (void*)trie_words;
2119 RExC_rxi->data->data[ data_slot + 3 ] = (void*)revcharmap;
2121 SvREFCNT_dec(revcharmap);
2125 : trie->startstate>1
2131 S_make_trie_failtable(pTHX_ RExC_state_t *pRExC_state, regnode *source, regnode *stclass, U32 depth)
2133 /* The Trie is constructed and compressed now so we can build a fail array now if its needed
2135 This is basically the Aho-Corasick algorithm. Its from exercise 3.31 and 3.32 in the
2136 "Red Dragon" -- Compilers, principles, techniques, and tools. Aho, Sethi, Ullman 1985/88
2139 We find the fail state for each state in the trie, this state is the longest proper
2140 suffix of the current states 'word' that is also a proper prefix of another word in our
2141 trie. State 1 represents the word '' and is the thus the default fail state. This allows
2142 the DFA not to have to restart after its tried and failed a word at a given point, it
2143 simply continues as though it had been matching the other word in the first place.
2145 'abcdgu'=~/abcdefg|cdgu/
2146 When we get to 'd' we are still matching the first word, we would encounter 'g' which would
2147 fail, which would bring use to the state representing 'd' in the second word where we would
2148 try 'g' and succeed, prodceding to match 'cdgu'.
2150 /* add a fail transition */
2151 const U32 trie_offset = ARG(source);
2152 reg_trie_data *trie=(reg_trie_data *)RExC_rxi->data->data[trie_offset];
2154 const U32 ucharcount = trie->uniquecharcount;
2155 const U32 numstates = trie->statecount;
2156 const U32 ubound = trie->lasttrans + ucharcount;
2160 U32 base = trie->states[ 1 ].trans.base;
2163 const U32 data_slot = add_data( pRExC_state, 1, "T" );
2164 GET_RE_DEBUG_FLAGS_DECL;
2166 PERL_UNUSED_ARG(depth);
2170 ARG_SET( stclass, data_slot );
2171 aho = (reg_ac_data *) PerlMemShared_calloc( 1, sizeof(reg_ac_data) );
2172 RExC_rxi->data->data[ data_slot ] = (void*)aho;
2173 aho->trie=trie_offset;
2174 aho->states=(reg_trie_state *)PerlMemShared_malloc( numstates * sizeof(reg_trie_state) );
2175 Copy( trie->states, aho->states, numstates, reg_trie_state );
2176 Newxz( q, numstates, U32);
2177 aho->fail = (U32 *) PerlMemShared_calloc( numstates, sizeof(U32) );
2180 /* initialize fail[0..1] to be 1 so that we always have
2181 a valid final fail state */
2182 fail[ 0 ] = fail[ 1 ] = 1;
2184 for ( charid = 0; charid < ucharcount ; charid++ ) {
2185 const U32 newstate = TRIE_TRANS_STATE( 1, base, ucharcount, charid, 0 );
2187 q[ q_write ] = newstate;
2188 /* set to point at the root */
2189 fail[ q[ q_write++ ] ]=1;
2192 while ( q_read < q_write) {
2193 const U32 cur = q[ q_read++ % numstates ];
2194 base = trie->states[ cur ].trans.base;
2196 for ( charid = 0 ; charid < ucharcount ; charid++ ) {
2197 const U32 ch_state = TRIE_TRANS_STATE( cur, base, ucharcount, charid, 1 );
2199 U32 fail_state = cur;
2202 fail_state = fail[ fail_state ];
2203 fail_base = aho->states[ fail_state ].trans.base;
2204 } while ( !TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 ) );
2206 fail_state = TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 );
2207 fail[ ch_state ] = fail_state;
2208 if ( !aho->states[ ch_state ].wordnum && aho->states[ fail_state ].wordnum )
2210 aho->states[ ch_state ].wordnum = aho->states[ fail_state ].wordnum;
2212 q[ q_write++ % numstates] = ch_state;
2216 /* restore fail[0..1] to 0 so that we "fall out" of the AC loop
2217 when we fail in state 1, this allows us to use the
2218 charclass scan to find a valid start char. This is based on the principle
2219 that theres a good chance the string being searched contains lots of stuff
2220 that cant be a start char.
2222 fail[ 0 ] = fail[ 1 ] = 0;
2223 DEBUG_TRIE_COMPILE_r({
2224 PerlIO_printf(Perl_debug_log,
2225 "%*sStclass Failtable (%"UVuf" states): 0",
2226 (int)(depth * 2), "", (UV)numstates
2228 for( q_read=1; q_read<numstates; q_read++ ) {
2229 PerlIO_printf(Perl_debug_log, ", %"UVuf, (UV)fail[q_read]);
2231 PerlIO_printf(Perl_debug_log, "\n");
2234 /*RExC_seen |= REG_SEEN_TRIEDFA;*/
2239 * There are strange code-generation bugs caused on sparc64 by gcc-2.95.2.
2240 * These need to be revisited when a newer toolchain becomes available.
2242 #if defined(__sparc64__) && defined(__GNUC__)
2243 # if __GNUC__ < 2 || (__GNUC__ == 2 && __GNUC_MINOR__ < 96)
2244 # undef SPARC64_GCC_WORKAROUND
2245 # define SPARC64_GCC_WORKAROUND 1
2249 #define DEBUG_PEEP(str,scan,depth) \
2250 DEBUG_OPTIMISE_r({if (scan){ \
2251 SV * const mysv=sv_newmortal(); \
2252 regnode *Next = regnext(scan); \
2253 regprop(RExC_rx, mysv, scan); \
2254 PerlIO_printf(Perl_debug_log, "%*s" str ">%3d: %s (%d)\n", \
2255 (int)depth*2, "", REG_NODE_NUM(scan), SvPV_nolen_const(mysv),\
2256 Next ? (REG_NODE_NUM(Next)) : 0 ); \
2263 #define JOIN_EXACT(scan,min,flags) \
2264 if (PL_regkind[OP(scan)] == EXACT) \
2265 join_exact(pRExC_state,(scan),(min),(flags),NULL,depth+1)
2268 S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, I32 *min, U32 flags,regnode *val, U32 depth) {
2269 /* Merge several consecutive EXACTish nodes into one. */
2270 regnode *n = regnext(scan);
2272 regnode *next = scan + NODE_SZ_STR(scan);
2276 regnode *stop = scan;
2277 GET_RE_DEBUG_FLAGS_DECL;
2279 PERL_UNUSED_ARG(depth);
2281 #ifndef EXPERIMENTAL_INPLACESCAN
2282 PERL_UNUSED_ARG(flags);
2283 PERL_UNUSED_ARG(val);
2285 DEBUG_PEEP("join",scan,depth);
2287 /* Skip NOTHING, merge EXACT*. */
2289 ( PL_regkind[OP(n)] == NOTHING ||
2290 (stringok && (OP(n) == OP(scan))))
2292 && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX) {
2294 if (OP(n) == TAIL || n > next)
2296 if (PL_regkind[OP(n)] == NOTHING) {
2297 DEBUG_PEEP("skip:",n,depth);
2298 NEXT_OFF(scan) += NEXT_OFF(n);
2299 next = n + NODE_STEP_REGNODE;
2306 else if (stringok) {
2307 const unsigned int oldl = STR_LEN(scan);
2308 regnode * const nnext = regnext(n);
2310 DEBUG_PEEP("merg",n,depth);
2313 if (oldl + STR_LEN(n) > U8_MAX)
2315 NEXT_OFF(scan) += NEXT_OFF(n);
2316 STR_LEN(scan) += STR_LEN(n);
2317 next = n + NODE_SZ_STR(n);
2318 /* Now we can overwrite *n : */
2319 Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
2327 #ifdef EXPERIMENTAL_INPLACESCAN
2328 if (flags && !NEXT_OFF(n)) {
2329 DEBUG_PEEP("atch", val, depth);
2330 if (reg_off_by_arg[OP(n)]) {
2331 ARG_SET(n, val - n);
2334 NEXT_OFF(n) = val - n;
2341 if (UTF && ( OP(scan) == EXACTF ) && ( STR_LEN(scan) >= 6 ) ) {
2343 Two problematic code points in Unicode casefolding of EXACT nodes:
2345 U+0390 - GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
2346 U+03B0 - GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
2352 U+03B9 U+0308 U+0301 0xCE 0xB9 0xCC 0x88 0xCC 0x81
2353 U+03C5 U+0308 U+0301 0xCF 0x85 0xCC 0x88 0xCC 0x81
2355 This means that in case-insensitive matching (or "loose matching",
2356 as Unicode calls it), an EXACTF of length six (the UTF-8 encoded byte
2357 length of the above casefolded versions) can match a target string
2358 of length two (the byte length of UTF-8 encoded U+0390 or U+03B0).
2359 This would rather mess up the minimum length computation.
2361 What we'll do is to look for the tail four bytes, and then peek
2362 at the preceding two bytes to see whether we need to decrease
2363 the minimum length by four (six minus two).
2365 Thanks to the design of UTF-8, there cannot be false matches:
2366 A sequence of valid UTF-8 bytes cannot be a subsequence of
2367 another valid sequence of UTF-8 bytes.
2370 char * const s0 = STRING(scan), *s, *t;
2371 char * const s1 = s0 + STR_LEN(scan) - 1;
2372 char * const s2 = s1 - 4;
2373 #ifdef EBCDIC /* RD tunifold greek 0390 and 03B0 */
2374 const char t0[] = "\xaf\x49\xaf\x42";
2376 const char t0[] = "\xcc\x88\xcc\x81";
2378 const char * const t1 = t0 + 3;
2381 s < s2 && (t = ninstr(s, s1, t0, t1));
2384 if (((U8)t[-1] == 0x68 && (U8)t[-2] == 0xB4) ||
2385 ((U8)t[-1] == 0x46 && (U8)t[-2] == 0xB5))
2387 if (((U8)t[-1] == 0xB9 && (U8)t[-2] == 0xCE) ||
2388 ((U8)t[-1] == 0x85 && (U8)t[-2] == 0xCF))
2396 n = scan + NODE_SZ_STR(scan);
2398 if (PL_regkind[OP(n)] != NOTHING || OP(n) == NOTHING) {
2405 DEBUG_OPTIMISE_r(if (merged){DEBUG_PEEP("finl",scan,depth)});
2409 /* REx optimizer. Converts nodes into quickier variants "in place".
2410 Finds fixed substrings. */
2412 /* Stops at toplevel WHILEM as well as at "last". At end *scanp is set
2413 to the position after last scanned or to NULL. */
2415 #define INIT_AND_WITHP \
2416 assert(!and_withp); \
2417 Newx(and_withp,1,struct regnode_charclass_class); \
2418 SAVEFREEPV(and_withp)
2420 /* this is a chain of data about sub patterns we are processing that
2421 need to be handled seperately/specially in study_chunk. Its so
2422 we can simulate recursion without losing state. */
2424 typedef struct scan_frame {
2425 regnode *last; /* last node to process in this frame */
2426 regnode *next; /* next node to process when last is reached */
2427 struct scan_frame *prev; /*previous frame*/
2428 I32 stop; /* what stopparen do we use */
2432 #define SCAN_COMMIT(s, data, m) scan_commit(s, data, m, is_inf)
2434 #define CASE_SYNST_FNC(nAmE) \
2436 if (flags & SCF_DO_STCLASS_AND) { \
2437 for (value = 0; value < 256; value++) \
2438 if (!is_ ## nAmE ## _cp(value)) \
2439 ANYOF_BITMAP_CLEAR(data->start_class, value); \
2442 for (value = 0; value < 256; value++) \
2443 if (is_ ## nAmE ## _cp(value)) \
2444 ANYOF_BITMAP_SET(data->start_class, value); \
2448 if (flags & SCF_DO_STCLASS_AND) { \
2449 for (value = 0; value < 256; value++) \
2450 if (is_ ## nAmE ## _cp(value)) \
2451 ANYOF_BITMAP_CLEAR(data->start_class, value); \
2454 for (value = 0; value < 256; value++) \
2455 if (!is_ ## nAmE ## _cp(value)) \
2456 ANYOF_BITMAP_SET(data->start_class, value); \
2463 S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
2464 I32 *minlenp, I32 *deltap,
2469 struct regnode_charclass_class *and_withp,
2470 U32 flags, U32 depth)
2471 /* scanp: Start here (read-write). */
2472 /* deltap: Write maxlen-minlen here. */
2473 /* last: Stop before this one. */
2474 /* data: string data about the pattern */
2475 /* stopparen: treat close N as END */
2476 /* recursed: which subroutines have we recursed into */
2477 /* and_withp: Valid if flags & SCF_DO_STCLASS_OR */
2480 I32 min = 0, pars = 0, code;
2481 regnode *scan = *scanp, *next;
2483 int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
2484 int is_inf_internal = 0; /* The studied chunk is infinite */
2485 I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
2486 scan_data_t data_fake;
2487 SV *re_trie_maxbuff = NULL;
2488 regnode *first_non_open = scan;
2489 I32 stopmin = I32_MAX;
2490 scan_frame *frame = NULL;
2492 GET_RE_DEBUG_FLAGS_DECL;
2495 StructCopy(&zero_scan_data, &data_fake, scan_data_t);
2499 while (first_non_open && OP(first_non_open) == OPEN)
2500 first_non_open=regnext(first_non_open);
2505 while ( scan && OP(scan) != END && scan < last ){
2506 /* Peephole optimizer: */
2507 DEBUG_STUDYDATA("Peep:", data,depth);
2508 DEBUG_PEEP("Peep",scan,depth);
2509 JOIN_EXACT(scan,&min,0);
2511 /* Follow the next-chain of the current node and optimize
2512 away all the NOTHINGs from it. */
2513 if (OP(scan) != CURLYX) {
2514 const int max = (reg_off_by_arg[OP(scan)]
2516 /* I32 may be smaller than U16 on CRAYs! */
2517 : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
2518 int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
2522 /* Skip NOTHING and LONGJMP. */
2523 while ((n = regnext(n))
2524 && ((PL_regkind[OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
2525 || ((OP(n) == LONGJMP) && (noff = ARG(n))))
2526 && off + noff < max)
2528 if (reg_off_by_arg[OP(scan)])
2531 NEXT_OFF(scan) = off;
2536 /* The principal pseudo-switch. Cannot be a switch, since we
2537 look into several different things. */
2538 if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
2539 || OP(scan) == IFTHEN) {
2540 next = regnext(scan);
2542 /* demq: the op(next)==code check is to see if we have "branch-branch" AFAICT */
2544 if (OP(next) == code || code == IFTHEN) {
2545 /* NOTE - There is similar code to this block below for handling
2546 TRIE nodes on a re-study. If you change stuff here check there
2548 I32 max1 = 0, min1 = I32_MAX, num = 0;
2549 struct regnode_charclass_class accum;
2550 regnode * const startbranch=scan;
2552 if (flags & SCF_DO_SUBSTR)
2553 SCAN_COMMIT(pRExC_state, data, minlenp); /* Cannot merge strings after this. */
2554 if (flags & SCF_DO_STCLASS)
2555 cl_init_zero(pRExC_state, &accum);
2557 while (OP(scan) == code) {
2558 I32 deltanext, minnext, f = 0, fake;
2559 struct regnode_charclass_class this_class;
2562 data_fake.flags = 0;
2564 data_fake.whilem_c = data->whilem_c;
2565 data_fake.last_closep = data->last_closep;
2568 data_fake.last_closep = &fake;
2570 data_fake.pos_delta = delta;
2571 next = regnext(scan);
2572 scan = NEXTOPER(scan);
2574 scan = NEXTOPER(scan);
2575 if (flags & SCF_DO_STCLASS) {
2576 cl_init(pRExC_state, &this_class);
2577 data_fake.start_class = &this_class;
2578 f = SCF_DO_STCLASS_AND;
2580 if (flags & SCF_WHILEM_VISITED_POS)
2581 f |= SCF_WHILEM_VISITED_POS;
2583 /* we suppose the run is continuous, last=next...*/
2584 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
2586 stopparen, recursed, NULL, f,depth+1);
2589 if (max1 < minnext + deltanext)
2590 max1 = minnext + deltanext;
2591 if (deltanext == I32_MAX)
2592 is_inf = is_inf_internal = 1;
2594 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
2596 if (data_fake.flags & SCF_SEEN_ACCEPT) {
2597 if ( stopmin > minnext)
2598 stopmin = min + min1;
2599 flags &= ~SCF_DO_SUBSTR;
2601 data->flags |= SCF_SEEN_ACCEPT;
2604 if (data_fake.flags & SF_HAS_EVAL)
2605 data->flags |= SF_HAS_EVAL;
2606 data->whilem_c = data_fake.whilem_c;
2608 if (flags & SCF_DO_STCLASS)
2609 cl_or(pRExC_state, &accum, &this_class);
2611 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
2613 if (flags & SCF_DO_SUBSTR) {
2614 data->pos_min += min1;
2615 data->pos_delta += max1 - min1;
2616 if (max1 != min1 || is_inf)
2617 data->longest = &(data->longest_float);
2620 delta += max1 - min1;
2621 if (flags & SCF_DO_STCLASS_OR) {
2622 cl_or(pRExC_state, data->start_class, &accum);
2624 cl_and(data->start_class, and_withp);
2625 flags &= ~SCF_DO_STCLASS;
2628 else if (flags & SCF_DO_STCLASS_AND) {
2630 cl_and(data->start_class, &accum);
2631 flags &= ~SCF_DO_STCLASS;
2634 /* Switch to OR mode: cache the old value of
2635 * data->start_class */
2637 StructCopy(data->start_class, and_withp,
2638 struct regnode_charclass_class);
2639 flags &= ~SCF_DO_STCLASS_AND;
2640 StructCopy(&accum, data->start_class,
2641 struct regnode_charclass_class);
2642 flags |= SCF_DO_STCLASS_OR;
2643 data->start_class->flags |= ANYOF_EOS;
2647 if (PERL_ENABLE_TRIE_OPTIMISATION && OP( startbranch ) == BRANCH ) {
2650 Assuming this was/is a branch we are dealing with: 'scan' now
2651 points at the item that follows the branch sequence, whatever
2652 it is. We now start at the beginning of the sequence and look
2659 which would be constructed from a pattern like /A|LIST|OF|WORDS/
2661 If we can find such a subseqence we need to turn the first
2662 element into a trie and then add the subsequent branch exact
2663 strings to the trie.
2667 1. patterns where the whole set of branch can be converted.
2669 2. patterns where only a subset can be converted.
2671 In case 1 we can replace the whole set with a single regop
2672 for the trie. In case 2 we need to keep the start and end
2675 'BRANCH EXACT; BRANCH EXACT; BRANCH X'
2676 becomes BRANCH TRIE; BRANCH X;
2678 There is an additional case, that being where there is a
2679 common prefix, which gets split out into an EXACT like node
2680 preceding the TRIE node.
2682 If x(1..n)==tail then we can do a simple trie, if not we make
2683 a "jump" trie, such that when we match the appropriate word
2684 we "jump" to the appopriate tail node. Essentailly we turn
2685 a nested if into a case structure of sorts.
2690 if (!re_trie_maxbuff) {
2691 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
2692 if (!SvIOK(re_trie_maxbuff))
2693 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
2695 if ( SvIV(re_trie_maxbuff)>=0 ) {
2697 regnode *first = (regnode *)NULL;
2698 regnode *last = (regnode *)NULL;
2699 regnode *tail = scan;
2704 SV * const mysv = sv_newmortal(); /* for dumping */
2706 /* var tail is used because there may be a TAIL
2707 regop in the way. Ie, the exacts will point to the
2708 thing following the TAIL, but the last branch will
2709 point at the TAIL. So we advance tail. If we
2710 have nested (?:) we may have to move through several
2714 while ( OP( tail ) == TAIL ) {
2715 /* this is the TAIL generated by (?:) */
2716 tail = regnext( tail );
2721 regprop(RExC_rx, mysv, tail );
2722 PerlIO_printf( Perl_debug_log, "%*s%s%s\n",
2723 (int)depth * 2 + 2, "",
2724 "Looking for TRIE'able sequences. Tail node is: ",
2725 SvPV_nolen_const( mysv )
2731 step through the branches, cur represents each
2732 branch, noper is the first thing to be matched
2733 as part of that branch and noper_next is the
2734 regnext() of that node. if noper is an EXACT
2735 and noper_next is the same as scan (our current
2736 position in the regex) then the EXACT branch is
2737 a possible optimization target. Once we have
2738 two or more consequetive such branches we can
2739 create a trie of the EXACT's contents and stich
2740 it in place. If the sequence represents all of
2741 the branches we eliminate the whole thing and
2742 replace it with a single TRIE. If it is a
2743 subsequence then we need to stitch it in. This
2744 means the first branch has to remain, and needs
2745 to be repointed at the item on the branch chain
2746 following the last branch optimized. This could
2747 be either a BRANCH, in which case the
2748 subsequence is internal, or it could be the
2749 item following the branch sequence in which
2750 case the subsequence is at the end.
2754 /* dont use tail as the end marker for this traverse */
2755 for ( cur = startbranch ; cur != scan ; cur = regnext( cur ) ) {
2756 regnode * const noper = NEXTOPER( cur );
2757 #if defined(DEBUGGING) || defined(NOJUMPTRIE)
2758 regnode * const noper_next = regnext( noper );
2762 regprop(RExC_rx, mysv, cur);
2763 PerlIO_printf( Perl_debug_log, "%*s- %s (%d)",
2764 (int)depth * 2 + 2,"", SvPV_nolen_const( mysv ), REG_NODE_NUM(cur) );
2766 regprop(RExC_rx, mysv, noper);
2767 PerlIO_printf( Perl_debug_log, " -> %s",
2768 SvPV_nolen_const(mysv));
2771 regprop(RExC_rx, mysv, noper_next );
2772 PerlIO_printf( Perl_debug_log,"\t=> %s\t",
2773 SvPV_nolen_const(mysv));
2775 PerlIO_printf( Perl_debug_log, "(First==%d,Last==%d,Cur==%d)\n",
2776 REG_NODE_NUM(first), REG_NODE_NUM(last), REG_NODE_NUM(cur) );
2778 if ( (((first && optype!=NOTHING) ? OP( noper ) == optype
2779 : PL_regkind[ OP( noper ) ] == EXACT )
2780 || OP(noper) == NOTHING )
2782 && noper_next == tail
2787 if ( !first || optype == NOTHING ) {
2788 if (!first) first = cur;
2789 optype = OP( noper );
2795 Currently we assume that the trie can handle unicode and ascii
2796 matches fold cased matches. If this proves true then the following
2797 define will prevent tries in this situation.
2799 #define TRIE_TYPE_IS_SAFE (UTF || optype==EXACT)
2801 #define TRIE_TYPE_IS_SAFE 1
2802 if ( last && TRIE_TYPE_IS_SAFE ) {
2803 make_trie( pRExC_state,
2804 startbranch, first, cur, tail, count,
2807 if ( PL_regkind[ OP( noper ) ] == EXACT
2809 && noper_next == tail
2814 optype = OP( noper );
2824 regprop(RExC_rx, mysv, cur);
2825 PerlIO_printf( Perl_debug_log,
2826 "%*s- %s (%d) <SCAN FINISHED>\n", (int)depth * 2 + 2,
2827 "", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur));
2831 if ( last && TRIE_TYPE_IS_SAFE ) {
2832 made= make_trie( pRExC_state, startbranch, first, scan, tail, count, optype, depth+1 );
2833 #ifdef TRIE_STUDY_OPT
2834 if ( ((made == MADE_EXACT_TRIE &&
2835 startbranch == first)
2836 || ( first_non_open == first )) &&
2838 flags |= SCF_TRIE_RESTUDY;
2839 if ( startbranch == first
2842 RExC_seen &=~REG_TOP_LEVEL_BRANCHES;
2852 else if ( code == BRANCHJ ) { /* single branch is optimized. */
2853 scan = NEXTOPER(NEXTOPER(scan));
2854 } else /* single branch is optimized. */
2855 scan = NEXTOPER(scan);
2857 } else if (OP(scan) == SUSPEND || OP(scan) == GOSUB || OP(scan) == GOSTART) {
2858 scan_frame *newframe = NULL;
2863 if (OP(scan) != SUSPEND) {
2864 /* set the pointer */
2865 if (OP(scan) == GOSUB) {
2867 RExC_recurse[ARG2L(scan)] = scan;
2868 start = RExC_open_parens[paren-1];
2869 end = RExC_close_parens[paren-1];
2872 start = RExC_rxi->program + 1;
2876 Newxz(recursed, (((RExC_npar)>>3) +1), U8);
2877 SAVEFREEPV(recursed);
2879 if (!PAREN_TEST(recursed,paren+1)) {
2880 PAREN_SET(recursed,paren+1);
2881 Newx(newframe,1,scan_frame);
2883 if (flags & SCF_DO_SUBSTR) {
2884 SCAN_COMMIT(pRExC_state,data,minlenp);
2885 data->longest = &(data->longest_float);
2887 is_inf = is_inf_internal = 1;
2888 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
2889 cl_anything(pRExC_state, data->start_class);
2890 flags &= ~SCF_DO_STCLASS;
2893 Newx(newframe,1,scan_frame);
2896 end = regnext(scan);
2901 SAVEFREEPV(newframe);
2902 newframe->next = regnext(scan);
2903 newframe->last = last;
2904 newframe->stop = stopparen;
2905 newframe->prev = frame;
2915 else if (OP(scan) == EXACT) {
2916 I32 l = STR_LEN(scan);
2919 const U8 * const s = (U8*)STRING(scan);
2920 l = utf8_length(s, s + l);
2921 uc = utf8_to_uvchr(s, NULL);
2923 uc = *((U8*)STRING(scan));
2926 if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
2927 /* The code below prefers earlier match for fixed
2928 offset, later match for variable offset. */
2929 if (data->last_end == -1) { /* Update the start info. */
2930 data->last_start_min = data->pos_min;
2931 data->last_start_max = is_inf
2932 ? I32_MAX : data->pos_min + data->pos_delta;
2934 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
2936 SvUTF8_on(data->last_found);
2938 SV * const sv = data->last_found;
2939 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
2940 mg_find(sv, PERL_MAGIC_utf8) : NULL;
2941 if (mg && mg->mg_len >= 0)
2942 mg->mg_len += utf8_length((U8*)STRING(scan),
2943 (U8*)STRING(scan)+STR_LEN(scan));
2945 data->last_end = data->pos_min + l;
2946 data->pos_min += l; /* As in the first entry. */
2947 data->flags &= ~SF_BEFORE_EOL;
2949 if (flags & SCF_DO_STCLASS_AND) {
2950 /* Check whether it is compatible with what we know already! */
2954 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
2955 && !ANYOF_BITMAP_TEST(data->start_class, uc)
2956 && (!(data->start_class->flags & ANYOF_FOLD)
2957 || !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
2960 ANYOF_CLASS_ZERO(data->start_class);
2961 ANYOF_BITMAP_ZERO(data->start_class);
2963 ANYOF_BITMAP_SET(data->start_class, uc);
2964 data->start_class->flags &= ~ANYOF_EOS;
2966 data->start_class->flags &= ~ANYOF_UNICODE_ALL;
2968 else if (flags & SCF_DO_STCLASS_OR) {
2969 /* false positive possible if the class is case-folded */
2971 ANYOF_BITMAP_SET(data->start_class, uc);
2973 data->start_class->flags |= ANYOF_UNICODE_ALL;
2974 data->start_class->flags &= ~ANYOF_EOS;
2975 cl_and(data->start_class, and_withp);
2977 flags &= ~SCF_DO_STCLASS;
2979 else if (PL_regkind[OP(scan)] == EXACT) { /* But OP != EXACT! */
2980 I32 l = STR_LEN(scan);
2981 UV uc = *((U8*)STRING(scan));
2983 /* Search for fixed substrings supports EXACT only. */
2984 if (flags & SCF_DO_SUBSTR) {
2986 SCAN_COMMIT(pRExC_state, data, minlenp);
2989 const U8 * const s = (U8 *)STRING(scan);
2990 l = utf8_length(s, s + l);
2991 uc = utf8_to_uvchr(s, NULL);
2994 if (flags & SCF_DO_SUBSTR)
2996 if (flags & SCF_DO_STCLASS_AND) {
2997 /* Check whether it is compatible with what we know already! */
3001 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
3002 && !ANYOF_BITMAP_TEST(data->start_class, uc)
3003 && !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
3005 ANYOF_CLASS_ZERO(data->start_class);
3006 ANYOF_BITMAP_ZERO(data->start_class);
3008 ANYOF_BITMAP_SET(data->start_class, uc);
3009 data->start_class->flags &= ~ANYOF_EOS;
3010 data->start_class->flags |= ANYOF_FOLD;
3011 if (OP(scan) == EXACTFL)
3012 data->start_class->flags |= ANYOF_LOCALE;
3015 else if (flags & SCF_DO_STCLASS_OR) {
3016 if (data->start_class->flags & ANYOF_FOLD) {
3017 /* false positive possible if the class is case-folded.
3018 Assume that the locale settings are the same... */
3020 ANYOF_BITMAP_SET(data->start_class, uc);
3021 data->start_class->flags &= ~ANYOF_EOS;
3023 cl_and(data->start_class, and_withp);
3025 flags &= ~SCF_DO_STCLASS;
3027 else if (strchr((const char*)PL_varies,OP(scan))) {
3028 I32 mincount, maxcount, minnext, deltanext, fl = 0;
3029 I32 f = flags, pos_before = 0;
3030 regnode * const oscan = scan;
3031 struct regnode_charclass_class this_class;
3032 struct regnode_charclass_class *oclass = NULL;
3033 I32 next_is_eval = 0;
3035 switch (PL_regkind[OP(scan)]) {
3036 case WHILEM: /* End of (?:...)* . */
3037 scan = NEXTOPER(scan);
3040 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
3041 next = NEXTOPER(scan);
3042 if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) {
3044 maxcount = REG_INFTY;
3045 next = regnext(scan);
3046 scan = NEXTOPER(scan);
3050 if (flags & SCF_DO_SUBSTR)
3055 if (flags & SCF_DO_STCLASS) {
3057 maxcount = REG_INFTY;
3058 next = regnext(scan);
3059 scan = NEXTOPER(scan);
3062 is_inf = is_inf_internal = 1;
3063 scan = regnext(scan);
3064 if (flags & SCF_DO_SUBSTR) {
3065 SCAN_COMMIT(pRExC_state, data, minlenp); /* Cannot extend fixed substrings */
3066 data->longest = &(data->longest_float);
3068 goto optimize_curly_tail;
3070 if (stopparen>0 && (OP(scan)==CURLYN || OP(scan)==CURLYM)
3071 && (scan->flags == stopparen))
3076 mincount = ARG1(scan);
3077 maxcount = ARG2(scan);
3079 next = regnext(scan);
3080 if (OP(scan) == CURLYX) {
3081 I32 lp = (data ? *(data->last_closep) : 0);
3082 scan->flags = ((lp <= (I32)U8_MAX) ? (U8)lp : U8_MAX);
3084 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
3085 next_is_eval = (OP(scan) == EVAL);
3087 if (flags & SCF_DO_SUBSTR) {
3088 if (mincount == 0) SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot extend fixed substrings */
3089 pos_before = data->pos_min;
3093 data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
3095 data->flags |= SF_IS_INF;
3097 if (flags & SCF_DO_STCLASS) {
3098 cl_init(pRExC_state, &this_class);
3099 oclass = data->start_class;
3100 data->start_class = &this_class;
3101 f |= SCF_DO_STCLASS_AND;
3102 f &= ~SCF_DO_STCLASS_OR;
3104 /* These are the cases when once a subexpression
3105 fails at a particular position, it cannot succeed
3106 even after backtracking at the enclosing scope.
3108 XXXX what if minimal match and we are at the
3109 initial run of {n,m}? */
3110 if ((mincount != maxcount - 1) && (maxcount != REG_INFTY))
3111 f &= ~SCF_WHILEM_VISITED_POS;
3113 /* This will finish on WHILEM, setting scan, or on NULL: */
3114 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
3115 last, data, stopparen, recursed, NULL,
3117 ? (f & ~SCF_DO_SUBSTR) : f),depth+1);
3119 if (flags & SCF_DO_STCLASS)
3120 data->start_class = oclass;
3121 if (mincount == 0 || minnext == 0) {
3122 if (flags & SCF_DO_STCLASS_OR) {
3123 cl_or(pRExC_state, data->start_class, &this_class);
3125 else if (flags & SCF_DO_STCLASS_AND) {
3126 /* Switch to OR mode: cache the old value of
3127 * data->start_class */
3129 StructCopy(data->start_class, and_withp,
3130 struct regnode_charclass_class);
3131 flags &= ~SCF_DO_STCLASS_AND;
3132 StructCopy(&this_class, data->start_class,
3133 struct regnode_charclass_class);
3134 flags |= SCF_DO_STCLASS_OR;
3135 data->start_class->flags |= ANYOF_EOS;
3137 } else { /* Non-zero len */
3138 if (flags & SCF_DO_STCLASS_OR) {
3139 cl_or(pRExC_state, data->start_class, &this_class);
3140 cl_and(data->start_class, and_withp);
3142 else if (flags & SCF_DO_STCLASS_AND)
3143 cl_and(data->start_class, &this_class);
3144 flags &= ~SCF_DO_STCLASS;
3146 if (!scan) /* It was not CURLYX, but CURLY. */
3148 if ( /* ? quantifier ok, except for (?{ ... }) */
3149 (next_is_eval || !(mincount == 0 && maxcount == 1))
3150 && (minnext == 0) && (deltanext == 0)
3151 && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
3152 && maxcount <= REG_INFTY/3 /* Complement check for big count */
3153 && ckWARN(WARN_REGEXP))
3156 "Quantifier unexpected on zero-length expression");
3159 min += minnext * mincount;
3160 is_inf_internal |= ((maxcount == REG_INFTY
3161 && (minnext + deltanext) > 0)
3162 || deltanext == I32_MAX);
3163 is_inf |= is_inf_internal;
3164 delta += (minnext + deltanext) * maxcount - minnext * mincount;
3166 /* Try powerful optimization CURLYX => CURLYN. */
3167 if ( OP(oscan) == CURLYX && data
3168 && data->flags & SF_IN_PAR
3169 && !(data->flags & SF_HAS_EVAL)
3170 && !deltanext && minnext == 1 ) {
3171 /* Try to optimize to CURLYN. */
3172 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
3173 regnode * const nxt1 = nxt;
3180 if (!strchr((const char*)PL_simple,OP(nxt))
3181 && !(PL_regkind[OP(nxt)] == EXACT
3182 && STR_LEN(nxt) == 1))
3188 if (OP(nxt) != CLOSE)
3190 if (RExC_open_parens) {
3191 RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
3192 RExC_close_parens[ARG(nxt1)-1]=nxt+2; /*close->while*/
3194 /* Now we know that nxt2 is the only contents: */
3195 oscan->flags = (U8)ARG(nxt);
3197 OP(nxt1) = NOTHING; /* was OPEN. */
3200 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
3201 NEXT_OFF(nxt1+ 1) = 0; /* just for consistancy. */
3202 NEXT_OFF(nxt2) = 0; /* just for consistancy with CURLY. */
3203 OP(nxt) = OPTIMIZED; /* was CLOSE. */
3204 OP(nxt + 1) = OPTIMIZED; /* was count. */
3205 NEXT_OFF(nxt+ 1) = 0; /* just for consistancy. */
3210 /* Try optimization CURLYX => CURLYM. */
3211 if ( OP(oscan) == CURLYX && data
3212 && !(data->flags & SF_HAS_PAR)
3213 && !(data->flags & SF_HAS_EVAL)
3214 && !deltanext /* atom is fixed width */
3215 && minnext != 0 /* CURLYM can't handle zero width */
3217 /* XXXX How to optimize if data == 0? */
3218 /* Optimize to a simpler form. */
3219 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
3223 while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
3224 && (OP(nxt2) != WHILEM))
3226 OP(nxt2) = SUCCEED; /* Whas WHILEM */
3227 /* Need to optimize away parenths. */
3228 if (data->flags & SF_IN_PAR) {
3229 /* Set the parenth number. */
3230 regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
3232 if (OP(nxt) != CLOSE)
3233 FAIL("Panic opt close");
3234 oscan->flags = (U8)ARG(nxt);
3235 if (RExC_open_parens) {
3236 RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
3237 RExC_close_parens[ARG(nxt1)-1]=nxt2+1; /*close->NOTHING*/
3239 OP(nxt1) = OPTIMIZED; /* was OPEN. */
3240 OP(nxt) = OPTIMIZED; /* was CLOSE. */
3243 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
3244 OP(nxt + 1) = OPTIMIZED; /* was count. */
3245 NEXT_OFF(nxt1 + 1) = 0; /* just for consistancy. */
3246 NEXT_OFF(nxt + 1) = 0; /* just for consistancy. */
3249 while ( nxt1 && (OP(nxt1) != WHILEM)) {
3250 regnode *nnxt = regnext(nxt1);
3253 if (reg_off_by_arg[OP(nxt1)])
3254 ARG_SET(nxt1, nxt2 - nxt1);
3255 else if (nxt2 - nxt1 < U16_MAX)
3256 NEXT_OFF(nxt1) = nxt2 - nxt1;
3258 OP(nxt) = NOTHING; /* Cannot beautify */
3263 /* Optimize again: */
3264 study_chunk(pRExC_state, &nxt1, minlenp, &deltanext, nxt,
3265 NULL, stopparen, recursed, NULL, 0,depth+1);
3270 else if ((OP(oscan) == CURLYX)
3271 && (flags & SCF_WHILEM_VISITED_POS)
3272 /* See the comment on a similar expression above.
3273 However, this time it not a subexpression
3274 we care about, but the expression itself. */
3275 && (maxcount == REG_INFTY)
3276 && data && ++data->whilem_c < 16) {
3277 /* This stays as CURLYX, we can put the count/of pair. */
3278 /* Find WHILEM (as in regexec.c) */
3279 regnode *nxt = oscan + NEXT_OFF(oscan);
3281 if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
3283 PREVOPER(nxt)->flags = (U8)(data->whilem_c
3284 | (RExC_whilem_seen << 4)); /* On WHILEM */
3286 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
3288 if (flags & SCF_DO_SUBSTR) {
3289 SV *last_str = NULL;
3290 int counted = mincount != 0;
3292 if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */
3293 #if defined(SPARC64_GCC_WORKAROUND)
3296 const char *s = NULL;
3299 if (pos_before >= data->last_start_min)
3302 b = data->last_start_min;
3305 s = SvPV_const(data->last_found, l);
3306 old = b - data->last_start_min;
3309 I32 b = pos_before >= data->last_start_min
3310 ? pos_before : data->last_start_min;
3312 const char * const s = SvPV_const(data->last_found, l);
3313 I32 old = b - data->last_start_min;
3317 old = utf8_hop((U8*)s, old) - (U8*)s;
3320 /* Get the added string: */
3321 last_str = newSVpvn(s + old, l);
3323 SvUTF8_on(last_str);
3324 if (deltanext == 0 && pos_before == b) {
3325 /* What was added is a constant string */
3327 SvGROW(last_str, (mincount * l) + 1);
3328 repeatcpy(SvPVX(last_str) + l,
3329 SvPVX_const(last_str), l, mincount - 1);
3330 SvCUR_set(last_str, SvCUR(last_str) * mincount);
3331 /* Add additional parts. */
3332 SvCUR_set(data->last_found,
3333 SvCUR(data->last_found) - l);
3334 sv_catsv(data->last_found, last_str);
3336 SV * sv = data->last_found;
3338 SvUTF8(sv) && SvMAGICAL(sv) ?
3339 mg_find(sv, PERL_MAGIC_utf8) : NULL;
3340 if (mg && mg->mg_len >= 0)
3341 mg->mg_len += CHR_SVLEN(last_str) - l;
3343 data->last_end += l * (mincount - 1);
3346 /* start offset must point into the last copy */
3347 data->last_start_min += minnext * (mincount - 1);
3348 data->last_start_max += is_inf ? I32_MAX
3349 : (maxcount - 1) * (minnext + data->pos_delta);
3352 /* It is counted once already... */
3353 data->pos_min += minnext * (mincount - counted);
3354 data->pos_delta += - counted * deltanext +
3355 (minnext + deltanext) * maxcount - minnext * mincount;
3356 if (mincount != maxcount) {
3357 /* Cannot extend fixed substrings found inside
3359 SCAN_COMMIT(pRExC_state,data,minlenp);
3360 if (mincount && last_str) {
3361 SV * const sv = data->last_found;
3362 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
3363 mg_find(sv, PERL_MAGIC_utf8) : NULL;
3367 sv_setsv(sv, last_str);
3368 data->last_end = data->pos_min;
3369 data->last_start_min =
3370 data->pos_min - CHR_SVLEN(last_str);
3371 data->last_start_max = is_inf
3373 : data->pos_min + data->pos_delta
3374 - CHR_SVLEN(last_str);
3376 data->longest = &(data->longest_float);
3378 SvREFCNT_dec(last_str);
3380 if (data && (fl & SF_HAS_EVAL))
3381 data->flags |= SF_HAS_EVAL;
3382 optimize_curly_tail:
3383 if (OP(oscan) != CURLYX) {
3384 while (PL_regkind[OP(next = regnext(oscan))] == NOTHING
3386 NEXT_OFF(oscan) += NEXT_OFF(next);
3389 default: /* REF and CLUMP only? */
3390 if (flags & SCF_DO_SUBSTR) {
3391 SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */
3392 data->longest = &(data->longest_float);
3394 is_inf = is_inf_internal = 1;
3395 if (flags & SCF_DO_STCLASS_OR)
3396 cl_anything(pRExC_state, data->start_class);
3397 flags &= ~SCF_DO_STCLASS;
3401 else if (OP(scan) == LNBREAK) {
3402 if (flags & SCF_DO_STCLASS) {
3404 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
3405 if (flags & SCF_DO_STCLASS_AND) {
3406 for (value = 0; value < 256; value++)
3407 if (!is_VERTWS_cp(value))
3408 ANYOF_BITMAP_CLEAR(data->start_class, value);
3411 for (value = 0; value < 256; value++)
3412 if (is_VERTWS_cp(value))
3413 ANYOF_BITMAP_SET(data->start_class, value);
3415 if (flags & SCF_DO_STCLASS_OR)
3416 cl_and(data->start_class, and_withp);
3417 flags &= ~SCF_DO_STCLASS;
3421 if (flags & SCF_DO_SUBSTR) {
3422 SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */
3424 data->pos_delta += 1;
3425 data->longest = &(data->longest_float);
3429 else if (OP(scan) == FOLDCHAR) {
3430 int d = ARG(scan)==0xDF ? 1 : 2;
3431 flags &= ~SCF_DO_STCLASS;
3434 if (flags & SCF_DO_SUBSTR) {
3435 SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */
3437 data->pos_delta += d;
3438 data->longest = &(data->longest_float);
3441 else if (strchr((const char*)PL_simple,OP(scan))) {
3444 if (flags & SCF_DO_SUBSTR) {
3445 SCAN_COMMIT(pRExC_state,data,minlenp);
3449 if (flags & SCF_DO_STCLASS) {
3450 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
3452 /* Some of the logic below assumes that switching
3453 locale on will only add false positives. */
3454 switch (PL_regkind[OP(scan)]) {
3458 /* Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan)); */
3459 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
3460 cl_anything(pRExC_state, data->start_class);
3463 if (OP(scan) == SANY)
3465 if (flags & SCF_DO_STCLASS_OR) { /* Everything but \n */
3466 value = (ANYOF_BITMAP_TEST(data->start_class,'\n')
3467 || (data->start_class->flags & ANYOF_CLASS));
3468 cl_anything(pRExC_state, data->start_class);
3470 if (flags & SCF_DO_STCLASS_AND || !value)
3471 ANYOF_BITMAP_CLEAR(data->start_class,'\n');
3474 if (flags & SCF_DO_STCLASS_AND)
3475 cl_and(data->start_class,
3476 (struct regnode_charclass_class*)scan);
3478 cl_or(pRExC_state, data->start_class,
3479 (struct regnode_charclass_class*)scan);
3482 if (flags & SCF_DO_STCLASS_AND) {
3483 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3484 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
3485 for (value = 0; value < 256; value++)
3486 if (!isALNUM(value))
3487 ANYOF_BITMAP_CLEAR(data->start_class, value);
3491 if (data->start_class->flags & ANYOF_LOCALE)
3492 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
3494 for (value = 0; value < 256; value++)
3496 ANYOF_BITMAP_SET(data->start_class, value);
3501 if (flags & SCF_DO_STCLASS_AND) {
3502 if (data->start_class->flags & ANYOF_LOCALE)
3503 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
3506 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
3507 data->start_class->flags |= ANYOF_LOCALE;
3511 if (flags & SCF_DO_STCLASS_AND) {
3512 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3513 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
3514 for (value = 0; value < 256; value++)
3516 ANYOF_BITMAP_CLEAR(data->start_class, value);
3520 if (data->start_class->flags & ANYOF_LOCALE)
3521 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
3523 for (value = 0; value < 256; value++)
3524 if (!isALNUM(value))
3525 ANYOF_BITMAP_SET(data->start_class, value);
3530 if (flags & SCF_DO_STCLASS_AND) {
3531 if (data->start_class->flags & ANYOF_LOCALE)
3532 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
3535 data->start_class->flags |= ANYOF_LOCALE;
3536 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
3540 if (flags & SCF_DO_STCLASS_AND) {
3541 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3542 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
3543 for (value = 0; value < 256; value++)
3544 if (!isSPACE(value))
3545 ANYOF_BITMAP_CLEAR(data->start_class, value);
3549 if (data->start_class->flags & ANYOF_LOCALE)
3550 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
3552 for (value = 0; value < 256; value++)
3554 ANYOF_BITMAP_SET(data->start_class, value);
3559 if (flags & SCF_DO_STCLASS_AND) {
3560 if (data->start_class->flags & ANYOF_LOCALE)
3561 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
3564 data->start_class->flags |= ANYOF_LOCALE;
3565 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
3569 if (flags & SCF_DO_STCLASS_AND) {
3570 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3571 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
3572 for (value = 0; value < 256; value++)
3574 ANYOF_BITMAP_CLEAR(data->start_class, value);
3578 if (data->start_class->flags & ANYOF_LOCALE)
3579 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
3581 for (value = 0; value < 256; value++)
3582 if (!isSPACE(value))
3583 ANYOF_BITMAP_SET(data->start_class, value);
3588 if (flags & SCF_DO_STCLASS_AND) {
3589 if (data->start_class->flags & ANYOF_LOCALE) {
3590 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
3591 for (value = 0; value < 256; value++)
3592 if (!isSPACE(value))
3593 ANYOF_BITMAP_CLEAR(data->start_class, value);
3597 data->start_class->flags |= ANYOF_LOCALE;
3598 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
3602 if (flags & SCF_DO_STCLASS_AND) {
3603 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NDIGIT);
3604 for (value = 0; value < 256; value++)
3605 if (!isDIGIT(value))
3606 ANYOF_BITMAP_CLEAR(data->start_class, value);
3609 if (data->start_class->flags & ANYOF_LOCALE)
3610 ANYOF_CLASS_SET(data->start_class,ANYOF_DIGIT);
3612 for (value = 0; value < 256; value++)
3614 ANYOF_BITMAP_SET(data->start_class, value);
3619 if (flags & SCF_DO_STCLASS_AND) {
3620 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_DIGIT);
3621 for (value = 0; value < 256; value++)
3623 ANYOF_BITMAP_CLEAR(data->start_class, value);
3626 if (data->start_class->flags & ANYOF_LOCALE)
3627 ANYOF_CLASS_SET(data->start_class,ANYOF_NDIGIT);
3629 for (value = 0; value < 256; value++)
3630 if (!isDIGIT(value))
3631 ANYOF_BITMAP_SET(data->start_class, value);
3635 CASE_SYNST_FNC(VERTWS);
3636 CASE_SYNST_FNC(HORIZWS);
3639 if (flags & SCF_DO_STCLASS_OR)
3640 cl_and(data->start_class, and_withp);
3641 flags &= ~SCF_DO_STCLASS;
3644 else if (PL_regkind[OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
3645 data->flags |= (OP(scan) == MEOL
3649 else if ( PL_regkind[OP(scan)] == BRANCHJ
3650 /* Lookbehind, or need to calculate parens/evals/stclass: */
3651 && (scan->flags || data || (flags & SCF_DO_STCLASS))
3652 && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) {
3653 if ( !PERL_ENABLE_POSITIVE_ASSERTION_STUDY
3654 || OP(scan) == UNLESSM )
3656 /* Negative Lookahead/lookbehind
3657 In this case we can't do fixed string optimisation.
3660 I32 deltanext, minnext, fake = 0;
3662 struct regnode_charclass_class intrnl;
3665 data_fake.flags = 0;
3667 data_fake.whilem_c = data->whilem_c;
3668 data_fake.last_closep = data->last_closep;
3671 data_fake.last_closep = &fake;
3672 data_fake.pos_delta = delta;
3673 if ( flags & SCF_DO_STCLASS && !scan->flags
3674 && OP(scan) == IFMATCH ) { /* Lookahead */
3675 cl_init(pRExC_state, &intrnl);
3676 data_fake.start_class = &intrnl;
3677 f |= SCF_DO_STCLASS_AND;
3679 if (flags & SCF_WHILEM_VISITED_POS)
3680 f |= SCF_WHILEM_VISITED_POS;
3681 next = regnext(scan);
3682 nscan = NEXTOPER(NEXTOPER(scan));
3683 minnext = study_chunk(pRExC_state, &nscan, minlenp, &deltanext,
3684 last, &data_fake, stopparen, recursed, NULL, f, depth+1);
3687 FAIL("Variable length lookbehind not implemented");
3689 else if (minnext > (I32)U8_MAX) {
3690 FAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
3692 scan->flags = (U8)minnext;
3695 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3697 if (data_fake.flags & SF_HAS_EVAL)
3698 data->flags |= SF_HAS_EVAL;
3699 data->whilem_c = data_fake.whilem_c;
3701 if (f & SCF_DO_STCLASS_AND) {
3702 const int was = (data->start_class->flags & ANYOF_EOS);
3704 cl_and(data->start_class, &intrnl);
3706 data->start_class->flags |= ANYOF_EOS;
3709 #if PERL_ENABLE_POSITIVE_ASSERTION_STUDY
3711 /* Positive Lookahead/lookbehind
3712 In this case we can do fixed string optimisation,
3713 but we must be careful about it. Note in the case of
3714 lookbehind the positions will be offset by the minimum
3715 length of the pattern, something we won't know about
3716 until after the recurse.
3718 I32 deltanext, fake = 0;
3720 struct regnode_charclass_class intrnl;
3722 /* We use SAVEFREEPV so that when the full compile
3723 is finished perl will clean up the allocated
3724 minlens when its all done. This was we don't
3725 have to worry about freeing them when we know
3726 they wont be used, which would be a pain.
3729 Newx( minnextp, 1, I32 );
3730 SAVEFREEPV(minnextp);
3733 StructCopy(data, &data_fake, scan_data_t);
3734 if ((flags & SCF_DO_SUBSTR) && data->last_found) {
3737 SCAN_COMMIT(pRExC_state, &data_fake,minlenp);
3738 data_fake.last_found=newSVsv(data->last_found);
3742 data_fake.last_closep = &fake;
3743 data_fake.flags = 0;
3744 data_fake.pos_delta = delta;
3746 data_fake.flags |= SF_IS_INF;
3747 if ( flags & SCF_DO_STCLASS && !scan->flags
3748 && OP(scan) == IFMATCH ) { /* Lookahead */
3749 cl_init(pRExC_state, &intrnl);
3750 data_fake.start_class = &intrnl;
3751 f |= SCF_DO_STCLASS_AND;
3753 if (flags & SCF_WHILEM_VISITED_POS)
3754 f |= SCF_WHILEM_VISITED_POS;
3755 next = regnext(scan);
3756 nscan = NEXTOPER(NEXTOPER(scan));
3758 *minnextp = study_chunk(pRExC_state, &nscan, minnextp, &deltanext,
3759 last, &data_fake, stopparen, recursed, NULL, f,depth+1);
3762 FAIL("Variable length lookbehind not implemented");
3764 else if (*minnextp > (I32)U8_MAX) {
3765 FAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
3767 scan->flags = (U8)*minnextp;
3772 if (f & SCF_DO_STCLASS_AND) {
3773 const int was = (data->start_class->flags & ANYOF_EOS);
3775 cl_and(data->start_class, &intrnl);
3777 data->start_class->flags |= ANYOF_EOS;
3780 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3782 if (data_fake.flags & SF_HAS_EVAL)
3783 data->flags |= SF_HAS_EVAL;
3784 data->whilem_c = data_fake.whilem_c;
3785 if ((flags & SCF_DO_SUBSTR) && data_fake.last_found) {
3786 if (RExC_rx->minlen<*minnextp)
3787 RExC_rx->minlen=*minnextp;
3788 SCAN_COMMIT(pRExC_state, &data_fake, minnextp);
3789 SvREFCNT_dec(data_fake.last_found);
3791 if ( data_fake.minlen_fixed != minlenp )
3793 data->offset_fixed= data_fake.offset_fixed;
3794 data->minlen_fixed= data_fake.minlen_fixed;
3795 data->lookbehind_fixed+= scan->flags;
3797 if ( data_fake.minlen_float != minlenp )
3799 data->minlen_float= data_fake.minlen_float;
3800 data->offset_float_min=data_fake.offset_float_min;
3801 data->offset_float_max=data_fake.offset_float_max;
3802 data->lookbehind_float+= scan->flags;
3811 else if (OP(scan) == OPEN) {
3812 if (stopparen != (I32)ARG(scan))
3815 else if (OP(scan) == CLOSE) {
3816 if (stopparen == (I32)ARG(scan)) {
3819 if ((I32)ARG(scan) == is_par) {
3820 next = regnext(scan);
3822 if ( next && (OP(next) != WHILEM) && next < last)
3823 is_par = 0; /* Disable optimization */
3826 *(data->last_closep) = ARG(scan);
3828 else if (OP(scan) == EVAL) {
3830 data->flags |= SF_HAS_EVAL;
3832 else if ( PL_regkind[OP(scan)] == ENDLIKE ) {
3833 if (flags & SCF_DO_SUBSTR) {
3834 SCAN_COMMIT(pRExC_state,data,minlenp);
3835 flags &= ~SCF_DO_SUBSTR;
3837 if (data && OP(scan)==ACCEPT) {
3838 data->flags |= SCF_SEEN_ACCEPT;
3843 else if (OP(scan) == LOGICAL && scan->flags == 2) /* Embedded follows */
3845 if (flags & SCF_DO_SUBSTR) {
3846 SCAN_COMMIT(pRExC_state,data,minlenp);
3847 data->longest = &(data->longest_float);
3849 is_inf = is_inf_internal = 1;
3850 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
3851 cl_anything(pRExC_state, data->start_class);
3852 flags &= ~SCF_DO_STCLASS;
3854 else if (OP(scan) == GPOS) {
3855 if (!(RExC_rx->extflags & RXf_GPOS_FLOAT) &&
3856 !(delta || is_inf || (data && data->pos_delta)))
3858 if (!(RExC_rx->extflags & RXf_ANCH) && (flags & SCF_DO_SUBSTR))
3859 RExC_rx->extflags |= RXf_ANCH_GPOS;
3860 if (RExC_rx->gofs < (U32)min)
3861 RExC_rx->gofs = min;
3863 RExC_rx->extflags |= RXf_GPOS_FLOAT;
3867 #ifdef TRIE_STUDY_OPT
3868 #ifdef FULL_TRIE_STUDY
3869 else if (PL_regkind[OP(scan)] == TRIE) {
3870 /* NOTE - There is similar code to this block above for handling
3871 BRANCH nodes on the initial study. If you change stuff here
3873 regnode *trie_node= scan;
3874 regnode *tail= regnext(scan);
3875 reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
3876 I32 max1 = 0, min1 = I32_MAX;
3877 struct regnode_charclass_class accum;
3879 if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
3880 SCAN_COMMIT(pRExC_state, data,minlenp); /* Cannot merge strings after this. */
3881 if (flags & SCF_DO_STCLASS)
3882 cl_init_zero(pRExC_state, &accum);
3888 const regnode *nextbranch= NULL;
3891 for ( word=1 ; word <= trie->wordcount ; word++)
3893 I32 deltanext=0, minnext=0, f = 0, fake;
3894 struct regnode_charclass_class this_class;
3896 data_fake.flags = 0;
3898 data_fake.whilem_c = data->whilem_c;
3899 data_fake.last_closep = data->last_closep;
3902 data_fake.last_closep = &fake;
3903 data_fake.pos_delta = delta;
3904 if (flags & SCF_DO_STCLASS) {
3905 cl_init(pRExC_state, &this_class);
3906 data_fake.start_class = &this_class;
3907 f = SCF_DO_STCLASS_AND;
3909 if (flags & SCF_WHILEM_VISITED_POS)
3910 f |= SCF_WHILEM_VISITED_POS;
3912 if (trie->jump[word]) {
3914 nextbranch = trie_node + trie->jump[0];
3915 scan= trie_node + trie->jump[word];
3916 /* We go from the jump point to the branch that follows
3917 it. Note this means we need the vestigal unused branches
3918 even though they arent otherwise used.
3920 minnext = study_chunk(pRExC_state, &scan, minlenp,
3921 &deltanext, (regnode *)nextbranch, &data_fake,
3922 stopparen, recursed, NULL, f,depth+1);
3924 if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
3925 nextbranch= regnext((regnode*)nextbranch);
3927 if (min1 > (I32)(minnext + trie->minlen))
3928 min1 = minnext + trie->minlen;
3929 if (max1 < (I32)(minnext + deltanext + trie->maxlen))
3930 max1 = minnext + deltanext + trie->maxlen;
3931 if (deltanext == I32_MAX)
3932 is_inf = is_inf_internal = 1;
3934 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3936 if (data_fake.flags & SCF_SEEN_ACCEPT) {
3937 if ( stopmin > min + min1)
3938 stopmin = min + min1;
3939 flags &= ~SCF_DO_SUBSTR;
3941 data->flags |= SCF_SEEN_ACCEPT;
3944 if (data_fake.flags & SF_HAS_EVAL)
3945 data->flags |= SF_HAS_EVAL;
3946 data->whilem_c = data_fake.whilem_c;
3948 if (flags & SCF_DO_STCLASS)
3949 cl_or(pRExC_state, &accum, &this_class);
3952 if (flags & SCF_DO_SUBSTR) {
3953 data->pos_min += min1;
3954 data->pos_delta += max1 - min1;
3955 if (max1 != min1 || is_inf)
3956 data->longest = &(data->longest_float);
3959 delta += max1 - min1;
3960 if (flags & SCF_DO_STCLASS_OR) {
3961 cl_or(pRExC_state, data->start_class, &accum);
3963 cl_and(data->start_class, and_withp);
3964 flags &= ~SCF_DO_STCLASS;
3967 else if (flags & SCF_DO_STCLASS_AND) {
3969 cl_and(data->start_class, &accum);
3970 flags &= ~SCF_DO_STCLASS;
3973 /* Switch to OR mode: cache the old value of
3974 * data->start_class */
3976 StructCopy(data->start_class, and_withp,
3977 struct regnode_charclass_class);
3978 flags &= ~SCF_DO_STCLASS_AND;
3979 StructCopy(&accum, data->start_class,
3980 struct regnode_charclass_class);
3981 flags |= SCF_DO_STCLASS_OR;
3982 data->start_class->flags |= ANYOF_EOS;
3989 else if (PL_regkind[OP(scan)] == TRIE) {
3990 reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
3993 min += trie->minlen;
3994 delta += (trie->maxlen - trie->minlen);
3995 flags &= ~SCF_DO_STCLASS; /* xxx */
3996 if (flags & SCF_DO_SUBSTR) {
3997 SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */
3998 data->pos_min += trie->minlen;
3999 data->pos_delta += (trie->maxlen - trie->minlen);
4000 if (trie->maxlen != trie->minlen)
4001 data->longest = &(data->longest_float);
4003 if (trie->jump) /* no more substrings -- for now /grr*/
4004 flags &= ~SCF_DO_SUBSTR;
4006 #endif /* old or new */
4007 #endif /* TRIE_STUDY_OPT */
4009 /* Else: zero-length, ignore. */
4010 scan = regnext(scan);
4015 stopparen = frame->stop;
4016 frame = frame->prev;
4017 goto fake_study_recurse;
4022 DEBUG_STUDYDATA("pre-fin:",data,depth);
4025 *deltap = is_inf_internal ? I32_MAX : delta;
4026 if (flags & SCF_DO_SUBSTR && is_inf)
4027 data->pos_delta = I32_MAX - data->pos_min;
4028 if (is_par > (I32)U8_MAX)
4030 if (is_par && pars==1 && data) {
4031 data->flags |= SF_IN_PAR;
4032 data->flags &= ~SF_HAS_PAR;
4034 else if (pars && data) {
4035 data->flags |= SF_HAS_PAR;
4036 data->flags &= ~SF_IN_PAR;
4038 if (flags & SCF_DO_STCLASS_OR)
4039 cl_and(data->start_class, and_withp);
4040 if (flags & SCF_TRIE_RESTUDY)
4041 data->flags |= SCF_TRIE_RESTUDY;
4043 DEBUG_STUDYDATA("post-fin:",data,depth);
4045 return min < stopmin ? min : stopmin;
4049 S_add_data(RExC_state_t *pRExC_state, U32 n, const char *s)
4051 U32 count = RExC_rxi->data ? RExC_rxi->data->count : 0;
4053 Renewc(RExC_rxi->data,
4054 sizeof(*RExC_rxi->data) + sizeof(void*) * (count + n - 1),
4055 char, struct reg_data);
4057 Renew(RExC_rxi->data->what, count + n, U8);
4059 Newx(RExC_rxi->data->what, n, U8);
4060 RExC_rxi->data->count = count + n;
4061 Copy(s, RExC_rxi->data->what + count, n, U8);
4065 /*XXX: todo make this not included in a non debugging perl */
4066 #ifndef PERL_IN_XSUB_RE
4068 Perl_reginitcolors(pTHX)
4071 const char * const s = PerlEnv_getenv("PERL_RE_COLORS");
4073 char *t = savepv(s);
4077 t = strchr(t, '\t');
4083 PL_colors[i] = t = (char *)"";
4088 PL_colors[i++] = (char *)"";
4095 #ifdef TRIE_STUDY_OPT
4096 #define CHECK_RESTUDY_GOTO \
4098 (data.flags & SCF_TRIE_RESTUDY) \
4102 #define CHECK_RESTUDY_GOTO
4106 - pregcomp - compile a regular expression into internal code
4108 * We can't allocate space until we know how big the compiled form will be,
4109 * but we can't compile it (and thus know how big it is) until we've got a
4110 * place to put the code. So we cheat: we compile it twice, once with code
4111 * generation turned off and size counting turned on, and once "for real".
4112 * This also means that we don't allocate space until we are sure that the
4113 * thing really will compile successfully, and we never have to move the
4114 * code and thus invalidate pointers into it. (Note that it has to be in
4115 * one piece because free() must be able to free it all.) [NB: not true in perl]
4117 * Beware that the optimization-preparation code in here knows about some
4118 * of the structure of the compiled regexp. [I'll say.]
4123 #ifndef PERL_IN_XSUB_RE
4124 #define RE_ENGINE_PTR &PL_core_reg_engine
4126 extern const struct regexp_engine my_reg_engine;
4127 #define RE_ENGINE_PTR &my_reg_engine
4130 #ifndef PERL_IN_XSUB_RE
4132 Perl_pregcomp(pTHX_ const SV * const pattern, const U32 flags)
4135 HV * const table = GvHV(PL_hintgv);
4136 /* Dispatch a request to compile a regexp to correct
4139 SV **ptr= hv_fetchs(table, "regcomp", FALSE);
4140 GET_RE_DEBUG_FLAGS_DECL;
4141 if (ptr && SvIOK(*ptr) && SvIV(*ptr)) {
4142 const regexp_engine *eng=INT2PTR(regexp_engine*,SvIV(*ptr));
4144 PerlIO_printf(Perl_debug_log, "Using engine %"UVxf"\n",
4147 return CALLREGCOMP_ENG(eng, pattern, flags);
4150 return Perl_re_compile(aTHX_ pattern, flags);
4155 Perl_re_compile(pTHX_ const SV * const pattern, const U32 pm_flags)
4159 register regexp_internal *ri;
4161 char* exp = SvPV((SV*)pattern, plen);
4162 char* xend = exp + plen;
4169 RExC_state_t RExC_state;
4170 RExC_state_t * const pRExC_state = &RExC_state;
4171 #ifdef TRIE_STUDY_OPT
4173 RExC_state_t copyRExC_state;
4175 GET_RE_DEBUG_FLAGS_DECL;
4176 DEBUG_r(if (!PL_colorset) reginitcolors());
4178 RExC_utf8 = RExC_orig_utf8 = pm_flags & RXf_UTF8;
4181 SV *dsv= sv_newmortal();
4182 RE_PV_QUOTED_DECL(s, RExC_utf8,
4183 dsv, exp, plen, 60);
4184 PerlIO_printf(Perl_debug_log, "%sCompiling REx%s %s\n",
4185 PL_colors[4],PL_colors[5],s);
4190 RExC_flags = pm_flags;
4194 RExC_seen_zerolen = *exp == '^' ? -1 : 0;
4195 RExC_seen_evals = 0;
4198 /* First pass: determine size, legality. */
4206 RExC_emit = &PL_regdummy;
4207 RExC_whilem_seen = 0;
4208 RExC_charnames = NULL;
4209 RExC_open_parens = NULL;
4210 RExC_close_parens = NULL;
4212 RExC_paren_names = NULL;
4214 RExC_paren_name_list = NULL;
4216 RExC_recurse = NULL;
4217 RExC_recurse_count = 0;
4219 #if 0 /* REGC() is (currently) a NOP at the first pass.
4220 * Clever compilers notice this and complain. --jhi */
4221 REGC((U8)REG_MAGIC, (char*)RExC_emit);
4223 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "Starting first pass (sizing)\n"));
4224 if (reg(pRExC_state, 0, &flags,1) == NULL) {
4225 RExC_precomp = NULL;
4228 if (RExC_utf8 && !RExC_orig_utf8) {
4229 /* It's possible to write a regexp in ascii that represents Unicode
4230 codepoints outside of the byte range, such as via \x{100}. If we
4231 detect such a sequence we have to convert the entire pattern to utf8
4232 and then recompile, as our sizing calculation will have been based
4233 on 1 byte == 1 character, but we will need to use utf8 to encode
4234 at least some part of the pattern, and therefore must convert the whole
4236 XXX: somehow figure out how to make this less expensive...
4239 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log,
4240 "UTF8 mismatch! Converting to utf8 for resizing and compile\n"));
4241 exp = (char*)Perl_bytes_to_utf8(aTHX_ (U8*)exp, &len);
4243 RExC_orig_utf8 = RExC_utf8;
4245 goto redo_first_pass;
4248 PerlIO_printf(Perl_debug_log,
4249 "Required size %"IVdf" nodes\n"
4250 "Starting second pass (creation)\n",
4253 RExC_lastparse=NULL;
4255 /* Small enough for pointer-storage convention?
4256 If extralen==0, this means that we will not need long jumps. */
4257 if (RExC_size >= 0x10000L && RExC_extralen)
4258 RExC_size += RExC_extralen;
4261 if (RExC_whilem_seen > 15)
4262 RExC_whilem_seen = 15;
4264 /* Allocate space and zero-initialize. Note, the two step process
4265 of zeroing when in debug mode, thus anything assigned has to
4266 happen after that */
4267 Newxz(r, 1, regexp);
4268 Newxc(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode),
4269 char, regexp_internal);
4270 if ( r == NULL || ri == NULL )
4271 FAIL("Regexp out of space");
4273 /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */
4274 Zero(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode), char);
4276 /* bulk initialize base fields with 0. */
4277 Zero(ri, sizeof(regexp_internal), char);
4280 /* non-zero initialization begins here */
4282 r->engine= RE_ENGINE_PTR;
4285 r->extflags = pm_flags;
4287 bool has_p = ((r->extflags & RXf_PMf_KEEPCOPY) == RXf_PMf_KEEPCOPY);
4288 bool has_minus = ((r->extflags & RXf_PMf_STD_PMMOD) != RXf_PMf_STD_PMMOD);
4289 bool has_runon = ((RExC_seen & REG_SEEN_RUN_ON_COMMENT)==REG_SEEN_RUN_ON_COMMENT);
4290 U16 reganch = (U16)((r->extflags & RXf_PMf_STD_PMMOD) >> 12);
4291 const char *fptr = STD_PAT_MODS; /*"msix"*/
4293 r->wraplen = r->prelen + has_minus + has_p + has_runon
4294 + (sizeof(STD_PAT_MODS) - 1)
4295 + (sizeof("(?:)") - 1);
4297 Newx(r->wrapped, r->wraplen + 1, char );
4301 *p++ = KEEPCOPY_PAT_MOD; /*'p'*/
4303 char *r = p + (sizeof(STD_PAT_MODS) - 1) + has_minus - 1;
4304 char *colon = r + 1;
4307 while((ch = *fptr++)) {
4321 Copy(RExC_precomp, p, r->prelen, char);
4331 r->nparens = RExC_npar - 1; /* set early to validate backrefs */
4333 if (RExC_seen & REG_SEEN_RECURSE) {
4334 Newxz(RExC_open_parens, RExC_npar,regnode *);
4335 SAVEFREEPV(RExC_open_parens);
4336 Newxz(RExC_close_parens,RExC_npar,regnode *);
4337 SAVEFREEPV(RExC_close_parens);
4340 /* Useful during FAIL. */
4341 #ifdef RE_TRACK_PATTERN_OFFSETS
4342 Newxz(ri->u.offsets, 2*RExC_size+1, U32); /* MJD 20001228 */
4343 DEBUG_OFFSETS_r(PerlIO_printf(Perl_debug_log,
4344 "%s %"UVuf" bytes for offset annotations.\n",
4345 ri->u.offsets ? "Got" : "Couldn't get",
4346 (UV)((2*RExC_size+1) * sizeof(U32))));
4348 SetProgLen(ri,RExC_size);
4352 /* Second pass: emit code. */
4353 RExC_flags = pm_flags; /* don't let top level (?i) bleed */
4358 RExC_emit_start = ri->program;
4359 RExC_emit = ri->program;
4360 RExC_emit_bound = ri->program + RExC_size + 1;
4362 /* Store the count of eval-groups for security checks: */
4363 RExC_rx->seen_evals = RExC_seen_evals;
4364 REGC((U8)REG_MAGIC, (char*) RExC_emit++);
4365 if (reg(pRExC_state, 0, &flags,1) == NULL) {
4369 /* XXXX To minimize changes to RE engine we always allocate
4370 3-units-long substrs field. */
4371 Newx(r->substrs, 1, struct reg_substr_data);
4372 if (RExC_recurse_count) {
4373 Newxz(RExC_recurse,RExC_recurse_count,regnode *);
4374 SAVEFREEPV(RExC_recurse);
4378 r->minlen = minlen = sawplus = sawopen = 0;
4379 Zero(r->substrs, 1, struct reg_substr_data);
4381 #ifdef TRIE_STUDY_OPT
4384 DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log,"Restudying\n"));
4386 RExC_state = copyRExC_state;
4387 if (seen & REG_TOP_LEVEL_BRANCHES)
4388 RExC_seen |= REG_TOP_LEVEL_BRANCHES;
4390 RExC_seen &= ~REG_TOP_LEVEL_BRANCHES;
4391 if (data.last_found) {
4392 SvREFCNT_dec(data.longest_fixed);
4393 SvREFCNT_dec(data.longest_float);
4394 SvREFCNT_dec(data.last_found);
4396 StructCopy(&zero_scan_data, &data, scan_data_t);
4398 StructCopy(&zero_scan_data, &data, scan_data_t);
4399 copyRExC_state = RExC_state;
4402 StructCopy(&zero_scan_data, &data, scan_data_t);
4405 /* Dig out information for optimizations. */
4406 r->extflags = RExC_flags; /* was pm_op */
4407 /*dmq: removed as part of de-PMOP: pm->op_pmflags = RExC_flags; */
4410 r->extflags |= RXf_UTF8; /* Unicode in it? */
4411 ri->regstclass = NULL;
4412 if (RExC_naughty >= 10) /* Probably an expensive pattern. */
4413 r->intflags |= PREGf_NAUGHTY;
4414 scan = ri->program + 1; /* First BRANCH. */
4416 /* testing for BRANCH here tells us whether there is "must appear"
4417 data in the pattern. If there is then we can use it for optimisations */
4418 if (!(RExC_seen & REG_TOP_LEVEL_BRANCHES)) { /* Only one top-level choice. */
4420 STRLEN longest_float_length, longest_fixed_length;
4421 struct regnode_charclass_class ch_class; /* pointed to by data */
4423 I32 last_close = 0; /* pointed to by data */
4424 regnode *first= scan;
4425 regnode *first_next= regnext(first);
4427 /* Skip introductions and multiplicators >= 1. */
4428 while ((OP(first) == OPEN && (sawopen = 1)) ||
4429 /* An OR of *one* alternative - should not happen now. */
4430 (OP(first) == BRANCH && OP(first_next) != BRANCH) ||
4431 /* for now we can't handle lookbehind IFMATCH*/
4432 (OP(first) == IFMATCH && !first->flags) ||
4433 (OP(first) == PLUS) ||
4434 (OP(first) == MINMOD) ||
4435 /* An {n,m} with n>0 */
4436 (PL_regkind[OP(first)] == CURLY && ARG1(first) > 0) ||
4437 (OP(first) == NOTHING && PL_regkind[OP(first_next)] != END ))
4440 if (OP(first) == PLUS)
4443 first += regarglen[OP(first)];
4444 if (OP(first) == IFMATCH) {
4445 first = NEXTOPER(first);
4446 first += EXTRA_STEP_2ARGS;
4447 } else /* XXX possible optimisation for /(?=)/ */
4448 first = NEXTOPER(first);
4449 first_next= regnext(first);
4452 /* Starting-point info. */
4454 DEBUG_PEEP("first:",first,0);
4455 /* Ignore EXACT as we deal with it later. */
4456 if (PL_regkind[OP(first)] == EXACT) {
4457 if (OP(first) == EXACT)
4458 NOOP; /* Empty, get anchored substr later. */
4459 else if ((OP(first) == EXACTF || OP(first) == EXACTFL))
4460 ri->regstclass = first;
4463 else if (PL_regkind[OP(first)] == TRIE &&
4464 ((reg_trie_data *)ri->data->data[ ARG(first) ])->minlen>0)
4467 /* this can happen only on restudy */
4468 if ( OP(first) == TRIE ) {
4469 struct regnode_1 *trieop = (struct regnode_1 *)
4470 PerlMemShared_calloc(1, sizeof(struct regnode_1));
4471 StructCopy(first,trieop,struct regnode_1);
4472 trie_op=(regnode *)trieop;
4474 struct regnode_charclass *trieop = (struct regnode_charclass *)
4475 PerlMemShared_calloc(1, sizeof(struct regnode_charclass));
4476 StructCopy(first,trieop,struct regnode_charclass);
4477 trie_op=(regnode *)trieop;
4480 make_trie_failtable(pRExC_state, (regnode *)first, trie_op, 0);
4481 ri->regstclass = trie_op;
4484 else if (strchr((const char*)PL_simple,OP(first)))
4485 ri->regstclass = first;
4486 else if (PL_regkind[OP(first)] == BOUND ||
4487 PL_regkind[OP(first)] == NBOUND)
4488 ri->regstclass = first;
4489 else if (PL_regkind[OP(first)] == BOL) {
4490 r->extflags |= (OP(first) == MBOL
4492 : (OP(first) == SBOL
4495 first = NEXTOPER(first);
4498 else if (OP(first) == GPOS) {
4499 r->extflags |= RXf_ANCH_GPOS;
4500 first = NEXTOPER(first);
4503 else if ((!sawopen || !RExC_sawback) &&
4504 (OP(first) == STAR &&
4505 PL_regkind[OP(NEXTOPER(first))] == REG_ANY) &&
4506 !(r->extflags & RXf_ANCH) && !(RExC_seen & REG_SEEN_EVAL))
4508 /* turn .* into ^.* with an implied $*=1 */
4510 (OP(NEXTOPER(first)) == REG_ANY)
4513 r->extflags |= type;
4514 r->intflags |= PREGf_IMPLICIT;
4515 first = NEXTOPER(first);
4518 if (sawplus && (!sawopen || !RExC_sawback)
4519 && !(RExC_seen & REG_SEEN_EVAL)) /* May examine pos and $& */
4520 /* x+ must match at the 1st pos of run of x's */
4521 r->intflags |= PREGf_SKIP;
4523 /* Scan is after the zeroth branch, first is atomic matcher. */
4524 #ifdef TRIE_STUDY_OPT
4527 PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
4528 (IV)(first - scan + 1))
4532 PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
4533 (IV)(first - scan + 1))
4539 * If there's something expensive in the r.e., find the
4540 * longest literal string that must appear and make it the
4541 * regmust. Resolve ties in favor of later strings, since
4542 * the regstart check works with the beginning of the r.e.
4543 * and avoiding duplication strengthens checking. Not a
4544 * strong reason, but sufficient in the absence of others.
4545 * [Now we resolve ties in favor of the earlier string if
4546 * it happens that c_offset_min has been invalidated, since the
4547 * earlier string may buy us something the later one won't.]
4550 data.longest_fixed = newSVpvs("");
4551 data.longest_float = newSVpvs("");
4552 data.last_found = newSVpvs("");
4553 data.longest = &(data.longest_fixed);
4555 if (!ri->regstclass) {
4556 cl_init(pRExC_state, &ch_class);
4557 data.start_class = &ch_class;
4558 stclass_flag = SCF_DO_STCLASS_AND;
4559 } else /* XXXX Check for BOUND? */
4561 data.last_closep = &last_close;
4563 minlen = study_chunk(pRExC_state, &first, &minlen, &fake, scan + RExC_size, /* Up to end */
4564 &data, -1, NULL, NULL,
4565 SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag,0);
4571 if ( RExC_npar == 1 && data.longest == &(data.longest_fixed)
4572 && data.last_start_min == 0 && data.last_end > 0
4573 && !RExC_seen_zerolen
4574 && !(RExC_seen & REG_SEEN_VERBARG)
4575 && (!(RExC_seen & REG_SEEN_GPOS) || (r->extflags & RXf_ANCH_GPOS)))
4576 r->extflags |= RXf_CHECK_ALL;
4577 scan_commit(pRExC_state, &data,&minlen,0);
4578 SvREFCNT_dec(data.last_found);
4580 /* Note that code very similar to this but for anchored string
4581 follows immediately below, changes may need to be made to both.
4584 longest_float_length = CHR_SVLEN(data.longest_float);
4585 if (longest_float_length
4586 || (data.flags & SF_FL_BEFORE_EOL
4587 && (!(data.flags & SF_FL_BEFORE_MEOL)
4588 || (RExC_flags & RXf_PMf_MULTILINE))))
4592 if (SvCUR(data.longest_fixed) /* ok to leave SvCUR */
4593 && data.offset_fixed == data.offset_float_min
4594 && SvCUR(data.longest_fixed) == SvCUR(data.longest_float))
4595 goto remove_float; /* As in (a)+. */
4597 /* copy the information about the longest float from the reg_scan_data
4598 over to the program. */
4599 if (SvUTF8(data.longest_float)) {
4600 r->float_utf8 = data.longest_float;
4601 r->float_substr = NULL;
4603 r->float_substr = data.longest_float;
4604 r->float_utf8 = NULL;
4606 /* float_end_shift is how many chars that must be matched that
4607 follow this item. We calculate it ahead of time as once the
4608 lookbehind offset is added in we lose the ability to correctly
4610 ml = data.minlen_float ? *(data.minlen_float)
4611 : (I32)longest_float_length;
4612 r->float_end_shift = ml - data.offset_float_min
4613 - longest_float_length + (SvTAIL(data.longest_float) != 0)
4614 + data.lookbehind_float;
4615 r->float_min_offset = data.offset_float_min - data.lookbehind_float;
4616 r->float_max_offset = data.offset_float_max;
4617 if (data.offset_float_max < I32_MAX) /* Don't offset infinity */
4618 r->float_max_offset -= data.lookbehind_float;
4620 t = (data.flags & SF_FL_BEFORE_EOL /* Can't have SEOL and MULTI */
4621 && (!(data.flags & SF_FL_BEFORE_MEOL)
4622 || (RExC_flags & RXf_PMf_MULTILINE)));
4623 fbm_compile(data.longest_float, t ? FBMcf_TAIL : 0);
4627 r->float_substr = r->float_utf8 = NULL;
4628 SvREFCNT_dec(data.longest_float);
4629 longest_float_length = 0;
4632 /* Note that code very similar to this but for floating string
4633 is immediately above, changes may need to be made to both.
4636 longest_fixed_length = CHR_SVLEN(data.longest_fixed);
4637 if (longest_fixed_length
4638 || (data.flags & SF_FIX_BEFORE_EOL /* Cannot have SEOL and MULTI */
4639 && (!(data.flags & SF_FIX_BEFORE_MEOL)
4640 || (RExC_flags & RXf_PMf_MULTILINE))))
4644 /* copy the information about the longest fixed
4645 from the reg_scan_data over to the program. */
4646 if (SvUTF8(data.longest_fixed)) {
4647 r->anchored_utf8 = data.longest_fixed;
4648 r->anchored_substr = NULL;
4650 r->anchored_substr = data.longest_fixed;
4651 r->anchored_utf8 = NULL;
4653 /* fixed_end_shift is how many chars that must be matched that
4654 follow this item. We calculate it ahead of time as once the
4655 lookbehind offset is added in we lose the ability to correctly
4657 ml = data.minlen_fixed ? *(data.minlen_fixed)
4658 : (I32)longest_fixed_length;
4659 r->anchored_end_shift = ml - data.offset_fixed
4660 - longest_fixed_length + (SvTAIL(data.longest_fixed) != 0)
4661 + data.lookbehind_fixed;
4662 r->anchored_offset = data.offset_fixed - data.lookbehind_fixed;
4664 t = (data.flags & SF_FIX_BEFORE_EOL /* Can't have SEOL and MULTI */
4665 && (!(data.flags & SF_FIX_BEFORE_MEOL)
4666 || (RExC_flags & RXf_PMf_MULTILINE)));
4667 fbm_compile(data.longest_fixed, t ? FBMcf_TAIL : 0);
4670 r->anchored_substr = r->anchored_utf8 = NULL;
4671 SvREFCNT_dec(data.longest_fixed);
4672 longest_fixed_length = 0;
4675 && (OP(ri->regstclass) == REG_ANY || OP(ri->regstclass) == SANY))
4676 ri->regstclass = NULL;
4677 if ((!(r->anchored_substr || r->anchored_utf8) || r->anchored_offset)
4679 && !(data.start_class->flags & ANYOF_EOS)
4680 && !cl_is_anything(data.start_class))
4682 const U32 n = add_data(pRExC_state, 1, "f");
4684 Newx(RExC_rxi->data->data[n], 1,
4685 struct regnode_charclass_class);
4686 StructCopy(data.start_class,
4687 (struct regnode_charclass_class*)RExC_rxi->data->data[n],
4688 struct regnode_charclass_class);
4689 ri->regstclass = (regnode*)RExC_rxi->data->data[n];
4690 r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
4691 DEBUG_COMPILE_r({ SV *sv = sv_newmortal();
4692 regprop(r, sv, (regnode*)data.start_class);
4693 PerlIO_printf(Perl_debug_log,
4694 "synthetic stclass \"%s\".\n",
4695 SvPVX_const(sv));});
4698 /* A temporary algorithm prefers floated substr to fixed one to dig more info. */
4699 if (longest_fixed_length > longest_float_length) {
4700 r->check_end_shift = r->anchored_end_shift;
4701 r->check_substr = r->anchored_substr;
4702 r->check_utf8 = r->anchored_utf8;
4703 r->check_offset_min = r->check_offset_max = r->anchored_offset;
4704 if (r->extflags & RXf_ANCH_SINGLE)
4705 r->extflags |= RXf_NOSCAN;
4708 r->check_end_shift = r->float_end_shift;
4709 r->check_substr = r->float_substr;
4710 r->check_utf8 = r->float_utf8;
4711 r->check_offset_min = r->float_min_offset;
4712 r->check_offset_max = r->float_max_offset;
4714 /* XXXX Currently intuiting is not compatible with ANCH_GPOS.
4715 This should be changed ASAP! */
4716 if ((r->check_substr || r->check_utf8) && !(r->extflags & RXf_ANCH_GPOS)) {
4717 r->extflags |= RXf_USE_INTUIT;
4718 if (SvTAIL(r->check_substr ? r->check_substr : r->check_utf8))
4719 r->extflags |= RXf_INTUIT_TAIL;
4721 /* XXX Unneeded? dmq (shouldn't as this is handled elsewhere)
4722 if ( (STRLEN)minlen < longest_float_length )
4723 minlen= longest_float_length;
4724 if ( (STRLEN)minlen < longest_fixed_length )
4725 minlen= longest_fixed_length;
4729 /* Several toplevels. Best we can is to set minlen. */
4731 struct regnode_charclass_class ch_class;
4734 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "\nMulti Top Level\n"));
4736 scan = ri->program + 1;
4737 cl_init(pRExC_state, &ch_class);
4738 data.start_class = &ch_class;
4739 data.last_closep = &last_close;
4742 minlen = study_chunk(pRExC_state, &scan, &minlen, &fake, scan + RExC_size,
4743 &data, -1, NULL, NULL, SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS,0);
4747 r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8
4748 = r->float_substr = r->float_utf8 = NULL;
4749 if (!(data.start_class->flags & ANYOF_EOS)
4750 && !cl_is_anything(data.start_class))
4752 const U32 n = add_data(pRExC_state, 1, "f");
4754 Newx(RExC_rxi->data->data[n], 1,
4755 struct regnode_charclass_class);
4756 StructCopy(data.start_class,
4757 (struct regnode_charclass_class*)RExC_rxi->data->data[n],
4758 struct regnode_charclass_class);
4759 ri->regstclass = (regnode*)RExC_rxi->data->data[n];
4760 r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
4761 DEBUG_COMPILE_r({ SV* sv = sv_newmortal();
4762 regprop(r, sv, (regnode*)data.start_class);
4763 PerlIO_printf(Perl_debug_log,
4764 "synthetic stclass \"%s\".\n",
4765 SvPVX_const(sv));});
4769 /* Guard against an embedded (?=) or (?<=) with a longer minlen than
4770 the "real" pattern. */
4772 PerlIO_printf(Perl_debug_log,"minlen: %"IVdf" r->minlen:%"IVdf"\n",
4773 (IV)minlen, (IV)r->minlen);
4775 r->minlenret = minlen;
4776 if (r->minlen < minlen)
4779 if (RExC_seen & REG_SEEN_GPOS)
4780 r->extflags |= RXf_GPOS_SEEN;
4781 if (RExC_seen & REG_SEEN_LOOKBEHIND)
4782 r->extflags |= RXf_LOOKBEHIND_SEEN;
4783 if (RExC_seen & REG_SEEN_EVAL)
4784 r->extflags |= RXf_EVAL_SEEN;
4785 if (RExC_seen & REG_SEEN_CANY)
4786 r->extflags |= RXf_CANY_SEEN;
4787 if (RExC_seen & REG_SEEN_VERBARG)
4788 r->intflags |= PREGf_VERBARG_SEEN;
4789 if (RExC_seen & REG_SEEN_CUTGROUP)
4790 r->intflags |= PREGf_CUTGROUP_SEEN;
4791 if (RExC_paren_names)
4792 r->paren_names = (HV*)SvREFCNT_inc(RExC_paren_names);
4794 r->paren_names = NULL;
4796 #ifdef STUPID_PATTERN_CHECKS
4798 r->extflags |= RXf_NULL;
4799 if (r->extflags & RXf_SPLIT && r->prelen == 1 && r->precomp[0] == ' ')
4800 /* XXX: this should happen BEFORE we compile */
4801 r->extflags |= (RXf_SKIPWHITE|RXf_WHITE);
4802 else if (r->prelen == 3 && memEQ("\\s+", r->precomp, 3))
4803 r->extflags |= RXf_WHITE;
4804 else if (r->prelen == 1 && r->precomp[0] == '^')
4805 r->extflags |= RXf_START_ONLY;
4807 if (r->extflags & RXf_SPLIT && r->prelen == 1 && r->precomp[0] == ' ')
4808 /* XXX: this should happen BEFORE we compile */
4809 r->extflags |= (RXf_SKIPWHITE|RXf_WHITE);
4811 regnode *first = ri->program + 1;
4813 U8 nop = OP(NEXTOPER(first));
4815 if (PL_regkind[fop] == NOTHING && nop == END)
4816 r->extflags |= RXf_NULL;
4817 else if (PL_regkind[fop] == BOL && nop == END)
4818 r->extflags |= RXf_START_ONLY;
4819 else if (fop == PLUS && nop ==SPACE && OP(regnext(first))==END)
4820 r->extflags |= RXf_WHITE;
4824 if (RExC_paren_names) {
4825 ri->name_list_idx = add_data( pRExC_state, 1, "p" );
4826 ri->data->data[ri->name_list_idx] = (void*)SvREFCNT_inc(RExC_paren_name_list);
4829 ri->name_list_idx = 0;
4831 if (RExC_recurse_count) {
4832 for ( ; RExC_recurse_count ; RExC_recurse_count-- ) {
4833 const regnode *scan = RExC_recurse[RExC_recurse_count-1];
4834 ARG2L_SET( scan, RExC_open_parens[ARG(scan)-1] - scan );
4837 Newxz(r->offs, RExC_npar, regexp_paren_pair);
4838 /* assume we don't need to swap parens around before we match */
4841 PerlIO_printf(Perl_debug_log,"Final program:\n");
4844 #ifdef RE_TRACK_PATTERN_OFFSETS
4845 DEBUG_OFFSETS_r(if (ri->u.offsets) {
4846 const U32 len = ri->u.offsets[0];
4848 GET_RE_DEBUG_FLAGS_DECL;
4849 PerlIO_printf(Perl_debug_log, "Offsets: [%"UVuf"]\n\t", (UV)ri->u.offsets[0]);
4850 for (i = 1; i <= len; i++) {
4851 if (ri->u.offsets[i*2-1] || ri->u.offsets[i*2])
4852 PerlIO_printf(Perl_debug_log, "%"UVuf":%"UVuf"[%"UVuf"] ",
4853 (UV)i, (UV)ri->u.offsets[i*2-1], (UV)ri->u.offsets[i*2]);
4855 PerlIO_printf(Perl_debug_log, "\n");
4861 #undef RE_ENGINE_PTR
4865 Perl_reg_named_buff(pTHX_ REGEXP * const rx, SV * const key, SV * const value,
4868 PERL_UNUSED_ARG(value);
4870 if (flags & RXapif_FETCH) {
4871 return reg_named_buff_fetch(rx, key, flags);
4872 } else if (flags & (RXapif_STORE | RXapif_DELETE | RXapif_CLEAR)) {
4873 Perl_croak(aTHX_ PL_no_modify);
4875 } else if (flags & RXapif_EXISTS) {
4876 return reg_named_buff_exists(rx, key, flags)
4879 } else if (flags & RXapif_REGNAMES) {
4880 return reg_named_buff_all(rx, flags);
4881 } else if (flags & (RXapif_SCALAR | RXapif_REGNAMES_COUNT)) {
4882 return reg_named_buff_scalar(rx, flags);
4884 Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff", (int)flags);
4890 Perl_reg_named_buff_iter(pTHX_ REGEXP * const rx, const SV * const lastkey,
4893 PERL_UNUSED_ARG(lastkey);
4895 if (flags & RXapif_FIRSTKEY)
4896 return reg_named_buff_firstkey(rx, flags);
4897 else if (flags & RXapif_NEXTKEY)
4898 return reg_named_buff_nextkey(rx, flags);
4900 Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_iter", (int)flags);
4906 Perl_reg_named_buff_fetch(pTHX_ REGEXP * const rx, SV * const namesv, const U32 flags)
4908 AV *retarray = NULL;
4910 if (flags & RXapif_ALL)
4913 if (rx && rx->paren_names) {
4914 HE *he_str = hv_fetch_ent( rx->paren_names, namesv, 0, 0 );
4917 SV* sv_dat=HeVAL(he_str);
4918 I32 *nums=(I32*)SvPVX(sv_dat);
4919 for ( i=0; i<SvIVX(sv_dat); i++ ) {
4920 if ((I32)(rx->nparens) >= nums[i]
4921 && rx->offs[nums[i]].start != -1
4922 && rx->offs[nums[i]].end != -1)
4925 CALLREG_NUMBUF_FETCH(rx,nums[i],ret);
4929 ret = newSVsv(&PL_sv_undef);
4932 SvREFCNT_inc_simple_void(ret);
4933 av_push(retarray, ret);
4937 return newRV((SV*)retarray);
4944 Perl_reg_named_buff_exists(pTHX_ REGEXP * const rx, SV * const key,
4947 if (rx && rx->paren_names) {
4948 if (flags & RXapif_ALL) {
4949 return hv_exists_ent(rx->paren_names, key, 0);
4951 SV *sv = CALLREG_NAMED_BUFF_FETCH(rx, key, flags);
4965 Perl_reg_named_buff_firstkey(pTHX_ REGEXP * const rx, const U32 flags)
4967 if ( rx && rx->paren_names ) {
4968 (void)hv_iterinit(rx->paren_names);
4970 return CALLREG_NAMED_BUFF_NEXTKEY(rx, NULL, flags & ~RXapif_FIRSTKEY);
4977 Perl_reg_named_buff_nextkey(pTHX_ REGEXP * const rx, const U32 flags)
4979 if (rx && rx->paren_names) {
4980 HV *hv = rx->paren_names;
4982 while ( (temphe = hv_iternext_flags(hv,0)) ) {
4985 SV* sv_dat = HeVAL(temphe);
4986 I32 *nums = (I32*)SvPVX(sv_dat);
4987 for ( i = 0; i < SvIVX(sv_dat); i++ ) {
4988 if ((I32)(rx->lastcloseparen) >= nums[i] &&
4989 rx->offs[nums[i]].start != -1 &&
4990 rx->offs[nums[i]].end != -1)
4996 if (parno || flags & RXapif_ALL) {
4998 char *pv = HePV(temphe, len);
4999 return newSVpvn(pv,len);
5007 Perl_reg_named_buff_scalar(pTHX_ REGEXP * const rx, const U32 flags)
5013 if (rx && rx->paren_names) {
5014 if (flags & (RXapif_ALL | RXapif_REGNAMES_COUNT)) {
5015 return newSViv(HvTOTALKEYS(rx->paren_names));
5016 } else if (flags & RXapif_ONE) {
5017 ret = CALLREG_NAMED_BUFF_ALL(rx, (flags | RXapif_REGNAMES));
5018 av = (AV*)SvRV(ret);
5019 length = av_len(av);
5020 return newSViv(length + 1);
5022 Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_scalar", (int)flags);
5026 return &PL_sv_undef;
5030 Perl_reg_named_buff_all(pTHX_ REGEXP * const rx, const U32 flags)
5034 if (rx && rx->paren_names) {
5035 HV *hv= rx->paren_names;
5037 (void)hv_iterinit(hv);
5038 while ( (temphe = hv_iternext_flags(hv,0)) ) {
5041 SV* sv_dat = HeVAL(temphe);
5042 I32 *nums = (I32*)SvPVX(sv_dat);
5043 for ( i = 0; i < SvIVX(sv_dat); i++ ) {
5044 if ((I32)(rx->lastcloseparen) >= nums[i] &&
5045 rx->offs[nums[i]].start != -1 &&
5046 rx->offs[nums[i]].end != -1)
5052 if (parno || flags & RXapif_ALL) {
5054 char *pv = HePV(temphe, len);
5055 av_push(av, newSVpvn(pv,len));
5060 return newRV((SV*)av);
5064 Perl_reg_numbered_buff_fetch(pTHX_ REGEXP * const rx, const I32 paren, SV * const sv)
5071 sv_setsv(sv,&PL_sv_undef);
5075 if (paren == RX_BUFF_IDX_PREMATCH && rx->offs[0].start != -1) {
5077 i = rx->offs[0].start;
5081 if (paren == RX_BUFF_IDX_POSTMATCH && rx->offs[0].end != -1) {
5083 s = rx->subbeg + rx->offs[0].end;
5084 i = rx->sublen - rx->offs[0].end;
5087 if ( 0 <= paren && paren <= (I32)rx->nparens &&
5088 (s1 = rx->offs[paren].start) != -1 &&
5089 (t1 = rx->offs[paren].end) != -1)
5093 s = rx->subbeg + s1;
5095 sv_setsv(sv,&PL_sv_undef);
5098 assert(rx->sublen >= (s - rx->subbeg) + i );
5100 const int oldtainted = PL_tainted;
5102 sv_setpvn(sv, s, i);
5103 PL_tainted = oldtainted;
5104 if ( (rx->extflags & RXf_CANY_SEEN)
5105 ? (RX_MATCH_UTF8(rx)
5106 && (!i || is_utf8_string((U8*)s, i)))
5107 : (RX_MATCH_UTF8(rx)) )
5114 if (RX_MATCH_TAINTED(rx)) {
5115 if (SvTYPE(sv) >= SVt_PVMG) {
5116 MAGIC* const mg = SvMAGIC(sv);
5119 SvMAGIC_set(sv, mg->mg_moremagic);
5121 if ((mgt = SvMAGIC(sv))) {
5122 mg->mg_moremagic = mgt;
5123 SvMAGIC_set(sv, mg);
5133 sv_setsv(sv,&PL_sv_undef);
5139 Perl_reg_numbered_buff_store(pTHX_ REGEXP * const rx, const I32 paren,
5140 SV const * const value)
5142 PERL_UNUSED_ARG(rx);
5143 PERL_UNUSED_ARG(paren);
5144 PERL_UNUSED_ARG(value);
5147 Perl_croak(aTHX_ PL_no_modify);
5151 Perl_reg_numbered_buff_length(pTHX_ REGEXP * const rx, const SV * const sv,
5157 /* Some of this code was originally in C<Perl_magic_len> in F<mg.c> */
5159 /* $` / ${^PREMATCH} */
5160 case RX_BUFF_IDX_PREMATCH:
5161 if (rx->offs[0].start != -1) {
5162 i = rx->offs[0].start;
5170 /* $' / ${^POSTMATCH} */
5171 case RX_BUFF_IDX_POSTMATCH:
5172 if (rx->offs[0].end != -1) {
5173 i = rx->sublen - rx->offs[0].end;
5175 s1 = rx->offs[0].end;
5181 /* $& / ${^MATCH}, $1, $2, ... */
5183 if (paren <= (I32)rx->nparens &&
5184 (s1 = rx->offs[paren].start) != -1 &&
5185 (t1 = rx->offs[paren].end) != -1)
5190 if (ckWARN(WARN_UNINITIALIZED))
5191 report_uninit((SV*)sv);
5196 if (i > 0 && RX_MATCH_UTF8(rx)) {
5197 const char * const s = rx->subbeg + s1;
5202 if (is_utf8_string_loclen((U8*)s, i, &ep, &el))
5209 Perl_reg_qr_package(pTHX_ REGEXP * const rx)
5211 PERL_UNUSED_ARG(rx);
5212 return newSVpvs("Regexp");
5215 /* Scans the name of a named buffer from the pattern.
5216 * If flags is REG_RSN_RETURN_NULL returns null.
5217 * If flags is REG_RSN_RETURN_NAME returns an SV* containing the name
5218 * If flags is REG_RSN_RETURN_DATA returns the data SV* corresponding
5219 * to the parsed name as looked up in the RExC_paren_names hash.
5220 * If there is an error throws a vFAIL().. type exception.
5223 #define REG_RSN_RETURN_NULL 0
5224 #define REG_RSN_RETURN_NAME 1
5225 #define REG_RSN_RETURN_DATA 2
5228 S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags) {
5229 char *name_start = RExC_parse;
5231 if (isIDFIRST_lazy_if(RExC_parse, UTF)) {
5232 /* skip IDFIRST by using do...while */
5235 RExC_parse += UTF8SKIP(RExC_parse);
5236 } while (isALNUM_utf8((U8*)RExC_parse));
5240 } while (isALNUM(*RExC_parse));
5244 SV* sv_name = sv_2mortal(Perl_newSVpvn(aTHX_ name_start,
5245 (int)(RExC_parse - name_start)));
5248 if ( flags == REG_RSN_RETURN_NAME)
5250 else if (flags==REG_RSN_RETURN_DATA) {
5253 if ( ! sv_name ) /* should not happen*/
5254 Perl_croak(aTHX_ "panic: no svname in reg_scan_name");
5255 if (RExC_paren_names)
5256 he_str = hv_fetch_ent( RExC_paren_names, sv_name, 0, 0 );
5258 sv_dat = HeVAL(he_str);
5260 vFAIL("Reference to nonexistent named group");
5264 Perl_croak(aTHX_ "panic: bad flag in reg_scan_name");
5271 #define DEBUG_PARSE_MSG(funcname) DEBUG_PARSE_r({ \
5272 int rem=(int)(RExC_end - RExC_parse); \
5281 if (RExC_lastparse!=RExC_parse) \
5282 PerlIO_printf(Perl_debug_log," >%.*s%-*s", \
5285 iscut ? "..." : "<" \
5288 PerlIO_printf(Perl_debug_log,"%16s",""); \
5291 num = RExC_size + 1; \
5293 num=REG_NODE_NUM(RExC_emit); \
5294 if (RExC_lastnum!=num) \
5295 PerlIO_printf(Perl_debug_log,"|%4d",num); \
5297 PerlIO_printf(Perl_debug_log,"|%4s",""); \
5298 PerlIO_printf(Perl_debug_log,"|%*s%-4s", \
5299 (int)((depth*2)), "", \
5303 RExC_lastparse=RExC_parse; \
5308 #define DEBUG_PARSE(funcname) DEBUG_PARSE_r({ \
5309 DEBUG_PARSE_MSG((funcname)); \
5310 PerlIO_printf(Perl_debug_log,"%4s","\n"); \
5312 #define DEBUG_PARSE_FMT(funcname,fmt,args) DEBUG_PARSE_r({ \
5313 DEBUG_PARSE_MSG((funcname)); \
5314 PerlIO_printf(Perl_debug_log,fmt "\n",args); \
5317 - reg - regular expression, i.e. main body or parenthesized thing
5319 * Caller must absorb opening parenthesis.
5321 * Combining parenthesis handling with the base level of regular expression
5322 * is a trifle forced, but the need to tie the tails of the branches to what
5323 * follows makes it hard to avoid.
5325 #define REGTAIL(x,y,z) regtail((x),(y),(z),depth+1)
5327 #define REGTAIL_STUDY(x,y,z) regtail_study((x),(y),(z),depth+1)
5329 #define REGTAIL_STUDY(x,y,z) regtail((x),(y),(z),depth+1)
5333 S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
5334 /* paren: Parenthesized? 0=top, 1=(, inside: changed to letter. */
5337 register regnode *ret; /* Will be the head of the group. */
5338 register regnode *br;
5339 register regnode *lastbr;
5340 register regnode *ender = NULL;
5341 register I32 parno = 0;
5343 U32 oregflags = RExC_flags;
5344 bool have_branch = 0;
5346 I32 freeze_paren = 0;
5347 I32 after_freeze = 0;
5349 /* for (?g), (?gc), and (?o) warnings; warning
5350 about (?c) will warn about (?g) -- japhy */
5352 #define WASTED_O 0x01
5353 #define WASTED_G 0x02
5354 #define WASTED_C 0x04
5355 #define WASTED_GC (0x02|0x04)
5356 I32 wastedflags = 0x00;
5358 char * parse_start = RExC_parse; /* MJD */
5359 char * const oregcomp_parse = RExC_parse;
5361 GET_RE_DEBUG_FLAGS_DECL;
5362 DEBUG_PARSE("reg ");
5364 *flagp = 0; /* Tentatively. */
5367 /* Make an OPEN node, if parenthesized. */
5369 if ( *RExC_parse == '*') { /* (*VERB:ARG) */
5370 char *start_verb = RExC_parse;
5371 STRLEN verb_len = 0;
5372 char *start_arg = NULL;
5373 unsigned char op = 0;
5375 int internal_argval = 0; /* internal_argval is only useful if !argok */
5376 while ( *RExC_parse && *RExC_parse != ')' ) {
5377 if ( *RExC_parse == ':' ) {
5378 start_arg = RExC_parse + 1;
5384 verb_len = RExC_parse - start_verb;
5387 while ( *RExC_parse && *RExC_parse != ')' )
5389 if ( *RExC_parse != ')' )
5390 vFAIL("Unterminated verb pattern argument");
5391 if ( RExC_parse == start_arg )
5394 if ( *RExC_parse != ')' )
5395 vFAIL("Unterminated verb pattern");
5398 switch ( *start_verb ) {
5399 case 'A': /* (*ACCEPT) */
5400 if ( memEQs(start_verb,verb_len,"ACCEPT") ) {
5402 internal_argval = RExC_nestroot;
5405 case 'C': /* (*COMMIT) */
5406 if ( memEQs(start_verb,verb_len,"COMMIT") )
5409 case 'F': /* (*FAIL) */
5410 if ( verb_len==1 || memEQs(start_verb,verb_len,"FAIL") ) {
5415 case ':': /* (*:NAME) */
5416 case 'M': /* (*MARK:NAME) */
5417 if ( verb_len==0 || memEQs(start_verb,verb_len,"MARK") ) {
5422 case 'P': /* (*PRUNE) */
5423 if ( memEQs(start_verb,verb_len,"PRUNE") )
5426 case 'S': /* (*SKIP) */
5427 if ( memEQs(start_verb,verb_len,"SKIP") )
5430 case 'T': /* (*THEN) */
5431 /* [19:06] <TimToady> :: is then */
5432 if ( memEQs(start_verb,verb_len,"THEN") ) {
5434 RExC_seen |= REG_SEEN_CUTGROUP;
5440 vFAIL3("Unknown verb pattern '%.*s'",
5441 verb_len, start_verb);
5444 if ( start_arg && internal_argval ) {
5445 vFAIL3("Verb pattern '%.*s' may not have an argument",
5446 verb_len, start_verb);
5447 } else if ( argok < 0 && !start_arg ) {
5448 vFAIL3("Verb pattern '%.*s' has a mandatory argument",
5449 verb_len, start_verb);
5451 ret = reganode(pRExC_state, op, internal_argval);
5452 if ( ! internal_argval && ! SIZE_ONLY ) {
5454 SV *sv = newSVpvn( start_arg, RExC_parse - start_arg);
5455 ARG(ret) = add_data( pRExC_state, 1, "S" );
5456 RExC_rxi->data->data[ARG(ret)]=(void*)sv;
5463 if (!internal_argval)
5464 RExC_seen |= REG_SEEN_VERBARG;
5465 } else if ( start_arg ) {
5466 vFAIL3("Verb pattern '%.*s' may not have an argument",
5467 verb_len, start_verb);
5469 ret = reg_node(pRExC_state, op);
5471 nextchar(pRExC_state);
5474 if (*RExC_parse == '?') { /* (?...) */
5475 bool is_logical = 0;
5476 const char * const seqstart = RExC_parse;
5479 paren = *RExC_parse++;
5480 ret = NULL; /* For look-ahead/behind. */
5483 case 'P': /* (?P...) variants for those used to PCRE/Python */
5484 paren = *RExC_parse++;
5485 if ( paren == '<') /* (?P<...>) named capture */
5487 else if (paren == '>') { /* (?P>name) named recursion */
5488 goto named_recursion;
5490 else if (paren == '=') { /* (?P=...) named backref */
5491 /* this pretty much dupes the code for \k<NAME> in regatom(), if
5492 you change this make sure you change that */
5493 char* name_start = RExC_parse;
5495 SV *sv_dat = reg_scan_name(pRExC_state,
5496 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
5497 if (RExC_parse == name_start || *RExC_parse != ')')
5498 vFAIL2("Sequence %.3s... not terminated",parse_start);
5501 num = add_data( pRExC_state, 1, "S" );
5502 RExC_rxi->data->data[num]=(void*)sv_dat;
5503 SvREFCNT_inc_simple_void(sv_dat);
5506 ret = reganode(pRExC_state,
5507 (U8)(FOLD ? (LOC ? NREFFL : NREFF) : NREF),
5511 Set_Node_Offset(ret, parse_start+1);
5512 Set_Node_Cur_Length(ret); /* MJD */
5514 nextchar(pRExC_state);
5518 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
5520 case '<': /* (?<...) */
5521 if (*RExC_parse == '!')
5523 else if (*RExC_parse != '=')
5529 case '\'': /* (?'...') */
5530 name_start= RExC_parse;
5531 svname = reg_scan_name(pRExC_state,
5532 SIZE_ONLY ? /* reverse test from the others */
5533 REG_RSN_RETURN_NAME :
5534 REG_RSN_RETURN_NULL);
5535 if (RExC_parse == name_start) {
5537 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
5540 if (*RExC_parse != paren)
5541 vFAIL2("Sequence (?%c... not terminated",
5542 paren=='>' ? '<' : paren);
5546 if (!svname) /* shouldnt happen */
5548 "panic: reg_scan_name returned NULL");
5549 if (!RExC_paren_names) {
5550 RExC_paren_names= newHV();
5551 sv_2mortal((SV*)RExC_paren_names);
5553 RExC_paren_name_list= newAV();
5554 sv_2mortal((SV*)RExC_paren_name_list);
5557 he_str = hv_fetch_ent( RExC_paren_names, svname, 1, 0 );
5559 sv_dat = HeVAL(he_str);
5561 /* croak baby croak */
5563 "panic: paren_name hash element allocation failed");
5564 } else if ( SvPOK(sv_dat) ) {
5565 /* (?|...) can mean we have dupes so scan to check
5566 its already been stored. Maybe a flag indicating
5567 we are inside such a construct would be useful,
5568 but the arrays are likely to be quite small, so
5569 for now we punt -- dmq */
5570 IV count = SvIV(sv_dat);
5571 I32 *pv = (I32*)SvPVX(sv_dat);
5573 for ( i = 0 ; i < count ; i++ ) {
5574 if ( pv[i] == RExC_npar ) {
5580 pv = (I32*)SvGROW(sv_dat, SvCUR(sv_dat) + sizeof(I32)+1);
5581 SvCUR_set(sv_dat, SvCUR(sv_dat) + sizeof(I32));
5582 pv[count] = RExC_npar;
5586 (void)SvUPGRADE(sv_dat,SVt_PVNV);
5587 sv_setpvn(sv_dat, (char *)&(RExC_npar), sizeof(I32));
5592 if (!av_store(RExC_paren_name_list, RExC_npar, SvREFCNT_inc(svname)))
5593 SvREFCNT_dec(svname);
5596 /*sv_dump(sv_dat);*/
5598 nextchar(pRExC_state);
5600 goto capturing_parens;
5602 RExC_seen |= REG_SEEN_LOOKBEHIND;
5604 case '=': /* (?=...) */
5605 case '!': /* (?!...) */
5606 RExC_seen_zerolen++;
5607 if (*RExC_parse == ')') {
5608 ret=reg_node(pRExC_state, OPFAIL);
5609 nextchar(pRExC_state);
5613 case '|': /* (?|...) */
5614 /* branch reset, behave like a (?:...) except that
5615 buffers in alternations share the same numbers */
5617 after_freeze = freeze_paren = RExC_npar;
5619 case ':': /* (?:...) */
5620 case '>': /* (?>...) */
5622 case '$': /* (?$...) */
5623 case '@': /* (?@...) */
5624 vFAIL2("Sequence (?%c...) not implemented", (int)paren);
5626 case '#': /* (?#...) */
5627 while (*RExC_parse && *RExC_parse != ')')
5629 if (*RExC_parse != ')')
5630 FAIL("Sequence (?#... not terminated");
5631 nextchar(pRExC_state);
5634 case '0' : /* (?0) */
5635 case 'R' : /* (?R) */
5636 if (*RExC_parse != ')')
5637 FAIL("Sequence (?R) not terminated");
5638 ret = reg_node(pRExC_state, GOSTART);
5639 *flagp |= POSTPONED;
5640 nextchar(pRExC_state);
5643 { /* named and numeric backreferences */
5645 case '&': /* (?&NAME) */
5646 parse_start = RExC_parse - 1;
5649 SV *sv_dat = reg_scan_name(pRExC_state,
5650 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
5651 num = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
5653 goto gen_recurse_regop;
5656 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
5658 vFAIL("Illegal pattern");
5660 goto parse_recursion;
5662 case '-': /* (?-1) */
5663 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
5664 RExC_parse--; /* rewind to let it be handled later */
5668 case '1': case '2': case '3': case '4': /* (?1) */
5669 case '5': case '6': case '7': case '8': case '9':
5672 num = atoi(RExC_parse);
5673 parse_start = RExC_parse - 1; /* MJD */
5674 if (*RExC_parse == '-')
5676 while (isDIGIT(*RExC_parse))
5678 if (*RExC_parse!=')')
5679 vFAIL("Expecting close bracket");
5682 if ( paren == '-' ) {
5684 Diagram of capture buffer numbering.
5685 Top line is the normal capture buffer numbers
5686 Botton line is the negative indexing as from
5690 /(a(x)y)(a(b(c(?-2)d)e)f)(g(h))/
5694 num = RExC_npar + num;
5697 vFAIL("Reference to nonexistent group");
5699 } else if ( paren == '+' ) {
5700 num = RExC_npar + num - 1;
5703 ret = reganode(pRExC_state, GOSUB, num);
5705 if (num > (I32)RExC_rx->nparens) {
5707 vFAIL("Reference to nonexistent group");
5709 ARG2L_SET( ret, RExC_recurse_count++);
5711 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
5712 "Recurse #%"UVuf" to %"IVdf"\n", (UV)ARG(ret), (IV)ARG2L(ret)));
5716 RExC_seen |= REG_SEEN_RECURSE;
5717 Set_Node_Length(ret, 1 + regarglen[OP(ret)]); /* MJD */
5718 Set_Node_Offset(ret, parse_start); /* MJD */
5720 *flagp |= POSTPONED;
5721 nextchar(pRExC_state);
5723 } /* named and numeric backreferences */
5726 case '?': /* (??...) */
5728 if (*RExC_parse != '{') {
5730 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
5733 *flagp |= POSTPONED;
5734 paren = *RExC_parse++;
5736 case '{': /* (?{...}) */
5741 char *s = RExC_parse;
5743 RExC_seen_zerolen++;
5744 RExC_seen |= REG_SEEN_EVAL;
5745 while (count && (c = *RExC_parse)) {
5756 if (*RExC_parse != ')') {
5758 vFAIL("Sequence (?{...}) not terminated or not {}-balanced");
5762 OP_4tree *sop, *rop;
5763 SV * const sv = newSVpvn(s, RExC_parse - 1 - s);
5766 Perl_save_re_context(aTHX);
5767 rop = sv_compile_2op(sv, &sop, "re", &pad);
5768 sop->op_private |= OPpREFCOUNTED;
5769 /* re_dup will OpREFCNT_inc */
5770 OpREFCNT_set(sop, 1);
5773 n = add_data(pRExC_state, 3, "nop");
5774 RExC_rxi->data->data[n] = (void*)rop;
5775 RExC_rxi->data->data[n+1] = (void*)sop;
5776 RExC_rxi->data->data[n+2] = (void*)pad;
5779 else { /* First pass */
5780 if (PL_reginterp_cnt < ++RExC_seen_evals
5782 /* No compiled RE interpolated, has runtime
5783 components ===> unsafe. */
5784 FAIL("Eval-group not allowed at runtime, use re 'eval'");
5785 if (PL_tainting && PL_tainted)
5786 FAIL("Eval-group in insecure regular expression");
5787 #if PERL_VERSION > 8
5788 if (IN_PERL_COMPILETIME)
5793 nextchar(pRExC_state);
5795 ret = reg_node(pRExC_state, LOGICAL);
5798 REGTAIL(pRExC_state, ret, reganode(pRExC_state, EVAL, n));
5799 /* deal with the length of this later - MJD */
5802 ret = reganode(pRExC_state, EVAL, n);
5803 Set_Node_Length(ret, RExC_parse - parse_start + 1);
5804 Set_Node_Offset(ret, parse_start);
5807 case '(': /* (?(?{...})...) and (?(?=...)...) */
5810 if (RExC_parse[0] == '?') { /* (?(?...)) */
5811 if (RExC_parse[1] == '=' || RExC_parse[1] == '!'
5812 || RExC_parse[1] == '<'
5813 || RExC_parse[1] == '{') { /* Lookahead or eval. */
5816 ret = reg_node(pRExC_state, LOGICAL);
5819 REGTAIL(pRExC_state, ret, reg(pRExC_state, 1, &flag,depth+1));
5823 else if ( RExC_parse[0] == '<' /* (?(<NAME>)...) */
5824 || RExC_parse[0] == '\'' ) /* (?('NAME')...) */
5826 char ch = RExC_parse[0] == '<' ? '>' : '\'';
5827 char *name_start= RExC_parse++;
5829 SV *sv_dat=reg_scan_name(pRExC_state,
5830 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
5831 if (RExC_parse == name_start || *RExC_parse != ch)
5832 vFAIL2("Sequence (?(%c... not terminated",
5833 (ch == '>' ? '<' : ch));
5836 num = add_data( pRExC_state, 1, "S" );
5837 RExC_rxi->data->data[num]=(void*)sv_dat;
5838 SvREFCNT_inc_simple_void(sv_dat);
5840 ret = reganode(pRExC_state,NGROUPP,num);
5841 goto insert_if_check_paren;
5843 else if (RExC_parse[0] == 'D' &&
5844 RExC_parse[1] == 'E' &&
5845 RExC_parse[2] == 'F' &&
5846 RExC_parse[3] == 'I' &&
5847 RExC_parse[4] == 'N' &&
5848 RExC_parse[5] == 'E')
5850 ret = reganode(pRExC_state,DEFINEP,0);
5853 goto insert_if_check_paren;
5855 else if (RExC_parse[0] == 'R') {
5858 if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
5859 parno = atoi(RExC_parse++);
5860 while (isDIGIT(*RExC_parse))
5862 } else if (RExC_parse[0] == '&') {
5865 sv_dat = reg_scan_name(pRExC_state,
5866 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
5867 parno = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
5869 ret = reganode(pRExC_state,INSUBP,parno);
5870 goto insert_if_check_paren;
5872 else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
5875 parno = atoi(RExC_parse++);
5877 while (isDIGIT(*RExC_parse))
5879 ret = reganode(pRExC_state, GROUPP, parno);
5881 insert_if_check_paren:
5882 if ((c = *nextchar(pRExC_state)) != ')')
5883 vFAIL("Switch condition not recognized");
5885 REGTAIL(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
5886 br = regbranch(pRExC_state, &flags, 1,depth+1);
5888 br = reganode(pRExC_state, LONGJMP, 0);
5890 REGTAIL(pRExC_state, br, reganode(pRExC_state, LONGJMP, 0));
5891 c = *nextchar(pRExC_state);
5896 vFAIL("(?(DEFINE)....) does not allow branches");
5897 lastbr = reganode(pRExC_state, IFTHEN, 0); /* Fake one for optimizer. */
5898 regbranch(pRExC_state, &flags, 1,depth+1);
5899 REGTAIL(pRExC_state, ret, lastbr);
5902 c = *nextchar(pRExC_state);
5907 vFAIL("Switch (?(condition)... contains too many branches");
5908 ender = reg_node(pRExC_state, TAIL);
5909 REGTAIL(pRExC_state, br, ender);
5911 REGTAIL(pRExC_state, lastbr, ender);
5912 REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
5915 REGTAIL(pRExC_state, ret, ender);
5916 RExC_size++; /* XXX WHY do we need this?!!
5917 For large programs it seems to be required
5918 but I can't figure out why. -- dmq*/
5922 vFAIL2("Unknown switch condition (?(%.2s", RExC_parse);
5926 RExC_parse--; /* for vFAIL to print correctly */
5927 vFAIL("Sequence (? incomplete");
5931 parse_flags: /* (?i) */
5933 U32 posflags = 0, negflags = 0;
5934 U32 *flagsp = &posflags;
5936 while (*RExC_parse) {
5937 /* && strchr("iogcmsx", *RExC_parse) */
5938 /* (?g), (?gc) and (?o) are useless here
5939 and must be globally applied -- japhy */
5940 switch (*RExC_parse) {
5941 CASE_STD_PMMOD_FLAGS_PARSE_SET(flagsp);
5942 case ONCE_PAT_MOD: /* 'o' */
5943 case GLOBAL_PAT_MOD: /* 'g' */
5944 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
5945 const I32 wflagbit = *RExC_parse == 'o' ? WASTED_O : WASTED_G;
5946 if (! (wastedflags & wflagbit) ) {
5947 wastedflags |= wflagbit;
5950 "Useless (%s%c) - %suse /%c modifier",
5951 flagsp == &negflags ? "?-" : "?",
5953 flagsp == &negflags ? "don't " : "",
5960 case CONTINUE_PAT_MOD: /* 'c' */
5961 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
5962 if (! (wastedflags & WASTED_C) ) {
5963 wastedflags |= WASTED_GC;
5966 "Useless (%sc) - %suse /gc modifier",
5967 flagsp == &negflags ? "?-" : "?",
5968 flagsp == &negflags ? "don't " : ""
5973 case KEEPCOPY_PAT_MOD: /* 'p' */
5974 if (flagsp == &negflags) {
5975 if (SIZE_ONLY && ckWARN(WARN_REGEXP))
5976 vWARN(RExC_parse + 1,"Useless use of (?-p)");
5978 *flagsp |= RXf_PMf_KEEPCOPY;
5982 if (flagsp == &negflags) {
5984 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
5988 wastedflags = 0; /* reset so (?g-c) warns twice */
5994 RExC_flags |= posflags;
5995 RExC_flags &= ~negflags;
5997 oregflags |= posflags;
5998 oregflags &= ~negflags;
6000 nextchar(pRExC_state);
6011 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
6016 }} /* one for the default block, one for the switch */
6023 ret = reganode(pRExC_state, OPEN, parno);
6026 RExC_nestroot = parno;
6027 if (RExC_seen & REG_SEEN_RECURSE
6028 && !RExC_open_parens[parno-1])
6030 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
6031 "Setting open paren #%"IVdf" to %d\n",
6032 (IV)parno, REG_NODE_NUM(ret)));
6033 RExC_open_parens[parno-1]= ret;
6036 Set_Node_Length(ret, 1); /* MJD */
6037 Set_Node_Offset(ret, RExC_parse); /* MJD */
6045 /* Pick up the branches, linking them together. */
6046 parse_start = RExC_parse; /* MJD */
6047 br = regbranch(pRExC_state, &flags, 1,depth+1);
6048 /* branch_len = (paren != 0); */
6052 if (*RExC_parse == '|') {
6053 if (!SIZE_ONLY && RExC_extralen) {
6054 reginsert(pRExC_state, BRANCHJ, br, depth+1);
6057 reginsert(pRExC_state, BRANCH, br, depth+1);
6058 Set_Node_Length(br, paren != 0);
6059 Set_Node_Offset_To_R(br-RExC_emit_start, parse_start-RExC_start);
6063 RExC_extralen += 1; /* For BRANCHJ-BRANCH. */
6065 else if (paren == ':') {
6066 *flagp |= flags&SIMPLE;
6068 if (is_open) { /* Starts with OPEN. */
6069 REGTAIL(pRExC_state, ret, br); /* OPEN -> first. */
6071 else if (paren != '?') /* Not Conditional */
6073 *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
6075 while (*RExC_parse == '|') {
6076 if (!SIZE_ONLY && RExC_extralen) {
6077 ender = reganode(pRExC_state, LONGJMP,0);
6078 REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender); /* Append to the previous. */
6081 RExC_extralen += 2; /* Account for LONGJMP. */
6082 nextchar(pRExC_state);
6084 if (RExC_npar > after_freeze)
6085 after_freeze = RExC_npar;
6086 RExC_npar = freeze_paren;
6088 br = regbranch(pRExC_state, &flags, 0, depth+1);
6092 REGTAIL(pRExC_state, lastbr, br); /* BRANCH -> BRANCH. */
6094 *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
6097 if (have_branch || paren != ':') {
6098 /* Make a closing node, and hook it on the end. */
6101 ender = reg_node(pRExC_state, TAIL);
6104 ender = reganode(pRExC_state, CLOSE, parno);
6105 if (!SIZE_ONLY && RExC_seen & REG_SEEN_RECURSE) {
6106 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
6107 "Setting close paren #%"IVdf" to %d\n",
6108 (IV)parno, REG_NODE_NUM(ender)));
6109 RExC_close_parens[parno-1]= ender;
6110 if (RExC_nestroot == parno)
6113 Set_Node_Offset(ender,RExC_parse+1); /* MJD */
6114 Set_Node_Length(ender,1); /* MJD */
6120 *flagp &= ~HASWIDTH;
6123 ender = reg_node(pRExC_state, SUCCEED);
6126 ender = reg_node(pRExC_state, END);
6128 assert(!RExC_opend); /* there can only be one! */
6133 REGTAIL(pRExC_state, lastbr, ender);
6135 if (have_branch && !SIZE_ONLY) {
6137 RExC_seen |= REG_TOP_LEVEL_BRANCHES;
6139 /* Hook the tails of the branches to the closing node. */
6140 for (br = ret; br; br = regnext(br)) {
6141 const U8 op = PL_regkind[OP(br)];
6143 REGTAIL_STUDY(pRExC_state, NEXTOPER(br), ender);
6145 else if (op == BRANCHJ) {
6146 REGTAIL_STUDY(pRExC_state, NEXTOPER(NEXTOPER(br)), ender);
6154 static const char parens[] = "=!<,>";
6156 if (paren && (p = strchr(parens, paren))) {
6157 U8 node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
6158 int flag = (p - parens) > 1;
6161 node = SUSPEND, flag = 0;
6162 reginsert(pRExC_state, node,ret, depth+1);
6163 Set_Node_Cur_Length(ret);
6164 Set_Node_Offset(ret, parse_start + 1);
6166 REGTAIL_STUDY(pRExC_state, ret, reg_node(pRExC_state, TAIL));
6170 /* Check for proper termination. */
6172 RExC_flags = oregflags;
6173 if (RExC_parse >= RExC_end || *nextchar(pRExC_state) != ')') {
6174 RExC_parse = oregcomp_parse;
6175 vFAIL("Unmatched (");
6178 else if (!paren && RExC_parse < RExC_end) {
6179 if (*RExC_parse == ')') {
6181 vFAIL("Unmatched )");
6184 FAIL("Junk on end of regexp"); /* "Can't happen". */
6188 RExC_npar = after_freeze;
6193 - regbranch - one alternative of an | operator
6195 * Implements the concatenation operator.
6198 S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first, U32 depth)
6201 register regnode *ret;
6202 register regnode *chain = NULL;
6203 register regnode *latest;
6204 I32 flags = 0, c = 0;
6205 GET_RE_DEBUG_FLAGS_DECL;
6206 DEBUG_PARSE("brnc");
6211 if (!SIZE_ONLY && RExC_extralen)
6212 ret = reganode(pRExC_state, BRANCHJ,0);
6214 ret = reg_node(pRExC_state, BRANCH);
6215 Set_Node_Length(ret, 1);
6219 if (!first && SIZE_ONLY)
6220 RExC_extralen += 1; /* BRANCHJ */
6222 *flagp = WORST; /* Tentatively. */
6225 nextchar(pRExC_state);
6226 while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
6228 latest = regpiece(pRExC_state, &flags,depth+1);
6229 if (latest == NULL) {
6230 if (flags & TRYAGAIN)
6234 else if (ret == NULL)
6236 *flagp |= flags&(HASWIDTH|POSTPONED);
6237 if (chain == NULL) /* First piece. */
6238 *flagp |= flags&SPSTART;
6241 REGTAIL(pRExC_state, chain, latest);
6246 if (chain == NULL) { /* Loop ran zero times. */
6247 chain = reg_node(pRExC_state, NOTHING);
6252 *flagp |= flags&SIMPLE;
6259 - regpiece - something followed by possible [*+?]
6261 * Note that the branching code sequences used for ? and the general cases
6262 * of * and + are somewhat optimized: they use the same NOTHING node as
6263 * both the endmarker for their branch list and the body of the last branch.
6264 * It might seem that this node could be dispensed with entirely, but the
6265 * endmarker role is not redundant.
6268 S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
6271 register regnode *ret;
6273 register char *next;
6275 const char * const origparse = RExC_parse;
6277 I32 max = REG_INFTY;
6279 const char *maxpos = NULL;
6280 GET_RE_DEBUG_FLAGS_DECL;
6281 DEBUG_PARSE("piec");
6283 ret = regatom(pRExC_state, &flags,depth+1);
6285 if (flags & TRYAGAIN)
6292 if (op == '{' && regcurly(RExC_parse)) {
6294 parse_start = RExC_parse; /* MJD */
6295 next = RExC_parse + 1;
6296 while (isDIGIT(*next) || *next == ',') {
6305 if (*next == '}') { /* got one */
6309 min = atoi(RExC_parse);
6313 maxpos = RExC_parse;
6315 if (!max && *maxpos != '0')
6316 max = REG_INFTY; /* meaning "infinity" */
6317 else if (max >= REG_INFTY)
6318 vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
6320 nextchar(pRExC_state);
6323 if ((flags&SIMPLE)) {
6324 RExC_naughty += 2 + RExC_naughty / 2;
6325 reginsert(pRExC_state, CURLY, ret, depth+1);
6326 Set_Node_Offset(ret, parse_start+1); /* MJD */
6327 Set_Node_Cur_Length(ret);
6330 regnode * const w = reg_node(pRExC_state, WHILEM);
6333 REGTAIL(pRExC_state, ret, w);
6334 if (!SIZE_ONLY && RExC_extralen) {
6335 reginsert(pRExC_state, LONGJMP,ret, depth+1);
6336 reginsert(pRExC_state, NOTHING,ret, depth+1);
6337 NEXT_OFF(ret) = 3; /* Go over LONGJMP. */
6339 reginsert(pRExC_state, CURLYX,ret, depth+1);
6341 Set_Node_Offset(ret, parse_start+1);
6342 Set_Node_Length(ret,
6343 op == '{' ? (RExC_parse - parse_start) : 1);
6345 if (!SIZE_ONLY && RExC_extralen)
6346 NEXT_OFF(ret) = 3; /* Go over NOTHING to LONGJMP. */
6347 REGTAIL(pRExC_state, ret, reg_node(pRExC_state, NOTHING));
6349 RExC_whilem_seen++, RExC_extralen += 3;
6350 RExC_naughty += 4 + RExC_naughty; /* compound interest */
6358 if (max && max < min)
6359 vFAIL("Can't do {n,m} with n > m");
6361 ARG1_SET(ret, (U16)min);
6362 ARG2_SET(ret, (U16)max);
6374 #if 0 /* Now runtime fix should be reliable. */
6376 /* if this is reinstated, don't forget to put this back into perldiag:
6378 =item Regexp *+ operand could be empty at {#} in regex m/%s/
6380 (F) The part of the regexp subject to either the * or + quantifier
6381 could match an empty string. The {#} shows in the regular
6382 expression about where the problem was discovered.
6386 if (!(flags&HASWIDTH) && op != '?')
6387 vFAIL("Regexp *+ operand could be empty");
6390 parse_start = RExC_parse;
6391 nextchar(pRExC_state);
6393 *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH);
6395 if (op == '*' && (flags&SIMPLE)) {
6396 reginsert(pRExC_state, STAR, ret, depth+1);
6400 else if (op == '*') {
6404 else if (op == '+' && (flags&SIMPLE)) {
6405 reginsert(pRExC_state, PLUS, ret, depth+1);
6409 else if (op == '+') {
6413 else if (op == '?') {
6418 if (!SIZE_ONLY && !(flags&(HASWIDTH|POSTPONED)) && max > REG_INFTY/3 && ckWARN(WARN_REGEXP)) {
6420 "%.*s matches null string many times",
6421 (int)(RExC_parse >= origparse ? RExC_parse - origparse : 0),
6425 if (RExC_parse < RExC_end && *RExC_parse == '?') {
6426 nextchar(pRExC_state);
6427 reginsert(pRExC_state, MINMOD, ret, depth+1);
6428 REGTAIL(pRExC_state, ret, ret + NODE_STEP_REGNODE);
6430 #ifndef REG_ALLOW_MINMOD_SUSPEND
6433 if (RExC_parse < RExC_end && *RExC_parse == '+') {
6435 nextchar(pRExC_state);
6436 ender = reg_node(pRExC_state, SUCCEED);
6437 REGTAIL(pRExC_state, ret, ender);
6438 reginsert(pRExC_state, SUSPEND, ret, depth+1);
6440 ender = reg_node(pRExC_state, TAIL);
6441 REGTAIL(pRExC_state, ret, ender);
6445 if (RExC_parse < RExC_end && ISMULT2(RExC_parse)) {
6447 vFAIL("Nested quantifiers");
6454 /* reg_namedseq(pRExC_state,UVp)
6456 This is expected to be called by a parser routine that has
6457 recognized'\N' and needs to handle the rest. RExC_parse is
6458 expected to point at the first char following the N at the time
6461 If valuep is non-null then it is assumed that we are parsing inside
6462 of a charclass definition and the first codepoint in the resolved
6463 string is returned via *valuep and the routine will return NULL.
6464 In this mode if a multichar string is returned from the charnames
6465 handler a warning will be issued, and only the first char in the
6466 sequence will be examined. If the string returned is zero length
6467 then the value of *valuep is undefined and NON-NULL will
6468 be returned to indicate failure. (This will NOT be a valid pointer
6471 If value is null then it is assumed that we are parsing normal text
6472 and inserts a new EXACT node into the program containing the resolved
6473 string and returns a pointer to the new node. If the string is
6474 zerolength a NOTHING node is emitted.
6476 On success RExC_parse is set to the char following the endbrace.
6477 Parsing failures will generate a fatal errorvia vFAIL(...)
6479 NOTE: We cache all results from the charnames handler locally in
6480 the RExC_charnames hash (created on first use) to prevent a charnames
6481 handler from playing silly-buggers and returning a short string and
6482 then a long string for a given pattern. Since the regexp program
6483 size is calculated during an initial parse this would result
6484 in a buffer overrun so we cache to prevent the charname result from
6485 changing during the course of the parse.
6489 S_reg_namedseq(pTHX_ RExC_state_t *pRExC_state, UV *valuep)
6491 char * name; /* start of the content of the name */
6492 char * endbrace; /* endbrace following the name */
6495 STRLEN len; /* this has various purposes throughout the code */
6496 bool cached = 0; /* if this is true then we shouldn't refcount dev sv_str */
6497 regnode *ret = NULL;
6499 if (*RExC_parse != '{') {
6500 vFAIL("Missing braces on \\N{}");
6502 name = RExC_parse+1;
6503 endbrace = strchr(RExC_parse, '}');
6506 vFAIL("Missing right brace on \\N{}");
6508 RExC_parse = endbrace + 1;
6511 /* RExC_parse points at the beginning brace,
6512 endbrace points at the last */
6513 if ( name[0]=='U' && name[1]=='+' ) {
6514 /* its a "Unicode hex" notation {U+89AB} */
6515 I32 fl = PERL_SCAN_ALLOW_UNDERSCORES
6516 | PERL_SCAN_DISALLOW_PREFIX
6517 | (SIZE_ONLY ? PERL_SCAN_SILENT_ILLDIGIT : 0);
6520 len = (STRLEN)(endbrace - name - 2);
6521 cp = grok_hex(name + 2, &len, &fl, NULL);
6522 if ( len != (STRLEN)(endbrace - name - 2) ) {
6532 sv_str= newSVpvn(&string, 1);
6534 /* fetch the charnames handler for this scope */
6535 HV * const table = GvHV(PL_hintgv);
6537 hv_fetchs(table, "charnames", FALSE) :
6539 SV *cv= cvp ? *cvp : NULL;
6542 /* create an SV with the name as argument */
6543 sv_name = newSVpvn(name, endbrace - name);
6545 if (!table || !(PL_hints & HINT_LOCALIZE_HH)) {
6546 vFAIL2("Constant(\\N{%s}) unknown: "
6547 "(possibly a missing \"use charnames ...\")",
6550 if (!cvp || !SvOK(*cvp)) { /* when $^H{charnames} = undef; */
6551 vFAIL2("Constant(\\N{%s}): "
6552 "$^H{charnames} is not defined",SvPVX(sv_name));
6557 if (!RExC_charnames) {
6558 /* make sure our cache is allocated */
6559 RExC_charnames = newHV();
6560 sv_2mortal((SV*)RExC_charnames);
6562 /* see if we have looked this one up before */
6563 he_str = hv_fetch_ent( RExC_charnames, sv_name, 0, 0 );
6565 sv_str = HeVAL(he_str);
6578 count= call_sv(cv, G_SCALAR);
6580 if (count == 1) { /* XXXX is this right? dmq */
6582 SvREFCNT_inc_simple_void(sv_str);
6590 if ( !sv_str || !SvOK(sv_str) ) {
6591 vFAIL2("Constant(\\N{%s}): Call to &{$^H{charnames}} "
6592 "did not return a defined value",SvPVX(sv_name));
6594 if (hv_store_ent( RExC_charnames, sv_name, sv_str, 0))
6599 char *p = SvPV(sv_str, len);
6602 if ( SvUTF8(sv_str) ) {
6603 *valuep = utf8_to_uvchr((U8*)p, &numlen);
6607 We have to turn on utf8 for high bit chars otherwise
6608 we get failures with
6610 "ss" =~ /[\N{LATIN SMALL LETTER SHARP S}]/i
6611 "SS" =~ /[\N{LATIN SMALL LETTER SHARP S}]/i
6613 This is different from what \x{} would do with the same
6614 codepoint, where the condition is > 0xFF.
6621 /* warn if we havent used the whole string? */
6623 if (numlen<len && SIZE_ONLY && ckWARN(WARN_REGEXP)) {
6625 "Ignoring excess chars from \\N{%s} in character class",
6629 } else if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
6631 "Ignoring zero length \\N{%s} in character class",
6636 SvREFCNT_dec(sv_name);
6638 SvREFCNT_dec(sv_str);
6639 return len ? NULL : (regnode *)&len;
6640 } else if(SvCUR(sv_str)) {
6646 char * parse_start = name-3; /* needed for the offsets */
6648 GET_RE_DEBUG_FLAGS_DECL; /* needed for the offsets */
6650 ret = reg_node(pRExC_state,
6651 (U8)(FOLD ? (LOC ? EXACTFL : EXACTF) : EXACT));
6654 if ( RExC_utf8 && !SvUTF8(sv_str) ) {
6655 sv_utf8_upgrade(sv_str);
6656 } else if ( !RExC_utf8 && SvUTF8(sv_str) ) {
6660 p = SvPV(sv_str, len);
6662 /* len is the length written, charlen is the size the char read */
6663 for ( len = 0; p < pend; p += charlen ) {
6665 UV uvc = utf8_to_uvchr((U8*)p, &charlen);
6667 STRLEN foldlen,numlen;
6668 U8 tmpbuf[UTF8_MAXBYTES_CASE+1], *foldbuf;
6669 uvc = toFOLD_uni(uvc, tmpbuf, &foldlen);
6670 /* Emit all the Unicode characters. */
6672 for (foldbuf = tmpbuf;
6676 uvc = utf8_to_uvchr(foldbuf, &numlen);
6678 const STRLEN unilen = reguni(pRExC_state, uvc, s);
6681 /* In EBCDIC the numlen
6682 * and unilen can differ. */
6684 if (numlen >= foldlen)
6688 break; /* "Can't happen." */
6691 const STRLEN unilen = reguni(pRExC_state, uvc, s);
6703 RExC_size += STR_SZ(len);
6706 RExC_emit += STR_SZ(len);
6708 Set_Node_Cur_Length(ret); /* MJD */
6710 nextchar(pRExC_state);
6712 ret = reg_node(pRExC_state,NOTHING);
6715 SvREFCNT_dec(sv_str);
6718 SvREFCNT_dec(sv_name);
6728 * It returns the code point in utf8 for the value in *encp.
6729 * value: a code value in the source encoding
6730 * encp: a pointer to an Encode object
6732 * If the result from Encode is not a single character,
6733 * it returns U+FFFD (Replacement character) and sets *encp to NULL.
6736 S_reg_recode(pTHX_ const char value, SV **encp)
6739 SV * const sv = sv_2mortal(newSVpvn(&value, numlen));
6740 const char * const s = *encp ? sv_recode_to_utf8(sv, *encp) : SvPVX(sv);
6741 const STRLEN newlen = SvCUR(sv);
6742 UV uv = UNICODE_REPLACEMENT;
6746 ? utf8n_to_uvchr((U8*)s, newlen, &numlen, UTF8_ALLOW_DEFAULT)
6749 if (!newlen || numlen != newlen) {
6750 uv = UNICODE_REPLACEMENT;
6758 - regatom - the lowest level
6760 Try to identify anything special at the start of the pattern. If there
6761 is, then handle it as required. This may involve generating a single regop,
6762 such as for an assertion; or it may involve recursing, such as to
6763 handle a () structure.
6765 If the string doesn't start with something special then we gobble up
6766 as much literal text as we can.
6768 Once we have been able to handle whatever type of thing started the
6769 sequence, we return.
6771 Note: we have to be careful with escapes, as they can be both literal
6772 and special, and in the case of \10 and friends can either, depending
6773 on context. Specifically there are two seperate switches for handling
6774 escape sequences, with the one for handling literal escapes requiring
6775 a dummy entry for all of the special escapes that are actually handled
6780 S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
6783 register regnode *ret = NULL;
6785 char *parse_start = RExC_parse;
6786 GET_RE_DEBUG_FLAGS_DECL;
6787 DEBUG_PARSE("atom");
6788 *flagp = WORST; /* Tentatively. */
6792 switch ((U8)*RExC_parse) {
6794 RExC_seen_zerolen++;
6795 nextchar(pRExC_state);
6796 if (RExC_flags & RXf_PMf_MULTILINE)
6797 ret = reg_node(pRExC_state, MBOL);
6798 else if (RExC_flags & RXf_PMf_SINGLELINE)
6799 ret = reg_node(pRExC_state, SBOL);
6801 ret = reg_node(pRExC_state, BOL);
6802 Set_Node_Length(ret, 1); /* MJD */
6805 nextchar(pRExC_state);
6807 RExC_seen_zerolen++;
6808 if (RExC_flags & RXf_PMf_MULTILINE)
6809 ret = reg_node(pRExC_state, MEOL);
6810 else if (RExC_flags & RXf_PMf_SINGLELINE)
6811 ret = reg_node(pRExC_state, SEOL);
6813 ret = reg_node(pRExC_state, EOL);
6814 Set_Node_Length(ret, 1); /* MJD */
6817 nextchar(pRExC_state);
6818 if (RExC_flags & RXf_PMf_SINGLELINE)
6819 ret = reg_node(pRExC_state, SANY);
6821 ret = reg_node(pRExC_state, REG_ANY);
6822 *flagp |= HASWIDTH|SIMPLE;
6824 Set_Node_Length(ret, 1); /* MJD */
6828 char * const oregcomp_parse = ++RExC_parse;
6829 ret = regclass(pRExC_state,depth+1);
6830 if (*RExC_parse != ']') {
6831 RExC_parse = oregcomp_parse;
6832 vFAIL("Unmatched [");
6834 nextchar(pRExC_state);
6835 *flagp |= HASWIDTH|SIMPLE;
6836 Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */
6840 nextchar(pRExC_state);
6841 ret = reg(pRExC_state, 1, &flags,depth+1);
6843 if (flags & TRYAGAIN) {
6844 if (RExC_parse == RExC_end) {
6845 /* Make parent create an empty node if needed. */
6853 *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE|POSTPONED);
6857 if (flags & TRYAGAIN) {
6861 vFAIL("Internal urp");
6862 /* Supposed to be caught earlier. */
6865 if (!regcurly(RExC_parse)) {
6874 vFAIL("Quantifier follows nothing");
6882 len=0; /* silence a spurious compiler warning */
6883 if ((cp = what_len_TRICKYFOLD_safe(RExC_parse,RExC_end,UTF,len))) {
6884 *flagp |= HASWIDTH; /* could be SIMPLE too, but needs a handler in regexec.regrepeat */
6885 RExC_parse+=len-1; /* we get one from nextchar() as well. :-( */
6886 ret = reganode(pRExC_state, FOLDCHAR, cp);
6887 Set_Node_Length(ret, 1); /* MJD */
6888 nextchar(pRExC_state); /* kill whitespace under /x */
6896 This switch handles escape sequences that resolve to some kind
6897 of special regop and not to literal text. Escape sequnces that
6898 resolve to literal text are handled below in the switch marked
6901 Every entry in this switch *must* have a corresponding entry
6902 in the literal escape switch. However, the opposite is not
6903 required, as the default for this switch is to jump to the
6904 literal text handling code.
6906 switch ((U8)*++RExC_parse) {
6911 /* Special Escapes */
6913 RExC_seen_zerolen++;
6914 ret = reg_node(pRExC_state, SBOL);
6916 goto finish_meta_pat;
6918 ret = reg_node(pRExC_state, GPOS);
6919 RExC_seen |= REG_SEEN_GPOS;
6921 goto finish_meta_pat;
6923 RExC_seen_zerolen++;
6924 ret = reg_node(pRExC_state, KEEPS);
6926 /* XXX:dmq : disabling in-place substitution seems to
6927 * be necessary here to avoid cases of memory corruption, as
6928 * with: C<$_="x" x 80; s/x\K/y/> -- rgs
6930 RExC_seen |= REG_SEEN_LOOKBEHIND;
6931 goto finish_meta_pat;
6933 ret = reg_node(pRExC_state, SEOL);
6935 RExC_seen_zerolen++; /* Do not optimize RE away */
6936 goto finish_meta_pat;
6938 ret = reg_node(pRExC_state, EOS);
6940 RExC_seen_zerolen++; /* Do not optimize RE away */
6941 goto finish_meta_pat;
6943 ret = reg_node(pRExC_state, CANY);
6944 RExC_seen |= REG_SEEN_CANY;
6945 *flagp |= HASWIDTH|SIMPLE;
6946 goto finish_meta_pat;
6948 ret = reg_node(pRExC_state, CLUMP);
6950 goto finish_meta_pat;
6952 ret = reg_node(pRExC_state, (U8)(LOC ? ALNUML : ALNUM));
6953 *flagp |= HASWIDTH|SIMPLE;
6954 goto finish_meta_pat;
6956 ret = reg_node(pRExC_state, (U8)(LOC ? NALNUML : NALNUM));
6957 *flagp |= HASWIDTH|SIMPLE;
6958 goto finish_meta_pat;
6960 RExC_seen_zerolen++;
6961 RExC_seen |= REG_SEEN_LOOKBEHIND;
6962 ret = reg_node(pRExC_state, (U8)(LOC ? BOUNDL : BOUND));
6964 goto finish_meta_pat;
6966 RExC_seen_zerolen++;
6967 RExC_seen |= REG_SEEN_LOOKBEHIND;
6968 ret = reg_node(pRExC_state, (U8)(LOC ? NBOUNDL : NBOUND));
6970 goto finish_meta_pat;
6972 ret = reg_node(pRExC_state, (U8)(LOC ? SPACEL : SPACE));
6973 *flagp |= HASWIDTH|SIMPLE;
6974 goto finish_meta_pat;
6976 ret = reg_node(pRExC_state, (U8)(LOC ? NSPACEL : NSPACE));
6977 *flagp |= HASWIDTH|SIMPLE;
6978 goto finish_meta_pat;
6980 ret = reg_node(pRExC_state, DIGIT);
6981 *flagp |= HASWIDTH|SIMPLE;
6982 goto finish_meta_pat;
6984 ret = reg_node(pRExC_state, NDIGIT);
6985 *flagp |= HASWIDTH|SIMPLE;
6986 goto finish_meta_pat;
6988 ret = reg_node(pRExC_state, LNBREAK);
6989 *flagp |= HASWIDTH|SIMPLE;
6990 goto finish_meta_pat;
6992 ret = reg_node(pRExC_state, HORIZWS);
6993 *flagp |= HASWIDTH|SIMPLE;
6994 goto finish_meta_pat;
6996 ret = reg_node(pRExC_state, NHORIZWS);
6997 *flagp |= HASWIDTH|SIMPLE;
6998 goto finish_meta_pat;
7000 ret = reg_node(pRExC_state, VERTWS);
7001 *flagp |= HASWIDTH|SIMPLE;
7002 goto finish_meta_pat;
7004 ret = reg_node(pRExC_state, NVERTWS);
7005 *flagp |= HASWIDTH|SIMPLE;
7007 nextchar(pRExC_state);
7008 Set_Node_Length(ret, 2); /* MJD */
7013 char* const oldregxend = RExC_end;
7015 char* parse_start = RExC_parse - 2;
7018 if (RExC_parse[1] == '{') {
7019 /* a lovely hack--pretend we saw [\pX] instead */
7020 RExC_end = strchr(RExC_parse, '}');
7022 const U8 c = (U8)*RExC_parse;
7024 RExC_end = oldregxend;
7025 vFAIL2("Missing right brace on \\%c{}", c);
7030 RExC_end = RExC_parse + 2;
7031 if (RExC_end > oldregxend)
7032 RExC_end = oldregxend;
7036 ret = regclass(pRExC_state,depth+1);
7038 RExC_end = oldregxend;
7041 Set_Node_Offset(ret, parse_start + 2);
7042 Set_Node_Cur_Length(ret);
7043 nextchar(pRExC_state);
7044 *flagp |= HASWIDTH|SIMPLE;
7048 /* Handle \N{NAME} here and not below because it can be
7049 multicharacter. join_exact() will join them up later on.
7050 Also this makes sure that things like /\N{BLAH}+/ and
7051 \N{BLAH} being multi char Just Happen. dmq*/
7053 ret= reg_namedseq(pRExC_state, NULL);
7055 case 'k': /* Handle \k<NAME> and \k'NAME' */
7058 char ch= RExC_parse[1];
7059 if (ch != '<' && ch != '\'' && ch != '{') {
7061 vFAIL2("Sequence %.2s... not terminated",parse_start);
7063 /* this pretty much dupes the code for (?P=...) in reg(), if
7064 you change this make sure you change that */
7065 char* name_start = (RExC_parse += 2);
7067 SV *sv_dat = reg_scan_name(pRExC_state,
7068 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
7069 ch= (ch == '<') ? '>' : (ch == '{') ? '}' : '\'';
7070 if (RExC_parse == name_start || *RExC_parse != ch)
7071 vFAIL2("Sequence %.3s... not terminated",parse_start);
7074 num = add_data( pRExC_state, 1, "S" );
7075 RExC_rxi->data->data[num]=(void*)sv_dat;
7076 SvREFCNT_inc_simple_void(sv_dat);
7080 ret = reganode(pRExC_state,
7081 (U8)(FOLD ? (LOC ? NREFFL : NREFF) : NREF),
7085 /* override incorrect value set in reganode MJD */
7086 Set_Node_Offset(ret, parse_start+1);
7087 Set_Node_Cur_Length(ret); /* MJD */
7088 nextchar(pRExC_state);
7094 case '1': case '2': case '3': case '4':
7095 case '5': case '6': case '7': case '8': case '9':
7098 bool isg = *RExC_parse == 'g';
7103 if (*RExC_parse == '{') {
7107 if (*RExC_parse == '-') {
7111 if (hasbrace && !isDIGIT(*RExC_parse)) {
7112 if (isrel) RExC_parse--;
7114 goto parse_named_seq;
7116 num = atoi(RExC_parse);
7117 if (isg && num == 0)
7118 vFAIL("Reference to invalid group 0");
7120 num = RExC_npar - num;
7122 vFAIL("Reference to nonexistent or unclosed group");
7124 if (!isg && num > 9 && num >= RExC_npar)
7127 char * const parse_start = RExC_parse - 1; /* MJD */
7128 while (isDIGIT(*RExC_parse))
7130 if (parse_start == RExC_parse - 1)
7131 vFAIL("Unterminated \\g... pattern");
7133 if (*RExC_parse != '}')
7134 vFAIL("Unterminated \\g{...} pattern");
7138 if (num > (I32)RExC_rx->nparens)
7139 vFAIL("Reference to nonexistent group");
7142 ret = reganode(pRExC_state,
7143 (U8)(FOLD ? (LOC ? REFFL : REFF) : REF),
7147 /* override incorrect value set in reganode MJD */
7148 Set_Node_Offset(ret, parse_start+1);
7149 Set_Node_Cur_Length(ret); /* MJD */
7151 nextchar(pRExC_state);
7156 if (RExC_parse >= RExC_end)
7157 FAIL("Trailing \\");
7160 /* Do not generate "unrecognized" warnings here, we fall
7161 back into the quick-grab loop below */
7168 if (RExC_flags & RXf_PMf_EXTENDED) {
7169 if ( reg_skipcomment( pRExC_state ) )
7176 register STRLEN len;
7181 U8 tmpbuf[UTF8_MAXBYTES_CASE+1], *foldbuf;
7183 parse_start = RExC_parse - 1;
7189 ret = reg_node(pRExC_state,
7190 (U8)(FOLD ? (LOC ? EXACTFL : EXACTF) : EXACT));
7192 for (len = 0, p = RExC_parse - 1;
7193 len < 127 && p < RExC_end;
7196 char * const oldp = p;
7198 if (RExC_flags & RXf_PMf_EXTENDED)
7199 p = regwhite( pRExC_state, p );
7204 if (LOC || !FOLD || !is_TRICKYFOLD_safe(p,RExC_end,UTF))
7205 goto normal_default;
7215 /* Literal Escapes Switch
7217 This switch is meant to handle escape sequences that
7218 resolve to a literal character.
7220 Every escape sequence that represents something
7221 else, like an assertion or a char class, is handled
7222 in the switch marked 'Special Escapes' above in this
7223 routine, but also has an entry here as anything that
7224 isn't explicitly mentioned here will be treated as
7225 an unescaped equivalent literal.
7229 /* These are all the special escapes. */
7233 if (LOC || !FOLD || !is_TRICKYFOLD_safe(p,RExC_end,UTF))
7234 goto normal_default;
7235 case 'A': /* Start assertion */
7236 case 'b': case 'B': /* Word-boundary assertion*/
7237 case 'C': /* Single char !DANGEROUS! */
7238 case 'd': case 'D': /* digit class */
7239 case 'g': case 'G': /* generic-backref, pos assertion */
7240 case 'h': case 'H': /* HORIZWS */
7241 case 'k': case 'K': /* named backref, keep marker */
7242 case 'N': /* named char sequence */
7243 case 'p': case 'P': /* Unicode property */
7244 case 'R': /* LNBREAK */
7245 case 's': case 'S': /* space class */
7246 case 'v': case 'V': /* VERTWS */
7247 case 'w': case 'W': /* word class */
7248 case 'X': /* eXtended Unicode "combining character sequence" */
7249 case 'z': case 'Z': /* End of line/string assertion */
7253 /* Anything after here is an escape that resolves to a
7254 literal. (Except digits, which may or may not)
7273 ender = ASCII_TO_NATIVE('\033');
7277 ender = ASCII_TO_NATIVE('\007');
7282 char* const e = strchr(p, '}');
7286 vFAIL("Missing right brace on \\x{}");
7289 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
7290 | PERL_SCAN_DISALLOW_PREFIX;
7291 STRLEN numlen = e - p - 1;
7292 ender = grok_hex(p + 1, &numlen, &flags, NULL);
7299 I32 flags = PERL_SCAN_DISALLOW_PREFIX;
7301 ender = grok_hex(p, &numlen, &flags, NULL);
7304 if (PL_encoding && ender < 0x100)
7305 goto recode_encoding;
7309 ender = UCHARAT(p++);
7310 ender = toCTRL(ender);
7312 case '0': case '1': case '2': case '3':case '4':
7313 case '5': case '6': case '7': case '8':case '9':
7315 (isDIGIT(p[1]) && atoi(p) >= RExC_npar) ) {
7318 ender = grok_oct(p, &numlen, &flags, NULL);
7325 if (PL_encoding && ender < 0x100)
7326 goto recode_encoding;
7330 SV* enc = PL_encoding;
7331 ender = reg_recode((const char)(U8)ender, &enc);
7332 if (!enc && SIZE_ONLY && ckWARN(WARN_REGEXP))
7333 vWARN(p, "Invalid escape in the specified encoding");
7339 FAIL("Trailing \\");
7342 if (!SIZE_ONLY&& isALPHA(*p) && ckWARN(WARN_REGEXP))
7343 vWARN2(p + 1, "Unrecognized escape \\%c passed through", UCHARAT(p));
7344 goto normal_default;
7349 if (UTF8_IS_START(*p) && UTF) {
7351 ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
7352 &numlen, UTF8_ALLOW_DEFAULT);
7359 if ( RExC_flags & RXf_PMf_EXTENDED)
7360 p = regwhite( pRExC_state, p );
7362 /* Prime the casefolded buffer. */
7363 ender = toFOLD_uni(ender, tmpbuf, &foldlen);
7365 if (p < RExC_end && ISMULT2(p)) { /* Back off on ?+*. */
7370 /* Emit all the Unicode characters. */
7372 for (foldbuf = tmpbuf;
7374 foldlen -= numlen) {
7375 ender = utf8_to_uvchr(foldbuf, &numlen);
7377 const STRLEN unilen = reguni(pRExC_state, ender, s);
7380 /* In EBCDIC the numlen
7381 * and unilen can differ. */
7383 if (numlen >= foldlen)
7387 break; /* "Can't happen." */
7391 const STRLEN unilen = reguni(pRExC_state, ender, s);
7400 REGC((char)ender, s++);
7406 /* Emit all the Unicode characters. */
7408 for (foldbuf = tmpbuf;
7410 foldlen -= numlen) {
7411 ender = utf8_to_uvchr(foldbuf, &numlen);
7413 const STRLEN unilen = reguni(pRExC_state, ender, s);
7416 /* In EBCDIC the numlen
7417 * and unilen can differ. */
7419 if (numlen >= foldlen)
7427 const STRLEN unilen = reguni(pRExC_state, ender, s);
7436 REGC((char)ender, s++);
7440 Set_Node_Cur_Length(ret); /* MJD */
7441 nextchar(pRExC_state);
7443 /* len is STRLEN which is unsigned, need to copy to signed */
7446 vFAIL("Internal disaster");
7450 if (len == 1 && UNI_IS_INVARIANT(ender))
7454 RExC_size += STR_SZ(len);
7457 RExC_emit += STR_SZ(len);
7467 S_regwhite( RExC_state_t *pRExC_state, char *p )
7469 const char *e = RExC_end;
7473 else if (*p == '#') {
7482 RExC_seen |= REG_SEEN_RUN_ON_COMMENT;
7490 /* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
7491 Character classes ([:foo:]) can also be negated ([:^foo:]).
7492 Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
7493 Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
7494 but trigger failures because they are currently unimplemented. */
7496 #define POSIXCC_DONE(c) ((c) == ':')
7497 #define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.')
7498 #define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c))
7501 S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value)
7504 I32 namedclass = OOB_NAMEDCLASS;
7506 if (value == '[' && RExC_parse + 1 < RExC_end &&
7507 /* I smell either [: or [= or [. -- POSIX has been here, right? */
7508 POSIXCC(UCHARAT(RExC_parse))) {
7509 const char c = UCHARAT(RExC_parse);
7510 char* const s = RExC_parse++;
7512 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != c)
7514 if (RExC_parse == RExC_end)
7515 /* Grandfather lone [:, [=, [. */
7518 const char* const t = RExC_parse++; /* skip over the c */
7521 if (UCHARAT(RExC_parse) == ']') {
7522 const char *posixcc = s + 1;
7523 RExC_parse++; /* skip over the ending ] */
7526 const I32 complement = *posixcc == '^' ? *posixcc++ : 0;
7527 const I32 skip = t - posixcc;
7529 /* Initially switch on the length of the name. */
7532 if (memEQ(posixcc, "word", 4)) /* this is not POSIX, this is the Perl \w */
7533 namedclass = complement ? ANYOF_NALNUM : ANYOF_ALNUM;
7536 /* Names all of length 5. */
7537 /* alnum alpha ascii blank cntrl digit graph lower
7538 print punct space upper */
7539 /* Offset 4 gives the best switch position. */
7540 switch (posixcc[4]) {
7542 if (memEQ(posixcc, "alph", 4)) /* alpha */
7543 namedclass = complement ? ANYOF_NALPHA : ANYOF_ALPHA;
7546 if (memEQ(posixcc, "spac", 4)) /* space */
7547 namedclass = complement ? ANYOF_NPSXSPC : ANYOF_PSXSPC;
7550 if (memEQ(posixcc, "grap", 4)) /* graph */
7551 namedclass = complement ? ANYOF_NGRAPH : ANYOF_GRAPH;
7554 if (memEQ(posixcc, "asci", 4)) /* ascii */
7555 namedclass = complement ? ANYOF_NASCII : ANYOF_ASCII;
7558 if (memEQ(posixcc, "blan", 4)) /* blank */
7559 namedclass = complement ? ANYOF_NBLANK : ANYOF_BLANK;
7562 if (memEQ(posixcc, "cntr", 4)) /* cntrl */
7563 namedclass = complement ? ANYOF_NCNTRL : ANYOF_CNTRL;
7566 if (memEQ(posixcc, "alnu", 4)) /* alnum */
7567 namedclass = complement ? ANYOF_NALNUMC : ANYOF_ALNUMC;
7570 if (memEQ(posixcc, "lowe", 4)) /* lower */
7571 namedclass = complement ? ANYOF_NLOWER : ANYOF_LOWER;
7572 else if (memEQ(posixcc, "uppe", 4)) /* upper */
7573 namedclass = complement ? ANYOF_NUPPER : ANYOF_UPPER;
7576 if (memEQ(posixcc, "digi", 4)) /* digit */
7577 namedclass = complement ? ANYOF_NDIGIT : ANYOF_DIGIT;
7578 else if (memEQ(posixcc, "prin", 4)) /* print */
7579 namedclass = complement ? ANYOF_NPRINT : ANYOF_PRINT;
7580 else if (memEQ(posixcc, "punc", 4)) /* punct */
7581 namedclass = complement ? ANYOF_NPUNCT : ANYOF_PUNCT;
7586 if (memEQ(posixcc, "xdigit", 6))
7587 namedclass = complement ? ANYOF_NXDIGIT : ANYOF_XDIGIT;
7591 if (namedclass == OOB_NAMEDCLASS)
7592 Simple_vFAIL3("POSIX class [:%.*s:] unknown",
7594 assert (posixcc[skip] == ':');
7595 assert (posixcc[skip+1] == ']');
7596 } else if (!SIZE_ONLY) {
7597 /* [[=foo=]] and [[.foo.]] are still future. */
7599 /* adjust RExC_parse so the warning shows after
7601 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse) != ']')
7603 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
7606 /* Maternal grandfather:
7607 * "[:" ending in ":" but not in ":]" */
7617 S_checkposixcc(pTHX_ RExC_state_t *pRExC_state)
7620 if (POSIXCC(UCHARAT(RExC_parse))) {
7621 const char *s = RExC_parse;
7622 const char c = *s++;
7626 if (*s && c == *s && s[1] == ']') {
7627 if (ckWARN(WARN_REGEXP))
7629 "POSIX syntax [%c %c] belongs inside character classes",
7632 /* [[=foo=]] and [[.foo.]] are still future. */
7633 if (POSIXCC_NOTYET(c)) {
7634 /* adjust RExC_parse so the error shows after
7636 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse++) != ']')
7638 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
7645 #define _C_C_T_(NAME,TEST,WORD) \
7648 ANYOF_CLASS_SET(ret, ANYOF_##NAME); \
7650 for (value = 0; value < 256; value++) \
7652 ANYOF_BITMAP_SET(ret, value); \
7657 case ANYOF_N##NAME: \
7659 ANYOF_CLASS_SET(ret, ANYOF_N##NAME); \
7661 for (value = 0; value < 256; value++) \
7663 ANYOF_BITMAP_SET(ret, value); \
7669 #define _C_C_T_NOLOC_(NAME,TEST,WORD) \
7671 for (value = 0; value < 256; value++) \
7673 ANYOF_BITMAP_SET(ret, value); \
7677 case ANYOF_N##NAME: \
7678 for (value = 0; value < 256; value++) \
7680 ANYOF_BITMAP_SET(ret, value); \
7686 parse a class specification and produce either an ANYOF node that
7687 matches the pattern or if the pattern matches a single char only and
7688 that char is < 256 and we are case insensitive then we produce an
7693 S_regclass(pTHX_ RExC_state_t *pRExC_state, U32 depth)
7696 register UV nextvalue;
7697 register IV prevvalue = OOB_UNICODE;
7698 register IV range = 0;
7699 UV value = 0; /* XXX:dmq: needs to be referenceable (unfortunately) */
7700 register regnode *ret;
7703 char *rangebegin = NULL;
7704 bool need_class = 0;
7707 bool optimize_invert = TRUE;
7708 AV* unicode_alternate = NULL;
7710 UV literal_endpoint = 0;
7712 UV stored = 0; /* number of chars stored in the class */
7714 regnode * const orig_emit = RExC_emit; /* Save the original RExC_emit in
7715 case we need to change the emitted regop to an EXACT. */
7716 const char * orig_parse = RExC_parse;
7717 GET_RE_DEBUG_FLAGS_DECL;
7719 PERL_UNUSED_ARG(depth);
7722 DEBUG_PARSE("clas");
7724 /* Assume we are going to generate an ANYOF node. */
7725 ret = reganode(pRExC_state, ANYOF, 0);
7728 ANYOF_FLAGS(ret) = 0;
7730 if (UCHARAT(RExC_parse) == '^') { /* Complement of range. */
7734 ANYOF_FLAGS(ret) |= ANYOF_INVERT;
7738 RExC_size += ANYOF_SKIP;
7739 listsv = &PL_sv_undef; /* For code scanners: listsv always non-NULL. */
7742 RExC_emit += ANYOF_SKIP;
7744 ANYOF_FLAGS(ret) |= ANYOF_FOLD;
7746 ANYOF_FLAGS(ret) |= ANYOF_LOCALE;
7747 ANYOF_BITMAP_ZERO(ret);
7748 listsv = newSVpvs("# comment\n");
7751 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
7753 if (!SIZE_ONLY && POSIXCC(nextvalue))
7754 checkposixcc(pRExC_state);
7756 /* allow 1st char to be ] (allowing it to be - is dealt with later) */
7757 if (UCHARAT(RExC_parse) == ']')
7761 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != ']') {
7765 namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
7768 rangebegin = RExC_parse;
7770 value = utf8n_to_uvchr((U8*)RExC_parse,
7771 RExC_end - RExC_parse,
7772 &numlen, UTF8_ALLOW_DEFAULT);
7773 RExC_parse += numlen;
7776 value = UCHARAT(RExC_parse++);
7778 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
7779 if (value == '[' && POSIXCC(nextvalue))
7780 namedclass = regpposixcc(pRExC_state, value);
7781 else if (value == '\\') {
7783 value = utf8n_to_uvchr((U8*)RExC_parse,
7784 RExC_end - RExC_parse,
7785 &numlen, UTF8_ALLOW_DEFAULT);
7786 RExC_parse += numlen;
7789 value = UCHARAT(RExC_parse++);
7790 /* Some compilers cannot handle switching on 64-bit integer
7791 * values, therefore value cannot be an UV. Yes, this will
7792 * be a problem later if we want switch on Unicode.
7793 * A similar issue a little bit later when switching on
7794 * namedclass. --jhi */
7795 switch ((I32)value) {
7796 case 'w': namedclass = ANYOF_ALNUM; break;
7797 case 'W': namedclass = ANYOF_NALNUM; break;
7798 case 's': namedclass = ANYOF_SPACE; break;
7799 case 'S': namedclass = ANYOF_NSPACE; break;
7800 case 'd': namedclass = ANYOF_DIGIT; break;
7801 case 'D': namedclass = ANYOF_NDIGIT; break;
7802 case 'v': namedclass = ANYOF_VERTWS; break;
7803 case 'V': namedclass = ANYOF_NVERTWS; break;
7804 case 'h': namedclass = ANYOF_HORIZWS; break;
7805 case 'H': namedclass = ANYOF_NHORIZWS; break;
7806 case 'N': /* Handle \N{NAME} in class */
7808 /* We only pay attention to the first char of
7809 multichar strings being returned. I kinda wonder
7810 if this makes sense as it does change the behaviour
7811 from earlier versions, OTOH that behaviour was broken
7813 UV v; /* value is register so we cant & it /grrr */
7814 if (reg_namedseq(pRExC_state, &v)) {
7824 if (RExC_parse >= RExC_end)
7825 vFAIL2("Empty \\%c{}", (U8)value);
7826 if (*RExC_parse == '{') {
7827 const U8 c = (U8)value;
7828 e = strchr(RExC_parse++, '}');
7830 vFAIL2("Missing right brace on \\%c{}", c);
7831 while (isSPACE(UCHARAT(RExC_parse)))
7833 if (e == RExC_parse)
7834 vFAIL2("Empty \\%c{}", c);
7836 while (isSPACE(UCHARAT(RExC_parse + n - 1)))
7844 if (UCHARAT(RExC_parse) == '^') {
7847 value = value == 'p' ? 'P' : 'p'; /* toggle */
7848 while (isSPACE(UCHARAT(RExC_parse))) {
7853 Perl_sv_catpvf(aTHX_ listsv, "%cutf8::%.*s\n",
7854 (value=='p' ? '+' : '!'), (int)n, RExC_parse);
7857 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
7858 namedclass = ANYOF_MAX; /* no official name, but it's named */
7861 case 'n': value = '\n'; break;
7862 case 'r': value = '\r'; break;
7863 case 't': value = '\t'; break;
7864 case 'f': value = '\f'; break;
7865 case 'b': value = '\b'; break;
7866 case 'e': value = ASCII_TO_NATIVE('\033');break;
7867 case 'a': value = ASCII_TO_NATIVE('\007');break;
7869 if (*RExC_parse == '{') {
7870 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
7871 | PERL_SCAN_DISALLOW_PREFIX;
7872 char * const e = strchr(RExC_parse++, '}');
7874 vFAIL("Missing right brace on \\x{}");
7876 numlen = e - RExC_parse;
7877 value = grok_hex(RExC_parse, &numlen, &flags, NULL);
7881 I32 flags = PERL_SCAN_DISALLOW_PREFIX;
7883 value = grok_hex(RExC_parse, &numlen, &flags, NULL);
7884 RExC_parse += numlen;
7886 if (PL_encoding && value < 0x100)
7887 goto recode_encoding;
7890 value = UCHARAT(RExC_parse++);
7891 value = toCTRL(value);
7893 case '0': case '1': case '2': case '3': case '4':
7894 case '5': case '6': case '7': case '8': case '9':
7898 value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
7899 RExC_parse += numlen;
7900 if (PL_encoding && value < 0x100)
7901 goto recode_encoding;
7906 SV* enc = PL_encoding;
7907 value = reg_recode((const char)(U8)value, &enc);
7908 if (!enc && SIZE_ONLY && ckWARN(WARN_REGEXP))
7910 "Invalid escape in the specified encoding");
7914 if (!SIZE_ONLY && isALPHA(value) && ckWARN(WARN_REGEXP))
7916 "Unrecognized escape \\%c in character class passed through",
7920 } /* end of \blah */
7926 if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
7928 if (!SIZE_ONLY && !need_class)
7929 ANYOF_CLASS_ZERO(ret);
7933 /* a bad range like a-\d, a-[:digit:] ? */
7936 if (ckWARN(WARN_REGEXP)) {
7938 RExC_parse >= rangebegin ?
7939 RExC_parse - rangebegin : 0;
7941 "False [] range \"%*.*s\"",
7944 if (prevvalue < 256) {
7945 ANYOF_BITMAP_SET(ret, prevvalue);
7946 ANYOF_BITMAP_SET(ret, '-');
7949 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
7950 Perl_sv_catpvf(aTHX_ listsv,
7951 "%04"UVxf"\n%04"UVxf"\n", (UV)prevvalue, (UV) '-');
7955 range = 0; /* this was not a true range */
7961 const char *what = NULL;
7964 if (namedclass > OOB_NAMEDCLASS)
7965 optimize_invert = FALSE;
7966 /* Possible truncation here but in some 64-bit environments
7967 * the compiler gets heartburn about switch on 64-bit values.
7968 * A similar issue a little earlier when switching on value.
7970 switch ((I32)namedclass) {
7971 case _C_C_T_(ALNUM, isALNUM(value), "Word");
7972 case _C_C_T_(ALNUMC, isALNUMC(value), "Alnum");
7973 case _C_C_T_(ALPHA, isALPHA(value), "Alpha");
7974 case _C_C_T_(BLANK, isBLANK(value), "Blank");
7975 case _C_C_T_(CNTRL, isCNTRL(value), "Cntrl");
7976 case _C_C_T_(GRAPH, isGRAPH(value), "Graph");
7977 case _C_C_T_(LOWER, isLOWER(value), "Lower");
7978 case _C_C_T_(PRINT, isPRINT(value), "Print");
7979 case _C_C_T_(PSXSPC, isPSXSPC(value), "Space");
7980 case _C_C_T_(PUNCT, isPUNCT(value), "Punct");
7981 case _C_C_T_(SPACE, isSPACE(value), "SpacePerl");
7982 case _C_C_T_(UPPER, isUPPER(value), "Upper");
7983 case _C_C_T_(XDIGIT, isXDIGIT(value), "XDigit");
7984 case _C_C_T_NOLOC_(VERTWS, is_VERTWS_latin1(&value), "VertSpace");
7985 case _C_C_T_NOLOC_(HORIZWS, is_HORIZWS_latin1(&value), "HorizSpace");
7988 ANYOF_CLASS_SET(ret, ANYOF_ASCII);
7991 for (value = 0; value < 128; value++)
7992 ANYOF_BITMAP_SET(ret, value);
7994 for (value = 0; value < 256; value++) {
7996 ANYOF_BITMAP_SET(ret, value);
8005 ANYOF_CLASS_SET(ret, ANYOF_NASCII);
8008 for (value = 128; value < 256; value++)
8009 ANYOF_BITMAP_SET(ret, value);
8011 for (value = 0; value < 256; value++) {
8012 if (!isASCII(value))
8013 ANYOF_BITMAP_SET(ret, value);
8022 ANYOF_CLASS_SET(ret, ANYOF_DIGIT);
8024 /* consecutive digits assumed */
8025 for (value = '0'; value <= '9'; value++)
8026 ANYOF_BITMAP_SET(ret, value);
8033 ANYOF_CLASS_SET(ret, ANYOF_NDIGIT);
8035 /* consecutive digits assumed */
8036 for (value = 0; value < '0'; value++)
8037 ANYOF_BITMAP_SET(ret, value);
8038 for (value = '9' + 1; value < 256; value++)
8039 ANYOF_BITMAP_SET(ret, value);
8045 /* this is to handle \p and \P */
8048 vFAIL("Invalid [::] class");
8052 /* Strings such as "+utf8::isWord\n" */
8053 Perl_sv_catpvf(aTHX_ listsv, "%cutf8::Is%s\n", yesno, what);
8056 ANYOF_FLAGS(ret) |= ANYOF_CLASS;
8059 } /* end of namedclass \blah */
8062 if (prevvalue > (IV)value) /* b-a */ {
8063 const int w = RExC_parse - rangebegin;
8064 Simple_vFAIL4("Invalid [] range \"%*.*s\"", w, w, rangebegin);
8065 range = 0; /* not a valid range */
8069 prevvalue = value; /* save the beginning of the range */
8070 if (*RExC_parse == '-' && RExC_parse+1 < RExC_end &&
8071 RExC_parse[1] != ']') {
8074 /* a bad range like \w-, [:word:]- ? */
8075 if (namedclass > OOB_NAMEDCLASS) {
8076 if (ckWARN(WARN_REGEXP)) {
8078 RExC_parse >= rangebegin ?
8079 RExC_parse - rangebegin : 0;
8081 "False [] range \"%*.*s\"",
8085 ANYOF_BITMAP_SET(ret, '-');
8087 range = 1; /* yeah, it's a range! */
8088 continue; /* but do it the next time */
8092 /* now is the next time */
8093 /*stored += (value - prevvalue + 1);*/
8095 if (prevvalue < 256) {
8096 const IV ceilvalue = value < 256 ? value : 255;
8099 /* In EBCDIC [\x89-\x91] should include
8100 * the \x8e but [i-j] should not. */
8101 if (literal_endpoint == 2 &&
8102 ((isLOWER(prevvalue) && isLOWER(ceilvalue)) ||
8103 (isUPPER(prevvalue) && isUPPER(ceilvalue))))
8105 if (isLOWER(prevvalue)) {
8106 for (i = prevvalue; i <= ceilvalue; i++)
8107 if (isLOWER(i) && !ANYOF_BITMAP_TEST(ret,i)) {
8109 ANYOF_BITMAP_SET(ret, i);
8112 for (i = prevvalue; i <= ceilvalue; i++)
8113 if (isUPPER(i) && !ANYOF_BITMAP_TEST(ret,i)) {
8115 ANYOF_BITMAP_SET(ret, i);
8121 for (i = prevvalue; i <= ceilvalue; i++) {
8122 if (!ANYOF_BITMAP_TEST(ret,i)) {
8124 ANYOF_BITMAP_SET(ret, i);
8128 if (value > 255 || UTF) {
8129 const UV prevnatvalue = NATIVE_TO_UNI(prevvalue);
8130 const UV natvalue = NATIVE_TO_UNI(value);
8131 stored+=2; /* can't optimize this class */
8132 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
8133 if (prevnatvalue < natvalue) { /* what about > ? */
8134 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\t%04"UVxf"\n",
8135 prevnatvalue, natvalue);
8137 else if (prevnatvalue == natvalue) {
8138 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n", natvalue);
8140 U8 foldbuf[UTF8_MAXBYTES_CASE+1];
8142 const UV f = to_uni_fold(natvalue, foldbuf, &foldlen);
8144 #ifdef EBCDIC /* RD t/uni/fold ff and 6b */
8145 if (RExC_precomp[0] == ':' &&
8146 RExC_precomp[1] == '[' &&
8147 (f == 0xDF || f == 0x92)) {
8148 f = NATIVE_TO_UNI(f);
8151 /* If folding and foldable and a single
8152 * character, insert also the folded version
8153 * to the charclass. */
8155 #ifdef EBCDIC /* RD tunifold ligatures s,t fb05, fb06 */
8156 if ((RExC_precomp[0] == ':' &&
8157 RExC_precomp[1] == '[' &&
8159 (value == 0xFB05 || value == 0xFB06))) ?
8160 foldlen == ((STRLEN)UNISKIP(f) - 1) :
8161 foldlen == (STRLEN)UNISKIP(f) )
8163 if (foldlen == (STRLEN)UNISKIP(f))
8165 Perl_sv_catpvf(aTHX_ listsv,
8168 /* Any multicharacter foldings
8169 * require the following transform:
8170 * [ABCDEF] -> (?:[ABCabcDEFd]|pq|rst)
8171 * where E folds into "pq" and F folds
8172 * into "rst", all other characters
8173 * fold to single characters. We save
8174 * away these multicharacter foldings,
8175 * to be later saved as part of the
8176 * additional "s" data. */
8179 if (!unicode_alternate)
8180 unicode_alternate = newAV();
8181 sv = newSVpvn((char*)foldbuf, foldlen);
8183 av_push(unicode_alternate, sv);
8187 /* If folding and the value is one of the Greek
8188 * sigmas insert a few more sigmas to make the
8189 * folding rules of the sigmas to work right.
8190 * Note that not all the possible combinations
8191 * are handled here: some of them are handled
8192 * by the standard folding rules, and some of
8193 * them (literal or EXACTF cases) are handled
8194 * during runtime in regexec.c:S_find_byclass(). */
8195 if (value == UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA) {
8196 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
8197 (UV)UNICODE_GREEK_CAPITAL_LETTER_SIGMA);
8198 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
8199 (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
8201 else if (value == UNICODE_GREEK_CAPITAL_LETTER_SIGMA)
8202 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
8203 (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
8208 literal_endpoint = 0;
8212 range = 0; /* this range (if it was one) is done now */
8216 ANYOF_FLAGS(ret) |= ANYOF_LARGE;
8218 RExC_size += ANYOF_CLASS_ADD_SKIP;
8220 RExC_emit += ANYOF_CLASS_ADD_SKIP;
8226 /****** !SIZE_ONLY AFTER HERE *********/
8228 if( stored == 1 && (value < 128 || (value < 256 && !UTF))
8229 && !( ANYOF_FLAGS(ret) & ( ANYOF_FLAGS_ALL ^ ANYOF_FOLD ) )
8231 /* optimize single char class to an EXACT node
8232 but *only* when its not a UTF/high char */
8233 const char * cur_parse= RExC_parse;
8234 RExC_emit = (regnode *)orig_emit;
8235 RExC_parse = (char *)orig_parse;
8236 ret = reg_node(pRExC_state,
8237 (U8)((ANYOF_FLAGS(ret) & ANYOF_FOLD) ? EXACTF : EXACT));
8238 RExC_parse = (char *)cur_parse;
8239 *STRING(ret)= (char)value;
8241 RExC_emit += STR_SZ(1);
8244 /* optimize case-insensitive simple patterns (e.g. /[a-z]/i) */
8245 if ( /* If the only flag is folding (plus possibly inversion). */
8246 ((ANYOF_FLAGS(ret) & (ANYOF_FLAGS_ALL ^ ANYOF_INVERT)) == ANYOF_FOLD)
8248 for (value = 0; value < 256; ++value) {
8249 if (ANYOF_BITMAP_TEST(ret, value)) {
8250 UV fold = PL_fold[value];
8253 ANYOF_BITMAP_SET(ret, fold);
8256 ANYOF_FLAGS(ret) &= ~ANYOF_FOLD;
8259 /* optimize inverted simple patterns (e.g. [^a-z]) */
8260 if (optimize_invert &&
8261 /* If the only flag is inversion. */
8262 (ANYOF_FLAGS(ret) & ANYOF_FLAGS_ALL) == ANYOF_INVERT) {
8263 for (value = 0; value < ANYOF_BITMAP_SIZE; ++value)
8264 ANYOF_BITMAP(ret)[value] ^= ANYOF_FLAGS_ALL;
8265 ANYOF_FLAGS(ret) = ANYOF_UNICODE_ALL;
8268 AV * const av = newAV();
8270 /* The 0th element stores the character class description
8271 * in its textual form: used later (regexec.c:Perl_regclass_swash())
8272 * to initialize the appropriate swash (which gets stored in
8273 * the 1st element), and also useful for dumping the regnode.
8274 * The 2nd element stores the multicharacter foldings,
8275 * used later (regexec.c:S_reginclass()). */
8276 av_store(av, 0, listsv);
8277 av_store(av, 1, NULL);
8278 av_store(av, 2, (SV*)unicode_alternate);
8279 rv = newRV_noinc((SV*)av);
8280 n = add_data(pRExC_state, 1, "s");
8281 RExC_rxi->data->data[n] = (void*)rv;
8289 /* reg_skipcomment()
8291 Absorbs an /x style # comments from the input stream.
8292 Returns true if there is more text remaining in the stream.
8293 Will set the REG_SEEN_RUN_ON_COMMENT flag if the comment
8294 terminates the pattern without including a newline.
8296 Note its the callers responsibility to ensure that we are
8302 S_reg_skipcomment(pTHX_ RExC_state_t *pRExC_state)
8305 while (RExC_parse < RExC_end)
8306 if (*RExC_parse++ == '\n') {
8311 /* we ran off the end of the pattern without ending
8312 the comment, so we have to add an \n when wrapping */
8313 RExC_seen |= REG_SEEN_RUN_ON_COMMENT;
8321 Advance that parse position, and optionall absorbs
8322 "whitespace" from the inputstream.
8324 Without /x "whitespace" means (?#...) style comments only,
8325 with /x this means (?#...) and # comments and whitespace proper.
8327 Returns the RExC_parse point from BEFORE the scan occurs.
8329 This is the /x friendly way of saying RExC_parse++.
8333 S_nextchar(pTHX_ RExC_state_t *pRExC_state)
8335 char* const retval = RExC_parse++;
8338 if (*RExC_parse == '(' && RExC_parse[1] == '?' &&
8339 RExC_parse[2] == '#') {
8340 while (*RExC_parse != ')') {
8341 if (RExC_parse == RExC_end)
8342 FAIL("Sequence (?#... not terminated");
8348 if (RExC_flags & RXf_PMf_EXTENDED) {
8349 if (isSPACE(*RExC_parse)) {
8353 else if (*RExC_parse == '#') {
8354 if ( reg_skipcomment( pRExC_state ) )
8363 - reg_node - emit a node
8365 STATIC regnode * /* Location. */
8366 S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
8369 register regnode *ptr;
8370 regnode * const ret = RExC_emit;
8371 GET_RE_DEBUG_FLAGS_DECL;
8374 SIZE_ALIGN(RExC_size);
8378 if (RExC_emit >= RExC_emit_bound)
8379 Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d", op);
8381 NODE_ALIGN_FILL(ret);
8383 FILL_ADVANCE_NODE(ptr, op);
8384 #ifdef RE_TRACK_PATTERN_OFFSETS
8385 if (RExC_offsets) { /* MJD */
8386 MJD_OFFSET_DEBUG(("%s:%d: (op %s) %s %"UVuf" (len %"UVuf") (max %"UVuf").\n",
8387 "reg_node", __LINE__,
8389 (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0]
8390 ? "Overwriting end of array!\n" : "OK",
8391 (UV)(RExC_emit - RExC_emit_start),
8392 (UV)(RExC_parse - RExC_start),
8393 (UV)RExC_offsets[0]));
8394 Set_Node_Offset(RExC_emit, RExC_parse + (op == END));
8402 - reganode - emit a node with an argument
8404 STATIC regnode * /* Location. */
8405 S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
8408 register regnode *ptr;
8409 regnode * const ret = RExC_emit;
8410 GET_RE_DEBUG_FLAGS_DECL;
8413 SIZE_ALIGN(RExC_size);
8418 assert(2==regarglen[op]+1);
8420 Anything larger than this has to allocate the extra amount.
8421 If we changed this to be:
8423 RExC_size += (1 + regarglen[op]);
8425 then it wouldn't matter. Its not clear what side effect
8426 might come from that so its not done so far.
8431 if (RExC_emit >= RExC_emit_bound)
8432 Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d", op);
8434 NODE_ALIGN_FILL(ret);
8436 FILL_ADVANCE_NODE_ARG(ptr, op, arg);
8437 #ifdef RE_TRACK_PATTERN_OFFSETS
8438 if (RExC_offsets) { /* MJD */
8439 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
8443 (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0] ?
8444 "Overwriting end of array!\n" : "OK",
8445 (UV)(RExC_emit - RExC_emit_start),
8446 (UV)(RExC_parse - RExC_start),
8447 (UV)RExC_offsets[0]));
8448 Set_Cur_Node_Offset;
8456 - reguni - emit (if appropriate) a Unicode character
8459 S_reguni(pTHX_ const RExC_state_t *pRExC_state, UV uv, char* s)
8462 return SIZE_ONLY ? UNISKIP(uv) : (uvchr_to_utf8((U8*)s, uv) - (U8*)s);
8466 - reginsert - insert an operator in front of already-emitted operand
8468 * Means relocating the operand.
8471 S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd, U32 depth)
8474 register regnode *src;
8475 register regnode *dst;
8476 register regnode *place;
8477 const int offset = regarglen[(U8)op];
8478 const int size = NODE_STEP_REGNODE + offset;
8479 GET_RE_DEBUG_FLAGS_DECL;
8480 PERL_UNUSED_ARG(depth);
8481 /* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */
8482 DEBUG_PARSE_FMT("inst"," - %s",PL_reg_name[op]);
8491 if (RExC_open_parens) {
8493 /*DEBUG_PARSE_FMT("inst"," - %"IVdf, (IV)RExC_npar);*/
8494 for ( paren=0 ; paren < RExC_npar ; paren++ ) {
8495 if ( RExC_open_parens[paren] >= opnd ) {
8496 /*DEBUG_PARSE_FMT("open"," - %d",size);*/
8497 RExC_open_parens[paren] += size;
8499 /*DEBUG_PARSE_FMT("open"," - %s","ok");*/
8501 if ( RExC_close_parens[paren] >= opnd ) {
8502 /*DEBUG_PARSE_FMT("close"," - %d",size);*/
8503 RExC_close_parens[paren] += size;
8505 /*DEBUG_PARSE_FMT("close"," - %s","ok");*/
8510 while (src > opnd) {
8511 StructCopy(--src, --dst, regnode);
8512 #ifdef RE_TRACK_PATTERN_OFFSETS
8513 if (RExC_offsets) { /* MJD 20010112 */
8514 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s copy %"UVuf" -> %"UVuf" (max %"UVuf").\n",
8518 (UV)(dst - RExC_emit_start) > RExC_offsets[0]
8519 ? "Overwriting end of array!\n" : "OK",
8520 (UV)(src - RExC_emit_start),
8521 (UV)(dst - RExC_emit_start),
8522 (UV)RExC_offsets[0]));
8523 Set_Node_Offset_To_R(dst-RExC_emit_start, Node_Offset(src));
8524 Set_Node_Length_To_R(dst-RExC_emit_start, Node_Length(src));
8530 place = opnd; /* Op node, where operand used to be. */
8531 #ifdef RE_TRACK_PATTERN_OFFSETS
8532 if (RExC_offsets) { /* MJD */
8533 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
8537 (UV)(place - RExC_emit_start) > RExC_offsets[0]
8538 ? "Overwriting end of array!\n" : "OK",
8539 (UV)(place - RExC_emit_start),
8540 (UV)(RExC_parse - RExC_start),
8541 (UV)RExC_offsets[0]));
8542 Set_Node_Offset(place, RExC_parse);
8543 Set_Node_Length(place, 1);
8546 src = NEXTOPER(place);
8547 FILL_ADVANCE_NODE(place, op);
8548 Zero(src, offset, regnode);
8552 - regtail - set the next-pointer at the end of a node chain of p to val.
8553 - SEE ALSO: regtail_study
8555 /* TODO: All three parms should be const */
8557 S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
8560 register regnode *scan;
8561 GET_RE_DEBUG_FLAGS_DECL;
8563 PERL_UNUSED_ARG(depth);
8569 /* Find last node. */
8572 regnode * const temp = regnext(scan);
8574 SV * const mysv=sv_newmortal();
8575 DEBUG_PARSE_MSG((scan==p ? "tail" : ""));
8576 regprop(RExC_rx, mysv, scan);
8577 PerlIO_printf(Perl_debug_log, "~ %s (%d) %s %s\n",
8578 SvPV_nolen_const(mysv), REG_NODE_NUM(scan),
8579 (temp == NULL ? "->" : ""),
8580 (temp == NULL ? PL_reg_name[OP(val)] : "")
8588 if (reg_off_by_arg[OP(scan)]) {
8589 ARG_SET(scan, val - scan);
8592 NEXT_OFF(scan) = val - scan;
8598 - regtail_study - set the next-pointer at the end of a node chain of p to val.
8599 - Look for optimizable sequences at the same time.
8600 - currently only looks for EXACT chains.
8602 This is expermental code. The idea is to use this routine to perform
8603 in place optimizations on branches and groups as they are constructed,
8604 with the long term intention of removing optimization from study_chunk so
8605 that it is purely analytical.
8607 Currently only used when in DEBUG mode. The macro REGTAIL_STUDY() is used
8608 to control which is which.
8611 /* TODO: All four parms should be const */
8614 S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
8617 register regnode *scan;
8619 #ifdef EXPERIMENTAL_INPLACESCAN
8623 GET_RE_DEBUG_FLAGS_DECL;
8629 /* Find last node. */
8633 regnode * const temp = regnext(scan);
8634 #ifdef EXPERIMENTAL_INPLACESCAN
8635 if (PL_regkind[OP(scan)] == EXACT)
8636 if (join_exact(pRExC_state,scan,&min,1,val,depth+1))
8644 if( exact == PSEUDO )
8646 else if ( exact != OP(scan) )
8655 SV * const mysv=sv_newmortal();
8656 DEBUG_PARSE_MSG((scan==p ? "tsdy" : ""));
8657 regprop(RExC_rx, mysv, scan);
8658 PerlIO_printf(Perl_debug_log, "~ %s (%d) -> %s\n",
8659 SvPV_nolen_const(mysv),
8661 PL_reg_name[exact]);
8668 SV * const mysv_val=sv_newmortal();
8669 DEBUG_PARSE_MSG("");
8670 regprop(RExC_rx, mysv_val, val);
8671 PerlIO_printf(Perl_debug_log, "~ attach to %s (%"IVdf") offset to %"IVdf"\n",
8672 SvPV_nolen_const(mysv_val),
8673 (IV)REG_NODE_NUM(val),
8677 if (reg_off_by_arg[OP(scan)]) {
8678 ARG_SET(scan, val - scan);
8681 NEXT_OFF(scan) = val - scan;
8689 - regcurly - a little FSA that accepts {\d+,?\d*}
8692 S_regcurly(register const char *s)
8711 - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
8715 S_regdump_extflags(pTHX_ const char *lead, const U32 flags) {
8718 for (bit=0; bit<32; bit++) {
8719 if (flags & (1<<bit)) {
8721 PerlIO_printf(Perl_debug_log, "%s",lead);
8722 PerlIO_printf(Perl_debug_log, "%s ",PL_reg_extflags_name[bit]);
8727 PerlIO_printf(Perl_debug_log, "\n");
8729 PerlIO_printf(Perl_debug_log, "%s[none-set]\n",lead);
8735 Perl_regdump(pTHX_ const regexp *r)
8739 SV * const sv = sv_newmortal();
8740 SV *dsv= sv_newmortal();
8742 GET_RE_DEBUG_FLAGS_DECL;
8744 (void)dumpuntil(r, ri->program, ri->program + 1, NULL, NULL, sv, 0, 0);
8746 /* Header fields of interest. */
8747 if (r->anchored_substr) {
8748 RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->anchored_substr),
8749 RE_SV_DUMPLEN(r->anchored_substr), 30);
8750 PerlIO_printf(Perl_debug_log,
8751 "anchored %s%s at %"IVdf" ",
8752 s, RE_SV_TAIL(r->anchored_substr),
8753 (IV)r->anchored_offset);
8754 } else if (r->anchored_utf8) {
8755 RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->anchored_utf8),
8756 RE_SV_DUMPLEN(r->anchored_utf8), 30);
8757 PerlIO_printf(Perl_debug_log,
8758 "anchored utf8 %s%s at %"IVdf" ",
8759 s, RE_SV_TAIL(r->anchored_utf8),
8760 (IV)r->anchored_offset);
8762 if (r->float_substr) {
8763 RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->float_substr),
8764 RE_SV_DUMPLEN(r->float_substr), 30);
8765 PerlIO_printf(Perl_debug_log,
8766 "floating %s%s at %"IVdf"..%"UVuf" ",
8767 s, RE_SV_TAIL(r->float_substr),
8768 (IV)r->float_min_offset, (UV)r->float_max_offset);
8769 } else if (r->float_utf8) {
8770 RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->float_utf8),
8771 RE_SV_DUMPLEN(r->float_utf8), 30);
8772 PerlIO_printf(Perl_debug_log,
8773 "floating utf8 %s%s at %"IVdf"..%"UVuf" ",
8774 s, RE_SV_TAIL(r->float_utf8),
8775 (IV)r->float_min_offset, (UV)r->float_max_offset);
8777 if (r->check_substr || r->check_utf8)
8778 PerlIO_printf(Perl_debug_log,
8780 (r->check_substr == r->float_substr
8781 && r->check_utf8 == r->float_utf8
8782 ? "(checking floating" : "(checking anchored"));
8783 if (r->extflags & RXf_NOSCAN)
8784 PerlIO_printf(Perl_debug_log, " noscan");
8785 if (r->extflags & RXf_CHECK_ALL)
8786 PerlIO_printf(Perl_debug_log, " isall");
8787 if (r->check_substr || r->check_utf8)
8788 PerlIO_printf(Perl_debug_log, ") ");
8790 if (ri->regstclass) {
8791 regprop(r, sv, ri->regstclass);
8792 PerlIO_printf(Perl_debug_log, "stclass %s ", SvPVX_const(sv));
8794 if (r->extflags & RXf_ANCH) {
8795 PerlIO_printf(Perl_debug_log, "anchored");
8796 if (r->extflags & RXf_ANCH_BOL)
8797 PerlIO_printf(Perl_debug_log, "(BOL)");
8798 if (r->extflags & RXf_ANCH_MBOL)
8799 PerlIO_printf(Perl_debug_log, "(MBOL)");
8800 if (r->extflags & RXf_ANCH_SBOL)
8801 PerlIO_printf(Perl_debug_log, "(SBOL)");
8802 if (r->extflags & RXf_ANCH_GPOS)
8803 PerlIO_printf(Perl_debug_log, "(GPOS)");
8804 PerlIO_putc(Perl_debug_log, ' ');
8806 if (r->extflags & RXf_GPOS_SEEN)
8807 PerlIO_printf(Perl_debug_log, "GPOS:%"UVuf" ", (UV)r->gofs);
8808 if (r->intflags & PREGf_SKIP)
8809 PerlIO_printf(Perl_debug_log, "plus ");
8810 if (r->intflags & PREGf_IMPLICIT)
8811 PerlIO_printf(Perl_debug_log, "implicit ");
8812 PerlIO_printf(Perl_debug_log, "minlen %"IVdf" ", (IV)r->minlen);
8813 if (r->extflags & RXf_EVAL_SEEN)
8814 PerlIO_printf(Perl_debug_log, "with eval ");
8815 PerlIO_printf(Perl_debug_log, "\n");
8816 DEBUG_FLAGS_r(regdump_extflags("r->extflags: ",r->extflags));
8818 PERL_UNUSED_CONTEXT;
8820 #endif /* DEBUGGING */
8824 - regprop - printable representation of opcode
8827 Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o)
8832 RXi_GET_DECL(prog,progi);
8833 GET_RE_DEBUG_FLAGS_DECL;
8836 sv_setpvn(sv, "", 0);
8838 if (OP(o) > REGNODE_MAX) /* regnode.type is unsigned */
8839 /* It would be nice to FAIL() here, but this may be called from
8840 regexec.c, and it would be hard to supply pRExC_state. */
8841 Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d", (int)OP(o), (int)REGNODE_MAX);
8842 sv_catpv(sv, PL_reg_name[OP(o)]); /* Take off const! */
8844 k = PL_regkind[OP(o)];
8848 /* Using is_utf8_string() (via PERL_PV_UNI_DETECT)
8849 * is a crude hack but it may be the best for now since
8850 * we have no flag "this EXACTish node was UTF-8"
8852 pv_pretty(sv, STRING(o), STR_LEN(o), 60, PL_colors[0], PL_colors[1],
8853 PERL_PV_ESCAPE_UNI_DETECT |
8854 PERL_PV_PRETTY_ELLIPSES |
8855 PERL_PV_PRETTY_LTGT |
8856 PERL_PV_PRETTY_NOCLEAR
8858 } else if (k == TRIE) {
8859 /* print the details of the trie in dumpuntil instead, as
8860 * progi->data isn't available here */
8861 const char op = OP(o);
8862 const U32 n = ARG(o);
8863 const reg_ac_data * const ac = IS_TRIE_AC(op) ?
8864 (reg_ac_data *)progi->data->data[n] :
8866 const reg_trie_data * const trie
8867 = (reg_trie_data*)progi->data->data[!IS_TRIE_AC(op) ? n : ac->trie];
8869 Perl_sv_catpvf(aTHX_ sv, "-%s",PL_reg_name[o->flags]);
8870 DEBUG_TRIE_COMPILE_r(
8871 Perl_sv_catpvf(aTHX_ sv,
8872 "<S:%"UVuf"/%"IVdf" W:%"UVuf" L:%"UVuf"/%"UVuf" C:%"UVuf"/%"UVuf">",
8873 (UV)trie->startstate,
8874 (IV)trie->statecount-1, /* -1 because of the unused 0 element */
8875 (UV)trie->wordcount,
8878 (UV)TRIE_CHARCOUNT(trie),
8879 (UV)trie->uniquecharcount
8882 if ( IS_ANYOF_TRIE(op) || trie->bitmap ) {
8884 int rangestart = -1;
8885 U8* bitmap = IS_ANYOF_TRIE(op) ? (U8*)ANYOF_BITMAP(o) : (U8*)TRIE_BITMAP(trie);
8887 for (i = 0; i <= 256; i++) {
8888 if (i < 256 && BITMAP_TEST(bitmap,i)) {
8889 if (rangestart == -1)
8891 } else if (rangestart != -1) {
8892 if (i <= rangestart + 3)
8893 for (; rangestart < i; rangestart++)
8894 put_byte(sv, rangestart);
8896 put_byte(sv, rangestart);
8898 put_byte(sv, i - 1);
8906 } else if (k == CURLY) {
8907 if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX)
8908 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */
8909 Perl_sv_catpvf(aTHX_ sv, " {%d,%d}", ARG1(o), ARG2(o));
8911 else if (k == WHILEM && o->flags) /* Ordinal/of */
8912 Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4);
8913 else if (k == REF || k == OPEN || k == CLOSE || k == GROUPP || OP(o)==ACCEPT) {
8914 Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o)); /* Parenth number */
8915 if ( prog->paren_names ) {
8916 if ( k != REF || OP(o) < NREF) {
8917 AV *list= (AV *)progi->data->data[progi->name_list_idx];
8918 SV **name= av_fetch(list, ARG(o), 0 );
8920 Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
8923 AV *list= (AV *)progi->data->data[ progi->name_list_idx ];
8924 SV *sv_dat=(SV*)progi->data->data[ ARG( o ) ];
8925 I32 *nums=(I32*)SvPVX(sv_dat);
8926 SV **name= av_fetch(list, nums[0], 0 );
8929 for ( n=0; n<SvIVX(sv_dat); n++ ) {
8930 Perl_sv_catpvf(aTHX_ sv, "%s%"IVdf,
8931 (n ? "," : ""), (IV)nums[n]);
8933 Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
8937 } else if (k == GOSUB)
8938 Perl_sv_catpvf(aTHX_ sv, "%d[%+d]", (int)ARG(o),(int)ARG2L(o)); /* Paren and offset */
8939 else if (k == VERB) {
8941 Perl_sv_catpvf(aTHX_ sv, ":%"SVf,
8942 SVfARG((SV*)progi->data->data[ ARG( o ) ]));
8943 } else if (k == LOGICAL)
8944 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* 2: embedded, otherwise 1 */
8945 else if (k == FOLDCHAR)
8946 Perl_sv_catpvf(aTHX_ sv, "[0x%"UVXf"]", PTR2UV(ARG(o)) );
8947 else if (k == ANYOF) {
8948 int i, rangestart = -1;
8949 const U8 flags = ANYOF_FLAGS(o);
8951 /* Should be synchronized with * ANYOF_ #xdefines in regcomp.h */
8952 static const char * const anyofs[] = {
8985 if (flags & ANYOF_LOCALE)
8986 sv_catpvs(sv, "{loc}");
8987 if (flags & ANYOF_FOLD)
8988 sv_catpvs(sv, "{i}");
8989 Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
8990 if (flags & ANYOF_INVERT)
8992 for (i = 0; i <= 256; i++) {
8993 if (i < 256 && ANYOF_BITMAP_TEST(o,i)) {
8994 if (rangestart == -1)
8996 } else if (rangestart != -1) {
8997 if (i <= rangestart + 3)
8998 for (; rangestart < i; rangestart++)
8999 put_byte(sv, rangestart);
9001 put_byte(sv, rangestart);
9003 put_byte(sv, i - 1);
9009 if (o->flags & ANYOF_CLASS)
9010 for (i = 0; i < (int)(sizeof(anyofs)/sizeof(char*)); i++)
9011 if (ANYOF_CLASS_TEST(o,i))
9012 sv_catpv(sv, anyofs[i]);
9014 if (flags & ANYOF_UNICODE)
9015 sv_catpvs(sv, "{unicode}");
9016 else if (flags & ANYOF_UNICODE_ALL)
9017 sv_catpvs(sv, "{unicode_all}");
9021 SV * const sw = regclass_swash(prog, o, FALSE, &lv, 0);
9025 U8 s[UTF8_MAXBYTES_CASE+1];
9027 for (i = 0; i <= 256; i++) { /* just the first 256 */
9028 uvchr_to_utf8(s, i);
9030 if (i < 256 && swash_fetch(sw, s, TRUE)) {
9031 if (rangestart == -1)
9033 } else if (rangestart != -1) {
9034 if (i <= rangestart + 3)
9035 for (; rangestart < i; rangestart++) {
9036 const U8 * const e = uvchr_to_utf8(s,rangestart);
9038 for(p = s; p < e; p++)
9042 const U8 *e = uvchr_to_utf8(s,rangestart);
9044 for (p = s; p < e; p++)
9047 e = uvchr_to_utf8(s, i-1);
9048 for (p = s; p < e; p++)
9055 sv_catpvs(sv, "..."); /* et cetera */
9059 char *s = savesvpv(lv);
9060 char * const origs = s;
9062 while (*s && *s != '\n')
9066 const char * const t = ++s;
9084 Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
9086 else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH))
9087 Perl_sv_catpvf(aTHX_ sv, "[%d]", -(o->flags));
9089 PERL_UNUSED_CONTEXT;
9090 PERL_UNUSED_ARG(sv);
9092 PERL_UNUSED_ARG(prog);
9093 #endif /* DEBUGGING */
9097 Perl_re_intuit_string(pTHX_ REGEXP * const prog)
9098 { /* Assume that RE_INTUIT is set */
9100 GET_RE_DEBUG_FLAGS_DECL;
9101 PERL_UNUSED_CONTEXT;
9105 const char * const s = SvPV_nolen_const(prog->check_substr
9106 ? prog->check_substr : prog->check_utf8);
9108 if (!PL_colorset) reginitcolors();
9109 PerlIO_printf(Perl_debug_log,
9110 "%sUsing REx %ssubstr:%s \"%s%.60s%s%s\"\n",
9112 prog->check_substr ? "" : "utf8 ",
9113 PL_colors[5],PL_colors[0],
9116 (strlen(s) > 60 ? "..." : ""));
9119 return prog->check_substr ? prog->check_substr : prog->check_utf8;
9125 handles refcounting and freeing the perl core regexp structure. When
9126 it is necessary to actually free the structure the first thing it
9127 does is call the 'free' method of the regexp_engine associated to to
9128 the regexp, allowing the handling of the void *pprivate; member
9129 first. (This routine is not overridable by extensions, which is why
9130 the extensions free is called first.)
9132 See regdupe and regdupe_internal if you change anything here.
9134 #ifndef PERL_IN_XSUB_RE
9136 Perl_pregfree(pTHX_ struct regexp *r)
9139 GET_RE_DEBUG_FLAGS_DECL;
9141 if (!r || (--r->refcnt > 0))
9144 ReREFCNT_dec(r->mother_re);
9146 CALLREGFREE_PVT(r); /* free the private data */
9148 SvREFCNT_dec(r->paren_names);
9149 Safefree(r->wrapped);
9152 if (r->anchored_substr)
9153 SvREFCNT_dec(r->anchored_substr);
9154 if (r->anchored_utf8)
9155 SvREFCNT_dec(r->anchored_utf8);
9156 if (r->float_substr)
9157 SvREFCNT_dec(r->float_substr);
9159 SvREFCNT_dec(r->float_utf8);
9160 Safefree(r->substrs);
9162 RX_MATCH_COPY_FREE(r);
9163 #ifdef PERL_OLD_COPY_ON_WRITE
9165 SvREFCNT_dec(r->saved_copy);
9174 This is a hacky workaround to the structural issue of match results
9175 being stored in the regexp structure which is in turn stored in
9176 PL_curpm/PL_reg_curpm. The problem is that due to qr// the pattern
9177 could be PL_curpm in multiple contexts, and could require multiple
9178 result sets being associated with the pattern simultaneously, such
9179 as when doing a recursive match with (??{$qr})
9181 The solution is to make a lightweight copy of the regexp structure
9182 when a qr// is returned from the code executed by (??{$qr}) this
9183 lightweight copy doesnt actually own any of its data except for
9184 the starp/end and the actual regexp structure itself.
9190 Perl_reg_temp_copy (pTHX_ struct regexp *r) {
9192 register const I32 npar = r->nparens+1;
9193 (void)ReREFCNT_inc(r);
9194 Newx(ret, 1, regexp);
9195 StructCopy(r, ret, regexp);
9196 Newx(ret->offs, npar, regexp_paren_pair);
9197 Copy(r->offs, ret->offs, npar, regexp_paren_pair);
9200 Newx(ret->substrs, 1, struct reg_substr_data);
9201 StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
9203 SvREFCNT_inc_void(ret->anchored_substr);
9204 SvREFCNT_inc_void(ret->anchored_utf8);
9205 SvREFCNT_inc_void(ret->float_substr);
9206 SvREFCNT_inc_void(ret->float_utf8);
9208 /* check_substr and check_utf8, if non-NULL, point to either their
9209 anchored or float namesakes, and don't hold a second reference. */
9211 RX_MATCH_COPIED_off(ret);
9212 #ifdef PERL_OLD_COPY_ON_WRITE
9213 ret->saved_copy = NULL;
9222 /* regfree_internal()
9224 Free the private data in a regexp. This is overloadable by
9225 extensions. Perl takes care of the regexp structure in pregfree(),
9226 this covers the *pprivate pointer which technically perldoesnt
9227 know about, however of course we have to handle the
9228 regexp_internal structure when no extension is in use.
9230 Note this is called before freeing anything in the regexp
9235 Perl_regfree_internal(pTHX_ REGEXP * const r)
9239 GET_RE_DEBUG_FLAGS_DECL;
9245 SV *dsv= sv_newmortal();
9246 RE_PV_QUOTED_DECL(s, (r->extflags & RXf_UTF8),
9247 dsv, r->precomp, r->prelen, 60);
9248 PerlIO_printf(Perl_debug_log,"%sFreeing REx:%s %s\n",
9249 PL_colors[4],PL_colors[5],s);
9252 #ifdef RE_TRACK_PATTERN_OFFSETS
9254 Safefree(ri->u.offsets); /* 20010421 MJD */
9257 int n = ri->data->count;
9258 PAD* new_comppad = NULL;
9263 /* If you add a ->what type here, update the comment in regcomp.h */
9264 switch (ri->data->what[n]) {
9268 SvREFCNT_dec((SV*)ri->data->data[n]);
9271 Safefree(ri->data->data[n]);
9274 new_comppad = (AV*)ri->data->data[n];
9277 if (new_comppad == NULL)
9278 Perl_croak(aTHX_ "panic: pregfree comppad");
9279 PAD_SAVE_LOCAL(old_comppad,
9280 /* Watch out for global destruction's random ordering. */
9281 (SvTYPE(new_comppad) == SVt_PVAV) ? new_comppad : NULL
9284 refcnt = OpREFCNT_dec((OP_4tree*)ri->data->data[n]);
9287 op_free((OP_4tree*)ri->data->data[n]);
9289 PAD_RESTORE_LOCAL(old_comppad);
9290 SvREFCNT_dec((SV*)new_comppad);
9296 { /* Aho Corasick add-on structure for a trie node.
9297 Used in stclass optimization only */
9299 reg_ac_data *aho=(reg_ac_data*)ri->data->data[n];
9301 refcount = --aho->refcount;
9304 PerlMemShared_free(aho->states);
9305 PerlMemShared_free(aho->fail);
9306 /* do this last!!!! */
9307 PerlMemShared_free(ri->data->data[n]);
9308 PerlMemShared_free(ri->regstclass);
9314 /* trie structure. */
9316 reg_trie_data *trie=(reg_trie_data*)ri->data->data[n];
9318 refcount = --trie->refcount;
9321 PerlMemShared_free(trie->charmap);
9322 PerlMemShared_free(trie->states);
9323 PerlMemShared_free(trie->trans);
9325 PerlMemShared_free(trie->bitmap);
9327 PerlMemShared_free(trie->wordlen);
9329 PerlMemShared_free(trie->jump);
9331 PerlMemShared_free(trie->nextword);
9332 /* do this last!!!! */
9333 PerlMemShared_free(ri->data->data[n]);
9338 Perl_croak(aTHX_ "panic: regfree data code '%c'", ri->data->what[n]);
9341 Safefree(ri->data->what);
9348 #define sv_dup_inc(s,t) SvREFCNT_inc(sv_dup(s,t))
9349 #define av_dup_inc(s,t) (AV*)SvREFCNT_inc(sv_dup((SV*)s,t))
9350 #define hv_dup_inc(s,t) (HV*)SvREFCNT_inc(sv_dup((SV*)s,t))
9351 #define SAVEPVN(p,n) ((p) ? savepvn(p,n) : NULL)
9354 re_dup - duplicate a regexp.
9356 This routine is expected to clone a given regexp structure. It is not
9357 compiler under USE_ITHREADS.
9359 After all of the core data stored in struct regexp is duplicated
9360 the regexp_engine.dupe method is used to copy any private data
9361 stored in the *pprivate pointer. This allows extensions to handle
9362 any duplication it needs to do.
9364 See pregfree() and regfree_internal() if you change anything here.
9366 #if defined(USE_ITHREADS)
9367 #ifndef PERL_IN_XSUB_RE
9369 Perl_re_dup(pTHX_ const regexp *r, CLONE_PARAMS *param)
9376 return (REGEXP *)NULL;
9378 if ((ret = (REGEXP *)ptr_table_fetch(PL_ptr_table, r)))
9382 npar = r->nparens+1;
9383 Newx(ret, 1, regexp);
9384 StructCopy(r, ret, regexp);
9385 Newx(ret->offs, npar, regexp_paren_pair);
9386 Copy(r->offs, ret->offs, npar, regexp_paren_pair);
9388 /* no need to copy these */
9389 Newx(ret->swap, npar, regexp_paren_pair);
9393 /* Do it this way to avoid reading from *r after the StructCopy().
9394 That way, if any of the sv_dup_inc()s dislodge *r from the L1
9395 cache, it doesn't matter. */
9396 const bool anchored = r->check_substr == r->anchored_substr;
9397 Newx(ret->substrs, 1, struct reg_substr_data);
9398 StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
9400 ret->anchored_substr = sv_dup_inc(ret->anchored_substr, param);
9401 ret->anchored_utf8 = sv_dup_inc(ret->anchored_utf8, param);
9402 ret->float_substr = sv_dup_inc(ret->float_substr, param);
9403 ret->float_utf8 = sv_dup_inc(ret->float_utf8, param);
9405 /* check_substr and check_utf8, if non-NULL, point to either their
9406 anchored or float namesakes, and don't hold a second reference. */
9408 if (ret->check_substr) {
9410 assert(r->check_utf8 == r->anchored_utf8);
9411 ret->check_substr = ret->anchored_substr;
9412 ret->check_utf8 = ret->anchored_utf8;
9414 assert(r->check_substr == r->float_substr);
9415 assert(r->check_utf8 == r->float_utf8);
9416 ret->check_substr = ret->float_substr;
9417 ret->check_utf8 = ret->float_utf8;
9422 ret->wrapped = SAVEPVN(ret->wrapped, ret->wraplen+1);
9423 ret->precomp = ret->wrapped + (ret->precomp - ret->wrapped);
9424 ret->paren_names = hv_dup_inc(ret->paren_names, param);
9427 RXi_SET(ret,CALLREGDUPE_PVT(ret,param));
9429 if (RX_MATCH_COPIED(ret))
9430 ret->subbeg = SAVEPVN(ret->subbeg, ret->sublen);
9433 #ifdef PERL_OLD_COPY_ON_WRITE
9434 ret->saved_copy = NULL;
9437 ret->mother_re = NULL;
9439 ret->seen_evals = 0;
9441 ptr_table_store(PL_ptr_table, r, ret);
9444 #endif /* PERL_IN_XSUB_RE */
9449 This is the internal complement to regdupe() which is used to copy
9450 the structure pointed to by the *pprivate pointer in the regexp.
9451 This is the core version of the extension overridable cloning hook.
9452 The regexp structure being duplicated will be copied by perl prior
9453 to this and will be provided as the regexp *r argument, however
9454 with the /old/ structures pprivate pointer value. Thus this routine
9455 may override any copying normally done by perl.
9457 It returns a pointer to the new regexp_internal structure.
9461 Perl_regdupe_internal(pTHX_ REGEXP * const r, CLONE_PARAMS *param)
9464 regexp_internal *reti;
9468 npar = r->nparens+1;
9471 Newxc(reti, sizeof(regexp_internal) + (len+1)*sizeof(regnode), char, regexp_internal);
9472 Copy(ri->program, reti->program, len+1, regnode);
9475 reti->regstclass = NULL;
9479 const int count = ri->data->count;
9482 Newxc(d, sizeof(struct reg_data) + count*sizeof(void *),
9483 char, struct reg_data);
9484 Newx(d->what, count, U8);
9487 for (i = 0; i < count; i++) {
9488 d->what[i] = ri->data->what[i];
9489 switch (d->what[i]) {
9490 /* legal options are one of: sSfpontTu
9491 see also regcomp.h and pregfree() */
9494 case 'p': /* actually an AV, but the dup function is identical. */
9495 case 'u': /* actually an HV, but the dup function is identical. */
9496 d->data[i] = sv_dup_inc((SV *)ri->data->data[i], param);
9499 /* This is cheating. */
9500 Newx(d->data[i], 1, struct regnode_charclass_class);
9501 StructCopy(ri->data->data[i], d->data[i],
9502 struct regnode_charclass_class);
9503 reti->regstclass = (regnode*)d->data[i];
9506 /* Compiled op trees are readonly and in shared memory,
9507 and can thus be shared without duplication. */
9509 d->data[i] = (void*)OpREFCNT_inc((OP*)ri->data->data[i]);
9513 /* Trie stclasses are readonly and can thus be shared
9514 * without duplication. We free the stclass in pregfree
9515 * when the corresponding reg_ac_data struct is freed.
9517 reti->regstclass= ri->regstclass;
9521 ((reg_trie_data*)ri->data->data[i])->refcount++;
9525 d->data[i] = ri->data->data[i];
9528 Perl_croak(aTHX_ "panic: re_dup unknown data code '%c'", ri->data->what[i]);
9537 reti->name_list_idx = ri->name_list_idx;
9539 #ifdef RE_TRACK_PATTERN_OFFSETS
9540 if (ri->u.offsets) {
9541 Newx(reti->u.offsets, 2*len+1, U32);
9542 Copy(ri->u.offsets, reti->u.offsets, 2*len+1, U32);
9545 SetProgLen(reti,len);
9551 #endif /* USE_ITHREADS */
9556 converts a regexp embedded in a MAGIC struct to its stringified form,
9557 caching the converted form in the struct and returns the cached
9560 If lp is nonnull then it is used to return the length of the
9563 If flags is nonnull and the returned string contains UTF8 then
9564 (*flags & 1) will be true.
9566 If haseval is nonnull then it is used to return whether the pattern
9569 Normally called via macro:
9571 CALLREG_STRINGIFY(mg,&len,&utf8);
9575 CALLREG_AS_STR(mg,&lp,&flags,&haseval)
9577 See sv_2pv_flags() in sv.c for an example of internal usage.
9580 #ifndef PERL_IN_XSUB_RE
9583 Perl_reg_stringify(pTHX_ MAGIC *mg, STRLEN *lp, U32 *flags, I32 *haseval ) {
9585 const regexp * const re = (regexp *)mg->mg_obj;
9587 *haseval = re->seen_evals;
9589 *flags = ((re->extflags & RXf_UTF8) ? 1 : 0);
9596 - regnext - dig the "next" pointer out of a node
9599 Perl_regnext(pTHX_ register regnode *p)
9602 register I32 offset;
9607 offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p));
9616 S_re_croak2(pTHX_ const char* pat1,const char* pat2,...)
9619 STRLEN l1 = strlen(pat1);
9620 STRLEN l2 = strlen(pat2);
9623 const char *message;
9629 Copy(pat1, buf, l1 , char);
9630 Copy(pat2, buf + l1, l2 , char);
9631 buf[l1 + l2] = '\n';
9632 buf[l1 + l2 + 1] = '\0';
9634 /* ANSI variant takes additional second argument */
9635 va_start(args, pat2);
9639 msv = vmess(buf, &args);
9641 message = SvPV_const(msv,l1);
9644 Copy(message, buf, l1 , char);
9645 buf[l1-1] = '\0'; /* Overwrite \n */
9646 Perl_croak(aTHX_ "%s", buf);
9649 /* XXX Here's a total kludge. But we need to re-enter for swash routines. */
9651 #ifndef PERL_IN_XSUB_RE
9653 Perl_save_re_context(pTHX)
9657 struct re_save_state *state;
9659 SAVEVPTR(PL_curcop);
9660 SSGROW(SAVESTACK_ALLOC_FOR_RE_SAVE_STATE + 1);
9662 state = (struct re_save_state *)(PL_savestack + PL_savestack_ix);
9663 PL_savestack_ix += SAVESTACK_ALLOC_FOR_RE_SAVE_STATE;
9664 SSPUSHINT(SAVEt_RE_STATE);
9666 Copy(&PL_reg_state, state, 1, struct re_save_state);
9668 PL_reg_start_tmp = 0;
9669 PL_reg_start_tmpl = 0;
9670 PL_reg_oldsaved = NULL;
9671 PL_reg_oldsavedlen = 0;
9673 PL_reg_leftiter = 0;
9674 PL_reg_poscache = NULL;
9675 PL_reg_poscache_size = 0;
9676 #ifdef PERL_OLD_COPY_ON_WRITE
9680 /* Save $1..$n (#18107: UTF-8 s/(\w+)/uc($1)/e); AMS 20021106. */
9682 const REGEXP * const rx = PM_GETRE(PL_curpm);
9685 for (i = 1; i <= rx->nparens; i++) {
9686 char digits[TYPE_CHARS(long)];
9687 const STRLEN len = my_snprintf(digits, sizeof(digits), "%lu", (long)i);
9688 GV *const *const gvp
9689 = (GV**)hv_fetch(PL_defstash, digits, len, 0);
9692 GV * const gv = *gvp;
9693 if (SvTYPE(gv) == SVt_PVGV && GvSV(gv))
9703 clear_re(pTHX_ void *r)
9706 ReREFCNT_dec((regexp *)r);
9712 S_put_byte(pTHX_ SV *sv, int c)
9714 /* Our definition of isPRINT() ignores locales, so only bytes that are
9715 not part of UTF-8 are considered printable. I assume that the same
9716 holds for UTF-EBCDIC.
9717 Also, code point 255 is not printable in either (it's E0 in EBCDIC,
9718 which Wikipedia says:
9720 EO, or Eight Ones, is an 8-bit EBCDIC character code represented as all
9721 ones (binary 1111 1111, hexadecimal FF). It is similar, but not
9722 identical, to the ASCII delete (DEL) or rubout control character.
9723 ) So the old condition can be simplified to !isPRINT(c) */
9725 Perl_sv_catpvf(aTHX_ sv, "\\%o", c);
9727 const char string = c;
9728 if (c == '-' || c == ']' || c == '\\' || c == '^')
9729 sv_catpvs(sv, "\\");
9730 sv_catpvn(sv, &string, 1);
9735 #define CLEAR_OPTSTART \
9736 if (optstart) STMT_START { \
9737 DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log, " (%"IVdf" nodes)\n", (IV)(node - optstart))); \
9741 #define DUMPUNTIL(b,e) CLEAR_OPTSTART; node=dumpuntil(r,start,(b),(e),last,sv,indent+1,depth+1);
9743 STATIC const regnode *
9744 S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node,
9745 const regnode *last, const regnode *plast,
9746 SV* sv, I32 indent, U32 depth)
9749 register U8 op = PSEUDO; /* Arbitrary non-END op. */
9750 register const regnode *next;
9751 const regnode *optstart= NULL;
9754 GET_RE_DEBUG_FLAGS_DECL;
9756 #ifdef DEBUG_DUMPUNTIL
9757 PerlIO_printf(Perl_debug_log, "--- %d : %d - %d - %d\n",indent,node-start,
9758 last ? last-start : 0,plast ? plast-start : 0);
9761 if (plast && plast < last)
9764 while (PL_regkind[op] != END && (!last || node < last)) {
9765 /* While that wasn't END last time... */
9768 if (op == CLOSE || op == WHILEM)
9770 next = regnext((regnode *)node);
9773 if (OP(node) == OPTIMIZED) {
9774 if (!optstart && RE_DEBUG_FLAG(RE_DEBUG_COMPILE_OPTIMISE))
9781 regprop(r, sv, node);
9782 PerlIO_printf(Perl_debug_log, "%4"IVdf":%*s%s", (IV)(node - start),
9783 (int)(2*indent + 1), "", SvPVX_const(sv));
9785 if (OP(node) != OPTIMIZED) {
9786 if (next == NULL) /* Next ptr. */
9787 PerlIO_printf(Perl_debug_log, " (0)");
9788 else if (PL_regkind[(U8)op] == BRANCH && PL_regkind[OP(next)] != BRANCH )
9789 PerlIO_printf(Perl_debug_log, " (FAIL)");
9791 PerlIO_printf(Perl_debug_log, " (%"IVdf")", (IV)(next - start));
9792 (void)PerlIO_putc(Perl_debug_log, '\n');
9796 if (PL_regkind[(U8)op] == BRANCHJ) {
9799 register const regnode *nnode = (OP(next) == LONGJMP
9800 ? regnext((regnode *)next)
9802 if (last && nnode > last)
9804 DUMPUNTIL(NEXTOPER(NEXTOPER(node)), nnode);
9807 else if (PL_regkind[(U8)op] == BRANCH) {
9809 DUMPUNTIL(NEXTOPER(node), next);
9811 else if ( PL_regkind[(U8)op] == TRIE ) {
9812 const regnode *this_trie = node;
9813 const char op = OP(node);
9814 const U32 n = ARG(node);
9815 const reg_ac_data * const ac = op>=AHOCORASICK ?
9816 (reg_ac_data *)ri->data->data[n] :
9818 const reg_trie_data * const trie =
9819 (reg_trie_data*)ri->data->data[op<AHOCORASICK ? n : ac->trie];
9821 AV *const trie_words = (AV *) ri->data->data[n + TRIE_WORDS_OFFSET];
9823 const regnode *nextbranch= NULL;
9825 sv_setpvn(sv, "", 0);
9826 for (word_idx= 0; word_idx < (I32)trie->wordcount; word_idx++) {
9827 SV ** const elem_ptr = av_fetch(trie_words,word_idx,0);
9829 PerlIO_printf(Perl_debug_log, "%*s%s ",
9830 (int)(2*(indent+3)), "",
9831 elem_ptr ? pv_pretty(sv, SvPV_nolen_const(*elem_ptr), SvCUR(*elem_ptr), 60,
9832 PL_colors[0], PL_colors[1],
9833 (SvUTF8(*elem_ptr) ? PERL_PV_ESCAPE_UNI : 0) |
9834 PERL_PV_PRETTY_ELLIPSES |
9840 U16 dist= trie->jump[word_idx+1];
9841 PerlIO_printf(Perl_debug_log, "(%"UVuf")\n",
9842 (UV)((dist ? this_trie + dist : next) - start));
9845 nextbranch= this_trie + trie->jump[0];
9846 DUMPUNTIL(this_trie + dist, nextbranch);
9848 if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
9849 nextbranch= regnext((regnode *)nextbranch);
9851 PerlIO_printf(Perl_debug_log, "\n");
9854 if (last && next > last)
9859 else if ( op == CURLY ) { /* "next" might be very big: optimizer */
9860 DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS,
9861 NEXTOPER(node) + EXTRA_STEP_2ARGS + 1);
9863 else if (PL_regkind[(U8)op] == CURLY && op != CURLYX) {
9865 DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS, next);
9867 else if ( op == PLUS || op == STAR) {
9868 DUMPUNTIL(NEXTOPER(node), NEXTOPER(node) + 1);
9870 else if (op == ANYOF) {
9871 /* arglen 1 + class block */
9872 node += 1 + ((ANYOF_FLAGS(node) & ANYOF_LARGE)
9873 ? ANYOF_CLASS_SKIP : ANYOF_SKIP);
9874 node = NEXTOPER(node);
9876 else if (PL_regkind[(U8)op] == EXACT) {
9877 /* Literal string, where present. */
9878 node += NODE_SZ_STR(node) - 1;
9879 node = NEXTOPER(node);
9882 node = NEXTOPER(node);
9883 node += regarglen[(U8)op];
9885 if (op == CURLYX || op == OPEN)
9889 #ifdef DEBUG_DUMPUNTIL
9890 PerlIO_printf(Perl_debug_log, "--- %d\n", (int)indent);
9895 #endif /* DEBUGGING */
9899 * c-indentation-style: bsd
9901 * indent-tabs-mode: t
9904 * ex: set ts=8 sts=4 sw=4 noet: