5 * 'A fair jaw-cracker dwarf-language must be.' --Samwise Gamgee
7 * [p.285 of _The Lord of the Rings_, II/iii: "The Ring Goes South"]
10 /* This file contains functions for compiling a regular expression. See
11 * also regexec.c which funnily enough, contains functions for executing
12 * a regular expression.
14 * This file is also copied at build time to ext/re/re_comp.c, where
15 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
16 * This causes the main functions to be compiled under new names and with
17 * debugging support added, which makes "use re 'debug'" work.
20 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
21 * confused with the original package (see point 3 below). Thanks, Henry!
24 /* Additional note: this code is very heavily munged from Henry's version
25 * in places. In some spots I've traded clarity for efficiency, so don't
26 * blame Henry for some of the lack of readability.
29 /* The names of the functions have been changed from regcomp and
30 * regexec to pregcomp and pregexec in order to avoid conflicts
31 * with the POSIX routines of the same names.
34 #ifdef PERL_EXT_RE_BUILD
39 * pregcomp and pregexec -- regsub and regerror are not used in perl
41 * Copyright (c) 1986 by University of Toronto.
42 * Written by Henry Spencer. Not derived from licensed software.
44 * Permission is granted to anyone to use this software for any
45 * purpose on any computer system, and to redistribute it freely,
46 * subject to the following restrictions:
48 * 1. The author is not responsible for the consequences of use of
49 * this software, no matter how awful, even if they arise
52 * 2. The origin of this software must not be misrepresented, either
53 * by explicit claim or by omission.
55 * 3. Altered versions must be plainly marked as such, and must not
56 * be misrepresented as being the original software.
59 **** Alterations to Henry's code are...
61 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
62 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
63 **** by Larry Wall and others
65 **** You may distribute under the terms of either the GNU General Public
66 **** License or the Artistic License, as specified in the README file.
69 * Beware that some of this code is subtly aware of the way operator
70 * precedence is structured in regular expressions. Serious changes in
71 * regular-expression syntax might require a total rethink.
74 #define PERL_IN_REGCOMP_C
77 #ifndef PERL_IN_XSUB_RE
82 #ifdef PERL_IN_XSUB_RE
93 # if defined(BUGGY_MSC6)
94 /* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */
95 # pragma optimize("a",off)
96 /* But MSC 6.00A is happy with 'w', for aliases only across function calls*/
97 # pragma optimize("w",on )
98 # endif /* BUGGY_MSC6 */
102 #define STATIC static
105 typedef struct RExC_state_t {
106 U32 flags; /* are we folding, multilining? */
107 char *precomp; /* uncompiled string. */
108 REGEXP *rx_sv; /* The SV that is the regexp. */
109 regexp *rx; /* perl core regexp structure */
110 regexp_internal *rxi; /* internal data for regexp object pprivate field */
111 char *start; /* Start of input for compile */
112 char *end; /* End of input for compile */
113 char *parse; /* Input-scan pointer. */
114 I32 whilem_seen; /* number of WHILEM in this expr */
115 regnode *emit_start; /* Start of emitted-code area */
116 regnode *emit_bound; /* First regnode outside of the allocated space */
117 regnode *emit; /* Code-emit pointer; ®dummy = don't = compiling */
118 I32 naughty; /* How bad is this pattern? */
119 I32 sawback; /* Did we see \1, ...? */
121 I32 size; /* Code size. */
122 I32 npar; /* Capture buffer count, (OPEN). */
123 I32 cpar; /* Capture buffer count, (CLOSE). */
124 I32 nestroot; /* root parens we are in - used by accept */
128 regnode **open_parens; /* pointers to open parens */
129 regnode **close_parens; /* pointers to close parens */
130 regnode *opend; /* END node in program */
131 I32 utf8; /* whether the pattern is utf8 or not */
132 I32 orig_utf8; /* whether the pattern was originally in utf8 */
133 /* XXX use this for future optimisation of case
134 * where pattern must be upgraded to utf8. */
135 HV *charnames; /* cache of named sequences */
136 HV *paren_names; /* Paren names */
138 regnode **recurse; /* Recurse regops */
139 I32 recurse_count; /* Number of recurse regops */
141 char *starttry; /* -Dr: where regtry was called. */
142 #define RExC_starttry (pRExC_state->starttry)
145 const char *lastparse;
147 AV *paren_name_list; /* idx -> name */
148 #define RExC_lastparse (pRExC_state->lastparse)
149 #define RExC_lastnum (pRExC_state->lastnum)
150 #define RExC_paren_name_list (pRExC_state->paren_name_list)
154 #define RExC_flags (pRExC_state->flags)
155 #define RExC_precomp (pRExC_state->precomp)
156 #define RExC_rx_sv (pRExC_state->rx_sv)
157 #define RExC_rx (pRExC_state->rx)
158 #define RExC_rxi (pRExC_state->rxi)
159 #define RExC_start (pRExC_state->start)
160 #define RExC_end (pRExC_state->end)
161 #define RExC_parse (pRExC_state->parse)
162 #define RExC_whilem_seen (pRExC_state->whilem_seen)
163 #ifdef RE_TRACK_PATTERN_OFFSETS
164 #define RExC_offsets (pRExC_state->rxi->u.offsets) /* I am not like the others */
166 #define RExC_emit (pRExC_state->emit)
167 #define RExC_emit_start (pRExC_state->emit_start)
168 #define RExC_emit_bound (pRExC_state->emit_bound)
169 #define RExC_naughty (pRExC_state->naughty)
170 #define RExC_sawback (pRExC_state->sawback)
171 #define RExC_seen (pRExC_state->seen)
172 #define RExC_size (pRExC_state->size)
173 #define RExC_npar (pRExC_state->npar)
174 #define RExC_nestroot (pRExC_state->nestroot)
175 #define RExC_extralen (pRExC_state->extralen)
176 #define RExC_seen_zerolen (pRExC_state->seen_zerolen)
177 #define RExC_seen_evals (pRExC_state->seen_evals)
178 #define RExC_utf8 (pRExC_state->utf8)
179 #define RExC_orig_utf8 (pRExC_state->orig_utf8)
180 #define RExC_charnames (pRExC_state->charnames)
181 #define RExC_open_parens (pRExC_state->open_parens)
182 #define RExC_close_parens (pRExC_state->close_parens)
183 #define RExC_opend (pRExC_state->opend)
184 #define RExC_paren_names (pRExC_state->paren_names)
185 #define RExC_recurse (pRExC_state->recurse)
186 #define RExC_recurse_count (pRExC_state->recurse_count)
189 #define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?')
190 #define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
191 ((*s) == '{' && regcurly(s)))
194 #undef SPSTART /* dratted cpp namespace... */
197 * Flags to be passed up and down.
199 #define WORST 0 /* Worst case. */
200 #define HASWIDTH 0x01 /* Known to match non-null strings. */
201 #define SIMPLE 0x02 /* Simple enough to be STAR/PLUS operand. */
202 #define SPSTART 0x04 /* Starts with * or +. */
203 #define TRYAGAIN 0x08 /* Weeded out a declaration. */
204 #define POSTPONED 0x10 /* (?1),(?&name), (??{...}) or similar */
206 #define REG_NODE_NUM(x) ((x) ? (int)((x)-RExC_emit_start) : -1)
208 /* whether trie related optimizations are enabled */
209 #if PERL_ENABLE_EXTENDED_TRIE_OPTIMISATION
210 #define TRIE_STUDY_OPT
211 #define FULL_TRIE_STUDY
217 #define PBYTE(u8str,paren) ((U8*)(u8str))[(paren) >> 3]
218 #define PBITVAL(paren) (1 << ((paren) & 7))
219 #define PAREN_TEST(u8str,paren) ( PBYTE(u8str,paren) & PBITVAL(paren))
220 #define PAREN_SET(u8str,paren) PBYTE(u8str,paren) |= PBITVAL(paren)
221 #define PAREN_UNSET(u8str,paren) PBYTE(u8str,paren) &= (~PBITVAL(paren))
224 /* About scan_data_t.
226 During optimisation we recurse through the regexp program performing
227 various inplace (keyhole style) optimisations. In addition study_chunk
228 and scan_commit populate this data structure with information about
229 what strings MUST appear in the pattern. We look for the longest
230 string that must appear for at a fixed location, and we look for the
231 longest string that may appear at a floating location. So for instance
236 Both 'FOO' and 'A' are fixed strings. Both 'B' and 'BAR' are floating
237 strings (because they follow a .* construct). study_chunk will identify
238 both FOO and BAR as being the longest fixed and floating strings respectively.
240 The strings can be composites, for instance
244 will result in a composite fixed substring 'foo'.
246 For each string some basic information is maintained:
248 - offset or min_offset
249 This is the position the string must appear at, or not before.
250 It also implicitly (when combined with minlenp) tells us how many
251 character must match before the string we are searching.
252 Likewise when combined with minlenp and the length of the string
253 tells us how many characters must appear after the string we have
257 Only used for floating strings. This is the rightmost point that
258 the string can appear at. Ifset to I32 max it indicates that the
259 string can occur infinitely far to the right.
262 A pointer to the minimum length of the pattern that the string
263 was found inside. This is important as in the case of positive
264 lookahead or positive lookbehind we can have multiple patterns
269 The minimum length of the pattern overall is 3, the minimum length
270 of the lookahead part is 3, but the minimum length of the part that
271 will actually match is 1. So 'FOO's minimum length is 3, but the
272 minimum length for the F is 1. This is important as the minimum length
273 is used to determine offsets in front of and behind the string being
274 looked for. Since strings can be composites this is the length of the
275 pattern at the time it was commited with a scan_commit. Note that
276 the length is calculated by study_chunk, so that the minimum lengths
277 are not known until the full pattern has been compiled, thus the
278 pointer to the value.
282 In the case of lookbehind the string being searched for can be
283 offset past the start point of the final matching string.
284 If this value was just blithely removed from the min_offset it would
285 invalidate some of the calculations for how many chars must match
286 before or after (as they are derived from min_offset and minlen and
287 the length of the string being searched for).
288 When the final pattern is compiled and the data is moved from the
289 scan_data_t structure into the regexp structure the information
290 about lookbehind is factored in, with the information that would
291 have been lost precalculated in the end_shift field for the
294 The fields pos_min and pos_delta are used to store the minimum offset
295 and the delta to the maximum offset at the current point in the pattern.
299 typedef struct scan_data_t {
300 /*I32 len_min; unused */
301 /*I32 len_delta; unused */
305 I32 last_end; /* min value, <0 unless valid. */
308 SV **longest; /* Either &l_fixed, or &l_float. */
309 SV *longest_fixed; /* longest fixed string found in pattern */
310 I32 offset_fixed; /* offset where it starts */
311 I32 *minlen_fixed; /* pointer to the minlen relevent to the string */
312 I32 lookbehind_fixed; /* is the position of the string modfied by LB */
313 SV *longest_float; /* longest floating string found in pattern */
314 I32 offset_float_min; /* earliest point in string it can appear */
315 I32 offset_float_max; /* latest point in string it can appear */
316 I32 *minlen_float; /* pointer to the minlen relevent to the string */
317 I32 lookbehind_float; /* is the position of the string modified by LB */
321 struct regnode_charclass_class *start_class;
325 * Forward declarations for pregcomp()'s friends.
328 static const scan_data_t zero_scan_data =
329 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ,0};
331 #define SF_BEFORE_EOL (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
332 #define SF_BEFORE_SEOL 0x0001
333 #define SF_BEFORE_MEOL 0x0002
334 #define SF_FIX_BEFORE_EOL (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
335 #define SF_FL_BEFORE_EOL (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
338 # define SF_FIX_SHIFT_EOL (0+2)
339 # define SF_FL_SHIFT_EOL (0+4)
341 # define SF_FIX_SHIFT_EOL (+2)
342 # define SF_FL_SHIFT_EOL (+4)
345 #define SF_FIX_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
346 #define SF_FIX_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
348 #define SF_FL_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
349 #define SF_FL_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
350 #define SF_IS_INF 0x0040
351 #define SF_HAS_PAR 0x0080
352 #define SF_IN_PAR 0x0100
353 #define SF_HAS_EVAL 0x0200
354 #define SCF_DO_SUBSTR 0x0400
355 #define SCF_DO_STCLASS_AND 0x0800
356 #define SCF_DO_STCLASS_OR 0x1000
357 #define SCF_DO_STCLASS (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
358 #define SCF_WHILEM_VISITED_POS 0x2000
360 #define SCF_TRIE_RESTUDY 0x4000 /* Do restudy? */
361 #define SCF_SEEN_ACCEPT 0x8000
363 #define UTF (RExC_utf8 != 0)
364 #define LOC ((RExC_flags & RXf_PMf_LOCALE) != 0)
365 #define FOLD ((RExC_flags & RXf_PMf_FOLD) != 0)
367 #define OOB_UNICODE 12345678
368 #define OOB_NAMEDCLASS -1
370 #define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
371 #define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
374 /* length of regex to show in messages that don't mark a position within */
375 #define RegexLengthToShowInErrorMessages 127
378 * If MARKER[12] are adjusted, be sure to adjust the constants at the top
379 * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
380 * op/pragma/warn/regcomp.
382 #define MARKER1 "<-- HERE" /* marker as it appears in the description */
383 #define MARKER2 " <-- HERE " /* marker as it appears within the regex */
385 #define REPORT_LOCATION " in regex; marked by " MARKER1 " in m/%.*s" MARKER2 "%s/"
388 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
389 * arg. Show regex, up to a maximum length. If it's too long, chop and add
392 #define _FAIL(code) STMT_START { \
393 const char *ellipses = ""; \
394 IV len = RExC_end - RExC_precomp; \
397 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv); \
398 if (len > RegexLengthToShowInErrorMessages) { \
399 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
400 len = RegexLengthToShowInErrorMessages - 10; \
406 #define FAIL(msg) _FAIL( \
407 Perl_croak(aTHX_ "%s in regex m/%.*s%s/", \
408 msg, (int)len, RExC_precomp, ellipses))
410 #define FAIL2(msg,arg) _FAIL( \
411 Perl_croak(aTHX_ msg " in regex m/%.*s%s/", \
412 arg, (int)len, RExC_precomp, ellipses))
415 * Simple_vFAIL -- like FAIL, but marks the current location in the scan
417 #define Simple_vFAIL(m) STMT_START { \
418 const IV offset = RExC_parse - RExC_precomp; \
419 Perl_croak(aTHX_ "%s" REPORT_LOCATION, \
420 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
424 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
426 #define vFAIL(m) STMT_START { \
428 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv); \
433 * Like Simple_vFAIL(), but accepts two arguments.
435 #define Simple_vFAIL2(m,a1) STMT_START { \
436 const IV offset = RExC_parse - RExC_precomp; \
437 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, \
438 (int)offset, RExC_precomp, RExC_precomp + offset); \
442 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
444 #define vFAIL2(m,a1) STMT_START { \
446 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv); \
447 Simple_vFAIL2(m, a1); \
452 * Like Simple_vFAIL(), but accepts three arguments.
454 #define Simple_vFAIL3(m, a1, a2) STMT_START { \
455 const IV offset = RExC_parse - RExC_precomp; \
456 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, \
457 (int)offset, RExC_precomp, RExC_precomp + offset); \
461 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
463 #define vFAIL3(m,a1,a2) STMT_START { \
465 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv); \
466 Simple_vFAIL3(m, a1, a2); \
470 * Like Simple_vFAIL(), but accepts four arguments.
472 #define Simple_vFAIL4(m, a1, a2, a3) STMT_START { \
473 const IV offset = RExC_parse - RExC_precomp; \
474 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3, \
475 (int)offset, RExC_precomp, RExC_precomp + offset); \
478 #define ckWARNreg(loc,m) STMT_START { \
479 const IV offset = loc - RExC_precomp; \
480 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
481 (int)offset, RExC_precomp, RExC_precomp + offset); \
484 #define ckWARNregdep(loc,m) STMT_START { \
485 const IV offset = loc - RExC_precomp; \
486 Perl_ck_warner_d(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), \
488 (int)offset, RExC_precomp, RExC_precomp + offset); \
491 #define ckWARN2reg(loc, m, a1) STMT_START { \
492 const IV offset = loc - RExC_precomp; \
493 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
494 a1, (int)offset, RExC_precomp, RExC_precomp + offset); \
497 #define vWARN3(loc, m, a1, a2) STMT_START { \
498 const IV offset = loc - RExC_precomp; \
499 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
500 a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset); \
503 #define ckWARN3reg(loc, m, a1, a2) STMT_START { \
504 const IV offset = loc - RExC_precomp; \
505 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
506 a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset); \
509 #define vWARN4(loc, m, a1, a2, a3) STMT_START { \
510 const IV offset = loc - RExC_precomp; \
511 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
512 a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
515 #define ckWARN4reg(loc, m, a1, a2, a3) STMT_START { \
516 const IV offset = loc - RExC_precomp; \
517 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
518 a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
521 #define vWARN5(loc, m, a1, a2, a3, a4) STMT_START { \
522 const IV offset = loc - RExC_precomp; \
523 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
524 a1, a2, a3, a4, (int)offset, RExC_precomp, RExC_precomp + offset); \
528 /* Allow for side effects in s */
529 #define REGC(c,s) STMT_START { \
530 if (!SIZE_ONLY) *(s) = (c); else (void)(s); \
533 /* Macros for recording node offsets. 20001227 mjd@plover.com
534 * Nodes are numbered 1, 2, 3, 4. Node #n's position is recorded in
535 * element 2*n-1 of the array. Element #2n holds the byte length node #n.
536 * Element 0 holds the number n.
537 * Position is 1 indexed.
539 #ifndef RE_TRACK_PATTERN_OFFSETS
540 #define Set_Node_Offset_To_R(node,byte)
541 #define Set_Node_Offset(node,byte)
542 #define Set_Cur_Node_Offset
543 #define Set_Node_Length_To_R(node,len)
544 #define Set_Node_Length(node,len)
545 #define Set_Node_Cur_Length(node)
546 #define Node_Offset(n)
547 #define Node_Length(n)
548 #define Set_Node_Offset_Length(node,offset,len)
549 #define ProgLen(ri) ri->u.proglen
550 #define SetProgLen(ri,x) ri->u.proglen = x
552 #define ProgLen(ri) ri->u.offsets[0]
553 #define SetProgLen(ri,x) ri->u.offsets[0] = x
554 #define Set_Node_Offset_To_R(node,byte) STMT_START { \
556 MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n", \
557 __LINE__, (int)(node), (int)(byte))); \
559 Perl_croak(aTHX_ "value of node is %d in Offset macro", (int)(node)); \
561 RExC_offsets[2*(node)-1] = (byte); \
566 #define Set_Node_Offset(node,byte) \
567 Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
568 #define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
570 #define Set_Node_Length_To_R(node,len) STMT_START { \
572 MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n", \
573 __LINE__, (int)(node), (int)(len))); \
575 Perl_croak(aTHX_ "value of node is %d in Length macro", (int)(node)); \
577 RExC_offsets[2*(node)] = (len); \
582 #define Set_Node_Length(node,len) \
583 Set_Node_Length_To_R((node)-RExC_emit_start, len)
584 #define Set_Cur_Node_Length(len) Set_Node_Length(RExC_emit, len)
585 #define Set_Node_Cur_Length(node) \
586 Set_Node_Length(node, RExC_parse - parse_start)
588 /* Get offsets and lengths */
589 #define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
590 #define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
592 #define Set_Node_Offset_Length(node,offset,len) STMT_START { \
593 Set_Node_Offset_To_R((node)-RExC_emit_start, (offset)); \
594 Set_Node_Length_To_R((node)-RExC_emit_start, (len)); \
598 #if PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS
599 #define EXPERIMENTAL_INPLACESCAN
600 #endif /*RE_TRACK_PATTERN_OFFSETS*/
602 #define DEBUG_STUDYDATA(str,data,depth) \
603 DEBUG_OPTIMISE_MORE_r(if(data){ \
604 PerlIO_printf(Perl_debug_log, \
605 "%*s" str "Pos:%"IVdf"/%"IVdf \
606 " Flags: 0x%"UVXf" Whilem_c: %"IVdf" Lcp: %"IVdf" %s", \
607 (int)(depth)*2, "", \
608 (IV)((data)->pos_min), \
609 (IV)((data)->pos_delta), \
610 (UV)((data)->flags), \
611 (IV)((data)->whilem_c), \
612 (IV)((data)->last_closep ? *((data)->last_closep) : -1), \
613 is_inf ? "INF " : "" \
615 if ((data)->last_found) \
616 PerlIO_printf(Perl_debug_log, \
617 "Last:'%s' %"IVdf":%"IVdf"/%"IVdf" %sFixed:'%s' @ %"IVdf \
618 " %sFloat: '%s' @ %"IVdf"/%"IVdf"", \
619 SvPVX_const((data)->last_found), \
620 (IV)((data)->last_end), \
621 (IV)((data)->last_start_min), \
622 (IV)((data)->last_start_max), \
623 ((data)->longest && \
624 (data)->longest==&((data)->longest_fixed)) ? "*" : "", \
625 SvPVX_const((data)->longest_fixed), \
626 (IV)((data)->offset_fixed), \
627 ((data)->longest && \
628 (data)->longest==&((data)->longest_float)) ? "*" : "", \
629 SvPVX_const((data)->longest_float), \
630 (IV)((data)->offset_float_min), \
631 (IV)((data)->offset_float_max) \
633 PerlIO_printf(Perl_debug_log,"\n"); \
636 static void clear_re(pTHX_ void *r);
638 /* Mark that we cannot extend a found fixed substring at this point.
639 Update the longest found anchored substring and the longest found
640 floating substrings if needed. */
643 S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data, I32 *minlenp, int is_inf)
645 const STRLEN l = CHR_SVLEN(data->last_found);
646 const STRLEN old_l = CHR_SVLEN(*data->longest);
647 GET_RE_DEBUG_FLAGS_DECL;
649 PERL_ARGS_ASSERT_SCAN_COMMIT;
651 if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
652 SvSetMagicSV(*data->longest, data->last_found);
653 if (*data->longest == data->longest_fixed) {
654 data->offset_fixed = l ? data->last_start_min : data->pos_min;
655 if (data->flags & SF_BEFORE_EOL)
657 |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
659 data->flags &= ~SF_FIX_BEFORE_EOL;
660 data->minlen_fixed=minlenp;
661 data->lookbehind_fixed=0;
663 else { /* *data->longest == data->longest_float */
664 data->offset_float_min = l ? data->last_start_min : data->pos_min;
665 data->offset_float_max = (l
666 ? data->last_start_max
667 : data->pos_min + data->pos_delta);
668 if (is_inf || (U32)data->offset_float_max > (U32)I32_MAX)
669 data->offset_float_max = I32_MAX;
670 if (data->flags & SF_BEFORE_EOL)
672 |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
674 data->flags &= ~SF_FL_BEFORE_EOL;
675 data->minlen_float=minlenp;
676 data->lookbehind_float=0;
679 SvCUR_set(data->last_found, 0);
681 SV * const sv = data->last_found;
682 if (SvUTF8(sv) && SvMAGICAL(sv)) {
683 MAGIC * const mg = mg_find(sv, PERL_MAGIC_utf8);
689 data->flags &= ~SF_BEFORE_EOL;
690 DEBUG_STUDYDATA("commit: ",data,0);
693 /* Can match anything (initialization) */
695 S_cl_anything(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
697 PERL_ARGS_ASSERT_CL_ANYTHING;
699 ANYOF_CLASS_ZERO(cl);
700 ANYOF_BITMAP_SETALL(cl);
701 cl->flags = ANYOF_EOS|ANYOF_UNICODE_ALL;
703 cl->flags |= ANYOF_LOCALE;
706 /* Can match anything (initialization) */
708 S_cl_is_anything(const struct regnode_charclass_class *cl)
712 PERL_ARGS_ASSERT_CL_IS_ANYTHING;
714 for (value = 0; value <= ANYOF_MAX; value += 2)
715 if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1))
717 if (!(cl->flags & ANYOF_UNICODE_ALL))
719 if (!ANYOF_BITMAP_TESTALLSET((const void*)cl))
724 /* Can match anything (initialization) */
726 S_cl_init(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
728 PERL_ARGS_ASSERT_CL_INIT;
730 Zero(cl, 1, struct regnode_charclass_class);
732 cl_anything(pRExC_state, cl);
736 S_cl_init_zero(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
738 PERL_ARGS_ASSERT_CL_INIT_ZERO;
740 Zero(cl, 1, struct regnode_charclass_class);
742 cl_anything(pRExC_state, cl);
744 cl->flags |= ANYOF_LOCALE;
747 /* 'And' a given class with another one. Can create false positives */
748 /* We assume that cl is not inverted */
750 S_cl_and(struct regnode_charclass_class *cl,
751 const struct regnode_charclass_class *and_with)
753 PERL_ARGS_ASSERT_CL_AND;
755 assert(and_with->type == ANYOF);
756 if (!(and_with->flags & ANYOF_CLASS)
757 && !(cl->flags & ANYOF_CLASS)
758 && (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
759 && !(and_with->flags & ANYOF_FOLD)
760 && !(cl->flags & ANYOF_FOLD)) {
763 if (and_with->flags & ANYOF_INVERT)
764 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
765 cl->bitmap[i] &= ~and_with->bitmap[i];
767 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
768 cl->bitmap[i] &= and_with->bitmap[i];
769 } /* XXXX: logic is complicated otherwise, leave it along for a moment. */
770 if (!(and_with->flags & ANYOF_EOS))
771 cl->flags &= ~ANYOF_EOS;
773 if (cl->flags & ANYOF_UNICODE_ALL && and_with->flags & ANYOF_UNICODE &&
774 !(and_with->flags & ANYOF_INVERT)) {
775 cl->flags &= ~ANYOF_UNICODE_ALL;
776 cl->flags |= ANYOF_UNICODE;
777 ARG_SET(cl, ARG(and_with));
779 if (!(and_with->flags & ANYOF_UNICODE_ALL) &&
780 !(and_with->flags & ANYOF_INVERT))
781 cl->flags &= ~ANYOF_UNICODE_ALL;
782 if (!(and_with->flags & (ANYOF_UNICODE|ANYOF_UNICODE_ALL)) &&
783 !(and_with->flags & ANYOF_INVERT))
784 cl->flags &= ~ANYOF_UNICODE;
787 /* 'OR' a given class with another one. Can create false positives */
788 /* We assume that cl is not inverted */
790 S_cl_or(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, const struct regnode_charclass_class *or_with)
792 PERL_ARGS_ASSERT_CL_OR;
794 if (or_with->flags & ANYOF_INVERT) {
796 * (B1 | CL1) | (!B2 & !CL2) = (B1 | !B2 & !CL2) | (CL1 | (!B2 & !CL2))
797 * <= (B1 | !B2) | (CL1 | !CL2)
798 * which is wasteful if CL2 is small, but we ignore CL2:
799 * (B1 | CL1) | (!B2 & !CL2) <= (B1 | CL1) | !B2 = (B1 | !B2) | CL1
800 * XXXX Can we handle case-fold? Unclear:
801 * (OK1(i) | OK1(i')) | !(OK1(i) | OK1(i')) =
802 * (OK1(i) | OK1(i')) | (!OK1(i) & !OK1(i'))
804 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
805 && !(or_with->flags & ANYOF_FOLD)
806 && !(cl->flags & ANYOF_FOLD) ) {
809 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
810 cl->bitmap[i] |= ~or_with->bitmap[i];
811 } /* XXXX: logic is complicated otherwise */
813 cl_anything(pRExC_state, cl);
816 /* (B1 | CL1) | (B2 | CL2) = (B1 | B2) | (CL1 | CL2)) */
817 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
818 && (!(or_with->flags & ANYOF_FOLD)
819 || (cl->flags & ANYOF_FOLD)) ) {
822 /* OR char bitmap and class bitmap separately */
823 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
824 cl->bitmap[i] |= or_with->bitmap[i];
825 if (or_with->flags & ANYOF_CLASS) {
826 for (i = 0; i < ANYOF_CLASSBITMAP_SIZE; i++)
827 cl->classflags[i] |= or_with->classflags[i];
828 cl->flags |= ANYOF_CLASS;
831 else { /* XXXX: logic is complicated, leave it along for a moment. */
832 cl_anything(pRExC_state, cl);
835 if (or_with->flags & ANYOF_EOS)
836 cl->flags |= ANYOF_EOS;
838 if (cl->flags & ANYOF_UNICODE && or_with->flags & ANYOF_UNICODE &&
839 ARG(cl) != ARG(or_with)) {
840 cl->flags |= ANYOF_UNICODE_ALL;
841 cl->flags &= ~ANYOF_UNICODE;
843 if (or_with->flags & ANYOF_UNICODE_ALL) {
844 cl->flags |= ANYOF_UNICODE_ALL;
845 cl->flags &= ~ANYOF_UNICODE;
849 #define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ]
850 #define TRIE_LIST_CUR(state) ( TRIE_LIST_ITEM( state, 0 ).forid )
851 #define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate )
852 #define TRIE_LIST_USED(idx) ( trie->states[state].trans.list ? (TRIE_LIST_CUR( idx ) - 1) : 0 )
857 dump_trie(trie,widecharmap,revcharmap)
858 dump_trie_interim_list(trie,widecharmap,revcharmap,next_alloc)
859 dump_trie_interim_table(trie,widecharmap,revcharmap,next_alloc)
861 These routines dump out a trie in a somewhat readable format.
862 The _interim_ variants are used for debugging the interim
863 tables that are used to generate the final compressed
864 representation which is what dump_trie expects.
866 Part of the reason for their existance is to provide a form
867 of documentation as to how the different representations function.
872 Dumps the final compressed table form of the trie to Perl_debug_log.
873 Used for debugging make_trie().
877 S_dump_trie(pTHX_ const struct _reg_trie_data *trie, HV *widecharmap,
878 AV *revcharmap, U32 depth)
881 SV *sv=sv_newmortal();
882 int colwidth= widecharmap ? 6 : 4;
883 GET_RE_DEBUG_FLAGS_DECL;
885 PERL_ARGS_ASSERT_DUMP_TRIE;
887 PerlIO_printf( Perl_debug_log, "%*sChar : %-6s%-6s%-4s ",
888 (int)depth * 2 + 2,"",
889 "Match","Base","Ofs" );
891 for( state = 0 ; state < trie->uniquecharcount ; state++ ) {
892 SV ** const tmp = av_fetch( revcharmap, state, 0);
894 PerlIO_printf( Perl_debug_log, "%*s",
896 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
897 PL_colors[0], PL_colors[1],
898 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
899 PERL_PV_ESCAPE_FIRSTCHAR
904 PerlIO_printf( Perl_debug_log, "\n%*sState|-----------------------",
905 (int)depth * 2 + 2,"");
907 for( state = 0 ; state < trie->uniquecharcount ; state++ )
908 PerlIO_printf( Perl_debug_log, "%.*s", colwidth, "--------");
909 PerlIO_printf( Perl_debug_log, "\n");
911 for( state = 1 ; state < trie->statecount ; state++ ) {
912 const U32 base = trie->states[ state ].trans.base;
914 PerlIO_printf( Perl_debug_log, "%*s#%4"UVXf"|", (int)depth * 2 + 2,"", (UV)state);
916 if ( trie->states[ state ].wordnum ) {
917 PerlIO_printf( Perl_debug_log, " W%4X", trie->states[ state ].wordnum );
919 PerlIO_printf( Perl_debug_log, "%6s", "" );
922 PerlIO_printf( Perl_debug_log, " @%4"UVXf" ", (UV)base );
927 while( ( base + ofs < trie->uniquecharcount ) ||
928 ( base + ofs - trie->uniquecharcount < trie->lasttrans
929 && trie->trans[ base + ofs - trie->uniquecharcount ].check != state))
932 PerlIO_printf( Perl_debug_log, "+%2"UVXf"[ ", (UV)ofs);
934 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
935 if ( ( base + ofs >= trie->uniquecharcount ) &&
936 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
937 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
939 PerlIO_printf( Perl_debug_log, "%*"UVXf,
941 (UV)trie->trans[ base + ofs - trie->uniquecharcount ].next );
943 PerlIO_printf( Perl_debug_log, "%*s",colwidth," ." );
947 PerlIO_printf( Perl_debug_log, "]");
950 PerlIO_printf( Perl_debug_log, "\n" );
954 Dumps a fully constructed but uncompressed trie in list form.
955 List tries normally only are used for construction when the number of
956 possible chars (trie->uniquecharcount) is very high.
957 Used for debugging make_trie().
960 S_dump_trie_interim_list(pTHX_ const struct _reg_trie_data *trie,
961 HV *widecharmap, AV *revcharmap, U32 next_alloc,
965 SV *sv=sv_newmortal();
966 int colwidth= widecharmap ? 6 : 4;
967 GET_RE_DEBUG_FLAGS_DECL;
969 PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_LIST;
971 /* print out the table precompression. */
972 PerlIO_printf( Perl_debug_log, "%*sState :Word | Transition Data\n%*s%s",
973 (int)depth * 2 + 2,"", (int)depth * 2 + 2,"",
974 "------:-----+-----------------\n" );
976 for( state=1 ; state < next_alloc ; state ++ ) {
979 PerlIO_printf( Perl_debug_log, "%*s %4"UVXf" :",
980 (int)depth * 2 + 2,"", (UV)state );
981 if ( ! trie->states[ state ].wordnum ) {
982 PerlIO_printf( Perl_debug_log, "%5s| ","");
984 PerlIO_printf( Perl_debug_log, "W%4x| ",
985 trie->states[ state ].wordnum
988 for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) {
989 SV ** const tmp = av_fetch( revcharmap, TRIE_LIST_ITEM(state,charid).forid, 0);
991 PerlIO_printf( Perl_debug_log, "%*s:%3X=%4"UVXf" | ",
993 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
994 PL_colors[0], PL_colors[1],
995 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
996 PERL_PV_ESCAPE_FIRSTCHAR
998 TRIE_LIST_ITEM(state,charid).forid,
999 (UV)TRIE_LIST_ITEM(state,charid).newstate
1002 PerlIO_printf(Perl_debug_log, "\n%*s| ",
1003 (int)((depth * 2) + 14), "");
1006 PerlIO_printf( Perl_debug_log, "\n");
1011 Dumps a fully constructed but uncompressed trie in table form.
1012 This is the normal DFA style state transition table, with a few
1013 twists to facilitate compression later.
1014 Used for debugging make_trie().
1017 S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie,
1018 HV *widecharmap, AV *revcharmap, U32 next_alloc,
1023 SV *sv=sv_newmortal();
1024 int colwidth= widecharmap ? 6 : 4;
1025 GET_RE_DEBUG_FLAGS_DECL;
1027 PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_TABLE;
1030 print out the table precompression so that we can do a visual check
1031 that they are identical.
1034 PerlIO_printf( Perl_debug_log, "%*sChar : ",(int)depth * 2 + 2,"" );
1036 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
1037 SV ** const tmp = av_fetch( revcharmap, charid, 0);
1039 PerlIO_printf( Perl_debug_log, "%*s",
1041 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
1042 PL_colors[0], PL_colors[1],
1043 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1044 PERL_PV_ESCAPE_FIRSTCHAR
1050 PerlIO_printf( Perl_debug_log, "\n%*sState+-",(int)depth * 2 + 2,"" );
1052 for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) {
1053 PerlIO_printf( Perl_debug_log, "%.*s", colwidth,"--------");
1056 PerlIO_printf( Perl_debug_log, "\n" );
1058 for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) {
1060 PerlIO_printf( Perl_debug_log, "%*s%4"UVXf" : ",
1061 (int)depth * 2 + 2,"",
1062 (UV)TRIE_NODENUM( state ) );
1064 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
1065 UV v=(UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next );
1067 PerlIO_printf( Perl_debug_log, "%*"UVXf, colwidth, v );
1069 PerlIO_printf( Perl_debug_log, "%*s", colwidth, "." );
1071 if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) {
1072 PerlIO_printf( Perl_debug_log, " (%4"UVXf")\n", (UV)trie->trans[ state ].check );
1074 PerlIO_printf( Perl_debug_log, " (%4"UVXf") W%4X\n", (UV)trie->trans[ state ].check,
1075 trie->states[ TRIE_NODENUM( state ) ].wordnum );
1082 /* make_trie(startbranch,first,last,tail,word_count,flags,depth)
1083 startbranch: the first branch in the whole branch sequence
1084 first : start branch of sequence of branch-exact nodes.
1085 May be the same as startbranch
1086 last : Thing following the last branch.
1087 May be the same as tail.
1088 tail : item following the branch sequence
1089 count : words in the sequence
1090 flags : currently the OP() type we will be building one of /EXACT(|F|Fl)/
1091 depth : indent depth
1093 Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node.
1095 A trie is an N'ary tree where the branches are determined by digital
1096 decomposition of the key. IE, at the root node you look up the 1st character and
1097 follow that branch repeat until you find the end of the branches. Nodes can be
1098 marked as "accepting" meaning they represent a complete word. Eg:
1102 would convert into the following structure. Numbers represent states, letters
1103 following numbers represent valid transitions on the letter from that state, if
1104 the number is in square brackets it represents an accepting state, otherwise it
1105 will be in parenthesis.
1107 +-h->+-e->[3]-+-r->(8)-+-s->[9]
1111 (1) +-i->(6)-+-s->[7]
1113 +-s->(3)-+-h->(4)-+-e->[5]
1115 Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers)
1117 This shows that when matching against the string 'hers' we will begin at state 1
1118 read 'h' and move to state 2, read 'e' and move to state 3 which is accepting,
1119 then read 'r' and go to state 8 followed by 's' which takes us to state 9 which
1120 is also accepting. Thus we know that we can match both 'he' and 'hers' with a
1121 single traverse. We store a mapping from accepting to state to which word was
1122 matched, and then when we have multiple possibilities we try to complete the
1123 rest of the regex in the order in which they occured in the alternation.
1125 The only prior NFA like behaviour that would be changed by the TRIE support is
1126 the silent ignoring of duplicate alternations which are of the form:
1128 / (DUPE|DUPE) X? (?{ ... }) Y /x
1130 Thus EVAL blocks follwing a trie may be called a different number of times with
1131 and without the optimisation. With the optimisations dupes will be silently
1132 ignored. This inconsistant behaviour of EVAL type nodes is well established as
1133 the following demonstrates:
1135 'words'=~/(word|word|word)(?{ print $1 })[xyz]/
1137 which prints out 'word' three times, but
1139 'words'=~/(word|word|word)(?{ print $1 })S/
1141 which doesnt print it out at all. This is due to other optimisations kicking in.
1143 Example of what happens on a structural level:
1145 The regexp /(ac|ad|ab)+/ will produce the folowing debug output:
1147 1: CURLYM[1] {1,32767}(18)
1158 This would be optimizable with startbranch=5, first=5, last=16, tail=16
1159 and should turn into:
1161 1: CURLYM[1] {1,32767}(18)
1163 [Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1]
1171 Cases where tail != last would be like /(?foo|bar)baz/:
1181 which would be optimizable with startbranch=1, first=1, last=7, tail=8
1182 and would end up looking like:
1185 [Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1]
1192 d = uvuni_to_utf8_flags(d, uv, 0);
1194 is the recommended Unicode-aware way of saying
1199 #define TRIE_STORE_REVCHAR \
1202 SV *zlopp = newSV(2); \
1203 unsigned char *flrbbbbb = (unsigned char *) SvPVX(zlopp); \
1204 unsigned const char *const kapow = uvuni_to_utf8(flrbbbbb, uvc & 0xFF); \
1205 SvCUR_set(zlopp, kapow - flrbbbbb); \
1208 av_push(revcharmap, zlopp); \
1210 char ooooff = (char)uvc; \
1211 av_push(revcharmap, newSVpvn(&ooooff, 1)); \
1215 #define TRIE_READ_CHAR STMT_START { \
1219 if ( foldlen > 0 ) { \
1220 uvc = utf8n_to_uvuni( scan, UTF8_MAXLEN, &len, uniflags ); \
1225 uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
1226 uvc = to_uni_fold( uvc, foldbuf, &foldlen ); \
1227 foldlen -= UNISKIP( uvc ); \
1228 scan = foldbuf + UNISKIP( uvc ); \
1231 uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
1241 #define TRIE_LIST_PUSH(state,fid,ns) STMT_START { \
1242 if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) { \
1243 U32 ging = TRIE_LIST_LEN( state ) *= 2; \
1244 Renew( trie->states[ state ].trans.list, ging, reg_trie_trans_le ); \
1246 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid; \
1247 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns; \
1248 TRIE_LIST_CUR( state )++; \
1251 #define TRIE_LIST_NEW(state) STMT_START { \
1252 Newxz( trie->states[ state ].trans.list, \
1253 4, reg_trie_trans_le ); \
1254 TRIE_LIST_CUR( state ) = 1; \
1255 TRIE_LIST_LEN( state ) = 4; \
1258 #define TRIE_HANDLE_WORD(state) STMT_START { \
1259 U16 dupe= trie->states[ state ].wordnum; \
1260 regnode * const noper_next = regnext( noper ); \
1262 if (trie->wordlen) \
1263 trie->wordlen[ curword ] = wordlen; \
1265 /* store the word for dumping */ \
1267 if (OP(noper) != NOTHING) \
1268 tmp = newSVpvn_utf8(STRING(noper), STR_LEN(noper), UTF); \
1270 tmp = newSVpvn_utf8( "", 0, UTF ); \
1271 av_push( trie_words, tmp ); \
1276 if ( noper_next < tail ) { \
1278 trie->jump = (U16 *) PerlMemShared_calloc( word_count + 1, sizeof(U16) ); \
1279 trie->jump[curword] = (U16)(noper_next - convert); \
1281 jumper = noper_next; \
1283 nextbranch= regnext(cur); \
1287 /* So it's a dupe. This means we need to maintain a */\
1288 /* linked-list from the first to the next. */\
1289 /* we only allocate the nextword buffer when there */\
1290 /* a dupe, so first time we have to do the allocation */\
1291 if (!trie->nextword) \
1292 trie->nextword = (U16 *) \
1293 PerlMemShared_calloc( word_count + 1, sizeof(U16)); \
1294 while ( trie->nextword[dupe] ) \
1295 dupe= trie->nextword[dupe]; \
1296 trie->nextword[dupe]= curword; \
1298 /* we haven't inserted this word yet. */ \
1299 trie->states[ state ].wordnum = curword; \
1304 #define TRIE_TRANS_STATE(state,base,ucharcount,charid,special) \
1305 ( ( base + charid >= ucharcount \
1306 && base + charid < ubound \
1307 && state == trie->trans[ base - ucharcount + charid ].check \
1308 && trie->trans[ base - ucharcount + charid ].next ) \
1309 ? trie->trans[ base - ucharcount + charid ].next \
1310 : ( state==1 ? special : 0 ) \
1314 #define MADE_JUMP_TRIE 2
1315 #define MADE_EXACT_TRIE 4
1318 S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *first, regnode *last, regnode *tail, U32 word_count, U32 flags, U32 depth)
1321 /* first pass, loop through and scan words */
1322 reg_trie_data *trie;
1323 HV *widecharmap = NULL;
1324 AV *revcharmap = newAV();
1326 const U32 uniflags = UTF8_ALLOW_DEFAULT;
1331 regnode *jumper = NULL;
1332 regnode *nextbranch = NULL;
1333 regnode *convert = NULL;
1334 /* we just use folder as a flag in utf8 */
1335 const U8 * const folder = ( flags == EXACTF
1337 : ( flags == EXACTFL
1344 const U32 data_slot = add_data( pRExC_state, 4, "tuuu" );
1345 AV *trie_words = NULL;
1346 /* along with revcharmap, this only used during construction but both are
1347 * useful during debugging so we store them in the struct when debugging.
1350 const U32 data_slot = add_data( pRExC_state, 2, "tu" );
1351 STRLEN trie_charcount=0;
1353 SV *re_trie_maxbuff;
1354 GET_RE_DEBUG_FLAGS_DECL;
1356 PERL_ARGS_ASSERT_MAKE_TRIE;
1358 PERL_UNUSED_ARG(depth);
1361 trie = (reg_trie_data *) PerlMemShared_calloc( 1, sizeof(reg_trie_data) );
1363 trie->startstate = 1;
1364 trie->wordcount = word_count;
1365 RExC_rxi->data->data[ data_slot ] = (void*)trie;
1366 trie->charmap = (U16 *) PerlMemShared_calloc( 256, sizeof(U16) );
1367 if (!(UTF && folder))
1368 trie->bitmap = (char *) PerlMemShared_calloc( ANYOF_BITMAP_SIZE, 1 );
1370 trie_words = newAV();
1373 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
1374 if (!SvIOK(re_trie_maxbuff)) {
1375 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
1378 PerlIO_printf( Perl_debug_log,
1379 "%*smake_trie start==%d, first==%d, last==%d, tail==%d depth=%d\n",
1380 (int)depth * 2 + 2, "",
1381 REG_NODE_NUM(startbranch),REG_NODE_NUM(first),
1382 REG_NODE_NUM(last), REG_NODE_NUM(tail),
1386 /* Find the node we are going to overwrite */
1387 if ( first == startbranch && OP( last ) != BRANCH ) {
1388 /* whole branch chain */
1391 /* branch sub-chain */
1392 convert = NEXTOPER( first );
1395 /* -- First loop and Setup --
1397 We first traverse the branches and scan each word to determine if it
1398 contains widechars, and how many unique chars there are, this is
1399 important as we have to build a table with at least as many columns as we
1402 We use an array of integers to represent the character codes 0..255
1403 (trie->charmap) and we use a an HV* to store Unicode characters. We use the
1404 native representation of the character value as the key and IV's for the
1407 *TODO* If we keep track of how many times each character is used we can
1408 remap the columns so that the table compression later on is more
1409 efficient in terms of memory by ensuring most common value is in the
1410 middle and the least common are on the outside. IMO this would be better
1411 than a most to least common mapping as theres a decent chance the most
1412 common letter will share a node with the least common, meaning the node
1413 will not be compressable. With a middle is most common approach the worst
1414 case is when we have the least common nodes twice.
1418 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1419 regnode * const noper = NEXTOPER( cur );
1420 const U8 *uc = (U8*)STRING( noper );
1421 const U8 * const e = uc + STR_LEN( noper );
1423 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1424 const U8 *scan = (U8*)NULL;
1425 U32 wordlen = 0; /* required init */
1427 bool set_bit = trie->bitmap ? 1 : 0; /*store the first char in the bitmap?*/
1429 if (OP(noper) == NOTHING) {
1433 if ( set_bit ) /* bitmap only alloced when !(UTF&&Folding) */
1434 TRIE_BITMAP_SET(trie,*uc); /* store the raw first byte
1435 regardless of encoding */
1437 for ( ; uc < e ; uc += len ) {
1438 TRIE_CHARCOUNT(trie)++;
1442 if ( !trie->charmap[ uvc ] ) {
1443 trie->charmap[ uvc ]=( ++trie->uniquecharcount );
1445 trie->charmap[ folder[ uvc ] ] = trie->charmap[ uvc ];
1449 /* store the codepoint in the bitmap, and if its ascii
1450 also store its folded equivelent. */
1451 TRIE_BITMAP_SET(trie,uvc);
1453 /* store the folded codepoint */
1454 if ( folder ) TRIE_BITMAP_SET(trie,folder[ uvc ]);
1457 /* store first byte of utf8 representation of
1458 codepoints in the 127 < uvc < 256 range */
1459 if (127 < uvc && uvc < 192) {
1460 TRIE_BITMAP_SET(trie,194);
1461 } else if (191 < uvc ) {
1462 TRIE_BITMAP_SET(trie,195);
1463 /* && uvc < 256 -- we know uvc is < 256 already */
1466 set_bit = 0; /* We've done our bit :-) */
1471 widecharmap = newHV();
1473 svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 1 );
1476 Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%"UVXf, uvc );
1478 if ( !SvTRUE( *svpp ) ) {
1479 sv_setiv( *svpp, ++trie->uniquecharcount );
1484 if( cur == first ) {
1487 } else if (chars < trie->minlen) {
1489 } else if (chars > trie->maxlen) {
1493 } /* end first pass */
1494 DEBUG_TRIE_COMPILE_r(
1495 PerlIO_printf( Perl_debug_log, "%*sTRIE(%s): W:%d C:%d Uq:%d Min:%d Max:%d\n",
1496 (int)depth * 2 + 2,"",
1497 ( widecharmap ? "UTF8" : "NATIVE" ), (int)word_count,
1498 (int)TRIE_CHARCOUNT(trie), trie->uniquecharcount,
1499 (int)trie->minlen, (int)trie->maxlen )
1501 trie->wordlen = (U32 *) PerlMemShared_calloc( word_count, sizeof(U32) );
1504 We now know what we are dealing with in terms of unique chars and
1505 string sizes so we can calculate how much memory a naive
1506 representation using a flat table will take. If it's over a reasonable
1507 limit (as specified by ${^RE_TRIE_MAXBUF}) we use a more memory
1508 conservative but potentially much slower representation using an array
1511 At the end we convert both representations into the same compressed
1512 form that will be used in regexec.c for matching with. The latter
1513 is a form that cannot be used to construct with but has memory
1514 properties similar to the list form and access properties similar
1515 to the table form making it both suitable for fast searches and
1516 small enough that its feasable to store for the duration of a program.
1518 See the comment in the code where the compressed table is produced
1519 inplace from the flat tabe representation for an explanation of how
1520 the compression works.
1525 if ( (IV)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1) > SvIV(re_trie_maxbuff) ) {
1527 Second Pass -- Array Of Lists Representation
1529 Each state will be represented by a list of charid:state records
1530 (reg_trie_trans_le) the first such element holds the CUR and LEN
1531 points of the allocated array. (See defines above).
1533 We build the initial structure using the lists, and then convert
1534 it into the compressed table form which allows faster lookups
1535 (but cant be modified once converted).
1538 STRLEN transcount = 1;
1540 DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
1541 "%*sCompiling trie using list compiler\n",
1542 (int)depth * 2 + 2, ""));
1544 trie->states = (reg_trie_state *)
1545 PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
1546 sizeof(reg_trie_state) );
1550 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1552 regnode * const noper = NEXTOPER( cur );
1553 U8 *uc = (U8*)STRING( noper );
1554 const U8 * const e = uc + STR_LEN( noper );
1555 U32 state = 1; /* required init */
1556 U16 charid = 0; /* sanity init */
1557 U8 *scan = (U8*)NULL; /* sanity init */
1558 STRLEN foldlen = 0; /* required init */
1559 U32 wordlen = 0; /* required init */
1560 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1562 if (OP(noper) != NOTHING) {
1563 for ( ; uc < e ; uc += len ) {
1568 charid = trie->charmap[ uvc ];
1570 SV** const svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 0);
1574 charid=(U16)SvIV( *svpp );
1577 /* charid is now 0 if we dont know the char read, or nonzero if we do */
1584 if ( !trie->states[ state ].trans.list ) {
1585 TRIE_LIST_NEW( state );
1587 for ( check = 1; check <= TRIE_LIST_USED( state ); check++ ) {
1588 if ( TRIE_LIST_ITEM( state, check ).forid == charid ) {
1589 newstate = TRIE_LIST_ITEM( state, check ).newstate;
1594 newstate = next_alloc++;
1595 TRIE_LIST_PUSH( state, charid, newstate );
1600 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
1604 TRIE_HANDLE_WORD(state);
1606 } /* end second pass */
1608 /* next alloc is the NEXT state to be allocated */
1609 trie->statecount = next_alloc;
1610 trie->states = (reg_trie_state *)
1611 PerlMemShared_realloc( trie->states,
1613 * sizeof(reg_trie_state) );
1615 /* and now dump it out before we compress it */
1616 DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_list(trie, widecharmap,
1617 revcharmap, next_alloc,
1621 trie->trans = (reg_trie_trans *)
1622 PerlMemShared_calloc( transcount, sizeof(reg_trie_trans) );
1629 for( state=1 ; state < next_alloc ; state ++ ) {
1633 DEBUG_TRIE_COMPILE_MORE_r(
1634 PerlIO_printf( Perl_debug_log, "tp: %d zp: %d ",tp,zp)
1638 if (trie->states[state].trans.list) {
1639 U16 minid=TRIE_LIST_ITEM( state, 1).forid;
1643 for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
1644 const U16 forid = TRIE_LIST_ITEM( state, idx).forid;
1645 if ( forid < minid ) {
1647 } else if ( forid > maxid ) {
1651 if ( transcount < tp + maxid - minid + 1) {
1653 trie->trans = (reg_trie_trans *)
1654 PerlMemShared_realloc( trie->trans,
1656 * sizeof(reg_trie_trans) );
1657 Zero( trie->trans + (transcount / 2), transcount / 2 , reg_trie_trans );
1659 base = trie->uniquecharcount + tp - minid;
1660 if ( maxid == minid ) {
1662 for ( ; zp < tp ; zp++ ) {
1663 if ( ! trie->trans[ zp ].next ) {
1664 base = trie->uniquecharcount + zp - minid;
1665 trie->trans[ zp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1666 trie->trans[ zp ].check = state;
1672 trie->trans[ tp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1673 trie->trans[ tp ].check = state;
1678 for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
1679 const U32 tid = base - trie->uniquecharcount + TRIE_LIST_ITEM( state, idx ).forid;
1680 trie->trans[ tid ].next = TRIE_LIST_ITEM( state, idx ).newstate;
1681 trie->trans[ tid ].check = state;
1683 tp += ( maxid - minid + 1 );
1685 Safefree(trie->states[ state ].trans.list);
1688 DEBUG_TRIE_COMPILE_MORE_r(
1689 PerlIO_printf( Perl_debug_log, " base: %d\n",base);
1692 trie->states[ state ].trans.base=base;
1694 trie->lasttrans = tp + 1;
1698 Second Pass -- Flat Table Representation.
1700 we dont use the 0 slot of either trans[] or states[] so we add 1 to each.
1701 We know that we will need Charcount+1 trans at most to store the data
1702 (one row per char at worst case) So we preallocate both structures
1703 assuming worst case.
1705 We then construct the trie using only the .next slots of the entry
1708 We use the .check field of the first entry of the node temporarily to
1709 make compression both faster and easier by keeping track of how many non
1710 zero fields are in the node.
1712 Since trans are numbered from 1 any 0 pointer in the table is a FAIL
1715 There are two terms at use here: state as a TRIE_NODEIDX() which is a
1716 number representing the first entry of the node, and state as a
1717 TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1) and
1718 TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3) if there
1719 are 2 entrys per node. eg:
1727 The table is internally in the right hand, idx form. However as we also
1728 have to deal with the states array which is indexed by nodenum we have to
1729 use TRIE_NODENUM() to convert.
1732 DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
1733 "%*sCompiling trie using table compiler\n",
1734 (int)depth * 2 + 2, ""));
1736 trie->trans = (reg_trie_trans *)
1737 PerlMemShared_calloc( ( TRIE_CHARCOUNT(trie) + 1 )
1738 * trie->uniquecharcount + 1,
1739 sizeof(reg_trie_trans) );
1740 trie->states = (reg_trie_state *)
1741 PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
1742 sizeof(reg_trie_state) );
1743 next_alloc = trie->uniquecharcount + 1;
1746 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1748 regnode * const noper = NEXTOPER( cur );
1749 const U8 *uc = (U8*)STRING( noper );
1750 const U8 * const e = uc + STR_LEN( noper );
1752 U32 state = 1; /* required init */
1754 U16 charid = 0; /* sanity init */
1755 U32 accept_state = 0; /* sanity init */
1756 U8 *scan = (U8*)NULL; /* sanity init */
1758 STRLEN foldlen = 0; /* required init */
1759 U32 wordlen = 0; /* required init */
1760 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1762 if ( OP(noper) != NOTHING ) {
1763 for ( ; uc < e ; uc += len ) {
1768 charid = trie->charmap[ uvc ];
1770 SV* const * const svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 0);
1771 charid = svpp ? (U16)SvIV(*svpp) : 0;
1775 if ( !trie->trans[ state + charid ].next ) {
1776 trie->trans[ state + charid ].next = next_alloc;
1777 trie->trans[ state ].check++;
1778 next_alloc += trie->uniquecharcount;
1780 state = trie->trans[ state + charid ].next;
1782 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
1784 /* charid is now 0 if we dont know the char read, or nonzero if we do */
1787 accept_state = TRIE_NODENUM( state );
1788 TRIE_HANDLE_WORD(accept_state);
1790 } /* end second pass */
1792 /* and now dump it out before we compress it */
1793 DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_table(trie, widecharmap,
1795 next_alloc, depth+1));
1799 * Inplace compress the table.*
1801 For sparse data sets the table constructed by the trie algorithm will
1802 be mostly 0/FAIL transitions or to put it another way mostly empty.
1803 (Note that leaf nodes will not contain any transitions.)
1805 This algorithm compresses the tables by eliminating most such
1806 transitions, at the cost of a modest bit of extra work during lookup:
1808 - Each states[] entry contains a .base field which indicates the
1809 index in the state[] array wheres its transition data is stored.
1811 - If .base is 0 there are no valid transitions from that node.
1813 - If .base is nonzero then charid is added to it to find an entry in
1816 -If trans[states[state].base+charid].check!=state then the
1817 transition is taken to be a 0/Fail transition. Thus if there are fail
1818 transitions at the front of the node then the .base offset will point
1819 somewhere inside the previous nodes data (or maybe even into a node
1820 even earlier), but the .check field determines if the transition is
1824 The following process inplace converts the table to the compressed
1825 table: We first do not compress the root node 1,and mark its all its
1826 .check pointers as 1 and set its .base pointer as 1 as well. This
1827 allows to do a DFA construction from the compressed table later, and
1828 ensures that any .base pointers we calculate later are greater than
1831 - We set 'pos' to indicate the first entry of the second node.
1833 - We then iterate over the columns of the node, finding the first and
1834 last used entry at l and m. We then copy l..m into pos..(pos+m-l),
1835 and set the .check pointers accordingly, and advance pos
1836 appropriately and repreat for the next node. Note that when we copy
1837 the next pointers we have to convert them from the original
1838 NODEIDX form to NODENUM form as the former is not valid post
1841 - If a node has no transitions used we mark its base as 0 and do not
1842 advance the pos pointer.
1844 - If a node only has one transition we use a second pointer into the
1845 structure to fill in allocated fail transitions from other states.
1846 This pointer is independent of the main pointer and scans forward
1847 looking for null transitions that are allocated to a state. When it
1848 finds one it writes the single transition into the "hole". If the
1849 pointer doesnt find one the single transition is appended as normal.
1851 - Once compressed we can Renew/realloc the structures to release the
1854 See "Table-Compression Methods" in sec 3.9 of the Red Dragon,
1855 specifically Fig 3.47 and the associated pseudocode.
1859 const U32 laststate = TRIE_NODENUM( next_alloc );
1862 trie->statecount = laststate;
1864 for ( state = 1 ; state < laststate ; state++ ) {
1866 const U32 stateidx = TRIE_NODEIDX( state );
1867 const U32 o_used = trie->trans[ stateidx ].check;
1868 U32 used = trie->trans[ stateidx ].check;
1869 trie->trans[ stateidx ].check = 0;
1871 for ( charid = 0 ; used && charid < trie->uniquecharcount ; charid++ ) {
1872 if ( flag || trie->trans[ stateidx + charid ].next ) {
1873 if ( trie->trans[ stateidx + charid ].next ) {
1875 for ( ; zp < pos ; zp++ ) {
1876 if ( ! trie->trans[ zp ].next ) {
1880 trie->states[ state ].trans.base = zp + trie->uniquecharcount - charid ;
1881 trie->trans[ zp ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
1882 trie->trans[ zp ].check = state;
1883 if ( ++zp > pos ) pos = zp;
1890 trie->states[ state ].trans.base = pos + trie->uniquecharcount - charid ;
1892 trie->trans[ pos ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
1893 trie->trans[ pos ].check = state;
1898 trie->lasttrans = pos + 1;
1899 trie->states = (reg_trie_state *)
1900 PerlMemShared_realloc( trie->states, laststate
1901 * sizeof(reg_trie_state) );
1902 DEBUG_TRIE_COMPILE_MORE_r(
1903 PerlIO_printf( Perl_debug_log,
1904 "%*sAlloc: %d Orig: %"IVdf" elements, Final:%"IVdf". Savings of %%%5.2f\n",
1905 (int)depth * 2 + 2,"",
1906 (int)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1 ),
1909 ( ( next_alloc - pos ) * 100 ) / (double)next_alloc );
1912 } /* end table compress */
1914 DEBUG_TRIE_COMPILE_MORE_r(
1915 PerlIO_printf(Perl_debug_log, "%*sStatecount:%"UVxf" Lasttrans:%"UVxf"\n",
1916 (int)depth * 2 + 2, "",
1917 (UV)trie->statecount,
1918 (UV)trie->lasttrans)
1920 /* resize the trans array to remove unused space */
1921 trie->trans = (reg_trie_trans *)
1922 PerlMemShared_realloc( trie->trans, trie->lasttrans
1923 * sizeof(reg_trie_trans) );
1925 /* and now dump out the compressed format */
1926 DEBUG_TRIE_COMPILE_r(dump_trie(trie, widecharmap, revcharmap, depth+1));
1928 { /* Modify the program and insert the new TRIE node*/
1929 U8 nodetype =(U8)(flags & 0xFF);
1933 regnode *optimize = NULL;
1934 #ifdef RE_TRACK_PATTERN_OFFSETS
1937 U32 mjd_nodelen = 0;
1938 #endif /* RE_TRACK_PATTERN_OFFSETS */
1939 #endif /* DEBUGGING */
1941 This means we convert either the first branch or the first Exact,
1942 depending on whether the thing following (in 'last') is a branch
1943 or not and whther first is the startbranch (ie is it a sub part of
1944 the alternation or is it the whole thing.)
1945 Assuming its a sub part we conver the EXACT otherwise we convert
1946 the whole branch sequence, including the first.
1948 /* Find the node we are going to overwrite */
1949 if ( first != startbranch || OP( last ) == BRANCH ) {
1950 /* branch sub-chain */
1951 NEXT_OFF( first ) = (U16)(last - first);
1952 #ifdef RE_TRACK_PATTERN_OFFSETS
1954 mjd_offset= Node_Offset((convert));
1955 mjd_nodelen= Node_Length((convert));
1958 /* whole branch chain */
1960 #ifdef RE_TRACK_PATTERN_OFFSETS
1963 const regnode *nop = NEXTOPER( convert );
1964 mjd_offset= Node_Offset((nop));
1965 mjd_nodelen= Node_Length((nop));
1969 PerlIO_printf(Perl_debug_log, "%*sMJD offset:%"UVuf" MJD length:%"UVuf"\n",
1970 (int)depth * 2 + 2, "",
1971 (UV)mjd_offset, (UV)mjd_nodelen)
1974 /* But first we check to see if there is a common prefix we can
1975 split out as an EXACT and put in front of the TRIE node. */
1976 trie->startstate= 1;
1977 if ( trie->bitmap && !widecharmap && !trie->jump ) {
1979 for ( state = 1 ; state < trie->statecount-1 ; state++ ) {
1983 const U32 base = trie->states[ state ].trans.base;
1985 if ( trie->states[state].wordnum )
1988 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
1989 if ( ( base + ofs >= trie->uniquecharcount ) &&
1990 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
1991 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
1993 if ( ++count > 1 ) {
1994 SV **tmp = av_fetch( revcharmap, ofs, 0);
1995 const U8 *ch = (U8*)SvPV_nolen_const( *tmp );
1996 if ( state == 1 ) break;
1998 Zero(trie->bitmap, ANYOF_BITMAP_SIZE, char);
2000 PerlIO_printf(Perl_debug_log,
2001 "%*sNew Start State=%"UVuf" Class: [",
2002 (int)depth * 2 + 2, "",
2005 SV ** const tmp = av_fetch( revcharmap, idx, 0);
2006 const U8 * const ch = (U8*)SvPV_nolen_const( *tmp );
2008 TRIE_BITMAP_SET(trie,*ch);
2010 TRIE_BITMAP_SET(trie, folder[ *ch ]);
2012 PerlIO_printf(Perl_debug_log, "%s", (char*)ch)
2016 TRIE_BITMAP_SET(trie,*ch);
2018 TRIE_BITMAP_SET(trie,folder[ *ch ]);
2019 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"%s", ch));
2025 SV **tmp = av_fetch( revcharmap, idx, 0);
2027 char *ch = SvPV( *tmp, len );
2029 SV *sv=sv_newmortal();
2030 PerlIO_printf( Perl_debug_log,
2031 "%*sPrefix State: %"UVuf" Idx:%"UVuf" Char='%s'\n",
2032 (int)depth * 2 + 2, "",
2034 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 6,
2035 PL_colors[0], PL_colors[1],
2036 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
2037 PERL_PV_ESCAPE_FIRSTCHAR
2042 OP( convert ) = nodetype;
2043 str=STRING(convert);
2046 STR_LEN(convert) += len;
2052 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"]\n"));
2058 regnode *n = convert+NODE_SZ_STR(convert);
2059 NEXT_OFF(convert) = NODE_SZ_STR(convert);
2060 trie->startstate = state;
2061 trie->minlen -= (state - 1);
2062 trie->maxlen -= (state - 1);
2064 /* At least the UNICOS C compiler choked on this
2065 * being argument to DEBUG_r(), so let's just have
2068 #ifdef PERL_EXT_RE_BUILD
2074 regnode *fix = convert;
2075 U32 word = trie->wordcount;
2077 Set_Node_Offset_Length(convert, mjd_offset, state - 1);
2078 while( ++fix < n ) {
2079 Set_Node_Offset_Length(fix, 0, 0);
2082 SV ** const tmp = av_fetch( trie_words, word, 0 );
2084 if ( STR_LEN(convert) <= SvCUR(*tmp) )
2085 sv_chop(*tmp, SvPV_nolen(*tmp) + STR_LEN(convert));
2087 sv_chop(*tmp, SvPV_nolen(*tmp) + SvCUR(*tmp));
2095 NEXT_OFF(convert) = (U16)(tail - convert);
2096 DEBUG_r(optimize= n);
2102 if ( trie->maxlen ) {
2103 NEXT_OFF( convert ) = (U16)(tail - convert);
2104 ARG_SET( convert, data_slot );
2105 /* Store the offset to the first unabsorbed branch in
2106 jump[0], which is otherwise unused by the jump logic.
2107 We use this when dumping a trie and during optimisation. */
2109 trie->jump[0] = (U16)(nextbranch - convert);
2112 if ( !trie->states[trie->startstate].wordnum && trie->bitmap &&
2113 ( (char *)jumper - (char *)convert) >= (int)sizeof(struct regnode_charclass) )
2115 OP( convert ) = TRIEC;
2116 Copy(trie->bitmap, ((struct regnode_charclass *)convert)->bitmap, ANYOF_BITMAP_SIZE, char);
2117 PerlMemShared_free(trie->bitmap);
2120 OP( convert ) = TRIE;
2122 /* store the type in the flags */
2123 convert->flags = nodetype;
2127 + regarglen[ OP( convert ) ];
2129 /* XXX We really should free up the resource in trie now,
2130 as we won't use them - (which resources?) dmq */
2132 /* needed for dumping*/
2133 DEBUG_r(if (optimize) {
2134 regnode *opt = convert;
2136 while ( ++opt < optimize) {
2137 Set_Node_Offset_Length(opt,0,0);
2140 Try to clean up some of the debris left after the
2143 while( optimize < jumper ) {
2144 mjd_nodelen += Node_Length((optimize));
2145 OP( optimize ) = OPTIMIZED;
2146 Set_Node_Offset_Length(optimize,0,0);
2149 Set_Node_Offset_Length(convert,mjd_offset,mjd_nodelen);
2151 } /* end node insert */
2152 RExC_rxi->data->data[ data_slot + 1 ] = (void*)widecharmap;
2154 RExC_rxi->data->data[ data_slot + TRIE_WORDS_OFFSET ] = (void*)trie_words;
2155 RExC_rxi->data->data[ data_slot + 3 ] = (void*)revcharmap;
2157 SvREFCNT_dec(revcharmap);
2161 : trie->startstate>1
2167 S_make_trie_failtable(pTHX_ RExC_state_t *pRExC_state, regnode *source, regnode *stclass, U32 depth)
2169 /* The Trie is constructed and compressed now so we can build a fail array now if its needed
2171 This is basically the Aho-Corasick algorithm. Its from exercise 3.31 and 3.32 in the
2172 "Red Dragon" -- Compilers, principles, techniques, and tools. Aho, Sethi, Ullman 1985/88
2175 We find the fail state for each state in the trie, this state is the longest proper
2176 suffix of the current states 'word' that is also a proper prefix of another word in our
2177 trie. State 1 represents the word '' and is the thus the default fail state. This allows
2178 the DFA not to have to restart after its tried and failed a word at a given point, it
2179 simply continues as though it had been matching the other word in the first place.
2181 'abcdgu'=~/abcdefg|cdgu/
2182 When we get to 'd' we are still matching the first word, we would encounter 'g' which would
2183 fail, which would bring use to the state representing 'd' in the second word where we would
2184 try 'g' and succeed, prodceding to match 'cdgu'.
2186 /* add a fail transition */
2187 const U32 trie_offset = ARG(source);
2188 reg_trie_data *trie=(reg_trie_data *)RExC_rxi->data->data[trie_offset];
2190 const U32 ucharcount = trie->uniquecharcount;
2191 const U32 numstates = trie->statecount;
2192 const U32 ubound = trie->lasttrans + ucharcount;
2196 U32 base = trie->states[ 1 ].trans.base;
2199 const U32 data_slot = add_data( pRExC_state, 1, "T" );
2200 GET_RE_DEBUG_FLAGS_DECL;
2202 PERL_ARGS_ASSERT_MAKE_TRIE_FAILTABLE;
2204 PERL_UNUSED_ARG(depth);
2208 ARG_SET( stclass, data_slot );
2209 aho = (reg_ac_data *) PerlMemShared_calloc( 1, sizeof(reg_ac_data) );
2210 RExC_rxi->data->data[ data_slot ] = (void*)aho;
2211 aho->trie=trie_offset;
2212 aho->states=(reg_trie_state *)PerlMemShared_malloc( numstates * sizeof(reg_trie_state) );
2213 Copy( trie->states, aho->states, numstates, reg_trie_state );
2214 Newxz( q, numstates, U32);
2215 aho->fail = (U32 *) PerlMemShared_calloc( numstates, sizeof(U32) );
2218 /* initialize fail[0..1] to be 1 so that we always have
2219 a valid final fail state */
2220 fail[ 0 ] = fail[ 1 ] = 1;
2222 for ( charid = 0; charid < ucharcount ; charid++ ) {
2223 const U32 newstate = TRIE_TRANS_STATE( 1, base, ucharcount, charid, 0 );
2225 q[ q_write ] = newstate;
2226 /* set to point at the root */
2227 fail[ q[ q_write++ ] ]=1;
2230 while ( q_read < q_write) {
2231 const U32 cur = q[ q_read++ % numstates ];
2232 base = trie->states[ cur ].trans.base;
2234 for ( charid = 0 ; charid < ucharcount ; charid++ ) {
2235 const U32 ch_state = TRIE_TRANS_STATE( cur, base, ucharcount, charid, 1 );
2237 U32 fail_state = cur;
2240 fail_state = fail[ fail_state ];
2241 fail_base = aho->states[ fail_state ].trans.base;
2242 } while ( !TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 ) );
2244 fail_state = TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 );
2245 fail[ ch_state ] = fail_state;
2246 if ( !aho->states[ ch_state ].wordnum && aho->states[ fail_state ].wordnum )
2248 aho->states[ ch_state ].wordnum = aho->states[ fail_state ].wordnum;
2250 q[ q_write++ % numstates] = ch_state;
2254 /* restore fail[0..1] to 0 so that we "fall out" of the AC loop
2255 when we fail in state 1, this allows us to use the
2256 charclass scan to find a valid start char. This is based on the principle
2257 that theres a good chance the string being searched contains lots of stuff
2258 that cant be a start char.
2260 fail[ 0 ] = fail[ 1 ] = 0;
2261 DEBUG_TRIE_COMPILE_r({
2262 PerlIO_printf(Perl_debug_log,
2263 "%*sStclass Failtable (%"UVuf" states): 0",
2264 (int)(depth * 2), "", (UV)numstates
2266 for( q_read=1; q_read<numstates; q_read++ ) {
2267 PerlIO_printf(Perl_debug_log, ", %"UVuf, (UV)fail[q_read]);
2269 PerlIO_printf(Perl_debug_log, "\n");
2272 /*RExC_seen |= REG_SEEN_TRIEDFA;*/
2277 * There are strange code-generation bugs caused on sparc64 by gcc-2.95.2.
2278 * These need to be revisited when a newer toolchain becomes available.
2280 #if defined(__sparc64__) && defined(__GNUC__)
2281 # if __GNUC__ < 2 || (__GNUC__ == 2 && __GNUC_MINOR__ < 96)
2282 # undef SPARC64_GCC_WORKAROUND
2283 # define SPARC64_GCC_WORKAROUND 1
2287 #define DEBUG_PEEP(str,scan,depth) \
2288 DEBUG_OPTIMISE_r({if (scan){ \
2289 SV * const mysv=sv_newmortal(); \
2290 regnode *Next = regnext(scan); \
2291 regprop(RExC_rx, mysv, scan); \
2292 PerlIO_printf(Perl_debug_log, "%*s" str ">%3d: %s (%d)\n", \
2293 (int)depth*2, "", REG_NODE_NUM(scan), SvPV_nolen_const(mysv),\
2294 Next ? (REG_NODE_NUM(Next)) : 0 ); \
2301 #define JOIN_EXACT(scan,min,flags) \
2302 if (PL_regkind[OP(scan)] == EXACT) \
2303 join_exact(pRExC_state,(scan),(min),(flags),NULL,depth+1)
2306 S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, I32 *min, U32 flags,regnode *val, U32 depth) {
2307 /* Merge several consecutive EXACTish nodes into one. */
2308 regnode *n = regnext(scan);
2310 regnode *next = scan + NODE_SZ_STR(scan);
2314 regnode *stop = scan;
2315 GET_RE_DEBUG_FLAGS_DECL;
2317 PERL_UNUSED_ARG(depth);
2320 PERL_ARGS_ASSERT_JOIN_EXACT;
2321 #ifndef EXPERIMENTAL_INPLACESCAN
2322 PERL_UNUSED_ARG(flags);
2323 PERL_UNUSED_ARG(val);
2325 DEBUG_PEEP("join",scan,depth);
2327 /* Skip NOTHING, merge EXACT*. */
2329 ( PL_regkind[OP(n)] == NOTHING ||
2330 (stringok && (OP(n) == OP(scan))))
2332 && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX) {
2334 if (OP(n) == TAIL || n > next)
2336 if (PL_regkind[OP(n)] == NOTHING) {
2337 DEBUG_PEEP("skip:",n,depth);
2338 NEXT_OFF(scan) += NEXT_OFF(n);
2339 next = n + NODE_STEP_REGNODE;
2346 else if (stringok) {
2347 const unsigned int oldl = STR_LEN(scan);
2348 regnode * const nnext = regnext(n);
2350 DEBUG_PEEP("merg",n,depth);
2353 if (oldl + STR_LEN(n) > U8_MAX)
2355 NEXT_OFF(scan) += NEXT_OFF(n);
2356 STR_LEN(scan) += STR_LEN(n);
2357 next = n + NODE_SZ_STR(n);
2358 /* Now we can overwrite *n : */
2359 Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
2367 #ifdef EXPERIMENTAL_INPLACESCAN
2368 if (flags && !NEXT_OFF(n)) {
2369 DEBUG_PEEP("atch", val, depth);
2370 if (reg_off_by_arg[OP(n)]) {
2371 ARG_SET(n, val - n);
2374 NEXT_OFF(n) = val - n;
2381 if (UTF && ( OP(scan) == EXACTF ) && ( STR_LEN(scan) >= 6 ) ) {
2383 Two problematic code points in Unicode casefolding of EXACT nodes:
2385 U+0390 - GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
2386 U+03B0 - GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
2392 U+03B9 U+0308 U+0301 0xCE 0xB9 0xCC 0x88 0xCC 0x81
2393 U+03C5 U+0308 U+0301 0xCF 0x85 0xCC 0x88 0xCC 0x81
2395 This means that in case-insensitive matching (or "loose matching",
2396 as Unicode calls it), an EXACTF of length six (the UTF-8 encoded byte
2397 length of the above casefolded versions) can match a target string
2398 of length two (the byte length of UTF-8 encoded U+0390 or U+03B0).
2399 This would rather mess up the minimum length computation.
2401 What we'll do is to look for the tail four bytes, and then peek
2402 at the preceding two bytes to see whether we need to decrease
2403 the minimum length by four (six minus two).
2405 Thanks to the design of UTF-8, there cannot be false matches:
2406 A sequence of valid UTF-8 bytes cannot be a subsequence of
2407 another valid sequence of UTF-8 bytes.
2410 char * const s0 = STRING(scan), *s, *t;
2411 char * const s1 = s0 + STR_LEN(scan) - 1;
2412 char * const s2 = s1 - 4;
2413 #ifdef EBCDIC /* RD tunifold greek 0390 and 03B0 */
2414 const char t0[] = "\xaf\x49\xaf\x42";
2416 const char t0[] = "\xcc\x88\xcc\x81";
2418 const char * const t1 = t0 + 3;
2421 s < s2 && (t = ninstr(s, s1, t0, t1));
2424 if (((U8)t[-1] == 0x68 && (U8)t[-2] == 0xB4) ||
2425 ((U8)t[-1] == 0x46 && (U8)t[-2] == 0xB5))
2427 if (((U8)t[-1] == 0xB9 && (U8)t[-2] == 0xCE) ||
2428 ((U8)t[-1] == 0x85 && (U8)t[-2] == 0xCF))
2436 n = scan + NODE_SZ_STR(scan);
2438 if (PL_regkind[OP(n)] != NOTHING || OP(n) == NOTHING) {
2445 DEBUG_OPTIMISE_r(if (merged){DEBUG_PEEP("finl",scan,depth)});
2449 /* REx optimizer. Converts nodes into quickier variants "in place".
2450 Finds fixed substrings. */
2452 /* Stops at toplevel WHILEM as well as at "last". At end *scanp is set
2453 to the position after last scanned or to NULL. */
2455 #define INIT_AND_WITHP \
2456 assert(!and_withp); \
2457 Newx(and_withp,1,struct regnode_charclass_class); \
2458 SAVEFREEPV(and_withp)
2460 /* this is a chain of data about sub patterns we are processing that
2461 need to be handled seperately/specially in study_chunk. Its so
2462 we can simulate recursion without losing state. */
2464 typedef struct scan_frame {
2465 regnode *last; /* last node to process in this frame */
2466 regnode *next; /* next node to process when last is reached */
2467 struct scan_frame *prev; /*previous frame*/
2468 I32 stop; /* what stopparen do we use */
2472 #define SCAN_COMMIT(s, data, m) scan_commit(s, data, m, is_inf)
2474 #define CASE_SYNST_FNC(nAmE) \
2476 if (flags & SCF_DO_STCLASS_AND) { \
2477 for (value = 0; value < 256; value++) \
2478 if (!is_ ## nAmE ## _cp(value)) \
2479 ANYOF_BITMAP_CLEAR(data->start_class, value); \
2482 for (value = 0; value < 256; value++) \
2483 if (is_ ## nAmE ## _cp(value)) \
2484 ANYOF_BITMAP_SET(data->start_class, value); \
2488 if (flags & SCF_DO_STCLASS_AND) { \
2489 for (value = 0; value < 256; value++) \
2490 if (is_ ## nAmE ## _cp(value)) \
2491 ANYOF_BITMAP_CLEAR(data->start_class, value); \
2494 for (value = 0; value < 256; value++) \
2495 if (!is_ ## nAmE ## _cp(value)) \
2496 ANYOF_BITMAP_SET(data->start_class, value); \
2503 S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
2504 I32 *minlenp, I32 *deltap,
2509 struct regnode_charclass_class *and_withp,
2510 U32 flags, U32 depth)
2511 /* scanp: Start here (read-write). */
2512 /* deltap: Write maxlen-minlen here. */
2513 /* last: Stop before this one. */
2514 /* data: string data about the pattern */
2515 /* stopparen: treat close N as END */
2516 /* recursed: which subroutines have we recursed into */
2517 /* and_withp: Valid if flags & SCF_DO_STCLASS_OR */
2520 I32 min = 0, pars = 0, code;
2521 regnode *scan = *scanp, *next;
2523 int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
2524 int is_inf_internal = 0; /* The studied chunk is infinite */
2525 I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
2526 scan_data_t data_fake;
2527 SV *re_trie_maxbuff = NULL;
2528 regnode *first_non_open = scan;
2529 I32 stopmin = I32_MAX;
2530 scan_frame *frame = NULL;
2531 GET_RE_DEBUG_FLAGS_DECL;
2533 PERL_ARGS_ASSERT_STUDY_CHUNK;
2536 StructCopy(&zero_scan_data, &data_fake, scan_data_t);
2540 while (first_non_open && OP(first_non_open) == OPEN)
2541 first_non_open=regnext(first_non_open);
2546 while ( scan && OP(scan) != END && scan < last ){
2547 /* Peephole optimizer: */
2548 DEBUG_STUDYDATA("Peep:", data,depth);
2549 DEBUG_PEEP("Peep",scan,depth);
2550 JOIN_EXACT(scan,&min,0);
2552 /* Follow the next-chain of the current node and optimize
2553 away all the NOTHINGs from it. */
2554 if (OP(scan) != CURLYX) {
2555 const int max = (reg_off_by_arg[OP(scan)]
2557 /* I32 may be smaller than U16 on CRAYs! */
2558 : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
2559 int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
2563 /* Skip NOTHING and LONGJMP. */
2564 while ((n = regnext(n))
2565 && ((PL_regkind[OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
2566 || ((OP(n) == LONGJMP) && (noff = ARG(n))))
2567 && off + noff < max)
2569 if (reg_off_by_arg[OP(scan)])
2572 NEXT_OFF(scan) = off;
2577 /* The principal pseudo-switch. Cannot be a switch, since we
2578 look into several different things. */
2579 if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
2580 || OP(scan) == IFTHEN) {
2581 next = regnext(scan);
2583 /* demq: the op(next)==code check is to see if we have "branch-branch" AFAICT */
2585 if (OP(next) == code || code == IFTHEN) {
2586 /* NOTE - There is similar code to this block below for handling
2587 TRIE nodes on a re-study. If you change stuff here check there
2589 I32 max1 = 0, min1 = I32_MAX, num = 0;
2590 struct regnode_charclass_class accum;
2591 regnode * const startbranch=scan;
2593 if (flags & SCF_DO_SUBSTR)
2594 SCAN_COMMIT(pRExC_state, data, minlenp); /* Cannot merge strings after this. */
2595 if (flags & SCF_DO_STCLASS)
2596 cl_init_zero(pRExC_state, &accum);
2598 while (OP(scan) == code) {
2599 I32 deltanext, minnext, f = 0, fake;
2600 struct regnode_charclass_class this_class;
2603 data_fake.flags = 0;
2605 data_fake.whilem_c = data->whilem_c;
2606 data_fake.last_closep = data->last_closep;
2609 data_fake.last_closep = &fake;
2611 data_fake.pos_delta = delta;
2612 next = regnext(scan);
2613 scan = NEXTOPER(scan);
2615 scan = NEXTOPER(scan);
2616 if (flags & SCF_DO_STCLASS) {
2617 cl_init(pRExC_state, &this_class);
2618 data_fake.start_class = &this_class;
2619 f = SCF_DO_STCLASS_AND;
2621 if (flags & SCF_WHILEM_VISITED_POS)
2622 f |= SCF_WHILEM_VISITED_POS;
2624 /* we suppose the run is continuous, last=next...*/
2625 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
2627 stopparen, recursed, NULL, f,depth+1);
2630 if (max1 < minnext + deltanext)
2631 max1 = minnext + deltanext;
2632 if (deltanext == I32_MAX)
2633 is_inf = is_inf_internal = 1;
2635 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
2637 if (data_fake.flags & SCF_SEEN_ACCEPT) {
2638 if ( stopmin > minnext)
2639 stopmin = min + min1;
2640 flags &= ~SCF_DO_SUBSTR;
2642 data->flags |= SCF_SEEN_ACCEPT;
2645 if (data_fake.flags & SF_HAS_EVAL)
2646 data->flags |= SF_HAS_EVAL;
2647 data->whilem_c = data_fake.whilem_c;
2649 if (flags & SCF_DO_STCLASS)
2650 cl_or(pRExC_state, &accum, &this_class);
2652 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
2654 if (flags & SCF_DO_SUBSTR) {
2655 data->pos_min += min1;
2656 data->pos_delta += max1 - min1;
2657 if (max1 != min1 || is_inf)
2658 data->longest = &(data->longest_float);
2661 delta += max1 - min1;
2662 if (flags & SCF_DO_STCLASS_OR) {
2663 cl_or(pRExC_state, data->start_class, &accum);
2665 cl_and(data->start_class, and_withp);
2666 flags &= ~SCF_DO_STCLASS;
2669 else if (flags & SCF_DO_STCLASS_AND) {
2671 cl_and(data->start_class, &accum);
2672 flags &= ~SCF_DO_STCLASS;
2675 /* Switch to OR mode: cache the old value of
2676 * data->start_class */
2678 StructCopy(data->start_class, and_withp,
2679 struct regnode_charclass_class);
2680 flags &= ~SCF_DO_STCLASS_AND;
2681 StructCopy(&accum, data->start_class,
2682 struct regnode_charclass_class);
2683 flags |= SCF_DO_STCLASS_OR;
2684 data->start_class->flags |= ANYOF_EOS;
2688 if (PERL_ENABLE_TRIE_OPTIMISATION && OP( startbranch ) == BRANCH ) {
2691 Assuming this was/is a branch we are dealing with: 'scan' now
2692 points at the item that follows the branch sequence, whatever
2693 it is. We now start at the beginning of the sequence and look
2700 which would be constructed from a pattern like /A|LIST|OF|WORDS/
2702 If we can find such a subseqence we need to turn the first
2703 element into a trie and then add the subsequent branch exact
2704 strings to the trie.
2708 1. patterns where the whole set of branch can be converted.
2710 2. patterns where only a subset can be converted.
2712 In case 1 we can replace the whole set with a single regop
2713 for the trie. In case 2 we need to keep the start and end
2716 'BRANCH EXACT; BRANCH EXACT; BRANCH X'
2717 becomes BRANCH TRIE; BRANCH X;
2719 There is an additional case, that being where there is a
2720 common prefix, which gets split out into an EXACT like node
2721 preceding the TRIE node.
2723 If x(1..n)==tail then we can do a simple trie, if not we make
2724 a "jump" trie, such that when we match the appropriate word
2725 we "jump" to the appopriate tail node. Essentailly we turn
2726 a nested if into a case structure of sorts.
2731 if (!re_trie_maxbuff) {
2732 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
2733 if (!SvIOK(re_trie_maxbuff))
2734 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
2736 if ( SvIV(re_trie_maxbuff)>=0 ) {
2738 regnode *first = (regnode *)NULL;
2739 regnode *last = (regnode *)NULL;
2740 regnode *tail = scan;
2745 SV * const mysv = sv_newmortal(); /* for dumping */
2747 /* var tail is used because there may be a TAIL
2748 regop in the way. Ie, the exacts will point to the
2749 thing following the TAIL, but the last branch will
2750 point at the TAIL. So we advance tail. If we
2751 have nested (?:) we may have to move through several
2755 while ( OP( tail ) == TAIL ) {
2756 /* this is the TAIL generated by (?:) */
2757 tail = regnext( tail );
2762 regprop(RExC_rx, mysv, tail );
2763 PerlIO_printf( Perl_debug_log, "%*s%s%s\n",
2764 (int)depth * 2 + 2, "",
2765 "Looking for TRIE'able sequences. Tail node is: ",
2766 SvPV_nolen_const( mysv )
2772 step through the branches, cur represents each
2773 branch, noper is the first thing to be matched
2774 as part of that branch and noper_next is the
2775 regnext() of that node. if noper is an EXACT
2776 and noper_next is the same as scan (our current
2777 position in the regex) then the EXACT branch is
2778 a possible optimization target. Once we have
2779 two or more consequetive such branches we can
2780 create a trie of the EXACT's contents and stich
2781 it in place. If the sequence represents all of
2782 the branches we eliminate the whole thing and
2783 replace it with a single TRIE. If it is a
2784 subsequence then we need to stitch it in. This
2785 means the first branch has to remain, and needs
2786 to be repointed at the item on the branch chain
2787 following the last branch optimized. This could
2788 be either a BRANCH, in which case the
2789 subsequence is internal, or it could be the
2790 item following the branch sequence in which
2791 case the subsequence is at the end.
2795 /* dont use tail as the end marker for this traverse */
2796 for ( cur = startbranch ; cur != scan ; cur = regnext( cur ) ) {
2797 regnode * const noper = NEXTOPER( cur );
2798 #if defined(DEBUGGING) || defined(NOJUMPTRIE)
2799 regnode * const noper_next = regnext( noper );
2803 regprop(RExC_rx, mysv, cur);
2804 PerlIO_printf( Perl_debug_log, "%*s- %s (%d)",
2805 (int)depth * 2 + 2,"", SvPV_nolen_const( mysv ), REG_NODE_NUM(cur) );
2807 regprop(RExC_rx, mysv, noper);
2808 PerlIO_printf( Perl_debug_log, " -> %s",
2809 SvPV_nolen_const(mysv));
2812 regprop(RExC_rx, mysv, noper_next );
2813 PerlIO_printf( Perl_debug_log,"\t=> %s\t",
2814 SvPV_nolen_const(mysv));
2816 PerlIO_printf( Perl_debug_log, "(First==%d,Last==%d,Cur==%d)\n",
2817 REG_NODE_NUM(first), REG_NODE_NUM(last), REG_NODE_NUM(cur) );
2819 if ( (((first && optype!=NOTHING) ? OP( noper ) == optype
2820 : PL_regkind[ OP( noper ) ] == EXACT )
2821 || OP(noper) == NOTHING )
2823 && noper_next == tail
2828 if ( !first || optype == NOTHING ) {
2829 if (!first) first = cur;
2830 optype = OP( noper );
2836 Currently we do not believe that the trie logic can
2837 handle case insensitive matching properly when the
2838 pattern is not unicode (thus forcing unicode semantics).
2840 If/when this is fixed the following define can be swapped
2841 in below to fully enable trie logic.
2843 #define TRIE_TYPE_IS_SAFE 1
2846 #define TRIE_TYPE_IS_SAFE (UTF || optype==EXACT)
2848 if ( last && TRIE_TYPE_IS_SAFE ) {
2849 make_trie( pRExC_state,
2850 startbranch, first, cur, tail, count,
2853 if ( PL_regkind[ OP( noper ) ] == EXACT
2855 && noper_next == tail
2860 optype = OP( noper );
2870 regprop(RExC_rx, mysv, cur);
2871 PerlIO_printf( Perl_debug_log,
2872 "%*s- %s (%d) <SCAN FINISHED>\n", (int)depth * 2 + 2,
2873 "", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur));
2877 if ( last && TRIE_TYPE_IS_SAFE ) {
2878 made= make_trie( pRExC_state, startbranch, first, scan, tail, count, optype, depth+1 );
2879 #ifdef TRIE_STUDY_OPT
2880 if ( ((made == MADE_EXACT_TRIE &&
2881 startbranch == first)
2882 || ( first_non_open == first )) &&
2884 flags |= SCF_TRIE_RESTUDY;
2885 if ( startbranch == first
2888 RExC_seen &=~REG_TOP_LEVEL_BRANCHES;
2898 else if ( code == BRANCHJ ) { /* single branch is optimized. */
2899 scan = NEXTOPER(NEXTOPER(scan));
2900 } else /* single branch is optimized. */
2901 scan = NEXTOPER(scan);
2903 } else if (OP(scan) == SUSPEND || OP(scan) == GOSUB || OP(scan) == GOSTART) {
2904 scan_frame *newframe = NULL;
2909 if (OP(scan) != SUSPEND) {
2910 /* set the pointer */
2911 if (OP(scan) == GOSUB) {
2913 RExC_recurse[ARG2L(scan)] = scan;
2914 start = RExC_open_parens[paren-1];
2915 end = RExC_close_parens[paren-1];
2918 start = RExC_rxi->program + 1;
2922 Newxz(recursed, (((RExC_npar)>>3) +1), U8);
2923 SAVEFREEPV(recursed);
2925 if (!PAREN_TEST(recursed,paren+1)) {
2926 PAREN_SET(recursed,paren+1);
2927 Newx(newframe,1,scan_frame);
2929 if (flags & SCF_DO_SUBSTR) {
2930 SCAN_COMMIT(pRExC_state,data,minlenp);
2931 data->longest = &(data->longest_float);
2933 is_inf = is_inf_internal = 1;
2934 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
2935 cl_anything(pRExC_state, data->start_class);
2936 flags &= ~SCF_DO_STCLASS;
2939 Newx(newframe,1,scan_frame);
2942 end = regnext(scan);
2947 SAVEFREEPV(newframe);
2948 newframe->next = regnext(scan);
2949 newframe->last = last;
2950 newframe->stop = stopparen;
2951 newframe->prev = frame;
2961 else if (OP(scan) == EXACT) {
2962 I32 l = STR_LEN(scan);
2965 const U8 * const s = (U8*)STRING(scan);
2966 l = utf8_length(s, s + l);
2967 uc = utf8_to_uvchr(s, NULL);
2969 uc = *((U8*)STRING(scan));
2972 if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
2973 /* The code below prefers earlier match for fixed
2974 offset, later match for variable offset. */
2975 if (data->last_end == -1) { /* Update the start info. */
2976 data->last_start_min = data->pos_min;
2977 data->last_start_max = is_inf
2978 ? I32_MAX : data->pos_min + data->pos_delta;
2980 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
2982 SvUTF8_on(data->last_found);
2984 SV * const sv = data->last_found;
2985 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
2986 mg_find(sv, PERL_MAGIC_utf8) : NULL;
2987 if (mg && mg->mg_len >= 0)
2988 mg->mg_len += utf8_length((U8*)STRING(scan),
2989 (U8*)STRING(scan)+STR_LEN(scan));
2991 data->last_end = data->pos_min + l;
2992 data->pos_min += l; /* As in the first entry. */
2993 data->flags &= ~SF_BEFORE_EOL;
2995 if (flags & SCF_DO_STCLASS_AND) {
2996 /* Check whether it is compatible with what we know already! */
3000 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
3001 && !ANYOF_BITMAP_TEST(data->start_class, uc)
3002 && (!(data->start_class->flags & ANYOF_FOLD)
3003 || !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
3006 ANYOF_CLASS_ZERO(data->start_class);
3007 ANYOF_BITMAP_ZERO(data->start_class);
3009 ANYOF_BITMAP_SET(data->start_class, uc);
3010 data->start_class->flags &= ~ANYOF_EOS;
3012 data->start_class->flags &= ~ANYOF_UNICODE_ALL;
3014 else if (flags & SCF_DO_STCLASS_OR) {
3015 /* false positive possible if the class is case-folded */
3017 ANYOF_BITMAP_SET(data->start_class, uc);
3019 data->start_class->flags |= ANYOF_UNICODE_ALL;
3020 data->start_class->flags &= ~ANYOF_EOS;
3021 cl_and(data->start_class, and_withp);
3023 flags &= ~SCF_DO_STCLASS;
3025 else if (PL_regkind[OP(scan)] == EXACT) { /* But OP != EXACT! */
3026 I32 l = STR_LEN(scan);
3027 UV uc = *((U8*)STRING(scan));
3029 /* Search for fixed substrings supports EXACT only. */
3030 if (flags & SCF_DO_SUBSTR) {
3032 SCAN_COMMIT(pRExC_state, data, minlenp);
3035 const U8 * const s = (U8 *)STRING(scan);
3036 l = utf8_length(s, s + l);
3037 uc = utf8_to_uvchr(s, NULL);
3040 if (flags & SCF_DO_SUBSTR)
3042 if (flags & SCF_DO_STCLASS_AND) {
3043 /* Check whether it is compatible with what we know already! */
3047 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
3048 && !ANYOF_BITMAP_TEST(data->start_class, uc)
3049 && !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
3051 ANYOF_CLASS_ZERO(data->start_class);
3052 ANYOF_BITMAP_ZERO(data->start_class);
3054 ANYOF_BITMAP_SET(data->start_class, uc);
3055 data->start_class->flags &= ~ANYOF_EOS;
3056 data->start_class->flags |= ANYOF_FOLD;
3057 if (OP(scan) == EXACTFL)
3058 data->start_class->flags |= ANYOF_LOCALE;
3061 else if (flags & SCF_DO_STCLASS_OR) {
3062 if (data->start_class->flags & ANYOF_FOLD) {
3063 /* false positive possible if the class is case-folded.
3064 Assume that the locale settings are the same... */
3066 ANYOF_BITMAP_SET(data->start_class, uc);
3067 data->start_class->flags &= ~ANYOF_EOS;
3069 cl_and(data->start_class, and_withp);
3071 flags &= ~SCF_DO_STCLASS;
3073 else if (strchr((const char*)PL_varies,OP(scan))) {
3074 I32 mincount, maxcount, minnext, deltanext, fl = 0;
3075 I32 f = flags, pos_before = 0;
3076 regnode * const oscan = scan;
3077 struct regnode_charclass_class this_class;
3078 struct regnode_charclass_class *oclass = NULL;
3079 I32 next_is_eval = 0;
3081 switch (PL_regkind[OP(scan)]) {
3082 case WHILEM: /* End of (?:...)* . */
3083 scan = NEXTOPER(scan);
3086 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
3087 next = NEXTOPER(scan);
3088 if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) {
3090 maxcount = REG_INFTY;
3091 next = regnext(scan);
3092 scan = NEXTOPER(scan);
3096 if (flags & SCF_DO_SUBSTR)
3101 if (flags & SCF_DO_STCLASS) {
3103 maxcount = REG_INFTY;
3104 next = regnext(scan);
3105 scan = NEXTOPER(scan);
3108 is_inf = is_inf_internal = 1;
3109 scan = regnext(scan);
3110 if (flags & SCF_DO_SUBSTR) {
3111 SCAN_COMMIT(pRExC_state, data, minlenp); /* Cannot extend fixed substrings */
3112 data->longest = &(data->longest_float);
3114 goto optimize_curly_tail;
3116 if (stopparen>0 && (OP(scan)==CURLYN || OP(scan)==CURLYM)
3117 && (scan->flags == stopparen))
3122 mincount = ARG1(scan);
3123 maxcount = ARG2(scan);
3125 next = regnext(scan);
3126 if (OP(scan) == CURLYX) {
3127 I32 lp = (data ? *(data->last_closep) : 0);
3128 scan->flags = ((lp <= (I32)U8_MAX) ? (U8)lp : U8_MAX);
3130 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
3131 next_is_eval = (OP(scan) == EVAL);
3133 if (flags & SCF_DO_SUBSTR) {
3134 if (mincount == 0) SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot extend fixed substrings */
3135 pos_before = data->pos_min;
3139 data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
3141 data->flags |= SF_IS_INF;
3143 if (flags & SCF_DO_STCLASS) {
3144 cl_init(pRExC_state, &this_class);
3145 oclass = data->start_class;
3146 data->start_class = &this_class;
3147 f |= SCF_DO_STCLASS_AND;
3148 f &= ~SCF_DO_STCLASS_OR;
3150 /* These are the cases when once a subexpression
3151 fails at a particular position, it cannot succeed
3152 even after backtracking at the enclosing scope.
3154 XXXX what if minimal match and we are at the
3155 initial run of {n,m}? */
3156 if ((mincount != maxcount - 1) && (maxcount != REG_INFTY))
3157 f &= ~SCF_WHILEM_VISITED_POS;
3159 /* This will finish on WHILEM, setting scan, or on NULL: */
3160 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
3161 last, data, stopparen, recursed, NULL,
3163 ? (f & ~SCF_DO_SUBSTR) : f),depth+1);
3165 if (flags & SCF_DO_STCLASS)
3166 data->start_class = oclass;
3167 if (mincount == 0 || minnext == 0) {
3168 if (flags & SCF_DO_STCLASS_OR) {
3169 cl_or(pRExC_state, data->start_class, &this_class);
3171 else if (flags & SCF_DO_STCLASS_AND) {
3172 /* Switch to OR mode: cache the old value of
3173 * data->start_class */
3175 StructCopy(data->start_class, and_withp,
3176 struct regnode_charclass_class);
3177 flags &= ~SCF_DO_STCLASS_AND;
3178 StructCopy(&this_class, data->start_class,
3179 struct regnode_charclass_class);
3180 flags |= SCF_DO_STCLASS_OR;
3181 data->start_class->flags |= ANYOF_EOS;
3183 } else { /* Non-zero len */
3184 if (flags & SCF_DO_STCLASS_OR) {
3185 cl_or(pRExC_state, data->start_class, &this_class);
3186 cl_and(data->start_class, and_withp);
3188 else if (flags & SCF_DO_STCLASS_AND)
3189 cl_and(data->start_class, &this_class);
3190 flags &= ~SCF_DO_STCLASS;
3192 if (!scan) /* It was not CURLYX, but CURLY. */
3194 if ( /* ? quantifier ok, except for (?{ ... }) */
3195 (next_is_eval || !(mincount == 0 && maxcount == 1))
3196 && (minnext == 0) && (deltanext == 0)
3197 && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
3198 && maxcount <= REG_INFTY/3) /* Complement check for big count */
3200 ckWARNreg(RExC_parse,
3201 "Quantifier unexpected on zero-length expression");
3204 min += minnext * mincount;
3205 is_inf_internal |= ((maxcount == REG_INFTY
3206 && (minnext + deltanext) > 0)
3207 || deltanext == I32_MAX);
3208 is_inf |= is_inf_internal;
3209 delta += (minnext + deltanext) * maxcount - minnext * mincount;
3211 /* Try powerful optimization CURLYX => CURLYN. */
3212 if ( OP(oscan) == CURLYX && data
3213 && data->flags & SF_IN_PAR
3214 && !(data->flags & SF_HAS_EVAL)
3215 && !deltanext && minnext == 1 ) {
3216 /* Try to optimize to CURLYN. */
3217 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
3218 regnode * const nxt1 = nxt;
3225 if (!strchr((const char*)PL_simple,OP(nxt))
3226 && !(PL_regkind[OP(nxt)] == EXACT
3227 && STR_LEN(nxt) == 1))
3233 if (OP(nxt) != CLOSE)
3235 if (RExC_open_parens) {
3236 RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
3237 RExC_close_parens[ARG(nxt1)-1]=nxt+2; /*close->while*/
3239 /* Now we know that nxt2 is the only contents: */
3240 oscan->flags = (U8)ARG(nxt);
3242 OP(nxt1) = NOTHING; /* was OPEN. */
3245 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
3246 NEXT_OFF(nxt1+ 1) = 0; /* just for consistancy. */
3247 NEXT_OFF(nxt2) = 0; /* just for consistancy with CURLY. */
3248 OP(nxt) = OPTIMIZED; /* was CLOSE. */
3249 OP(nxt + 1) = OPTIMIZED; /* was count. */
3250 NEXT_OFF(nxt+ 1) = 0; /* just for consistancy. */
3255 /* Try optimization CURLYX => CURLYM. */
3256 if ( OP(oscan) == CURLYX && data
3257 && !(data->flags & SF_HAS_PAR)
3258 && !(data->flags & SF_HAS_EVAL)
3259 && !deltanext /* atom is fixed width */
3260 && minnext != 0 /* CURLYM can't handle zero width */
3262 /* XXXX How to optimize if data == 0? */
3263 /* Optimize to a simpler form. */
3264 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
3268 while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
3269 && (OP(nxt2) != WHILEM))
3271 OP(nxt2) = SUCCEED; /* Whas WHILEM */
3272 /* Need to optimize away parenths. */
3273 if (data->flags & SF_IN_PAR) {
3274 /* Set the parenth number. */
3275 regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
3277 if (OP(nxt) != CLOSE)
3278 FAIL("Panic opt close");
3279 oscan->flags = (U8)ARG(nxt);
3280 if (RExC_open_parens) {
3281 RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
3282 RExC_close_parens[ARG(nxt1)-1]=nxt2+1; /*close->NOTHING*/
3284 OP(nxt1) = OPTIMIZED; /* was OPEN. */
3285 OP(nxt) = OPTIMIZED; /* was CLOSE. */
3288 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
3289 OP(nxt + 1) = OPTIMIZED; /* was count. */
3290 NEXT_OFF(nxt1 + 1) = 0; /* just for consistancy. */
3291 NEXT_OFF(nxt + 1) = 0; /* just for consistancy. */
3294 while ( nxt1 && (OP(nxt1) != WHILEM)) {
3295 regnode *nnxt = regnext(nxt1);
3298 if (reg_off_by_arg[OP(nxt1)])
3299 ARG_SET(nxt1, nxt2 - nxt1);
3300 else if (nxt2 - nxt1 < U16_MAX)
3301 NEXT_OFF(nxt1) = nxt2 - nxt1;
3303 OP(nxt) = NOTHING; /* Cannot beautify */
3308 /* Optimize again: */
3309 study_chunk(pRExC_state, &nxt1, minlenp, &deltanext, nxt,
3310 NULL, stopparen, recursed, NULL, 0,depth+1);
3315 else if ((OP(oscan) == CURLYX)
3316 && (flags & SCF_WHILEM_VISITED_POS)
3317 /* See the comment on a similar expression above.
3318 However, this time it not a subexpression
3319 we care about, but the expression itself. */
3320 && (maxcount == REG_INFTY)
3321 && data && ++data->whilem_c < 16) {
3322 /* This stays as CURLYX, we can put the count/of pair. */
3323 /* Find WHILEM (as in regexec.c) */
3324 regnode *nxt = oscan + NEXT_OFF(oscan);
3326 if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
3328 PREVOPER(nxt)->flags = (U8)(data->whilem_c
3329 | (RExC_whilem_seen << 4)); /* On WHILEM */
3331 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
3333 if (flags & SCF_DO_SUBSTR) {
3334 SV *last_str = NULL;
3335 int counted = mincount != 0;
3337 if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */
3338 #if defined(SPARC64_GCC_WORKAROUND)
3341 const char *s = NULL;
3344 if (pos_before >= data->last_start_min)
3347 b = data->last_start_min;
3350 s = SvPV_const(data->last_found, l);
3351 old = b - data->last_start_min;
3354 I32 b = pos_before >= data->last_start_min
3355 ? pos_before : data->last_start_min;
3357 const char * const s = SvPV_const(data->last_found, l);
3358 I32 old = b - data->last_start_min;
3362 old = utf8_hop((U8*)s, old) - (U8*)s;
3365 /* Get the added string: */
3366 last_str = newSVpvn_utf8(s + old, l, UTF);
3367 if (deltanext == 0 && pos_before == b) {
3368 /* What was added is a constant string */
3370 SvGROW(last_str, (mincount * l) + 1);
3371 repeatcpy(SvPVX(last_str) + l,
3372 SvPVX_const(last_str), l, mincount - 1);
3373 SvCUR_set(last_str, SvCUR(last_str) * mincount);
3374 /* Add additional parts. */
3375 SvCUR_set(data->last_found,
3376 SvCUR(data->last_found) - l);
3377 sv_catsv(data->last_found, last_str);
3379 SV * sv = data->last_found;
3381 SvUTF8(sv) && SvMAGICAL(sv) ?
3382 mg_find(sv, PERL_MAGIC_utf8) : NULL;
3383 if (mg && mg->mg_len >= 0)
3384 mg->mg_len += CHR_SVLEN(last_str) - l;
3386 data->last_end += l * (mincount - 1);
3389 /* start offset must point into the last copy */
3390 data->last_start_min += minnext * (mincount - 1);
3391 data->last_start_max += is_inf ? I32_MAX
3392 : (maxcount - 1) * (minnext + data->pos_delta);
3395 /* It is counted once already... */
3396 data->pos_min += minnext * (mincount - counted);
3397 data->pos_delta += - counted * deltanext +
3398 (minnext + deltanext) * maxcount - minnext * mincount;
3399 if (mincount != maxcount) {
3400 /* Cannot extend fixed substrings found inside
3402 SCAN_COMMIT(pRExC_state,data,minlenp);
3403 if (mincount && last_str) {
3404 SV * const sv = data->last_found;
3405 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
3406 mg_find(sv, PERL_MAGIC_utf8) : NULL;
3410 sv_setsv(sv, last_str);
3411 data->last_end = data->pos_min;
3412 data->last_start_min =
3413 data->pos_min - CHR_SVLEN(last_str);
3414 data->last_start_max = is_inf
3416 : data->pos_min + data->pos_delta
3417 - CHR_SVLEN(last_str);
3419 data->longest = &(data->longest_float);
3421 SvREFCNT_dec(last_str);
3423 if (data && (fl & SF_HAS_EVAL))
3424 data->flags |= SF_HAS_EVAL;
3425 optimize_curly_tail:
3426 if (OP(oscan) != CURLYX) {
3427 while (PL_regkind[OP(next = regnext(oscan))] == NOTHING
3429 NEXT_OFF(oscan) += NEXT_OFF(next);
3432 default: /* REF and CLUMP only? */
3433 if (flags & SCF_DO_SUBSTR) {
3434 SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */
3435 data->longest = &(data->longest_float);
3437 is_inf = is_inf_internal = 1;
3438 if (flags & SCF_DO_STCLASS_OR)
3439 cl_anything(pRExC_state, data->start_class);
3440 flags &= ~SCF_DO_STCLASS;
3444 else if (OP(scan) == LNBREAK) {
3445 if (flags & SCF_DO_STCLASS) {
3447 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
3448 if (flags & SCF_DO_STCLASS_AND) {
3449 for (value = 0; value < 256; value++)
3450 if (!is_VERTWS_cp(value))
3451 ANYOF_BITMAP_CLEAR(data->start_class, value);
3454 for (value = 0; value < 256; value++)
3455 if (is_VERTWS_cp(value))
3456 ANYOF_BITMAP_SET(data->start_class, value);
3458 if (flags & SCF_DO_STCLASS_OR)
3459 cl_and(data->start_class, and_withp);
3460 flags &= ~SCF_DO_STCLASS;
3464 if (flags & SCF_DO_SUBSTR) {
3465 SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */
3467 data->pos_delta += 1;
3468 data->longest = &(data->longest_float);
3472 else if (OP(scan) == FOLDCHAR) {
3473 int d = ARG(scan)==0xDF ? 1 : 2;
3474 flags &= ~SCF_DO_STCLASS;
3477 if (flags & SCF_DO_SUBSTR) {
3478 SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */
3480 data->pos_delta += d;
3481 data->longest = &(data->longest_float);
3484 else if (strchr((const char*)PL_simple,OP(scan))) {
3487 if (flags & SCF_DO_SUBSTR) {
3488 SCAN_COMMIT(pRExC_state,data,minlenp);
3492 if (flags & SCF_DO_STCLASS) {
3493 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
3495 /* Some of the logic below assumes that switching
3496 locale on will only add false positives. */
3497 switch (PL_regkind[OP(scan)]) {
3501 /* Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan)); */
3502 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
3503 cl_anything(pRExC_state, data->start_class);
3506 if (OP(scan) == SANY)
3508 if (flags & SCF_DO_STCLASS_OR) { /* Everything but \n */
3509 value = (ANYOF_BITMAP_TEST(data->start_class,'\n')
3510 || (data->start_class->flags & ANYOF_CLASS));
3511 cl_anything(pRExC_state, data->start_class);
3513 if (flags & SCF_DO_STCLASS_AND || !value)
3514 ANYOF_BITMAP_CLEAR(data->start_class,'\n');
3517 if (flags & SCF_DO_STCLASS_AND)
3518 cl_and(data->start_class,
3519 (struct regnode_charclass_class*)scan);
3521 cl_or(pRExC_state, data->start_class,
3522 (struct regnode_charclass_class*)scan);
3525 if (flags & SCF_DO_STCLASS_AND) {
3526 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3527 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
3528 for (value = 0; value < 256; value++)
3529 if (!isALNUM(value))
3530 ANYOF_BITMAP_CLEAR(data->start_class, value);
3534 if (data->start_class->flags & ANYOF_LOCALE)
3535 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
3537 for (value = 0; value < 256; value++)
3539 ANYOF_BITMAP_SET(data->start_class, value);
3544 if (flags & SCF_DO_STCLASS_AND) {
3545 if (data->start_class->flags & ANYOF_LOCALE)
3546 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
3549 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
3550 data->start_class->flags |= ANYOF_LOCALE;
3554 if (flags & SCF_DO_STCLASS_AND) {
3555 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3556 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
3557 for (value = 0; value < 256; value++)
3559 ANYOF_BITMAP_CLEAR(data->start_class, value);
3563 if (data->start_class->flags & ANYOF_LOCALE)
3564 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
3566 for (value = 0; value < 256; value++)
3567 if (!isALNUM(value))
3568 ANYOF_BITMAP_SET(data->start_class, value);
3573 if (flags & SCF_DO_STCLASS_AND) {
3574 if (data->start_class->flags & ANYOF_LOCALE)
3575 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
3578 data->start_class->flags |= ANYOF_LOCALE;
3579 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
3583 if (flags & SCF_DO_STCLASS_AND) {
3584 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3585 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
3586 for (value = 0; value < 256; value++)
3587 if (!isSPACE(value))
3588 ANYOF_BITMAP_CLEAR(data->start_class, value);
3592 if (data->start_class->flags & ANYOF_LOCALE)
3593 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
3595 for (value = 0; value < 256; value++)
3597 ANYOF_BITMAP_SET(data->start_class, value);
3602 if (flags & SCF_DO_STCLASS_AND) {
3603 if (data->start_class->flags & ANYOF_LOCALE)
3604 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
3607 data->start_class->flags |= ANYOF_LOCALE;
3608 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
3612 if (flags & SCF_DO_STCLASS_AND) {
3613 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3614 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
3615 for (value = 0; value < 256; value++)
3617 ANYOF_BITMAP_CLEAR(data->start_class, value);
3621 if (data->start_class->flags & ANYOF_LOCALE)
3622 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
3624 for (value = 0; value < 256; value++)
3625 if (!isSPACE(value))
3626 ANYOF_BITMAP_SET(data->start_class, value);
3631 if (flags & SCF_DO_STCLASS_AND) {
3632 if (data->start_class->flags & ANYOF_LOCALE) {
3633 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
3634 for (value = 0; value < 256; value++)
3635 if (!isSPACE(value))
3636 ANYOF_BITMAP_CLEAR(data->start_class, value);
3640 data->start_class->flags |= ANYOF_LOCALE;
3641 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
3645 if (flags & SCF_DO_STCLASS_AND) {
3646 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NDIGIT);
3647 for (value = 0; value < 256; value++)
3648 if (!isDIGIT(value))
3649 ANYOF_BITMAP_CLEAR(data->start_class, value);
3652 if (data->start_class->flags & ANYOF_LOCALE)
3653 ANYOF_CLASS_SET(data->start_class,ANYOF_DIGIT);
3655 for (value = 0; value < 256; value++)
3657 ANYOF_BITMAP_SET(data->start_class, value);
3662 if (flags & SCF_DO_STCLASS_AND) {
3663 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_DIGIT);
3664 for (value = 0; value < 256; value++)
3666 ANYOF_BITMAP_CLEAR(data->start_class, value);
3669 if (data->start_class->flags & ANYOF_LOCALE)
3670 ANYOF_CLASS_SET(data->start_class,ANYOF_NDIGIT);
3672 for (value = 0; value < 256; value++)
3673 if (!isDIGIT(value))
3674 ANYOF_BITMAP_SET(data->start_class, value);
3678 CASE_SYNST_FNC(VERTWS);
3679 CASE_SYNST_FNC(HORIZWS);
3682 if (flags & SCF_DO_STCLASS_OR)
3683 cl_and(data->start_class, and_withp);
3684 flags &= ~SCF_DO_STCLASS;
3687 else if (PL_regkind[OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
3688 data->flags |= (OP(scan) == MEOL
3692 else if ( PL_regkind[OP(scan)] == BRANCHJ
3693 /* Lookbehind, or need to calculate parens/evals/stclass: */
3694 && (scan->flags || data || (flags & SCF_DO_STCLASS))
3695 && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) {
3696 if ( !PERL_ENABLE_POSITIVE_ASSERTION_STUDY
3697 || OP(scan) == UNLESSM )
3699 /* Negative Lookahead/lookbehind
3700 In this case we can't do fixed string optimisation.
3703 I32 deltanext, minnext, fake = 0;
3705 struct regnode_charclass_class intrnl;
3708 data_fake.flags = 0;
3710 data_fake.whilem_c = data->whilem_c;
3711 data_fake.last_closep = data->last_closep;
3714 data_fake.last_closep = &fake;
3715 data_fake.pos_delta = delta;
3716 if ( flags & SCF_DO_STCLASS && !scan->flags
3717 && OP(scan) == IFMATCH ) { /* Lookahead */
3718 cl_init(pRExC_state, &intrnl);
3719 data_fake.start_class = &intrnl;
3720 f |= SCF_DO_STCLASS_AND;
3722 if (flags & SCF_WHILEM_VISITED_POS)
3723 f |= SCF_WHILEM_VISITED_POS;
3724 next = regnext(scan);
3725 nscan = NEXTOPER(NEXTOPER(scan));
3726 minnext = study_chunk(pRExC_state, &nscan, minlenp, &deltanext,
3727 last, &data_fake, stopparen, recursed, NULL, f, depth+1);
3730 FAIL("Variable length lookbehind not implemented");
3732 else if (minnext > (I32)U8_MAX) {
3733 FAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
3735 scan->flags = (U8)minnext;
3738 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3740 if (data_fake.flags & SF_HAS_EVAL)
3741 data->flags |= SF_HAS_EVAL;
3742 data->whilem_c = data_fake.whilem_c;
3744 if (f & SCF_DO_STCLASS_AND) {
3745 if (flags & SCF_DO_STCLASS_OR) {
3746 /* OR before, AND after: ideally we would recurse with
3747 * data_fake to get the AND applied by study of the
3748 * remainder of the pattern, and then derecurse;
3749 * *** HACK *** for now just treat as "no information".
3750 * See [perl #56690].
3752 cl_init(pRExC_state, data->start_class);
3754 /* AND before and after: combine and continue */
3755 const int was = (data->start_class->flags & ANYOF_EOS);
3757 cl_and(data->start_class, &intrnl);
3759 data->start_class->flags |= ANYOF_EOS;
3763 #if PERL_ENABLE_POSITIVE_ASSERTION_STUDY
3765 /* Positive Lookahead/lookbehind
3766 In this case we can do fixed string optimisation,
3767 but we must be careful about it. Note in the case of
3768 lookbehind the positions will be offset by the minimum
3769 length of the pattern, something we won't know about
3770 until after the recurse.
3772 I32 deltanext, fake = 0;
3774 struct regnode_charclass_class intrnl;
3776 /* We use SAVEFREEPV so that when the full compile
3777 is finished perl will clean up the allocated
3778 minlens when its all done. This was we don't
3779 have to worry about freeing them when we know
3780 they wont be used, which would be a pain.
3783 Newx( minnextp, 1, I32 );
3784 SAVEFREEPV(minnextp);
3787 StructCopy(data, &data_fake, scan_data_t);
3788 if ((flags & SCF_DO_SUBSTR) && data->last_found) {
3791 SCAN_COMMIT(pRExC_state, &data_fake,minlenp);
3792 data_fake.last_found=newSVsv(data->last_found);
3796 data_fake.last_closep = &fake;
3797 data_fake.flags = 0;
3798 data_fake.pos_delta = delta;
3800 data_fake.flags |= SF_IS_INF;
3801 if ( flags & SCF_DO_STCLASS && !scan->flags
3802 && OP(scan) == IFMATCH ) { /* Lookahead */
3803 cl_init(pRExC_state, &intrnl);
3804 data_fake.start_class = &intrnl;
3805 f |= SCF_DO_STCLASS_AND;
3807 if (flags & SCF_WHILEM_VISITED_POS)
3808 f |= SCF_WHILEM_VISITED_POS;
3809 next = regnext(scan);
3810 nscan = NEXTOPER(NEXTOPER(scan));
3812 *minnextp = study_chunk(pRExC_state, &nscan, minnextp, &deltanext,
3813 last, &data_fake, stopparen, recursed, NULL, f,depth+1);
3816 FAIL("Variable length lookbehind not implemented");
3818 else if (*minnextp > (I32)U8_MAX) {
3819 FAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
3821 scan->flags = (U8)*minnextp;
3826 if (f & SCF_DO_STCLASS_AND) {
3827 const int was = (data->start_class->flags & ANYOF_EOS);
3829 cl_and(data->start_class, &intrnl);
3831 data->start_class->flags |= ANYOF_EOS;
3834 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3836 if (data_fake.flags & SF_HAS_EVAL)
3837 data->flags |= SF_HAS_EVAL;
3838 data->whilem_c = data_fake.whilem_c;
3839 if ((flags & SCF_DO_SUBSTR) && data_fake.last_found) {
3840 if (RExC_rx->minlen<*minnextp)
3841 RExC_rx->minlen=*minnextp;
3842 SCAN_COMMIT(pRExC_state, &data_fake, minnextp);
3843 SvREFCNT_dec(data_fake.last_found);
3845 if ( data_fake.minlen_fixed != minlenp )
3847 data->offset_fixed= data_fake.offset_fixed;
3848 data->minlen_fixed= data_fake.minlen_fixed;
3849 data->lookbehind_fixed+= scan->flags;
3851 if ( data_fake.minlen_float != minlenp )
3853 data->minlen_float= data_fake.minlen_float;
3854 data->offset_float_min=data_fake.offset_float_min;
3855 data->offset_float_max=data_fake.offset_float_max;
3856 data->lookbehind_float+= scan->flags;
3865 else if (OP(scan) == OPEN) {
3866 if (stopparen != (I32)ARG(scan))
3869 else if (OP(scan) == CLOSE) {
3870 if (stopparen == (I32)ARG(scan)) {
3873 if ((I32)ARG(scan) == is_par) {
3874 next = regnext(scan);
3876 if ( next && (OP(next) != WHILEM) && next < last)
3877 is_par = 0; /* Disable optimization */
3880 *(data->last_closep) = ARG(scan);
3882 else if (OP(scan) == EVAL) {
3884 data->flags |= SF_HAS_EVAL;
3886 else if ( PL_regkind[OP(scan)] == ENDLIKE ) {
3887 if (flags & SCF_DO_SUBSTR) {
3888 SCAN_COMMIT(pRExC_state,data,minlenp);
3889 flags &= ~SCF_DO_SUBSTR;
3891 if (data && OP(scan)==ACCEPT) {
3892 data->flags |= SCF_SEEN_ACCEPT;
3897 else if (OP(scan) == LOGICAL && scan->flags == 2) /* Embedded follows */
3899 if (flags & SCF_DO_SUBSTR) {
3900 SCAN_COMMIT(pRExC_state,data,minlenp);
3901 data->longest = &(data->longest_float);
3903 is_inf = is_inf_internal = 1;
3904 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
3905 cl_anything(pRExC_state, data->start_class);
3906 flags &= ~SCF_DO_STCLASS;
3908 else if (OP(scan) == GPOS) {
3909 if (!(RExC_rx->extflags & RXf_GPOS_FLOAT) &&
3910 !(delta || is_inf || (data && data->pos_delta)))
3912 if (!(RExC_rx->extflags & RXf_ANCH) && (flags & SCF_DO_SUBSTR))
3913 RExC_rx->extflags |= RXf_ANCH_GPOS;
3914 if (RExC_rx->gofs < (U32)min)
3915 RExC_rx->gofs = min;
3917 RExC_rx->extflags |= RXf_GPOS_FLOAT;
3921 #ifdef TRIE_STUDY_OPT
3922 #ifdef FULL_TRIE_STUDY
3923 else if (PL_regkind[OP(scan)] == TRIE) {
3924 /* NOTE - There is similar code to this block above for handling
3925 BRANCH nodes on the initial study. If you change stuff here
3927 regnode *trie_node= scan;
3928 regnode *tail= regnext(scan);
3929 reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
3930 I32 max1 = 0, min1 = I32_MAX;
3931 struct regnode_charclass_class accum;
3933 if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
3934 SCAN_COMMIT(pRExC_state, data,minlenp); /* Cannot merge strings after this. */
3935 if (flags & SCF_DO_STCLASS)
3936 cl_init_zero(pRExC_state, &accum);
3942 const regnode *nextbranch= NULL;
3945 for ( word=1 ; word <= trie->wordcount ; word++)
3947 I32 deltanext=0, minnext=0, f = 0, fake;
3948 struct regnode_charclass_class this_class;
3950 data_fake.flags = 0;
3952 data_fake.whilem_c = data->whilem_c;
3953 data_fake.last_closep = data->last_closep;
3956 data_fake.last_closep = &fake;
3957 data_fake.pos_delta = delta;
3958 if (flags & SCF_DO_STCLASS) {
3959 cl_init(pRExC_state, &this_class);
3960 data_fake.start_class = &this_class;
3961 f = SCF_DO_STCLASS_AND;
3963 if (flags & SCF_WHILEM_VISITED_POS)
3964 f |= SCF_WHILEM_VISITED_POS;
3966 if (trie->jump[word]) {
3968 nextbranch = trie_node + trie->jump[0];
3969 scan= trie_node + trie->jump[word];
3970 /* We go from the jump point to the branch that follows
3971 it. Note this means we need the vestigal unused branches
3972 even though they arent otherwise used.
3974 minnext = study_chunk(pRExC_state, &scan, minlenp,
3975 &deltanext, (regnode *)nextbranch, &data_fake,
3976 stopparen, recursed, NULL, f,depth+1);
3978 if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
3979 nextbranch= regnext((regnode*)nextbranch);
3981 if (min1 > (I32)(minnext + trie->minlen))
3982 min1 = minnext + trie->minlen;
3983 if (max1 < (I32)(minnext + deltanext + trie->maxlen))
3984 max1 = minnext + deltanext + trie->maxlen;
3985 if (deltanext == I32_MAX)
3986 is_inf = is_inf_internal = 1;
3988 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3990 if (data_fake.flags & SCF_SEEN_ACCEPT) {
3991 if ( stopmin > min + min1)
3992 stopmin = min + min1;
3993 flags &= ~SCF_DO_SUBSTR;
3995 data->flags |= SCF_SEEN_ACCEPT;
3998 if (data_fake.flags & SF_HAS_EVAL)
3999 data->flags |= SF_HAS_EVAL;
4000 data->whilem_c = data_fake.whilem_c;
4002 if (flags & SCF_DO_STCLASS)
4003 cl_or(pRExC_state, &accum, &this_class);
4006 if (flags & SCF_DO_SUBSTR) {
4007 data->pos_min += min1;
4008 data->pos_delta += max1 - min1;
4009 if (max1 != min1 || is_inf)
4010 data->longest = &(data->longest_float);
4013 delta += max1 - min1;
4014 if (flags & SCF_DO_STCLASS_OR) {
4015 cl_or(pRExC_state, data->start_class, &accum);
4017 cl_and(data->start_class, and_withp);
4018 flags &= ~SCF_DO_STCLASS;
4021 else if (flags & SCF_DO_STCLASS_AND) {
4023 cl_and(data->start_class, &accum);
4024 flags &= ~SCF_DO_STCLASS;
4027 /* Switch to OR mode: cache the old value of
4028 * data->start_class */
4030 StructCopy(data->start_class, and_withp,
4031 struct regnode_charclass_class);
4032 flags &= ~SCF_DO_STCLASS_AND;
4033 StructCopy(&accum, data->start_class,
4034 struct regnode_charclass_class);
4035 flags |= SCF_DO_STCLASS_OR;
4036 data->start_class->flags |= ANYOF_EOS;
4043 else if (PL_regkind[OP(scan)] == TRIE) {
4044 reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
4047 min += trie->minlen;
4048 delta += (trie->maxlen - trie->minlen);
4049 flags &= ~SCF_DO_STCLASS; /* xxx */
4050 if (flags & SCF_DO_SUBSTR) {
4051 SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */
4052 data->pos_min += trie->minlen;
4053 data->pos_delta += (trie->maxlen - trie->minlen);
4054 if (trie->maxlen != trie->minlen)
4055 data->longest = &(data->longest_float);
4057 if (trie->jump) /* no more substrings -- for now /grr*/
4058 flags &= ~SCF_DO_SUBSTR;
4060 #endif /* old or new */
4061 #endif /* TRIE_STUDY_OPT */
4063 /* Else: zero-length, ignore. */
4064 scan = regnext(scan);
4069 stopparen = frame->stop;
4070 frame = frame->prev;
4071 goto fake_study_recurse;
4076 DEBUG_STUDYDATA("pre-fin:",data,depth);
4079 *deltap = is_inf_internal ? I32_MAX : delta;
4080 if (flags & SCF_DO_SUBSTR && is_inf)
4081 data->pos_delta = I32_MAX - data->pos_min;
4082 if (is_par > (I32)U8_MAX)
4084 if (is_par && pars==1 && data) {
4085 data->flags |= SF_IN_PAR;
4086 data->flags &= ~SF_HAS_PAR;
4088 else if (pars && data) {
4089 data->flags |= SF_HAS_PAR;
4090 data->flags &= ~SF_IN_PAR;
4092 if (flags & SCF_DO_STCLASS_OR)
4093 cl_and(data->start_class, and_withp);
4094 if (flags & SCF_TRIE_RESTUDY)
4095 data->flags |= SCF_TRIE_RESTUDY;
4097 DEBUG_STUDYDATA("post-fin:",data,depth);
4099 return min < stopmin ? min : stopmin;
4103 S_add_data(RExC_state_t *pRExC_state, U32 n, const char *s)
4105 U32 count = RExC_rxi->data ? RExC_rxi->data->count : 0;
4107 PERL_ARGS_ASSERT_ADD_DATA;
4109 Renewc(RExC_rxi->data,
4110 sizeof(*RExC_rxi->data) + sizeof(void*) * (count + n - 1),
4111 char, struct reg_data);
4113 Renew(RExC_rxi->data->what, count + n, U8);
4115 Newx(RExC_rxi->data->what, n, U8);
4116 RExC_rxi->data->count = count + n;
4117 Copy(s, RExC_rxi->data->what + count, n, U8);
4121 /*XXX: todo make this not included in a non debugging perl */
4122 #ifndef PERL_IN_XSUB_RE
4124 Perl_reginitcolors(pTHX)
4127 const char * const s = PerlEnv_getenv("PERL_RE_COLORS");
4129 char *t = savepv(s);
4133 t = strchr(t, '\t');
4139 PL_colors[i] = t = (char *)"";
4144 PL_colors[i++] = (char *)"";
4151 #ifdef TRIE_STUDY_OPT
4152 #define CHECK_RESTUDY_GOTO \
4154 (data.flags & SCF_TRIE_RESTUDY) \
4158 #define CHECK_RESTUDY_GOTO
4162 - pregcomp - compile a regular expression into internal code
4164 * We can't allocate space until we know how big the compiled form will be,
4165 * but we can't compile it (and thus know how big it is) until we've got a
4166 * place to put the code. So we cheat: we compile it twice, once with code
4167 * generation turned off and size counting turned on, and once "for real".
4168 * This also means that we don't allocate space until we are sure that the
4169 * thing really will compile successfully, and we never have to move the
4170 * code and thus invalidate pointers into it. (Note that it has to be in
4171 * one piece because free() must be able to free it all.) [NB: not true in perl]
4173 * Beware that the optimization-preparation code in here knows about some
4174 * of the structure of the compiled regexp. [I'll say.]
4179 #ifndef PERL_IN_XSUB_RE
4180 #define RE_ENGINE_PTR &PL_core_reg_engine
4182 extern const struct regexp_engine my_reg_engine;
4183 #define RE_ENGINE_PTR &my_reg_engine
4186 #ifndef PERL_IN_XSUB_RE
4188 Perl_pregcomp(pTHX_ SV * const pattern, const U32 flags)
4191 HV * const table = GvHV(PL_hintgv);
4193 PERL_ARGS_ASSERT_PREGCOMP;
4195 /* Dispatch a request to compile a regexp to correct
4198 SV **ptr= hv_fetchs(table, "regcomp", FALSE);
4199 GET_RE_DEBUG_FLAGS_DECL;
4200 if (ptr && SvIOK(*ptr) && SvIV(*ptr)) {
4201 const regexp_engine *eng=INT2PTR(regexp_engine*,SvIV(*ptr));
4203 PerlIO_printf(Perl_debug_log, "Using engine %"UVxf"\n",
4206 return CALLREGCOMP_ENG(eng, pattern, flags);
4209 return Perl_re_compile(aTHX_ pattern, flags);
4214 Perl_re_compile(pTHX_ SV * const pattern, U32 pm_flags)
4219 register regexp_internal *ri;
4221 char *exp = SvPV(pattern, plen);
4222 char* xend = exp + plen;
4229 RExC_state_t RExC_state;
4230 RExC_state_t * const pRExC_state = &RExC_state;
4231 #ifdef TRIE_STUDY_OPT
4233 RExC_state_t copyRExC_state;
4235 GET_RE_DEBUG_FLAGS_DECL;
4237 PERL_ARGS_ASSERT_RE_COMPILE;
4239 DEBUG_r(if (!PL_colorset) reginitcolors());
4241 RExC_utf8 = RExC_orig_utf8 = SvUTF8(pattern);
4244 SV *dsv= sv_newmortal();
4245 RE_PV_QUOTED_DECL(s, RExC_utf8,
4246 dsv, exp, plen, 60);
4247 PerlIO_printf(Perl_debug_log, "%sCompiling REx%s %s\n",
4248 PL_colors[4],PL_colors[5],s);
4253 RExC_flags = pm_flags;
4257 RExC_seen_zerolen = *exp == '^' ? -1 : 0;
4258 RExC_seen_evals = 0;
4261 /* First pass: determine size, legality. */
4269 RExC_emit = &PL_regdummy;
4270 RExC_whilem_seen = 0;
4271 RExC_charnames = NULL;
4272 RExC_open_parens = NULL;
4273 RExC_close_parens = NULL;
4275 RExC_paren_names = NULL;
4277 RExC_paren_name_list = NULL;
4279 RExC_recurse = NULL;
4280 RExC_recurse_count = 0;
4282 #if 0 /* REGC() is (currently) a NOP at the first pass.
4283 * Clever compilers notice this and complain. --jhi */
4284 REGC((U8)REG_MAGIC, (char*)RExC_emit);
4286 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "Starting first pass (sizing)\n"));
4287 if (reg(pRExC_state, 0, &flags,1) == NULL) {
4288 RExC_precomp = NULL;
4291 if (RExC_utf8 && !RExC_orig_utf8) {
4292 /* It's possible to write a regexp in ascii that represents Unicode
4293 codepoints outside of the byte range, such as via \x{100}. If we
4294 detect such a sequence we have to convert the entire pattern to utf8
4295 and then recompile, as our sizing calculation will have been based
4296 on 1 byte == 1 character, but we will need to use utf8 to encode
4297 at least some part of the pattern, and therefore must convert the whole
4299 XXX: somehow figure out how to make this less expensive...
4302 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log,
4303 "UTF8 mismatch! Converting to utf8 for resizing and compile\n"));
4304 exp = (char*)Perl_bytes_to_utf8(aTHX_ (U8*)exp, &len);
4306 RExC_orig_utf8 = RExC_utf8;
4308 goto redo_first_pass;
4311 PerlIO_printf(Perl_debug_log,
4312 "Required size %"IVdf" nodes\n"
4313 "Starting second pass (creation)\n",
4316 RExC_lastparse=NULL;
4318 /* Small enough for pointer-storage convention?
4319 If extralen==0, this means that we will not need long jumps. */
4320 if (RExC_size >= 0x10000L && RExC_extralen)
4321 RExC_size += RExC_extralen;
4324 if (RExC_whilem_seen > 15)
4325 RExC_whilem_seen = 15;
4327 /* Allocate space and zero-initialize. Note, the two step process
4328 of zeroing when in debug mode, thus anything assigned has to
4329 happen after that */
4330 rx = (REGEXP*) newSV_type(SVt_REGEXP);
4331 r = (struct regexp*)SvANY(rx);
4332 Newxc(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode),
4333 char, regexp_internal);
4334 if ( r == NULL || ri == NULL )
4335 FAIL("Regexp out of space");
4337 /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */
4338 Zero(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode), char);
4340 /* bulk initialize base fields with 0. */
4341 Zero(ri, sizeof(regexp_internal), char);
4344 /* non-zero initialization begins here */
4346 r->engine= RE_ENGINE_PTR;
4347 r->extflags = pm_flags;
4349 bool has_p = ((r->extflags & RXf_PMf_KEEPCOPY) == RXf_PMf_KEEPCOPY);
4350 bool has_minus = ((r->extflags & RXf_PMf_STD_PMMOD) != RXf_PMf_STD_PMMOD);
4351 bool has_runon = ((RExC_seen & REG_SEEN_RUN_ON_COMMENT)==REG_SEEN_RUN_ON_COMMENT);
4352 U16 reganch = (U16)((r->extflags & RXf_PMf_STD_PMMOD)
4353 >> RXf_PMf_STD_PMMOD_SHIFT);
4354 const char *fptr = STD_PAT_MODS; /*"msix"*/
4356 const STRLEN wraplen = plen + has_minus + has_p + has_runon
4357 + (sizeof(STD_PAT_MODS) - 1)
4358 + (sizeof("(?:)") - 1);
4360 p = sv_grow(MUTABLE_SV(rx), wraplen + 1);
4361 SvCUR_set(rx, wraplen);
4363 SvFLAGS(rx) |= SvUTF8(pattern);
4366 *p++ = KEEPCOPY_PAT_MOD; /*'p'*/
4368 char *r = p + (sizeof(STD_PAT_MODS) - 1) + has_minus - 1;
4369 char *colon = r + 1;
4372 while((ch = *fptr++)) {
4386 Copy(RExC_precomp, p, plen, char);
4387 assert ((RX_WRAPPED(rx) - p) < 16);
4388 r->pre_prefix = p - RX_WRAPPED(rx);
4397 r->nparens = RExC_npar - 1; /* set early to validate backrefs */
4399 if (RExC_seen & REG_SEEN_RECURSE) {
4400 Newxz(RExC_open_parens, RExC_npar,regnode *);
4401 SAVEFREEPV(RExC_open_parens);
4402 Newxz(RExC_close_parens,RExC_npar,regnode *);
4403 SAVEFREEPV(RExC_close_parens);
4406 /* Useful during FAIL. */
4407 #ifdef RE_TRACK_PATTERN_OFFSETS
4408 Newxz(ri->u.offsets, 2*RExC_size+1, U32); /* MJD 20001228 */
4409 DEBUG_OFFSETS_r(PerlIO_printf(Perl_debug_log,
4410 "%s %"UVuf" bytes for offset annotations.\n",
4411 ri->u.offsets ? "Got" : "Couldn't get",
4412 (UV)((2*RExC_size+1) * sizeof(U32))));
4414 SetProgLen(ri,RExC_size);
4419 /* Second pass: emit code. */
4420 RExC_flags = pm_flags; /* don't let top level (?i) bleed */
4425 RExC_emit_start = ri->program;
4426 RExC_emit = ri->program;
4427 RExC_emit_bound = ri->program + RExC_size + 1;
4429 /* Store the count of eval-groups for security checks: */
4430 RExC_rx->seen_evals = RExC_seen_evals;
4431 REGC((U8)REG_MAGIC, (char*) RExC_emit++);
4432 if (reg(pRExC_state, 0, &flags,1) == NULL) {
4436 /* XXXX To minimize changes to RE engine we always allocate
4437 3-units-long substrs field. */
4438 Newx(r->substrs, 1, struct reg_substr_data);
4439 if (RExC_recurse_count) {
4440 Newxz(RExC_recurse,RExC_recurse_count,regnode *);
4441 SAVEFREEPV(RExC_recurse);
4445 r->minlen = minlen = sawplus = sawopen = 0;
4446 Zero(r->substrs, 1, struct reg_substr_data);
4448 #ifdef TRIE_STUDY_OPT
4450 StructCopy(&zero_scan_data, &data, scan_data_t);
4451 copyRExC_state = RExC_state;
4454 DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log,"Restudying\n"));
4456 RExC_state = copyRExC_state;
4457 if (seen & REG_TOP_LEVEL_BRANCHES)
4458 RExC_seen |= REG_TOP_LEVEL_BRANCHES;
4460 RExC_seen &= ~REG_TOP_LEVEL_BRANCHES;
4461 if (data.last_found) {
4462 SvREFCNT_dec(data.longest_fixed);
4463 SvREFCNT_dec(data.longest_float);
4464 SvREFCNT_dec(data.last_found);
4466 StructCopy(&zero_scan_data, &data, scan_data_t);
4469 StructCopy(&zero_scan_data, &data, scan_data_t);
4472 /* Dig out information for optimizations. */
4473 r->extflags = RExC_flags; /* was pm_op */
4474 /*dmq: removed as part of de-PMOP: pm->op_pmflags = RExC_flags; */
4477 SvUTF8_on(rx); /* Unicode in it? */
4478 ri->regstclass = NULL;
4479 if (RExC_naughty >= 10) /* Probably an expensive pattern. */
4480 r->intflags |= PREGf_NAUGHTY;
4481 scan = ri->program + 1; /* First BRANCH. */
4483 /* testing for BRANCH here tells us whether there is "must appear"
4484 data in the pattern. If there is then we can use it for optimisations */
4485 if (!(RExC_seen & REG_TOP_LEVEL_BRANCHES)) { /* Only one top-level choice. */
4487 STRLEN longest_float_length, longest_fixed_length;
4488 struct regnode_charclass_class ch_class; /* pointed to by data */
4490 I32 last_close = 0; /* pointed to by data */
4491 regnode *first= scan;
4492 regnode *first_next= regnext(first);
4495 * Skip introductions and multiplicators >= 1
4496 * so that we can extract the 'meat' of the pattern that must
4497 * match in the large if() sequence following.
4498 * NOTE that EXACT is NOT covered here, as it is normally
4499 * picked up by the optimiser separately.
4501 * This is unfortunate as the optimiser isnt handling lookahead
4502 * properly currently.
4505 while ((OP(first) == OPEN && (sawopen = 1)) ||
4506 /* An OR of *one* alternative - should not happen now. */
4507 (OP(first) == BRANCH && OP(first_next) != BRANCH) ||
4508 /* for now we can't handle lookbehind IFMATCH*/
4509 (OP(first) == IFMATCH && !first->flags) ||
4510 (OP(first) == PLUS) ||
4511 (OP(first) == MINMOD) ||
4512 /* An {n,m} with n>0 */
4513 (PL_regkind[OP(first)] == CURLY && ARG1(first) > 0) ||
4514 (OP(first) == NOTHING && PL_regkind[OP(first_next)] != END ))
4517 * the only op that could be a regnode is PLUS, all the rest
4518 * will be regnode_1 or regnode_2.
4521 if (OP(first) == PLUS)
4524 first += regarglen[OP(first)];
4526 first = NEXTOPER(first);
4527 first_next= regnext(first);
4530 /* Starting-point info. */
4532 DEBUG_PEEP("first:",first,0);
4533 /* Ignore EXACT as we deal with it later. */
4534 if (PL_regkind[OP(first)] == EXACT) {
4535 if (OP(first) == EXACT)
4536 NOOP; /* Empty, get anchored substr later. */
4537 else if ((OP(first) == EXACTF || OP(first) == EXACTFL))
4538 ri->regstclass = first;
4541 else if (PL_regkind[OP(first)] == TRIE &&
4542 ((reg_trie_data *)ri->data->data[ ARG(first) ])->minlen>0)
4545 /* this can happen only on restudy */
4546 if ( OP(first) == TRIE ) {
4547 struct regnode_1 *trieop = (struct regnode_1 *)
4548 PerlMemShared_calloc(1, sizeof(struct regnode_1));
4549 StructCopy(first,trieop,struct regnode_1);
4550 trie_op=(regnode *)trieop;
4552 struct regnode_charclass *trieop = (struct regnode_charclass *)
4553 PerlMemShared_calloc(1, sizeof(struct regnode_charclass));
4554 StructCopy(first,trieop,struct regnode_charclass);
4555 trie_op=(regnode *)trieop;
4558 make_trie_failtable(pRExC_state, (regnode *)first, trie_op, 0);
4559 ri->regstclass = trie_op;
4562 else if (strchr((const char*)PL_simple,OP(first)))
4563 ri->regstclass = first;
4564 else if (PL_regkind[OP(first)] == BOUND ||
4565 PL_regkind[OP(first)] == NBOUND)
4566 ri->regstclass = first;
4567 else if (PL_regkind[OP(first)] == BOL) {
4568 r->extflags |= (OP(first) == MBOL
4570 : (OP(first) == SBOL
4573 first = NEXTOPER(first);
4576 else if (OP(first) == GPOS) {
4577 r->extflags |= RXf_ANCH_GPOS;
4578 first = NEXTOPER(first);
4581 else if ((!sawopen || !RExC_sawback) &&
4582 (OP(first) == STAR &&
4583 PL_regkind[OP(NEXTOPER(first))] == REG_ANY) &&
4584 !(r->extflags & RXf_ANCH) && !(RExC_seen & REG_SEEN_EVAL))
4586 /* turn .* into ^.* with an implied $*=1 */
4588 (OP(NEXTOPER(first)) == REG_ANY)
4591 r->extflags |= type;
4592 r->intflags |= PREGf_IMPLICIT;
4593 first = NEXTOPER(first);
4596 if (sawplus && (!sawopen || !RExC_sawback)
4597 && !(RExC_seen & REG_SEEN_EVAL)) /* May examine pos and $& */
4598 /* x+ must match at the 1st pos of run of x's */
4599 r->intflags |= PREGf_SKIP;
4601 /* Scan is after the zeroth branch, first is atomic matcher. */
4602 #ifdef TRIE_STUDY_OPT
4605 PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
4606 (IV)(first - scan + 1))
4610 PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
4611 (IV)(first - scan + 1))
4617 * If there's something expensive in the r.e., find the
4618 * longest literal string that must appear and make it the
4619 * regmust. Resolve ties in favor of later strings, since
4620 * the regstart check works with the beginning of the r.e.
4621 * and avoiding duplication strengthens checking. Not a
4622 * strong reason, but sufficient in the absence of others.
4623 * [Now we resolve ties in favor of the earlier string if
4624 * it happens that c_offset_min has been invalidated, since the
4625 * earlier string may buy us something the later one won't.]
4628 data.longest_fixed = newSVpvs("");
4629 data.longest_float = newSVpvs("");
4630 data.last_found = newSVpvs("");
4631 data.longest = &(data.longest_fixed);
4633 if (!ri->regstclass) {
4634 cl_init(pRExC_state, &ch_class);
4635 data.start_class = &ch_class;
4636 stclass_flag = SCF_DO_STCLASS_AND;
4637 } else /* XXXX Check for BOUND? */
4639 data.last_closep = &last_close;
4641 minlen = study_chunk(pRExC_state, &first, &minlen, &fake, scan + RExC_size, /* Up to end */
4642 &data, -1, NULL, NULL,
4643 SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag,0);
4649 if ( RExC_npar == 1 && data.longest == &(data.longest_fixed)
4650 && data.last_start_min == 0 && data.last_end > 0
4651 && !RExC_seen_zerolen
4652 && !(RExC_seen & REG_SEEN_VERBARG)
4653 && (!(RExC_seen & REG_SEEN_GPOS) || (r->extflags & RXf_ANCH_GPOS)))
4654 r->extflags |= RXf_CHECK_ALL;
4655 scan_commit(pRExC_state, &data,&minlen,0);
4656 SvREFCNT_dec(data.last_found);
4658 /* Note that code very similar to this but for anchored string
4659 follows immediately below, changes may need to be made to both.
4662 longest_float_length = CHR_SVLEN(data.longest_float);
4663 if (longest_float_length
4664 || (data.flags & SF_FL_BEFORE_EOL
4665 && (!(data.flags & SF_FL_BEFORE_MEOL)
4666 || (RExC_flags & RXf_PMf_MULTILINE))))
4670 if (SvCUR(data.longest_fixed) /* ok to leave SvCUR */
4671 && data.offset_fixed == data.offset_float_min
4672 && SvCUR(data.longest_fixed) == SvCUR(data.longest_float))
4673 goto remove_float; /* As in (a)+. */
4675 /* copy the information about the longest float from the reg_scan_data
4676 over to the program. */
4677 if (SvUTF8(data.longest_float)) {
4678 r->float_utf8 = data.longest_float;
4679 r->float_substr = NULL;
4681 r->float_substr = data.longest_float;
4682 r->float_utf8 = NULL;
4684 /* float_end_shift is how many chars that must be matched that
4685 follow this item. We calculate it ahead of time as once the
4686 lookbehind offset is added in we lose the ability to correctly
4688 ml = data.minlen_float ? *(data.minlen_float)
4689 : (I32)longest_float_length;
4690 r->float_end_shift = ml - data.offset_float_min
4691 - longest_float_length + (SvTAIL(data.longest_float) != 0)
4692 + data.lookbehind_float;
4693 r->float_min_offset = data.offset_float_min - data.lookbehind_float;
4694 r->float_max_offset = data.offset_float_max;
4695 if (data.offset_float_max < I32_MAX) /* Don't offset infinity */
4696 r->float_max_offset -= data.lookbehind_float;
4698 t = (data.flags & SF_FL_BEFORE_EOL /* Can't have SEOL and MULTI */
4699 && (!(data.flags & SF_FL_BEFORE_MEOL)
4700 || (RExC_flags & RXf_PMf_MULTILINE)));
4701 fbm_compile(data.longest_float, t ? FBMcf_TAIL : 0);
4705 r->float_substr = r->float_utf8 = NULL;
4706 SvREFCNT_dec(data.longest_float);
4707 longest_float_length = 0;
4710 /* Note that code very similar to this but for floating string
4711 is immediately above, changes may need to be made to both.
4714 longest_fixed_length = CHR_SVLEN(data.longest_fixed);
4715 if (longest_fixed_length
4716 || (data.flags & SF_FIX_BEFORE_EOL /* Cannot have SEOL and MULTI */
4717 && (!(data.flags & SF_FIX_BEFORE_MEOL)
4718 || (RExC_flags & RXf_PMf_MULTILINE))))
4722 /* copy the information about the longest fixed
4723 from the reg_scan_data over to the program. */
4724 if (SvUTF8(data.longest_fixed)) {
4725 r->anchored_utf8 = data.longest_fixed;
4726 r->anchored_substr = NULL;
4728 r->anchored_substr = data.longest_fixed;
4729 r->anchored_utf8 = NULL;
4731 /* fixed_end_shift is how many chars that must be matched that
4732 follow this item. We calculate it ahead of time as once the
4733 lookbehind offset is added in we lose the ability to correctly
4735 ml = data.minlen_fixed ? *(data.minlen_fixed)
4736 : (I32)longest_fixed_length;
4737 r->anchored_end_shift = ml - data.offset_fixed
4738 - longest_fixed_length + (SvTAIL(data.longest_fixed) != 0)
4739 + data.lookbehind_fixed;
4740 r->anchored_offset = data.offset_fixed - data.lookbehind_fixed;
4742 t = (data.flags & SF_FIX_BEFORE_EOL /* Can't have SEOL and MULTI */
4743 && (!(data.flags & SF_FIX_BEFORE_MEOL)
4744 || (RExC_flags & RXf_PMf_MULTILINE)));
4745 fbm_compile(data.longest_fixed, t ? FBMcf_TAIL : 0);
4748 r->anchored_substr = r->anchored_utf8 = NULL;
4749 SvREFCNT_dec(data.longest_fixed);
4750 longest_fixed_length = 0;
4753 && (OP(ri->regstclass) == REG_ANY || OP(ri->regstclass) == SANY))
4754 ri->regstclass = NULL;
4755 if ((!(r->anchored_substr || r->anchored_utf8) || r->anchored_offset)
4757 && !(data.start_class->flags & ANYOF_EOS)
4758 && !cl_is_anything(data.start_class))
4760 const U32 n = add_data(pRExC_state, 1, "f");
4762 Newx(RExC_rxi->data->data[n], 1,
4763 struct regnode_charclass_class);
4764 StructCopy(data.start_class,
4765 (struct regnode_charclass_class*)RExC_rxi->data->data[n],
4766 struct regnode_charclass_class);
4767 ri->regstclass = (regnode*)RExC_rxi->data->data[n];
4768 r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
4769 DEBUG_COMPILE_r({ SV *sv = sv_newmortal();
4770 regprop(r, sv, (regnode*)data.start_class);
4771 PerlIO_printf(Perl_debug_log,
4772 "synthetic stclass \"%s\".\n",
4773 SvPVX_const(sv));});
4776 /* A temporary algorithm prefers floated substr to fixed one to dig more info. */
4777 if (longest_fixed_length > longest_float_length) {
4778 r->check_end_shift = r->anchored_end_shift;
4779 r->check_substr = r->anchored_substr;
4780 r->check_utf8 = r->anchored_utf8;
4781 r->check_offset_min = r->check_offset_max = r->anchored_offset;
4782 if (r->extflags & RXf_ANCH_SINGLE)
4783 r->extflags |= RXf_NOSCAN;
4786 r->check_end_shift = r->float_end_shift;
4787 r->check_substr = r->float_substr;
4788 r->check_utf8 = r->float_utf8;
4789 r->check_offset_min = r->float_min_offset;
4790 r->check_offset_max = r->float_max_offset;
4792 /* XXXX Currently intuiting is not compatible with ANCH_GPOS.
4793 This should be changed ASAP! */
4794 if ((r->check_substr || r->check_utf8) && !(r->extflags & RXf_ANCH_GPOS)) {
4795 r->extflags |= RXf_USE_INTUIT;
4796 if (SvTAIL(r->check_substr ? r->check_substr : r->check_utf8))
4797 r->extflags |= RXf_INTUIT_TAIL;
4799 /* XXX Unneeded? dmq (shouldn't as this is handled elsewhere)
4800 if ( (STRLEN)minlen < longest_float_length )
4801 minlen= longest_float_length;
4802 if ( (STRLEN)minlen < longest_fixed_length )
4803 minlen= longest_fixed_length;
4807 /* Several toplevels. Best we can is to set minlen. */
4809 struct regnode_charclass_class ch_class;
4812 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "\nMulti Top Level\n"));
4814 scan = ri->program + 1;
4815 cl_init(pRExC_state, &ch_class);
4816 data.start_class = &ch_class;
4817 data.last_closep = &last_close;
4820 minlen = study_chunk(pRExC_state, &scan, &minlen, &fake, scan + RExC_size,
4821 &data, -1, NULL, NULL, SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS,0);
4825 r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8
4826 = r->float_substr = r->float_utf8 = NULL;
4827 if (!(data.start_class->flags & ANYOF_EOS)
4828 && !cl_is_anything(data.start_class))
4830 const U32 n = add_data(pRExC_state, 1, "f");
4832 Newx(RExC_rxi->data->data[n], 1,
4833 struct regnode_charclass_class);
4834 StructCopy(data.start_class,
4835 (struct regnode_charclass_class*)RExC_rxi->data->data[n],
4836 struct regnode_charclass_class);
4837 ri->regstclass = (regnode*)RExC_rxi->data->data[n];
4838 r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
4839 DEBUG_COMPILE_r({ SV* sv = sv_newmortal();
4840 regprop(r, sv, (regnode*)data.start_class);
4841 PerlIO_printf(Perl_debug_log,
4842 "synthetic stclass \"%s\".\n",
4843 SvPVX_const(sv));});
4847 /* Guard against an embedded (?=) or (?<=) with a longer minlen than
4848 the "real" pattern. */
4850 PerlIO_printf(Perl_debug_log,"minlen: %"IVdf" r->minlen:%"IVdf"\n",
4851 (IV)minlen, (IV)r->minlen);
4853 r->minlenret = minlen;
4854 if (r->minlen < minlen)
4857 if (RExC_seen & REG_SEEN_GPOS)
4858 r->extflags |= RXf_GPOS_SEEN;
4859 if (RExC_seen & REG_SEEN_LOOKBEHIND)
4860 r->extflags |= RXf_LOOKBEHIND_SEEN;
4861 if (RExC_seen & REG_SEEN_EVAL)
4862 r->extflags |= RXf_EVAL_SEEN;
4863 if (RExC_seen & REG_SEEN_CANY)
4864 r->extflags |= RXf_CANY_SEEN;
4865 if (RExC_seen & REG_SEEN_VERBARG)
4866 r->intflags |= PREGf_VERBARG_SEEN;
4867 if (RExC_seen & REG_SEEN_CUTGROUP)
4868 r->intflags |= PREGf_CUTGROUP_SEEN;
4869 if (RExC_paren_names)
4870 RXp_PAREN_NAMES(r) = MUTABLE_HV(SvREFCNT_inc(RExC_paren_names));
4872 RXp_PAREN_NAMES(r) = NULL;
4874 #ifdef STUPID_PATTERN_CHECKS
4875 if (RX_PRELEN(rx) == 0)
4876 r->extflags |= RXf_NULL;
4877 if (r->extflags & RXf_SPLIT && RX_PRELEN(rx) == 1 && RX_PRECOMP(rx)[0] == ' ')
4878 /* XXX: this should happen BEFORE we compile */
4879 r->extflags |= (RXf_SKIPWHITE|RXf_WHITE);
4880 else if (RX_PRELEN(rx) == 3 && memEQ("\\s+", RX_PRECOMP(rx), 3))
4881 r->extflags |= RXf_WHITE;
4882 else if (RX_PRELEN(rx) == 1 && RXp_PRECOMP(rx)[0] == '^')
4883 r->extflags |= RXf_START_ONLY;
4885 if (r->extflags & RXf_SPLIT && RX_PRELEN(rx) == 1 && RX_PRECOMP(rx)[0] == ' ')
4886 /* XXX: this should happen BEFORE we compile */
4887 r->extflags |= (RXf_SKIPWHITE|RXf_WHITE);
4889 regnode *first = ri->program + 1;
4891 U8 nop = OP(NEXTOPER(first));
4893 if (PL_regkind[fop] == NOTHING && nop == END)
4894 r->extflags |= RXf_NULL;
4895 else if (PL_regkind[fop] == BOL && nop == END)
4896 r->extflags |= RXf_START_ONLY;
4897 else if (fop == PLUS && nop ==SPACE && OP(regnext(first))==END)
4898 r->extflags |= RXf_WHITE;
4902 if (RExC_paren_names) {
4903 ri->name_list_idx = add_data( pRExC_state, 1, "p" );
4904 ri->data->data[ri->name_list_idx] = (void*)SvREFCNT_inc(RExC_paren_name_list);
4907 ri->name_list_idx = 0;
4909 if (RExC_recurse_count) {
4910 for ( ; RExC_recurse_count ; RExC_recurse_count-- ) {
4911 const regnode *scan = RExC_recurse[RExC_recurse_count-1];
4912 ARG2L_SET( scan, RExC_open_parens[ARG(scan)-1] - scan );
4915 Newxz(r->offs, RExC_npar, regexp_paren_pair);
4916 /* assume we don't need to swap parens around before we match */
4919 PerlIO_printf(Perl_debug_log,"Final program:\n");
4922 #ifdef RE_TRACK_PATTERN_OFFSETS
4923 DEBUG_OFFSETS_r(if (ri->u.offsets) {
4924 const U32 len = ri->u.offsets[0];
4926 GET_RE_DEBUG_FLAGS_DECL;
4927 PerlIO_printf(Perl_debug_log, "Offsets: [%"UVuf"]\n\t", (UV)ri->u.offsets[0]);
4928 for (i = 1; i <= len; i++) {
4929 if (ri->u.offsets[i*2-1] || ri->u.offsets[i*2])
4930 PerlIO_printf(Perl_debug_log, "%"UVuf":%"UVuf"[%"UVuf"] ",
4931 (UV)i, (UV)ri->u.offsets[i*2-1], (UV)ri->u.offsets[i*2]);
4933 PerlIO_printf(Perl_debug_log, "\n");
4939 #undef RE_ENGINE_PTR
4943 Perl_reg_named_buff(pTHX_ REGEXP * const rx, SV * const key, SV * const value,
4946 PERL_ARGS_ASSERT_REG_NAMED_BUFF;
4948 PERL_UNUSED_ARG(value);
4950 if (flags & RXapif_FETCH) {
4951 return reg_named_buff_fetch(rx, key, flags);
4952 } else if (flags & (RXapif_STORE | RXapif_DELETE | RXapif_CLEAR)) {
4953 Perl_croak(aTHX_ "%s", PL_no_modify);
4955 } else if (flags & RXapif_EXISTS) {
4956 return reg_named_buff_exists(rx, key, flags)
4959 } else if (flags & RXapif_REGNAMES) {
4960 return reg_named_buff_all(rx, flags);
4961 } else if (flags & (RXapif_SCALAR | RXapif_REGNAMES_COUNT)) {
4962 return reg_named_buff_scalar(rx, flags);
4964 Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff", (int)flags);
4970 Perl_reg_named_buff_iter(pTHX_ REGEXP * const rx, const SV * const lastkey,
4973 PERL_ARGS_ASSERT_REG_NAMED_BUFF_ITER;
4974 PERL_UNUSED_ARG(lastkey);
4976 if (flags & RXapif_FIRSTKEY)
4977 return reg_named_buff_firstkey(rx, flags);
4978 else if (flags & RXapif_NEXTKEY)
4979 return reg_named_buff_nextkey(rx, flags);
4981 Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_iter", (int)flags);
4987 Perl_reg_named_buff_fetch(pTHX_ REGEXP * const r, SV * const namesv,
4990 AV *retarray = NULL;
4992 struct regexp *const rx = (struct regexp *)SvANY(r);
4994 PERL_ARGS_ASSERT_REG_NAMED_BUFF_FETCH;
4996 if (flags & RXapif_ALL)
4999 if (rx && RXp_PAREN_NAMES(rx)) {
5000 HE *he_str = hv_fetch_ent( RXp_PAREN_NAMES(rx), namesv, 0, 0 );
5003 SV* sv_dat=HeVAL(he_str);
5004 I32 *nums=(I32*)SvPVX(sv_dat);
5005 for ( i=0; i<SvIVX(sv_dat); i++ ) {
5006 if ((I32)(rx->nparens) >= nums[i]
5007 && rx->offs[nums[i]].start != -1
5008 && rx->offs[nums[i]].end != -1)
5011 CALLREG_NUMBUF_FETCH(r,nums[i],ret);
5015 ret = newSVsv(&PL_sv_undef);
5018 av_push(retarray, ret);
5021 return newRV_noinc(MUTABLE_SV(retarray));
5028 Perl_reg_named_buff_exists(pTHX_ REGEXP * const r, SV * const key,
5031 struct regexp *const rx = (struct regexp *)SvANY(r);
5033 PERL_ARGS_ASSERT_REG_NAMED_BUFF_EXISTS;
5035 if (rx && RXp_PAREN_NAMES(rx)) {
5036 if (flags & RXapif_ALL) {
5037 return hv_exists_ent(RXp_PAREN_NAMES(rx), key, 0);
5039 SV *sv = CALLREG_NAMED_BUFF_FETCH(r, key, flags);
5053 Perl_reg_named_buff_firstkey(pTHX_ REGEXP * const r, const U32 flags)
5055 struct regexp *const rx = (struct regexp *)SvANY(r);
5057 PERL_ARGS_ASSERT_REG_NAMED_BUFF_FIRSTKEY;
5059 if ( rx && RXp_PAREN_NAMES(rx) ) {
5060 (void)hv_iterinit(RXp_PAREN_NAMES(rx));
5062 return CALLREG_NAMED_BUFF_NEXTKEY(r, NULL, flags & ~RXapif_FIRSTKEY);
5069 Perl_reg_named_buff_nextkey(pTHX_ REGEXP * const r, const U32 flags)
5071 struct regexp *const rx = (struct regexp *)SvANY(r);
5072 GET_RE_DEBUG_FLAGS_DECL;
5074 PERL_ARGS_ASSERT_REG_NAMED_BUFF_NEXTKEY;
5076 if (rx && RXp_PAREN_NAMES(rx)) {
5077 HV *hv = RXp_PAREN_NAMES(rx);
5079 while ( (temphe = hv_iternext_flags(hv,0)) ) {
5082 SV* sv_dat = HeVAL(temphe);
5083 I32 *nums = (I32*)SvPVX(sv_dat);
5084 for ( i = 0; i < SvIVX(sv_dat); i++ ) {
5085 if ((I32)(rx->lastparen) >= nums[i] &&
5086 rx->offs[nums[i]].start != -1 &&
5087 rx->offs[nums[i]].end != -1)
5093 if (parno || flags & RXapif_ALL) {
5094 return newSVhek(HeKEY_hek(temphe));
5102 Perl_reg_named_buff_scalar(pTHX_ REGEXP * const r, const U32 flags)
5107 struct regexp *const rx = (struct regexp *)SvANY(r);
5109 PERL_ARGS_ASSERT_REG_NAMED_BUFF_SCALAR;
5111 if (rx && RXp_PAREN_NAMES(rx)) {
5112 if (flags & (RXapif_ALL | RXapif_REGNAMES_COUNT)) {
5113 return newSViv(HvTOTALKEYS(RXp_PAREN_NAMES(rx)));
5114 } else if (flags & RXapif_ONE) {
5115 ret = CALLREG_NAMED_BUFF_ALL(r, (flags | RXapif_REGNAMES));
5116 av = MUTABLE_AV(SvRV(ret));
5117 length = av_len(av);
5119 return newSViv(length + 1);
5121 Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_scalar", (int)flags);
5125 return &PL_sv_undef;
5129 Perl_reg_named_buff_all(pTHX_ REGEXP * const r, const U32 flags)
5131 struct regexp *const rx = (struct regexp *)SvANY(r);
5134 PERL_ARGS_ASSERT_REG_NAMED_BUFF_ALL;
5136 if (rx && RXp_PAREN_NAMES(rx)) {
5137 HV *hv= RXp_PAREN_NAMES(rx);
5139 (void)hv_iterinit(hv);
5140 while ( (temphe = hv_iternext_flags(hv,0)) ) {
5143 SV* sv_dat = HeVAL(temphe);
5144 I32 *nums = (I32*)SvPVX(sv_dat);
5145 for ( i = 0; i < SvIVX(sv_dat); i++ ) {
5146 if ((I32)(rx->lastparen) >= nums[i] &&
5147 rx->offs[nums[i]].start != -1 &&
5148 rx->offs[nums[i]].end != -1)
5154 if (parno || flags & RXapif_ALL) {
5155 av_push(av, newSVhek(HeKEY_hek(temphe)));
5160 return newRV_noinc(MUTABLE_SV(av));
5164 Perl_reg_numbered_buff_fetch(pTHX_ REGEXP * const r, const I32 paren,
5167 struct regexp *const rx = (struct regexp *)SvANY(r);
5172 PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_FETCH;
5175 sv_setsv(sv,&PL_sv_undef);
5179 if (paren == RX_BUFF_IDX_PREMATCH && rx->offs[0].start != -1) {
5181 i = rx->offs[0].start;
5185 if (paren == RX_BUFF_IDX_POSTMATCH && rx->offs[0].end != -1) {
5187 s = rx->subbeg + rx->offs[0].end;
5188 i = rx->sublen - rx->offs[0].end;
5191 if ( 0 <= paren && paren <= (I32)rx->nparens &&
5192 (s1 = rx->offs[paren].start) != -1 &&
5193 (t1 = rx->offs[paren].end) != -1)
5197 s = rx->subbeg + s1;
5199 sv_setsv(sv,&PL_sv_undef);
5202 assert(rx->sublen >= (s - rx->subbeg) + i );
5204 const int oldtainted = PL_tainted;
5206 sv_setpvn(sv, s, i);
5207 PL_tainted = oldtainted;
5208 if ( (rx->extflags & RXf_CANY_SEEN)
5209 ? (RXp_MATCH_UTF8(rx)
5210 && (!i || is_utf8_string((U8*)s, i)))
5211 : (RXp_MATCH_UTF8(rx)) )
5218 if (RXp_MATCH_TAINTED(rx)) {
5219 if (SvTYPE(sv) >= SVt_PVMG) {
5220 MAGIC* const mg = SvMAGIC(sv);
5223 SvMAGIC_set(sv, mg->mg_moremagic);
5225 if ((mgt = SvMAGIC(sv))) {
5226 mg->mg_moremagic = mgt;
5227 SvMAGIC_set(sv, mg);
5237 sv_setsv(sv,&PL_sv_undef);
5243 Perl_reg_numbered_buff_store(pTHX_ REGEXP * const rx, const I32 paren,
5244 SV const * const value)
5246 PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_STORE;
5248 PERL_UNUSED_ARG(rx);
5249 PERL_UNUSED_ARG(paren);
5250 PERL_UNUSED_ARG(value);
5253 Perl_croak(aTHX_ "%s", PL_no_modify);
5257 Perl_reg_numbered_buff_length(pTHX_ REGEXP * const r, const SV * const sv,
5260 struct regexp *const rx = (struct regexp *)SvANY(r);
5264 PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_LENGTH;
5266 /* Some of this code was originally in C<Perl_magic_len> in F<mg.c> */
5268 /* $` / ${^PREMATCH} */
5269 case RX_BUFF_IDX_PREMATCH:
5270 if (rx->offs[0].start != -1) {
5271 i = rx->offs[0].start;
5279 /* $' / ${^POSTMATCH} */
5280 case RX_BUFF_IDX_POSTMATCH:
5281 if (rx->offs[0].end != -1) {
5282 i = rx->sublen - rx->offs[0].end;
5284 s1 = rx->offs[0].end;
5290 /* $& / ${^MATCH}, $1, $2, ... */
5292 if (paren <= (I32)rx->nparens &&
5293 (s1 = rx->offs[paren].start) != -1 &&
5294 (t1 = rx->offs[paren].end) != -1)
5299 if (ckWARN(WARN_UNINITIALIZED))
5300 report_uninit((const SV *)sv);
5305 if (i > 0 && RXp_MATCH_UTF8(rx)) {
5306 const char * const s = rx->subbeg + s1;
5311 if (is_utf8_string_loclen((U8*)s, i, &ep, &el))
5318 Perl_reg_qr_package(pTHX_ REGEXP * const rx)
5320 PERL_ARGS_ASSERT_REG_QR_PACKAGE;
5321 PERL_UNUSED_ARG(rx);
5325 return newSVpvs("Regexp");
5328 /* Scans the name of a named buffer from the pattern.
5329 * If flags is REG_RSN_RETURN_NULL returns null.
5330 * If flags is REG_RSN_RETURN_NAME returns an SV* containing the name
5331 * If flags is REG_RSN_RETURN_DATA returns the data SV* corresponding
5332 * to the parsed name as looked up in the RExC_paren_names hash.
5333 * If there is an error throws a vFAIL().. type exception.
5336 #define REG_RSN_RETURN_NULL 0
5337 #define REG_RSN_RETURN_NAME 1
5338 #define REG_RSN_RETURN_DATA 2
5341 S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags)
5343 char *name_start = RExC_parse;
5345 PERL_ARGS_ASSERT_REG_SCAN_NAME;
5347 if (isIDFIRST_lazy_if(RExC_parse, UTF)) {
5348 /* skip IDFIRST by using do...while */
5351 RExC_parse += UTF8SKIP(RExC_parse);
5352 } while (isALNUM_utf8((U8*)RExC_parse));
5356 } while (isALNUM(*RExC_parse));
5361 = newSVpvn_flags(name_start, (int)(RExC_parse - name_start),
5362 SVs_TEMP | (UTF ? SVf_UTF8 : 0));
5363 if ( flags == REG_RSN_RETURN_NAME)
5365 else if (flags==REG_RSN_RETURN_DATA) {
5368 if ( ! sv_name ) /* should not happen*/
5369 Perl_croak(aTHX_ "panic: no svname in reg_scan_name");
5370 if (RExC_paren_names)
5371 he_str = hv_fetch_ent( RExC_paren_names, sv_name, 0, 0 );
5373 sv_dat = HeVAL(he_str);
5375 vFAIL("Reference to nonexistent named group");
5379 Perl_croak(aTHX_ "panic: bad flag in reg_scan_name");
5386 #define DEBUG_PARSE_MSG(funcname) DEBUG_PARSE_r({ \
5387 int rem=(int)(RExC_end - RExC_parse); \
5396 if (RExC_lastparse!=RExC_parse) \
5397 PerlIO_printf(Perl_debug_log," >%.*s%-*s", \
5400 iscut ? "..." : "<" \
5403 PerlIO_printf(Perl_debug_log,"%16s",""); \
5406 num = RExC_size + 1; \
5408 num=REG_NODE_NUM(RExC_emit); \
5409 if (RExC_lastnum!=num) \
5410 PerlIO_printf(Perl_debug_log,"|%4d",num); \
5412 PerlIO_printf(Perl_debug_log,"|%4s",""); \
5413 PerlIO_printf(Perl_debug_log,"|%*s%-4s", \
5414 (int)((depth*2)), "", \
5418 RExC_lastparse=RExC_parse; \
5423 #define DEBUG_PARSE(funcname) DEBUG_PARSE_r({ \
5424 DEBUG_PARSE_MSG((funcname)); \
5425 PerlIO_printf(Perl_debug_log,"%4s","\n"); \
5427 #define DEBUG_PARSE_FMT(funcname,fmt,args) DEBUG_PARSE_r({ \
5428 DEBUG_PARSE_MSG((funcname)); \
5429 PerlIO_printf(Perl_debug_log,fmt "\n",args); \
5432 - reg - regular expression, i.e. main body or parenthesized thing
5434 * Caller must absorb opening parenthesis.
5436 * Combining parenthesis handling with the base level of regular expression
5437 * is a trifle forced, but the need to tie the tails of the branches to what
5438 * follows makes it hard to avoid.
5440 #define REGTAIL(x,y,z) regtail((x),(y),(z),depth+1)
5442 #define REGTAIL_STUDY(x,y,z) regtail_study((x),(y),(z),depth+1)
5444 #define REGTAIL_STUDY(x,y,z) regtail((x),(y),(z),depth+1)
5448 S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
5449 /* paren: Parenthesized? 0=top, 1=(, inside: changed to letter. */
5452 register regnode *ret; /* Will be the head of the group. */
5453 register regnode *br;
5454 register regnode *lastbr;
5455 register regnode *ender = NULL;
5456 register I32 parno = 0;
5458 U32 oregflags = RExC_flags;
5459 bool have_branch = 0;
5461 I32 freeze_paren = 0;
5462 I32 after_freeze = 0;
5464 /* for (?g), (?gc), and (?o) warnings; warning
5465 about (?c) will warn about (?g) -- japhy */
5467 #define WASTED_O 0x01
5468 #define WASTED_G 0x02
5469 #define WASTED_C 0x04
5470 #define WASTED_GC (0x02|0x04)
5471 I32 wastedflags = 0x00;
5473 char * parse_start = RExC_parse; /* MJD */
5474 char * const oregcomp_parse = RExC_parse;
5476 GET_RE_DEBUG_FLAGS_DECL;
5478 PERL_ARGS_ASSERT_REG;
5479 DEBUG_PARSE("reg ");
5481 *flagp = 0; /* Tentatively. */
5484 /* Make an OPEN node, if parenthesized. */
5486 if ( *RExC_parse == '*') { /* (*VERB:ARG) */
5487 char *start_verb = RExC_parse;
5488 STRLEN verb_len = 0;
5489 char *start_arg = NULL;
5490 unsigned char op = 0;
5492 int internal_argval = 0; /* internal_argval is only useful if !argok */
5493 while ( *RExC_parse && *RExC_parse != ')' ) {
5494 if ( *RExC_parse == ':' ) {
5495 start_arg = RExC_parse + 1;
5501 verb_len = RExC_parse - start_verb;
5504 while ( *RExC_parse && *RExC_parse != ')' )
5506 if ( *RExC_parse != ')' )
5507 vFAIL("Unterminated verb pattern argument");
5508 if ( RExC_parse == start_arg )
5511 if ( *RExC_parse != ')' )
5512 vFAIL("Unterminated verb pattern");
5515 switch ( *start_verb ) {
5516 case 'A': /* (*ACCEPT) */
5517 if ( memEQs(start_verb,verb_len,"ACCEPT") ) {
5519 internal_argval = RExC_nestroot;
5522 case 'C': /* (*COMMIT) */
5523 if ( memEQs(start_verb,verb_len,"COMMIT") )
5526 case 'F': /* (*FAIL) */
5527 if ( verb_len==1 || memEQs(start_verb,verb_len,"FAIL") ) {
5532 case ':': /* (*:NAME) */
5533 case 'M': /* (*MARK:NAME) */
5534 if ( verb_len==0 || memEQs(start_verb,verb_len,"MARK") ) {
5539 case 'P': /* (*PRUNE) */
5540 if ( memEQs(start_verb,verb_len,"PRUNE") )
5543 case 'S': /* (*SKIP) */
5544 if ( memEQs(start_verb,verb_len,"SKIP") )
5547 case 'T': /* (*THEN) */
5548 /* [19:06] <TimToady> :: is then */
5549 if ( memEQs(start_verb,verb_len,"THEN") ) {
5551 RExC_seen |= REG_SEEN_CUTGROUP;
5557 vFAIL3("Unknown verb pattern '%.*s'",
5558 verb_len, start_verb);
5561 if ( start_arg && internal_argval ) {
5562 vFAIL3("Verb pattern '%.*s' may not have an argument",
5563 verb_len, start_verb);
5564 } else if ( argok < 0 && !start_arg ) {
5565 vFAIL3("Verb pattern '%.*s' has a mandatory argument",
5566 verb_len, start_verb);
5568 ret = reganode(pRExC_state, op, internal_argval);
5569 if ( ! internal_argval && ! SIZE_ONLY ) {
5571 SV *sv = newSVpvn( start_arg, RExC_parse - start_arg);
5572 ARG(ret) = add_data( pRExC_state, 1, "S" );
5573 RExC_rxi->data->data[ARG(ret)]=(void*)sv;
5580 if (!internal_argval)
5581 RExC_seen |= REG_SEEN_VERBARG;
5582 } else if ( start_arg ) {
5583 vFAIL3("Verb pattern '%.*s' may not have an argument",
5584 verb_len, start_verb);
5586 ret = reg_node(pRExC_state, op);
5588 nextchar(pRExC_state);
5591 if (*RExC_parse == '?') { /* (?...) */
5592 bool is_logical = 0;
5593 const char * const seqstart = RExC_parse;
5596 paren = *RExC_parse++;
5597 ret = NULL; /* For look-ahead/behind. */
5600 case 'P': /* (?P...) variants for those used to PCRE/Python */
5601 paren = *RExC_parse++;
5602 if ( paren == '<') /* (?P<...>) named capture */
5604 else if (paren == '>') { /* (?P>name) named recursion */
5605 goto named_recursion;
5607 else if (paren == '=') { /* (?P=...) named backref */
5608 /* this pretty much dupes the code for \k<NAME> in regatom(), if
5609 you change this make sure you change that */
5610 char* name_start = RExC_parse;
5612 SV *sv_dat = reg_scan_name(pRExC_state,
5613 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
5614 if (RExC_parse == name_start || *RExC_parse != ')')
5615 vFAIL2("Sequence %.3s... not terminated",parse_start);
5618 num = add_data( pRExC_state, 1, "S" );
5619 RExC_rxi->data->data[num]=(void*)sv_dat;
5620 SvREFCNT_inc_simple_void(sv_dat);
5623 ret = reganode(pRExC_state,
5624 (U8)(FOLD ? (LOC ? NREFFL : NREFF) : NREF),
5628 Set_Node_Offset(ret, parse_start+1);
5629 Set_Node_Cur_Length(ret); /* MJD */
5631 nextchar(pRExC_state);
5635 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
5637 case '<': /* (?<...) */
5638 if (*RExC_parse == '!')
5640 else if (*RExC_parse != '=')
5646 case '\'': /* (?'...') */
5647 name_start= RExC_parse;
5648 svname = reg_scan_name(pRExC_state,
5649 SIZE_ONLY ? /* reverse test from the others */
5650 REG_RSN_RETURN_NAME :
5651 REG_RSN_RETURN_NULL);
5652 if (RExC_parse == name_start) {
5654 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
5657 if (*RExC_parse != paren)
5658 vFAIL2("Sequence (?%c... not terminated",
5659 paren=='>' ? '<' : paren);
5663 if (!svname) /* shouldnt happen */
5665 "panic: reg_scan_name returned NULL");
5666 if (!RExC_paren_names) {
5667 RExC_paren_names= newHV();
5668 sv_2mortal(MUTABLE_SV(RExC_paren_names));
5670 RExC_paren_name_list= newAV();
5671 sv_2mortal(MUTABLE_SV(RExC_paren_name_list));
5674 he_str = hv_fetch_ent( RExC_paren_names, svname, 1, 0 );
5676 sv_dat = HeVAL(he_str);
5678 /* croak baby croak */
5680 "panic: paren_name hash element allocation failed");
5681 } else if ( SvPOK(sv_dat) ) {
5682 /* (?|...) can mean we have dupes so scan to check
5683 its already been stored. Maybe a flag indicating
5684 we are inside such a construct would be useful,
5685 but the arrays are likely to be quite small, so
5686 for now we punt -- dmq */
5687 IV count = SvIV(sv_dat);
5688 I32 *pv = (I32*)SvPVX(sv_dat);
5690 for ( i = 0 ; i < count ; i++ ) {
5691 if ( pv[i] == RExC_npar ) {
5697 pv = (I32*)SvGROW(sv_dat, SvCUR(sv_dat) + sizeof(I32)+1);
5698 SvCUR_set(sv_dat, SvCUR(sv_dat) + sizeof(I32));
5699 pv[count] = RExC_npar;
5700 SvIV_set(sv_dat, SvIVX(sv_dat) + 1);
5703 (void)SvUPGRADE(sv_dat,SVt_PVNV);
5704 sv_setpvn(sv_dat, (char *)&(RExC_npar), sizeof(I32));
5706 SvIV_set(sv_dat, 1);
5709 if (!av_store(RExC_paren_name_list, RExC_npar, SvREFCNT_inc(svname)))
5710 SvREFCNT_dec(svname);
5713 /*sv_dump(sv_dat);*/
5715 nextchar(pRExC_state);
5717 goto capturing_parens;
5719 RExC_seen |= REG_SEEN_LOOKBEHIND;
5721 case '=': /* (?=...) */
5722 RExC_seen_zerolen++;
5724 case '!': /* (?!...) */
5725 RExC_seen_zerolen++;
5726 if (*RExC_parse == ')') {
5727 ret=reg_node(pRExC_state, OPFAIL);
5728 nextchar(pRExC_state);
5732 case '|': /* (?|...) */
5733 /* branch reset, behave like a (?:...) except that
5734 buffers in alternations share the same numbers */
5736 after_freeze = freeze_paren = RExC_npar;
5738 case ':': /* (?:...) */
5739 case '>': /* (?>...) */
5741 case '$': /* (?$...) */
5742 case '@': /* (?@...) */
5743 vFAIL2("Sequence (?%c...) not implemented", (int)paren);
5745 case '#': /* (?#...) */
5746 while (*RExC_parse && *RExC_parse != ')')
5748 if (*RExC_parse != ')')
5749 FAIL("Sequence (?#... not terminated");
5750 nextchar(pRExC_state);
5753 case '0' : /* (?0) */
5754 case 'R' : /* (?R) */
5755 if (*RExC_parse != ')')
5756 FAIL("Sequence (?R) not terminated");
5757 ret = reg_node(pRExC_state, GOSTART);
5758 *flagp |= POSTPONED;
5759 nextchar(pRExC_state);
5762 { /* named and numeric backreferences */
5764 case '&': /* (?&NAME) */
5765 parse_start = RExC_parse - 1;
5768 SV *sv_dat = reg_scan_name(pRExC_state,
5769 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
5770 num = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
5772 goto gen_recurse_regop;
5775 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
5777 vFAIL("Illegal pattern");
5779 goto parse_recursion;
5781 case '-': /* (?-1) */
5782 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
5783 RExC_parse--; /* rewind to let it be handled later */
5787 case '1': case '2': case '3': case '4': /* (?1) */
5788 case '5': case '6': case '7': case '8': case '9':
5791 num = atoi(RExC_parse);
5792 parse_start = RExC_parse - 1; /* MJD */
5793 if (*RExC_parse == '-')
5795 while (isDIGIT(*RExC_parse))
5797 if (*RExC_parse!=')')
5798 vFAIL("Expecting close bracket");
5801 if ( paren == '-' ) {
5803 Diagram of capture buffer numbering.
5804 Top line is the normal capture buffer numbers
5805 Botton line is the negative indexing as from
5809 /(a(x)y)(a(b(c(?-2)d)e)f)(g(h))/
5813 num = RExC_npar + num;
5816 vFAIL("Reference to nonexistent group");
5818 } else if ( paren == '+' ) {
5819 num = RExC_npar + num - 1;
5822 ret = reganode(pRExC_state, GOSUB, num);
5824 if (num > (I32)RExC_rx->nparens) {
5826 vFAIL("Reference to nonexistent group");
5828 ARG2L_SET( ret, RExC_recurse_count++);
5830 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
5831 "Recurse #%"UVuf" to %"IVdf"\n", (UV)ARG(ret), (IV)ARG2L(ret)));
5835 RExC_seen |= REG_SEEN_RECURSE;
5836 Set_Node_Length(ret, 1 + regarglen[OP(ret)]); /* MJD */
5837 Set_Node_Offset(ret, parse_start); /* MJD */
5839 *flagp |= POSTPONED;
5840 nextchar(pRExC_state);
5842 } /* named and numeric backreferences */
5845 case '?': /* (??...) */
5847 if (*RExC_parse != '{') {
5849 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
5852 *flagp |= POSTPONED;
5853 paren = *RExC_parse++;
5855 case '{': /* (?{...}) */
5860 char *s = RExC_parse;
5862 RExC_seen_zerolen++;
5863 RExC_seen |= REG_SEEN_EVAL;
5864 while (count && (c = *RExC_parse)) {
5875 if (*RExC_parse != ')') {
5877 vFAIL("Sequence (?{...}) not terminated or not {}-balanced");
5881 OP_4tree *sop, *rop;
5882 SV * const sv = newSVpvn(s, RExC_parse - 1 - s);
5885 Perl_save_re_context(aTHX);
5886 rop = sv_compile_2op(sv, &sop, "re", &pad);
5887 sop->op_private |= OPpREFCOUNTED;
5888 /* re_dup will OpREFCNT_inc */
5889 OpREFCNT_set(sop, 1);
5892 n = add_data(pRExC_state, 3, "nop");
5893 RExC_rxi->data->data[n] = (void*)rop;
5894 RExC_rxi->data->data[n+1] = (void*)sop;
5895 RExC_rxi->data->data[n+2] = (void*)pad;
5898 else { /* First pass */
5899 if (PL_reginterp_cnt < ++RExC_seen_evals
5901 /* No compiled RE interpolated, has runtime
5902 components ===> unsafe. */
5903 FAIL("Eval-group not allowed at runtime, use re 'eval'");
5904 if (PL_tainting && PL_tainted)
5905 FAIL("Eval-group in insecure regular expression");
5906 #if PERL_VERSION > 8
5907 if (IN_PERL_COMPILETIME)
5912 nextchar(pRExC_state);
5914 ret = reg_node(pRExC_state, LOGICAL);
5917 REGTAIL(pRExC_state, ret, reganode(pRExC_state, EVAL, n));
5918 /* deal with the length of this later - MJD */
5921 ret = reganode(pRExC_state, EVAL, n);
5922 Set_Node_Length(ret, RExC_parse - parse_start + 1);
5923 Set_Node_Offset(ret, parse_start);
5926 case '(': /* (?(?{...})...) and (?(?=...)...) */
5929 if (RExC_parse[0] == '?') { /* (?(?...)) */
5930 if (RExC_parse[1] == '=' || RExC_parse[1] == '!'
5931 || RExC_parse[1] == '<'
5932 || RExC_parse[1] == '{') { /* Lookahead or eval. */
5935 ret = reg_node(pRExC_state, LOGICAL);
5938 REGTAIL(pRExC_state, ret, reg(pRExC_state, 1, &flag,depth+1));
5942 else if ( RExC_parse[0] == '<' /* (?(<NAME>)...) */
5943 || RExC_parse[0] == '\'' ) /* (?('NAME')...) */
5945 char ch = RExC_parse[0] == '<' ? '>' : '\'';
5946 char *name_start= RExC_parse++;
5948 SV *sv_dat=reg_scan_name(pRExC_state,
5949 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
5950 if (RExC_parse == name_start || *RExC_parse != ch)
5951 vFAIL2("Sequence (?(%c... not terminated",
5952 (ch == '>' ? '<' : ch));
5955 num = add_data( pRExC_state, 1, "S" );
5956 RExC_rxi->data->data[num]=(void*)sv_dat;
5957 SvREFCNT_inc_simple_void(sv_dat);
5959 ret = reganode(pRExC_state,NGROUPP,num);
5960 goto insert_if_check_paren;
5962 else if (RExC_parse[0] == 'D' &&
5963 RExC_parse[1] == 'E' &&
5964 RExC_parse[2] == 'F' &&
5965 RExC_parse[3] == 'I' &&
5966 RExC_parse[4] == 'N' &&
5967 RExC_parse[5] == 'E')
5969 ret = reganode(pRExC_state,DEFINEP,0);
5972 goto insert_if_check_paren;
5974 else if (RExC_parse[0] == 'R') {
5977 if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
5978 parno = atoi(RExC_parse++);
5979 while (isDIGIT(*RExC_parse))
5981 } else if (RExC_parse[0] == '&') {
5984 sv_dat = reg_scan_name(pRExC_state,
5985 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
5986 parno = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
5988 ret = reganode(pRExC_state,INSUBP,parno);
5989 goto insert_if_check_paren;
5991 else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
5994 parno = atoi(RExC_parse++);
5996 while (isDIGIT(*RExC_parse))
5998 ret = reganode(pRExC_state, GROUPP, parno);
6000 insert_if_check_paren:
6001 if ((c = *nextchar(pRExC_state)) != ')')
6002 vFAIL("Switch condition not recognized");
6004 REGTAIL(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
6005 br = regbranch(pRExC_state, &flags, 1,depth+1);
6007 br = reganode(pRExC_state, LONGJMP, 0);
6009 REGTAIL(pRExC_state, br, reganode(pRExC_state, LONGJMP, 0));
6010 c = *nextchar(pRExC_state);
6015 vFAIL("(?(DEFINE)....) does not allow branches");
6016 lastbr = reganode(pRExC_state, IFTHEN, 0); /* Fake one for optimizer. */
6017 regbranch(pRExC_state, &flags, 1,depth+1);
6018 REGTAIL(pRExC_state, ret, lastbr);
6021 c = *nextchar(pRExC_state);
6026 vFAIL("Switch (?(condition)... contains too many branches");
6027 ender = reg_node(pRExC_state, TAIL);
6028 REGTAIL(pRExC_state, br, ender);
6030 REGTAIL(pRExC_state, lastbr, ender);
6031 REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
6034 REGTAIL(pRExC_state, ret, ender);
6035 RExC_size++; /* XXX WHY do we need this?!!
6036 For large programs it seems to be required
6037 but I can't figure out why. -- dmq*/
6041 vFAIL2("Unknown switch condition (?(%.2s", RExC_parse);
6045 RExC_parse--; /* for vFAIL to print correctly */
6046 vFAIL("Sequence (? incomplete");
6050 parse_flags: /* (?i) */
6052 U32 posflags = 0, negflags = 0;
6053 U32 *flagsp = &posflags;
6055 while (*RExC_parse) {
6056 /* && strchr("iogcmsx", *RExC_parse) */
6057 /* (?g), (?gc) and (?o) are useless here
6058 and must be globally applied -- japhy */
6059 switch (*RExC_parse) {
6060 CASE_STD_PMMOD_FLAGS_PARSE_SET(flagsp);
6061 case ONCE_PAT_MOD: /* 'o' */
6062 case GLOBAL_PAT_MOD: /* 'g' */
6063 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
6064 const I32 wflagbit = *RExC_parse == 'o' ? WASTED_O : WASTED_G;
6065 if (! (wastedflags & wflagbit) ) {
6066 wastedflags |= wflagbit;
6069 "Useless (%s%c) - %suse /%c modifier",
6070 flagsp == &negflags ? "?-" : "?",
6072 flagsp == &negflags ? "don't " : "",
6079 case CONTINUE_PAT_MOD: /* 'c' */
6080 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
6081 if (! (wastedflags & WASTED_C) ) {
6082 wastedflags |= WASTED_GC;
6085 "Useless (%sc) - %suse /gc modifier",
6086 flagsp == &negflags ? "?-" : "?",
6087 flagsp == &negflags ? "don't " : ""
6092 case KEEPCOPY_PAT_MOD: /* 'p' */
6093 if (flagsp == &negflags) {
6095 ckWARNreg(RExC_parse + 1,"Useless use of (?-p)");
6097 *flagsp |= RXf_PMf_KEEPCOPY;
6101 if (flagsp == &negflags) {
6103 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
6107 wastedflags = 0; /* reset so (?g-c) warns twice */
6113 RExC_flags |= posflags;
6114 RExC_flags &= ~negflags;
6116 oregflags |= posflags;
6117 oregflags &= ~negflags;
6119 nextchar(pRExC_state);
6130 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
6135 }} /* one for the default block, one for the switch */
6142 ret = reganode(pRExC_state, OPEN, parno);
6145 RExC_nestroot = parno;
6146 if (RExC_seen & REG_SEEN_RECURSE
6147 && !RExC_open_parens[parno-1])
6149 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
6150 "Setting open paren #%"IVdf" to %d\n",
6151 (IV)parno, REG_NODE_NUM(ret)));
6152 RExC_open_parens[parno-1]= ret;
6155 Set_Node_Length(ret, 1); /* MJD */
6156 Set_Node_Offset(ret, RExC_parse); /* MJD */
6164 /* Pick up the branches, linking them together. */
6165 parse_start = RExC_parse; /* MJD */
6166 br = regbranch(pRExC_state, &flags, 1,depth+1);
6169 if (RExC_npar > after_freeze)
6170 after_freeze = RExC_npar;
6171 RExC_npar = freeze_paren;
6174 /* branch_len = (paren != 0); */
6178 if (*RExC_parse == '|') {
6179 if (!SIZE_ONLY && RExC_extralen) {
6180 reginsert(pRExC_state, BRANCHJ, br, depth+1);
6183 reginsert(pRExC_state, BRANCH, br, depth+1);
6184 Set_Node_Length(br, paren != 0);
6185 Set_Node_Offset_To_R(br-RExC_emit_start, parse_start-RExC_start);
6189 RExC_extralen += 1; /* For BRANCHJ-BRANCH. */
6191 else if (paren == ':') {
6192 *flagp |= flags&SIMPLE;
6194 if (is_open) { /* Starts with OPEN. */
6195 REGTAIL(pRExC_state, ret, br); /* OPEN -> first. */
6197 else if (paren != '?') /* Not Conditional */
6199 *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
6201 while (*RExC_parse == '|') {
6202 if (!SIZE_ONLY && RExC_extralen) {
6203 ender = reganode(pRExC_state, LONGJMP,0);
6204 REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender); /* Append to the previous. */
6207 RExC_extralen += 2; /* Account for LONGJMP. */
6208 nextchar(pRExC_state);
6210 if (RExC_npar > after_freeze)
6211 after_freeze = RExC_npar;
6212 RExC_npar = freeze_paren;
6214 br = regbranch(pRExC_state, &flags, 0, depth+1);
6218 REGTAIL(pRExC_state, lastbr, br); /* BRANCH -> BRANCH. */
6220 *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
6223 if (have_branch || paren != ':') {
6224 /* Make a closing node, and hook it on the end. */
6227 ender = reg_node(pRExC_state, TAIL);
6230 ender = reganode(pRExC_state, CLOSE, parno);
6231 if (!SIZE_ONLY && RExC_seen & REG_SEEN_RECURSE) {
6232 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
6233 "Setting close paren #%"IVdf" to %d\n",
6234 (IV)parno, REG_NODE_NUM(ender)));
6235 RExC_close_parens[parno-1]= ender;
6236 if (RExC_nestroot == parno)
6239 Set_Node_Offset(ender,RExC_parse+1); /* MJD */
6240 Set_Node_Length(ender,1); /* MJD */
6246 *flagp &= ~HASWIDTH;
6249 ender = reg_node(pRExC_state, SUCCEED);
6252 ender = reg_node(pRExC_state, END);
6254 assert(!RExC_opend); /* there can only be one! */
6259 REGTAIL(pRExC_state, lastbr, ender);
6261 if (have_branch && !SIZE_ONLY) {
6263 RExC_seen |= REG_TOP_LEVEL_BRANCHES;
6265 /* Hook the tails of the branches to the closing node. */
6266 for (br = ret; br; br = regnext(br)) {
6267 const U8 op = PL_regkind[OP(br)];
6269 REGTAIL_STUDY(pRExC_state, NEXTOPER(br), ender);
6271 else if (op == BRANCHJ) {
6272 REGTAIL_STUDY(pRExC_state, NEXTOPER(NEXTOPER(br)), ender);
6280 static const char parens[] = "=!<,>";
6282 if (paren && (p = strchr(parens, paren))) {
6283 U8 node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
6284 int flag = (p - parens) > 1;
6287 node = SUSPEND, flag = 0;
6288 reginsert(pRExC_state, node,ret, depth+1);
6289 Set_Node_Cur_Length(ret);
6290 Set_Node_Offset(ret, parse_start + 1);
6292 REGTAIL_STUDY(pRExC_state, ret, reg_node(pRExC_state, TAIL));
6296 /* Check for proper termination. */
6298 RExC_flags = oregflags;
6299 if (RExC_parse >= RExC_end || *nextchar(pRExC_state) != ')') {
6300 RExC_parse = oregcomp_parse;
6301 vFAIL("Unmatched (");
6304 else if (!paren && RExC_parse < RExC_end) {
6305 if (*RExC_parse == ')') {
6307 vFAIL("Unmatched )");
6310 FAIL("Junk on end of regexp"); /* "Can't happen". */
6314 RExC_npar = after_freeze;
6319 - regbranch - one alternative of an | operator
6321 * Implements the concatenation operator.
6324 S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first, U32 depth)
6327 register regnode *ret;
6328 register regnode *chain = NULL;
6329 register regnode *latest;
6330 I32 flags = 0, c = 0;
6331 GET_RE_DEBUG_FLAGS_DECL;
6333 PERL_ARGS_ASSERT_REGBRANCH;
6335 DEBUG_PARSE("brnc");
6340 if (!SIZE_ONLY && RExC_extralen)
6341 ret = reganode(pRExC_state, BRANCHJ,0);
6343 ret = reg_node(pRExC_state, BRANCH);
6344 Set_Node_Length(ret, 1);
6348 if (!first && SIZE_ONLY)
6349 RExC_extralen += 1; /* BRANCHJ */
6351 *flagp = WORST; /* Tentatively. */
6354 nextchar(pRExC_state);
6355 while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
6357 latest = regpiece(pRExC_state, &flags,depth+1);
6358 if (latest == NULL) {
6359 if (flags & TRYAGAIN)
6363 else if (ret == NULL)
6365 *flagp |= flags&(HASWIDTH|POSTPONED);
6366 if (chain == NULL) /* First piece. */
6367 *flagp |= flags&SPSTART;
6370 REGTAIL(pRExC_state, chain, latest);
6375 if (chain == NULL) { /* Loop ran zero times. */
6376 chain = reg_node(pRExC_state, NOTHING);
6381 *flagp |= flags&SIMPLE;
6388 - regpiece - something followed by possible [*+?]
6390 * Note that the branching code sequences used for ? and the general cases
6391 * of * and + are somewhat optimized: they use the same NOTHING node as
6392 * both the endmarker for their branch list and the body of the last branch.
6393 * It might seem that this node could be dispensed with entirely, but the
6394 * endmarker role is not redundant.
6397 S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
6400 register regnode *ret;
6402 register char *next;
6404 const char * const origparse = RExC_parse;
6406 I32 max = REG_INFTY;
6408 const char *maxpos = NULL;
6409 GET_RE_DEBUG_FLAGS_DECL;
6411 PERL_ARGS_ASSERT_REGPIECE;
6413 DEBUG_PARSE("piec");
6415 ret = regatom(pRExC_state, &flags,depth+1);
6417 if (flags & TRYAGAIN)
6424 if (op == '{' && regcurly(RExC_parse)) {
6426 parse_start = RExC_parse; /* MJD */
6427 next = RExC_parse + 1;
6428 while (isDIGIT(*next) || *next == ',') {
6437 if (*next == '}') { /* got one */
6441 min = atoi(RExC_parse);
6445 maxpos = RExC_parse;
6447 if (!max && *maxpos != '0')
6448 max = REG_INFTY; /* meaning "infinity" */
6449 else if (max >= REG_INFTY)
6450 vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
6452 nextchar(pRExC_state);
6455 if ((flags&SIMPLE)) {
6456 RExC_naughty += 2 + RExC_naughty / 2;
6457 reginsert(pRExC_state, CURLY, ret, depth+1);
6458 Set_Node_Offset(ret, parse_start+1); /* MJD */
6459 Set_Node_Cur_Length(ret);
6462 regnode * const w = reg_node(pRExC_state, WHILEM);
6465 REGTAIL(pRExC_state, ret, w);
6466 if (!SIZE_ONLY && RExC_extralen) {
6467 reginsert(pRExC_state, LONGJMP,ret, depth+1);
6468 reginsert(pRExC_state, NOTHING,ret, depth+1);
6469 NEXT_OFF(ret) = 3; /* Go over LONGJMP. */
6471 reginsert(pRExC_state, CURLYX,ret, depth+1);
6473 Set_Node_Offset(ret, parse_start+1);
6474 Set_Node_Length(ret,
6475 op == '{' ? (RExC_parse - parse_start) : 1);
6477 if (!SIZE_ONLY && RExC_extralen)
6478 NEXT_OFF(ret) = 3; /* Go over NOTHING to LONGJMP. */
6479 REGTAIL(pRExC_state, ret, reg_node(pRExC_state, NOTHING));
6481 RExC_whilem_seen++, RExC_extralen += 3;
6482 RExC_naughty += 4 + RExC_naughty; /* compound interest */
6491 vFAIL("Can't do {n,m} with n > m");
6493 ARG1_SET(ret, (U16)min);
6494 ARG2_SET(ret, (U16)max);
6506 #if 0 /* Now runtime fix should be reliable. */
6508 /* if this is reinstated, don't forget to put this back into perldiag:
6510 =item Regexp *+ operand could be empty at {#} in regex m/%s/
6512 (F) The part of the regexp subject to either the * or + quantifier
6513 could match an empty string. The {#} shows in the regular
6514 expression about where the problem was discovered.
6518 if (!(flags&HASWIDTH) && op != '?')
6519 vFAIL("Regexp *+ operand could be empty");
6522 parse_start = RExC_parse;
6523 nextchar(pRExC_state);
6525 *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH);
6527 if (op == '*' && (flags&SIMPLE)) {
6528 reginsert(pRExC_state, STAR, ret, depth+1);
6532 else if (op == '*') {
6536 else if (op == '+' && (flags&SIMPLE)) {
6537 reginsert(pRExC_state, PLUS, ret, depth+1);
6541 else if (op == '+') {
6545 else if (op == '?') {
6550 if (!SIZE_ONLY && !(flags&(HASWIDTH|POSTPONED)) && max > REG_INFTY/3) {
6551 ckWARN3reg(RExC_parse,
6552 "%.*s matches null string many times",
6553 (int)(RExC_parse >= origparse ? RExC_parse - origparse : 0),
6557 if (RExC_parse < RExC_end && *RExC_parse == '?') {
6558 nextchar(pRExC_state);
6559 reginsert(pRExC_state, MINMOD, ret, depth+1);
6560 REGTAIL(pRExC_state, ret, ret + NODE_STEP_REGNODE);
6562 #ifndef REG_ALLOW_MINMOD_SUSPEND
6565 if (RExC_parse < RExC_end && *RExC_parse == '+') {
6567 nextchar(pRExC_state);
6568 ender = reg_node(pRExC_state, SUCCEED);
6569 REGTAIL(pRExC_state, ret, ender);
6570 reginsert(pRExC_state, SUSPEND, ret, depth+1);
6572 ender = reg_node(pRExC_state, TAIL);
6573 REGTAIL(pRExC_state, ret, ender);
6577 if (RExC_parse < RExC_end && ISMULT2(RExC_parse)) {
6579 vFAIL("Nested quantifiers");
6586 /* reg_namedseq(pRExC_state,UVp)
6588 This is expected to be called by a parser routine that has
6589 recognized '\N' and needs to handle the rest. RExC_parse is
6590 expected to point at the first char following the N at the time
6593 If valuep is non-null then it is assumed that we are parsing inside
6594 of a charclass definition and the first codepoint in the resolved
6595 string is returned via *valuep and the routine will return NULL.
6596 In this mode if a multichar string is returned from the charnames
6597 handler a warning will be issued, and only the first char in the
6598 sequence will be examined. If the string returned is zero length
6599 then the value of *valuep is undefined and NON-NULL will
6600 be returned to indicate failure. (This will NOT be a valid pointer
6603 If valuep is null then it is assumed that we are parsing normal text
6604 and inserts a new EXACT node into the program containing the resolved
6605 string and returns a pointer to the new node. If the string is
6606 zerolength a NOTHING node is emitted.
6608 On success RExC_parse is set to the char following the endbrace.
6609 Parsing failures will generate a fatal errorvia vFAIL(...)
6611 NOTE: We cache all results from the charnames handler locally in
6612 the RExC_charnames hash (created on first use) to prevent a charnames
6613 handler from playing silly-buggers and returning a short string and
6614 then a long string for a given pattern. Since the regexp program
6615 size is calculated during an initial parse this would result
6616 in a buffer overrun so we cache to prevent the charname result from
6617 changing during the course of the parse.
6621 S_reg_namedseq(pTHX_ RExC_state_t *pRExC_state, UV *valuep, I32 *flagp)
6623 char * name; /* start of the content of the name */
6624 char * endbrace; /* endbrace following the name */
6627 STRLEN len; /* this has various purposes throughout the code */
6628 bool cached = 0; /* if this is true then we shouldn't refcount dev sv_str */
6629 regnode *ret = NULL;
6631 PERL_ARGS_ASSERT_REG_NAMEDSEQ;
6633 if (*RExC_parse != '{' ||
6634 (*RExC_parse == '{' && RExC_parse[1]
6635 && strchr("0123456789", RExC_parse[1])))
6637 GET_RE_DEBUG_FLAGS_DECL;
6639 /* no bare \N in a charclass */
6640 vFAIL("Missing braces on \\N{}");
6642 nextchar(pRExC_state);
6643 ret = reg_node(pRExC_state, REG_ANY);
6644 *flagp |= HASWIDTH|SIMPLE;
6647 Set_Node_Length(ret, 1); /* MJD */
6650 name = RExC_parse+1;
6651 endbrace = strchr(RExC_parse, '}');
6654 vFAIL("Missing right brace on \\N{}");
6656 RExC_parse = endbrace + 1;
6659 /* RExC_parse points at the beginning brace,
6660 endbrace points at the last */
6661 if ( name[0]=='U' && name[1]=='+' ) {
6662 /* its a "Unicode hex" notation {U+89AB} */
6663 I32 fl = PERL_SCAN_ALLOW_UNDERSCORES
6664 | PERL_SCAN_DISALLOW_PREFIX
6665 | (SIZE_ONLY ? PERL_SCAN_SILENT_ILLDIGIT : 0);
6667 len = (STRLEN)(endbrace - name - 2);
6668 cp = grok_hex(name + 2, &len, &fl, NULL);
6669 if ( len != (STRLEN)(endbrace - name - 2) ) {
6673 if (cp > 0xff) RExC_utf8 = 1;
6678 /* Need to convert to utf8 if either: won't fit into a byte, or the re
6679 * is going to be in utf8 and the representation changes under utf8. */
6680 if (cp > 0xff || (RExC_utf8 && ! UNI_IS_INVARIANT(cp))) {
6681 U8 string[UTF8_MAXBYTES+1];
6684 tmps = uvuni_to_utf8(string, cp);
6685 sv_str = newSVpvn_utf8((char*)string, tmps - string, TRUE);
6686 } else { /* Otherwise, no need for utf8, can skip that step */
6689 sv_str= newSVpvn(&string, 1);
6692 /* fetch the charnames handler for this scope */
6693 HV * const table = GvHV(PL_hintgv);
6695 hv_fetchs(table, "charnames", FALSE) :
6697 SV *cv= cvp ? *cvp : NULL;
6700 /* create an SV with the name as argument */
6701 sv_name = newSVpvn(name, endbrace - name);
6703 if (!table || !(PL_hints & HINT_LOCALIZE_HH)) {
6704 vFAIL2("Constant(\\N{%" SVf "}) unknown: "
6705 "(possibly a missing \"use charnames ...\")",
6708 if (!cvp || !SvOK(*cvp)) { /* when $^H{charnames} = undef; */
6709 vFAIL2("Constant(\\N{%" SVf "}): "
6710 "$^H{charnames} is not defined", SVfARG(sv_name));
6715 if (!RExC_charnames) {
6716 /* make sure our cache is allocated */
6717 RExC_charnames = newHV();
6718 sv_2mortal(MUTABLE_SV(RExC_charnames));
6720 /* see if we have looked this one up before */
6721 he_str = hv_fetch_ent( RExC_charnames, sv_name, 0, 0 );
6723 sv_str = HeVAL(he_str);
6725 } else if (PL_parser && PL_parser->error_count > 0) {
6726 /* Don't attempt to load charnames if we're already in error */
6727 vFAIL("Too many errors, cannot continue parsing");
6739 count= call_sv(cv, G_SCALAR);
6742 if (count == 1) { /* XXXX is this right? dmq */
6744 SvREFCNT_inc_simple_void(sv_str);
6751 if ( !sv_str || !SvOK(sv_str) ) {
6752 vFAIL2("Constant(\\N{%" SVf "}): Call to &{$^H{charnames}} "
6753 "did not return a defined value", SVfARG(sv_name));
6755 if (hv_store_ent( RExC_charnames, sv_name, sv_str, 0))
6760 char *p = SvPV(sv_str, len);
6763 if ( SvUTF8(sv_str) ) {
6764 *valuep = utf8_to_uvchr((U8*)p, &numlen);
6768 We have to turn on utf8 for high bit chars otherwise
6769 we get failures with
6771 "ss" =~ /[\N{LATIN SMALL LETTER SHARP S}]/i
6772 "SS" =~ /[\N{LATIN SMALL LETTER SHARP S}]/i
6774 This is different from what \x{} would do with the same
6775 codepoint, where the condition is > 0xFF.
6782 /* warn if we havent used the whole string? */
6784 if (numlen<len && SIZE_ONLY) {
6785 ckWARN2reg(RExC_parse,
6786 "Ignoring excess chars from \\N{%" SVf "} in character class",
6790 } else if (SIZE_ONLY) {
6791 ckWARN2reg(RExC_parse,
6792 "Ignoring zero length \\N{%" SVf "} in character class",
6796 SvREFCNT_dec(sv_name);
6798 SvREFCNT_dec(sv_str);
6799 return len ? NULL : (regnode *)&len;
6800 } else if(SvCUR(sv_str)) {
6806 char * parse_start = name-3; /* needed for the offsets */
6808 GET_RE_DEBUG_FLAGS_DECL; /* needed for the offsets */
6810 ret = reg_node(pRExC_state,
6811 (U8)(FOLD ? (LOC ? EXACTFL : EXACTF) : EXACT));
6814 if ( RExC_utf8 && !SvUTF8(sv_str) ) {
6815 sv_utf8_upgrade(sv_str);
6816 } else if ( !RExC_utf8 && SvUTF8(sv_str) ) {
6820 p = SvPV(sv_str, len);
6822 /* len is the length written, charlen is the size the char read */
6823 for ( len = 0; p < pend; p += charlen ) {
6825 UV uvc = utf8_to_uvchr((U8*)p, &charlen);
6827 STRLEN foldlen,numlen;
6828 U8 tmpbuf[UTF8_MAXBYTES_CASE+1], *foldbuf;
6829 uvc = toFOLD_uni(uvc, tmpbuf, &foldlen);
6830 /* Emit all the Unicode characters. */
6832 for (foldbuf = tmpbuf;
6836 uvc = utf8_to_uvchr(foldbuf, &numlen);
6838 const STRLEN unilen = reguni(pRExC_state, uvc, s);
6841 /* In EBCDIC the numlen
6842 * and unilen can differ. */
6844 if (numlen >= foldlen)
6848 break; /* "Can't happen." */
6851 const STRLEN unilen = reguni(pRExC_state, uvc, s);
6863 RExC_size += STR_SZ(len);
6866 RExC_emit += STR_SZ(len);
6868 Set_Node_Cur_Length(ret); /* MJD */
6870 nextchar(pRExC_state);
6871 } else { /* zero length */
6872 ret = reg_node(pRExC_state,NOTHING);
6874 SvREFCNT_dec(sv_name);
6876 SvREFCNT_dec(sv_str);
6885 * It returns the code point in utf8 for the value in *encp.
6886 * value: a code value in the source encoding
6887 * encp: a pointer to an Encode object
6889 * If the result from Encode is not a single character,
6890 * it returns U+FFFD (Replacement character) and sets *encp to NULL.
6893 S_reg_recode(pTHX_ const char value, SV **encp)
6896 SV * const sv = newSVpvn_flags(&value, numlen, SVs_TEMP);
6897 const char * const s = *encp ? sv_recode_to_utf8(sv, *encp) : SvPVX(sv);
6898 const STRLEN newlen = SvCUR(sv);
6899 UV uv = UNICODE_REPLACEMENT;
6901 PERL_ARGS_ASSERT_REG_RECODE;
6905 ? utf8n_to_uvchr((U8*)s, newlen, &numlen, UTF8_ALLOW_DEFAULT)
6908 if (!newlen || numlen != newlen) {
6909 uv = UNICODE_REPLACEMENT;
6917 - regatom - the lowest level
6919 Try to identify anything special at the start of the pattern. If there
6920 is, then handle it as required. This may involve generating a single regop,
6921 such as for an assertion; or it may involve recursing, such as to
6922 handle a () structure.
6924 If the string doesn't start with something special then we gobble up
6925 as much literal text as we can.
6927 Once we have been able to handle whatever type of thing started the
6928 sequence, we return.
6930 Note: we have to be careful with escapes, as they can be both literal
6931 and special, and in the case of \10 and friends can either, depending
6932 on context. Specifically there are two seperate switches for handling
6933 escape sequences, with the one for handling literal escapes requiring
6934 a dummy entry for all of the special escapes that are actually handled
6939 S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
6942 register regnode *ret = NULL;
6944 char *parse_start = RExC_parse;
6945 GET_RE_DEBUG_FLAGS_DECL;
6946 DEBUG_PARSE("atom");
6947 *flagp = WORST; /* Tentatively. */
6949 PERL_ARGS_ASSERT_REGATOM;
6952 switch ((U8)*RExC_parse) {
6954 RExC_seen_zerolen++;
6955 nextchar(pRExC_state);
6956 if (RExC_flags & RXf_PMf_MULTILINE)
6957 ret = reg_node(pRExC_state, MBOL);
6958 else if (RExC_flags & RXf_PMf_SINGLELINE)
6959 ret = reg_node(pRExC_state, SBOL);
6961 ret = reg_node(pRExC_state, BOL);
6962 Set_Node_Length(ret, 1); /* MJD */
6965 nextchar(pRExC_state);
6967 RExC_seen_zerolen++;
6968 if (RExC_flags & RXf_PMf_MULTILINE)
6969 ret = reg_node(pRExC_state, MEOL);
6970 else if (RExC_flags & RXf_PMf_SINGLELINE)
6971 ret = reg_node(pRExC_state, SEOL);
6973 ret = reg_node(pRExC_state, EOL);
6974 Set_Node_Length(ret, 1); /* MJD */
6977 nextchar(pRExC_state);
6978 if (RExC_flags & RXf_PMf_SINGLELINE)
6979 ret = reg_node(pRExC_state, SANY);
6981 ret = reg_node(pRExC_state, REG_ANY);
6982 *flagp |= HASWIDTH|SIMPLE;
6984 Set_Node_Length(ret, 1); /* MJD */
6988 char * const oregcomp_parse = ++RExC_parse;
6989 ret = regclass(pRExC_state,depth+1);
6990 if (*RExC_parse != ']') {
6991 RExC_parse = oregcomp_parse;
6992 vFAIL("Unmatched [");
6994 nextchar(pRExC_state);
6995 *flagp |= HASWIDTH|SIMPLE;
6996 Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */
7000 nextchar(pRExC_state);
7001 ret = reg(pRExC_state, 1, &flags,depth+1);
7003 if (flags & TRYAGAIN) {
7004 if (RExC_parse == RExC_end) {
7005 /* Make parent create an empty node if needed. */
7013 *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE|POSTPONED);
7017 if (flags & TRYAGAIN) {
7021 vFAIL("Internal urp");
7022 /* Supposed to be caught earlier. */
7025 if (!regcurly(RExC_parse)) {
7034 vFAIL("Quantifier follows nothing");
7042 len=0; /* silence a spurious compiler warning */
7043 if ((cp = what_len_TRICKYFOLD_safe(RExC_parse,RExC_end,UTF,len))) {
7044 *flagp |= HASWIDTH; /* could be SIMPLE too, but needs a handler in regexec.regrepeat */
7045 RExC_parse+=len-1; /* we get one from nextchar() as well. :-( */
7046 ret = reganode(pRExC_state, FOLDCHAR, cp);
7047 Set_Node_Length(ret, 1); /* MJD */
7048 nextchar(pRExC_state); /* kill whitespace under /x */
7056 This switch handles escape sequences that resolve to some kind
7057 of special regop and not to literal text. Escape sequnces that
7058 resolve to literal text are handled below in the switch marked
7061 Every entry in this switch *must* have a corresponding entry
7062 in the literal escape switch. However, the opposite is not
7063 required, as the default for this switch is to jump to the
7064 literal text handling code.
7066 switch ((U8)*++RExC_parse) {
7071 /* Special Escapes */
7073 RExC_seen_zerolen++;
7074 ret = reg_node(pRExC_state, SBOL);
7076 goto finish_meta_pat;
7078 ret = reg_node(pRExC_state, GPOS);
7079 RExC_seen |= REG_SEEN_GPOS;
7081 goto finish_meta_pat;
7083 RExC_seen_zerolen++;
7084 ret = reg_node(pRExC_state, KEEPS);
7086 /* XXX:dmq : disabling in-place substitution seems to
7087 * be necessary here to avoid cases of memory corruption, as
7088 * with: C<$_="x" x 80; s/x\K/y/> -- rgs
7090 RExC_seen |= REG_SEEN_LOOKBEHIND;
7091 goto finish_meta_pat;
7093 ret = reg_node(pRExC_state, SEOL);
7095 RExC_seen_zerolen++; /* Do not optimize RE away */
7096 goto finish_meta_pat;
7098 ret = reg_node(pRExC_state, EOS);
7100 RExC_seen_zerolen++; /* Do not optimize RE away */
7101 goto finish_meta_pat;
7103 ret = reg_node(pRExC_state, CANY);
7104 RExC_seen |= REG_SEEN_CANY;
7105 *flagp |= HASWIDTH|SIMPLE;
7106 goto finish_meta_pat;
7108 ret = reg_node(pRExC_state, CLUMP);
7110 goto finish_meta_pat;
7112 ret = reg_node(pRExC_state, (U8)(LOC ? ALNUML : ALNUM));
7113 *flagp |= HASWIDTH|SIMPLE;
7114 goto finish_meta_pat;
7116 ret = reg_node(pRExC_state, (U8)(LOC ? NALNUML : NALNUM));
7117 *flagp |= HASWIDTH|SIMPLE;
7118 goto finish_meta_pat;
7120 RExC_seen_zerolen++;
7121 RExC_seen |= REG_SEEN_LOOKBEHIND;
7122 ret = reg_node(pRExC_state, (U8)(LOC ? BOUNDL : BOUND));
7124 goto finish_meta_pat;
7126 RExC_seen_zerolen++;
7127 RExC_seen |= REG_SEEN_LOOKBEHIND;
7128 ret = reg_node(pRExC_state, (U8)(LOC ? NBOUNDL : NBOUND));
7130 goto finish_meta_pat;
7132 ret = reg_node(pRExC_state, (U8)(LOC ? SPACEL : SPACE));
7133 *flagp |= HASWIDTH|SIMPLE;
7134 goto finish_meta_pat;
7136 ret = reg_node(pRExC_state, (U8)(LOC ? NSPACEL : NSPACE));
7137 *flagp |= HASWIDTH|SIMPLE;
7138 goto finish_meta_pat;
7140 ret = reg_node(pRExC_state, DIGIT);
7141 *flagp |= HASWIDTH|SIMPLE;
7142 goto finish_meta_pat;
7144 ret = reg_node(pRExC_state, NDIGIT);
7145 *flagp |= HASWIDTH|SIMPLE;
7146 goto finish_meta_pat;
7148 ret = reg_node(pRExC_state, LNBREAK);
7149 *flagp |= HASWIDTH|SIMPLE;
7150 goto finish_meta_pat;
7152 ret = reg_node(pRExC_state, HORIZWS);
7153 *flagp |= HASWIDTH|SIMPLE;
7154 goto finish_meta_pat;
7156 ret = reg_node(pRExC_state, NHORIZWS);
7157 *flagp |= HASWIDTH|SIMPLE;
7158 goto finish_meta_pat;
7160 ret = reg_node(pRExC_state, VERTWS);
7161 *flagp |= HASWIDTH|SIMPLE;
7162 goto finish_meta_pat;
7164 ret = reg_node(pRExC_state, NVERTWS);
7165 *flagp |= HASWIDTH|SIMPLE;
7167 nextchar(pRExC_state);
7168 Set_Node_Length(ret, 2); /* MJD */
7173 char* const oldregxend = RExC_end;
7175 char* parse_start = RExC_parse - 2;
7178 if (RExC_parse[1] == '{') {
7179 /* a lovely hack--pretend we saw [\pX] instead */
7180 RExC_end = strchr(RExC_parse, '}');
7182 const U8 c = (U8)*RExC_parse;
7184 RExC_end = oldregxend;
7185 vFAIL2("Missing right brace on \\%c{}", c);
7190 RExC_end = RExC_parse + 2;
7191 if (RExC_end > oldregxend)
7192 RExC_end = oldregxend;
7196 ret = regclass(pRExC_state,depth+1);
7198 RExC_end = oldregxend;
7201 Set_Node_Offset(ret, parse_start + 2);
7202 Set_Node_Cur_Length(ret);
7203 nextchar(pRExC_state);
7204 *flagp |= HASWIDTH|SIMPLE;
7208 /* Handle \N and \N{NAME} here and not below because it can be
7209 multicharacter. join_exact() will join them up later on.
7210 Also this makes sure that things like /\N{BLAH}+/ and
7211 \N{BLAH} being multi char Just Happen. dmq*/
7213 ret= reg_namedseq(pRExC_state, NULL, flagp);
7215 case 'k': /* Handle \k<NAME> and \k'NAME' */
7218 char ch= RExC_parse[1];
7219 if (ch != '<' && ch != '\'' && ch != '{') {
7221 vFAIL2("Sequence %.2s... not terminated",parse_start);
7223 /* this pretty much dupes the code for (?P=...) in reg(), if
7224 you change this make sure you change that */
7225 char* name_start = (RExC_parse += 2);
7227 SV *sv_dat = reg_scan_name(pRExC_state,
7228 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
7229 ch= (ch == '<') ? '>' : (ch == '{') ? '}' : '\'';
7230 if (RExC_parse == name_start || *RExC_parse != ch)
7231 vFAIL2("Sequence %.3s... not terminated",parse_start);
7234 num = add_data( pRExC_state, 1, "S" );
7235 RExC_rxi->data->data[num]=(void*)sv_dat;
7236 SvREFCNT_inc_simple_void(sv_dat);
7240 ret = reganode(pRExC_state,
7241 (U8)(FOLD ? (LOC ? NREFFL : NREFF) : NREF),
7245 /* override incorrect value set in reganode MJD */
7246 Set_Node_Offset(ret, parse_start+1);
7247 Set_Node_Cur_Length(ret); /* MJD */
7248 nextchar(pRExC_state);
7254 case '1': case '2': case '3': case '4':
7255 case '5': case '6': case '7': case '8': case '9':
7258 bool isg = *RExC_parse == 'g';
7263 if (*RExC_parse == '{') {
7267 if (*RExC_parse == '-') {
7271 if (hasbrace && !isDIGIT(*RExC_parse)) {
7272 if (isrel) RExC_parse--;
7274 goto parse_named_seq;
7276 num = atoi(RExC_parse);
7277 if (isg && num == 0)
7278 vFAIL("Reference to invalid group 0");
7280 num = RExC_npar - num;
7282 vFAIL("Reference to nonexistent or unclosed group");
7284 if (!isg && num > 9 && num >= RExC_npar)
7287 char * const parse_start = RExC_parse - 1; /* MJD */
7288 while (isDIGIT(*RExC_parse))
7290 if (parse_start == RExC_parse - 1)
7291 vFAIL("Unterminated \\g... pattern");
7293 if (*RExC_parse != '}')
7294 vFAIL("Unterminated \\g{...} pattern");
7298 if (num > (I32)RExC_rx->nparens)
7299 vFAIL("Reference to nonexistent group");
7302 ret = reganode(pRExC_state,
7303 (U8)(FOLD ? (LOC ? REFFL : REFF) : REF),
7307 /* override incorrect value set in reganode MJD */
7308 Set_Node_Offset(ret, parse_start+1);
7309 Set_Node_Cur_Length(ret); /* MJD */
7311 nextchar(pRExC_state);
7316 if (RExC_parse >= RExC_end)
7317 FAIL("Trailing \\");
7320 /* Do not generate "unrecognized" warnings here, we fall
7321 back into the quick-grab loop below */
7328 if (RExC_flags & RXf_PMf_EXTENDED) {
7329 if ( reg_skipcomment( pRExC_state ) )
7336 register STRLEN len;
7341 U8 tmpbuf[UTF8_MAXBYTES_CASE+1], *foldbuf;
7343 parse_start = RExC_parse - 1;
7349 ret = reg_node(pRExC_state,
7350 (U8)(FOLD ? (LOC ? EXACTFL : EXACTF) : EXACT));
7352 for (len = 0, p = RExC_parse - 1;
7353 len < 127 && p < RExC_end;
7356 char * const oldp = p;
7358 if (RExC_flags & RXf_PMf_EXTENDED)
7359 p = regwhite( pRExC_state, p );
7364 if (LOC || !FOLD || !is_TRICKYFOLD_safe(p,RExC_end,UTF))
7365 goto normal_default;
7375 /* Literal Escapes Switch
7377 This switch is meant to handle escape sequences that
7378 resolve to a literal character.
7380 Every escape sequence that represents something
7381 else, like an assertion or a char class, is handled
7382 in the switch marked 'Special Escapes' above in this
7383 routine, but also has an entry here as anything that
7384 isn't explicitly mentioned here will be treated as
7385 an unescaped equivalent literal.
7389 /* These are all the special escapes. */
7393 if (LOC || !FOLD || !is_TRICKYFOLD_safe(p,RExC_end,UTF))
7394 goto normal_default;
7395 case 'A': /* Start assertion */
7396 case 'b': case 'B': /* Word-boundary assertion*/
7397 case 'C': /* Single char !DANGEROUS! */
7398 case 'd': case 'D': /* digit class */
7399 case 'g': case 'G': /* generic-backref, pos assertion */
7400 case 'h': case 'H': /* HORIZWS */
7401 case 'k': case 'K': /* named backref, keep marker */
7402 case 'N': /* named char sequence */
7403 case 'p': case 'P': /* Unicode property */
7404 case 'R': /* LNBREAK */
7405 case 's': case 'S': /* space class */
7406 case 'v': case 'V': /* VERTWS */
7407 case 'w': case 'W': /* word class */
7408 case 'X': /* eXtended Unicode "combining character sequence" */
7409 case 'z': case 'Z': /* End of line/string assertion */
7413 /* Anything after here is an escape that resolves to a
7414 literal. (Except digits, which may or may not)
7433 ender = ASCII_TO_NATIVE('\033');
7437 ender = ASCII_TO_NATIVE('\007');
7442 char* const e = strchr(p, '}');
7446 vFAIL("Missing right brace on \\x{}");
7449 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
7450 | PERL_SCAN_DISALLOW_PREFIX;
7451 STRLEN numlen = e - p - 1;
7452 ender = grok_hex(p + 1, &numlen, &flags, NULL);
7459 I32 flags = PERL_SCAN_DISALLOW_PREFIX;
7461 ender = grok_hex(p, &numlen, &flags, NULL);
7464 if (PL_encoding && ender < 0x100)
7465 goto recode_encoding;
7469 ender = UCHARAT(p++);
7470 ender = toCTRL(ender);
7472 case '0': case '1': case '2': case '3':case '4':
7473 case '5': case '6': case '7': case '8':case '9':
7475 (isDIGIT(p[1]) && atoi(p) >= RExC_npar) ) {
7478 ender = grok_oct(p, &numlen, &flags, NULL);
7480 /* An octal above 0xff is interpreted differently
7481 * depending on if the re is in utf8 or not. If it
7482 * is in utf8, the value will be itself, otherwise
7483 * it is interpreted as modulo 0x100. It has been
7484 * decided to discourage the use of octal above the
7485 * single-byte range. For now, warn only when
7486 * it ends up modulo */
7487 if (SIZE_ONLY && ender >= 0x100
7488 && ! UTF && ! PL_encoding) {
7489 ckWARNregdep(p, "Use of octal value above 377 is deprecated");
7497 if (PL_encoding && ender < 0x100)
7498 goto recode_encoding;
7502 SV* enc = PL_encoding;
7503 ender = reg_recode((const char)(U8)ender, &enc);
7504 if (!enc && SIZE_ONLY)
7505 ckWARNreg(p, "Invalid escape in the specified encoding");
7511 FAIL("Trailing \\");
7514 if (!SIZE_ONLY&& isALPHA(*p))
7515 ckWARN2reg(p + 1, "Unrecognized escape \\%c passed through", UCHARAT(p));
7516 goto normal_default;
7521 if (UTF8_IS_START(*p) && UTF) {
7523 ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
7524 &numlen, UTF8_ALLOW_DEFAULT);
7531 if ( RExC_flags & RXf_PMf_EXTENDED)
7532 p = regwhite( pRExC_state, p );
7534 /* Prime the casefolded buffer. */
7535 ender = toFOLD_uni(ender, tmpbuf, &foldlen);
7537 if (p < RExC_end && ISMULT2(p)) { /* Back off on ?+*. */
7542 /* Emit all the Unicode characters. */
7544 for (foldbuf = tmpbuf;
7546 foldlen -= numlen) {
7547 ender = utf8_to_uvchr(foldbuf, &numlen);
7549 const STRLEN unilen = reguni(pRExC_state, ender, s);
7552 /* In EBCDIC the numlen
7553 * and unilen can differ. */
7555 if (numlen >= foldlen)
7559 break; /* "Can't happen." */
7563 const STRLEN unilen = reguni(pRExC_state, ender, s);
7572 REGC((char)ender, s++);
7578 /* Emit all the Unicode characters. */
7580 for (foldbuf = tmpbuf;
7582 foldlen -= numlen) {
7583 ender = utf8_to_uvchr(foldbuf, &numlen);
7585 const STRLEN unilen = reguni(pRExC_state, ender, s);
7588 /* In EBCDIC the numlen
7589 * and unilen can differ. */
7591 if (numlen >= foldlen)
7599 const STRLEN unilen = reguni(pRExC_state, ender, s);
7608 REGC((char)ender, s++);
7612 Set_Node_Cur_Length(ret); /* MJD */
7613 nextchar(pRExC_state);
7615 /* len is STRLEN which is unsigned, need to copy to signed */
7618 vFAIL("Internal disaster");
7622 if (len == 1 && UNI_IS_INVARIANT(ender))
7626 RExC_size += STR_SZ(len);
7629 RExC_emit += STR_SZ(len);
7639 S_regwhite( RExC_state_t *pRExC_state, char *p )
7641 const char *e = RExC_end;
7643 PERL_ARGS_ASSERT_REGWHITE;
7648 else if (*p == '#') {
7657 RExC_seen |= REG_SEEN_RUN_ON_COMMENT;
7665 /* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
7666 Character classes ([:foo:]) can also be negated ([:^foo:]).
7667 Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
7668 Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
7669 but trigger failures because they are currently unimplemented. */
7671 #define POSIXCC_DONE(c) ((c) == ':')
7672 #define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.')
7673 #define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c))
7676 S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value)
7679 I32 namedclass = OOB_NAMEDCLASS;
7681 PERL_ARGS_ASSERT_REGPPOSIXCC;
7683 if (value == '[' && RExC_parse + 1 < RExC_end &&
7684 /* I smell either [: or [= or [. -- POSIX has been here, right? */
7685 POSIXCC(UCHARAT(RExC_parse))) {
7686 const char c = UCHARAT(RExC_parse);
7687 char* const s = RExC_parse++;
7689 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != c)
7691 if (RExC_parse == RExC_end)
7692 /* Grandfather lone [:, [=, [. */
7695 const char* const t = RExC_parse++; /* skip over the c */
7698 if (UCHARAT(RExC_parse) == ']') {
7699 const char *posixcc = s + 1;
7700 RExC_parse++; /* skip over the ending ] */
7703 const I32 complement = *posixcc == '^' ? *posixcc++ : 0;
7704 const I32 skip = t - posixcc;
7706 /* Initially switch on the length of the name. */
7709 if (memEQ(posixcc, "word", 4)) /* this is not POSIX, this is the Perl \w */
7710 namedclass = complement ? ANYOF_NALNUM : ANYOF_ALNUM;
7713 /* Names all of length 5. */
7714 /* alnum alpha ascii blank cntrl digit graph lower
7715 print punct space upper */
7716 /* Offset 4 gives the best switch position. */
7717 switch (posixcc[4]) {
7719 if (memEQ(posixcc, "alph", 4)) /* alpha */
7720 namedclass = complement ? ANYOF_NALPHA : ANYOF_ALPHA;
7723 if (memEQ(posixcc, "spac", 4)) /* space */
7724 namedclass = complement ? ANYOF_NPSXSPC : ANYOF_PSXSPC;
7727 if (memEQ(posixcc, "grap", 4)) /* graph */
7728 namedclass = complement ? ANYOF_NGRAPH : ANYOF_GRAPH;
7731 if (memEQ(posixcc, "asci", 4)) /* ascii */
7732 namedclass = complement ? ANYOF_NASCII : ANYOF_ASCII;
7735 if (memEQ(posixcc, "blan", 4)) /* blank */
7736 namedclass = complement ? ANYOF_NBLANK : ANYOF_BLANK;
7739 if (memEQ(posixcc, "cntr", 4)) /* cntrl */
7740 namedclass = complement ? ANYOF_NCNTRL : ANYOF_CNTRL;
7743 if (memEQ(posixcc, "alnu", 4)) /* alnum */
7744 namedclass = complement ? ANYOF_NALNUMC : ANYOF_ALNUMC;
7747 if (memEQ(posixcc, "lowe", 4)) /* lower */
7748 namedclass = complement ? ANYOF_NLOWER : ANYOF_LOWER;
7749 else if (memEQ(posixcc, "uppe", 4)) /* upper */
7750 namedclass = complement ? ANYOF_NUPPER : ANYOF_UPPER;
7753 if (memEQ(posixcc, "digi", 4)) /* digit */
7754 namedclass = complement ? ANYOF_NDIGIT : ANYOF_DIGIT;
7755 else if (memEQ(posixcc, "prin", 4)) /* print */
7756 namedclass = complement ? ANYOF_NPRINT : ANYOF_PRINT;
7757 else if (memEQ(posixcc, "punc", 4)) /* punct */
7758 namedclass = complement ? ANYOF_NPUNCT : ANYOF_PUNCT;
7763 if (memEQ(posixcc, "xdigit", 6))
7764 namedclass = complement ? ANYOF_NXDIGIT : ANYOF_XDIGIT;
7768 if (namedclass == OOB_NAMEDCLASS)
7769 Simple_vFAIL3("POSIX class [:%.*s:] unknown",
7771 assert (posixcc[skip] == ':');
7772 assert (posixcc[skip+1] == ']');
7773 } else if (!SIZE_ONLY) {
7774 /* [[=foo=]] and [[.foo.]] are still future. */
7776 /* adjust RExC_parse so the warning shows after
7778 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse) != ']')
7780 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
7783 /* Maternal grandfather:
7784 * "[:" ending in ":" but not in ":]" */
7794 S_checkposixcc(pTHX_ RExC_state_t *pRExC_state)
7798 PERL_ARGS_ASSERT_CHECKPOSIXCC;
7800 if (POSIXCC(UCHARAT(RExC_parse))) {
7801 const char *s = RExC_parse;
7802 const char c = *s++;
7806 if (*s && c == *s && s[1] == ']') {
7808 "POSIX syntax [%c %c] belongs inside character classes",
7811 /* [[=foo=]] and [[.foo.]] are still future. */
7812 if (POSIXCC_NOTYET(c)) {
7813 /* adjust RExC_parse so the error shows after
7815 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse++) != ']')
7817 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
7824 #define _C_C_T_(NAME,TEST,WORD) \
7827 ANYOF_CLASS_SET(ret, ANYOF_##NAME); \
7829 for (value = 0; value < 256; value++) \
7831 ANYOF_BITMAP_SET(ret, value); \
7836 case ANYOF_N##NAME: \
7838 ANYOF_CLASS_SET(ret, ANYOF_N##NAME); \
7840 for (value = 0; value < 256; value++) \
7842 ANYOF_BITMAP_SET(ret, value); \
7848 #define _C_C_T_NOLOC_(NAME,TEST,WORD) \
7850 for (value = 0; value < 256; value++) \
7852 ANYOF_BITMAP_SET(ret, value); \
7856 case ANYOF_N##NAME: \
7857 for (value = 0; value < 256; value++) \
7859 ANYOF_BITMAP_SET(ret, value); \
7865 We dont use PERL_LEGACY_UNICODE_CHARCLASS_MAPPINGS as the direct test
7866 so that it is possible to override the option here without having to
7867 rebuild the entire core. as we are required to do if we change regcomp.h
7868 which is where PERL_LEGACY_UNICODE_CHARCLASS_MAPPINGS is defined.
7870 #if PERL_LEGACY_UNICODE_CHARCLASS_MAPPINGS
7871 #define BROKEN_UNICODE_CHARCLASS_MAPPINGS
7874 #ifdef BROKEN_UNICODE_CHARCLASS_MAPPINGS
7875 #define POSIX_CC_UNI_NAME(CCNAME) CCNAME
7877 #define POSIX_CC_UNI_NAME(CCNAME) "Posix" CCNAME
7881 parse a class specification and produce either an ANYOF node that
7882 matches the pattern or if the pattern matches a single char only and
7883 that char is < 256 and we are case insensitive then we produce an
7888 S_regclass(pTHX_ RExC_state_t *pRExC_state, U32 depth)
7891 register UV nextvalue;
7892 register IV prevvalue = OOB_UNICODE;
7893 register IV range = 0;
7894 UV value = 0; /* XXX:dmq: needs to be referenceable (unfortunately) */
7895 register regnode *ret;
7898 char *rangebegin = NULL;
7899 bool need_class = 0;
7902 bool optimize_invert = TRUE;
7903 AV* unicode_alternate = NULL;
7905 UV literal_endpoint = 0;
7907 UV stored = 0; /* number of chars stored in the class */
7909 regnode * const orig_emit = RExC_emit; /* Save the original RExC_emit in
7910 case we need to change the emitted regop to an EXACT. */
7911 const char * orig_parse = RExC_parse;
7912 GET_RE_DEBUG_FLAGS_DECL;
7914 PERL_ARGS_ASSERT_REGCLASS;
7916 PERL_UNUSED_ARG(depth);
7919 DEBUG_PARSE("clas");
7921 /* Assume we are going to generate an ANYOF node. */
7922 ret = reganode(pRExC_state, ANYOF, 0);
7925 ANYOF_FLAGS(ret) = 0;
7927 if (UCHARAT(RExC_parse) == '^') { /* Complement of range. */
7931 ANYOF_FLAGS(ret) |= ANYOF_INVERT;
7935 RExC_size += ANYOF_SKIP;
7936 listsv = &PL_sv_undef; /* For code scanners: listsv always non-NULL. */
7939 RExC_emit += ANYOF_SKIP;
7941 ANYOF_FLAGS(ret) |= ANYOF_FOLD;
7943 ANYOF_FLAGS(ret) |= ANYOF_LOCALE;
7944 ANYOF_BITMAP_ZERO(ret);
7945 listsv = newSVpvs("# comment\n");
7948 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
7950 if (!SIZE_ONLY && POSIXCC(nextvalue))
7951 checkposixcc(pRExC_state);
7953 /* allow 1st char to be ] (allowing it to be - is dealt with later) */
7954 if (UCHARAT(RExC_parse) == ']')
7958 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != ']') {
7962 namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
7965 rangebegin = RExC_parse;
7967 value = utf8n_to_uvchr((U8*)RExC_parse,
7968 RExC_end - RExC_parse,
7969 &numlen, UTF8_ALLOW_DEFAULT);
7970 RExC_parse += numlen;
7973 value = UCHARAT(RExC_parse++);
7975 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
7976 if (value == '[' && POSIXCC(nextvalue))
7977 namedclass = regpposixcc(pRExC_state, value);
7978 else if (value == '\\') {
7980 value = utf8n_to_uvchr((U8*)RExC_parse,
7981 RExC_end - RExC_parse,
7982 &numlen, UTF8_ALLOW_DEFAULT);
7983 RExC_parse += numlen;
7986 value = UCHARAT(RExC_parse++);
7987 /* Some compilers cannot handle switching on 64-bit integer
7988 * values, therefore value cannot be an UV. Yes, this will
7989 * be a problem later if we want switch on Unicode.
7990 * A similar issue a little bit later when switching on
7991 * namedclass. --jhi */
7992 switch ((I32)value) {
7993 case 'w': namedclass = ANYOF_ALNUM; break;
7994 case 'W': namedclass = ANYOF_NALNUM; break;
7995 case 's': namedclass = ANYOF_SPACE; break;
7996 case 'S': namedclass = ANYOF_NSPACE; break;
7997 case 'd': namedclass = ANYOF_DIGIT; break;
7998 case 'D': namedclass = ANYOF_NDIGIT; break;
7999 case 'v': namedclass = ANYOF_VERTWS; break;
8000 case 'V': namedclass = ANYOF_NVERTWS; break;
8001 case 'h': namedclass = ANYOF_HORIZWS; break;
8002 case 'H': namedclass = ANYOF_NHORIZWS; break;
8003 case 'N': /* Handle \N{NAME} in class */
8005 /* We only pay attention to the first char of
8006 multichar strings being returned. I kinda wonder
8007 if this makes sense as it does change the behaviour
8008 from earlier versions, OTOH that behaviour was broken
8010 UV v; /* value is register so we cant & it /grrr */
8011 if (reg_namedseq(pRExC_state, &v, NULL)) {
8021 if (RExC_parse >= RExC_end)
8022 vFAIL2("Empty \\%c{}", (U8)value);
8023 if (*RExC_parse == '{') {
8024 const U8 c = (U8)value;
8025 e = strchr(RExC_parse++, '}');
8027 vFAIL2("Missing right brace on \\%c{}", c);
8028 while (isSPACE(UCHARAT(RExC_parse)))
8030 if (e == RExC_parse)
8031 vFAIL2("Empty \\%c{}", c);
8033 while (isSPACE(UCHARAT(RExC_parse + n - 1)))
8041 if (UCHARAT(RExC_parse) == '^') {
8044 value = value == 'p' ? 'P' : 'p'; /* toggle */
8045 while (isSPACE(UCHARAT(RExC_parse))) {
8050 Perl_sv_catpvf(aTHX_ listsv, "%cutf8::%.*s\n",
8051 (value=='p' ? '+' : '!'), (int)n, RExC_parse);
8054 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
8055 namedclass = ANYOF_MAX; /* no official name, but it's named */
8058 case 'n': value = '\n'; break;
8059 case 'r': value = '\r'; break;
8060 case 't': value = '\t'; break;
8061 case 'f': value = '\f'; break;
8062 case 'b': value = '\b'; break;
8063 case 'e': value = ASCII_TO_NATIVE('\033');break;
8064 case 'a': value = ASCII_TO_NATIVE('\007');break;
8066 if (*RExC_parse == '{') {
8067 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
8068 | PERL_SCAN_DISALLOW_PREFIX;
8069 char * const e = strchr(RExC_parse++, '}');
8071 vFAIL("Missing right brace on \\x{}");
8073 numlen = e - RExC_parse;
8074 value = grok_hex(RExC_parse, &numlen, &flags, NULL);
8078 I32 flags = PERL_SCAN_DISALLOW_PREFIX;
8080 value = grok_hex(RExC_parse, &numlen, &flags, NULL);
8081 RExC_parse += numlen;
8083 if (PL_encoding && value < 0x100)
8084 goto recode_encoding;
8087 value = UCHARAT(RExC_parse++);
8088 value = toCTRL(value);
8090 case '0': case '1': case '2': case '3': case '4':
8091 case '5': case '6': case '7': case '8': case '9':
8095 value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
8096 RExC_parse += numlen;
8097 if (PL_encoding && value < 0x100)
8098 goto recode_encoding;
8103 SV* enc = PL_encoding;
8104 value = reg_recode((const char)(U8)value, &enc);
8105 if (!enc && SIZE_ONLY)
8106 ckWARNreg(RExC_parse,
8107 "Invalid escape in the specified encoding");
8111 if (!SIZE_ONLY && isALPHA(value))
8112 ckWARN2reg(RExC_parse,
8113 "Unrecognized escape \\%c in character class passed through",
8117 } /* end of \blah */
8123 if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
8125 if (!SIZE_ONLY && !need_class)
8126 ANYOF_CLASS_ZERO(ret);
8130 /* a bad range like a-\d, a-[:digit:] ? */
8134 RExC_parse >= rangebegin ?
8135 RExC_parse - rangebegin : 0;
8136 ckWARN4reg(RExC_parse,
8137 "False [] range \"%*.*s\"",
8140 if (prevvalue < 256) {
8141 ANYOF_BITMAP_SET(ret, prevvalue);
8142 ANYOF_BITMAP_SET(ret, '-');
8145 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
8146 Perl_sv_catpvf(aTHX_ listsv,
8147 "%04"UVxf"\n%04"UVxf"\n", (UV)prevvalue, (UV) '-');
8151 range = 0; /* this was not a true range */
8157 const char *what = NULL;
8160 if (namedclass > OOB_NAMEDCLASS)
8161 optimize_invert = FALSE;
8162 /* Possible truncation here but in some 64-bit environments
8163 * the compiler gets heartburn about switch on 64-bit values.
8164 * A similar issue a little earlier when switching on value.
8166 switch ((I32)namedclass) {
8168 case _C_C_T_(ALNUMC, isALNUMC(value), POSIX_CC_UNI_NAME("Alnum"));
8169 case _C_C_T_(ALPHA, isALPHA(value), POSIX_CC_UNI_NAME("Alpha"));
8170 case _C_C_T_(BLANK, isBLANK(value), POSIX_CC_UNI_NAME("Blank"));
8171 case _C_C_T_(CNTRL, isCNTRL(value), POSIX_CC_UNI_NAME("Cntrl"));
8172 case _C_C_T_(GRAPH, isGRAPH(value), POSIX_CC_UNI_NAME("Graph"));
8173 case _C_C_T_(LOWER, isLOWER(value), POSIX_CC_UNI_NAME("Lower"));
8174 case _C_C_T_(PRINT, isPRINT(value), POSIX_CC_UNI_NAME("Print"));
8175 case _C_C_T_(PSXSPC, isPSXSPC(value), POSIX_CC_UNI_NAME("Space"));
8176 case _C_C_T_(PUNCT, isPUNCT(value), POSIX_CC_UNI_NAME("Punct"));
8177 case _C_C_T_(UPPER, isUPPER(value), POSIX_CC_UNI_NAME("Upper"));
8178 #ifdef BROKEN_UNICODE_CHARCLASS_MAPPINGS
8179 case _C_C_T_(ALNUM, isALNUM(value), "Word");
8180 case _C_C_T_(SPACE, isSPACE(value), "SpacePerl");
8182 case _C_C_T_(SPACE, isSPACE(value), "PerlSpace");
8183 case _C_C_T_(ALNUM, isALNUM(value), "PerlWord");
8185 case _C_C_T_(XDIGIT, isXDIGIT(value), "XDigit");
8186 case _C_C_T_NOLOC_(VERTWS, is_VERTWS_latin1(&value), "VertSpace");
8187 case _C_C_T_NOLOC_(HORIZWS, is_HORIZWS_latin1(&value), "HorizSpace");
8190 ANYOF_CLASS_SET(ret, ANYOF_ASCII);
8193 for (value = 0; value < 128; value++)
8194 ANYOF_BITMAP_SET(ret, value);
8196 for (value = 0; value < 256; value++) {
8198 ANYOF_BITMAP_SET(ret, value);
8207 ANYOF_CLASS_SET(ret, ANYOF_NASCII);
8210 for (value = 128; value < 256; value++)
8211 ANYOF_BITMAP_SET(ret, value);
8213 for (value = 0; value < 256; value++) {
8214 if (!isASCII(value))
8215 ANYOF_BITMAP_SET(ret, value);
8224 ANYOF_CLASS_SET(ret, ANYOF_DIGIT);
8226 /* consecutive digits assumed */
8227 for (value = '0'; value <= '9'; value++)
8228 ANYOF_BITMAP_SET(ret, value);
8231 what = POSIX_CC_UNI_NAME("Digit");
8235 ANYOF_CLASS_SET(ret, ANYOF_NDIGIT);
8237 /* consecutive digits assumed */
8238 for (value = 0; value < '0'; value++)
8239 ANYOF_BITMAP_SET(ret, value);
8240 for (value = '9' + 1; value < 256; value++)
8241 ANYOF_BITMAP_SET(ret, value);
8244 what = POSIX_CC_UNI_NAME("Digit");
8247 /* this is to handle \p and \P */
8250 vFAIL("Invalid [::] class");
8254 /* Strings such as "+utf8::isWord\n" */
8255 Perl_sv_catpvf(aTHX_ listsv, "%cutf8::Is%s\n", yesno, what);
8258 ANYOF_FLAGS(ret) |= ANYOF_CLASS;
8261 } /* end of namedclass \blah */
8264 if (prevvalue > (IV)value) /* b-a */ {
8265 const int w = RExC_parse - rangebegin;
8266 Simple_vFAIL4("Invalid [] range \"%*.*s\"", w, w, rangebegin);
8267 range = 0; /* not a valid range */
8271 prevvalue = value; /* save the beginning of the range */
8272 if (*RExC_parse == '-' && RExC_parse+1 < RExC_end &&
8273 RExC_parse[1] != ']') {
8276 /* a bad range like \w-, [:word:]- ? */
8277 if (namedclass > OOB_NAMEDCLASS) {
8278 if (ckWARN(WARN_REGEXP)) {
8280 RExC_parse >= rangebegin ?
8281 RExC_parse - rangebegin : 0;
8283 "False [] range \"%*.*s\"",
8287 ANYOF_BITMAP_SET(ret, '-');
8289 range = 1; /* yeah, it's a range! */
8290 continue; /* but do it the next time */
8294 /* now is the next time */
8295 /*stored += (value - prevvalue + 1);*/
8297 if (prevvalue < 256) {
8298 const IV ceilvalue = value < 256 ? value : 255;
8301 /* In EBCDIC [\x89-\x91] should include
8302 * the \x8e but [i-j] should not. */
8303 if (literal_endpoint == 2 &&
8304 ((isLOWER(prevvalue) && isLOWER(ceilvalue)) ||
8305 (isUPPER(prevvalue) && isUPPER(ceilvalue))))
8307 if (isLOWER(prevvalue)) {
8308 for (i = prevvalue; i <= ceilvalue; i++)
8309 if (isLOWER(i) && !ANYOF_BITMAP_TEST(ret,i)) {
8311 ANYOF_BITMAP_SET(ret, i);
8314 for (i = prevvalue; i <= ceilvalue; i++)
8315 if (isUPPER(i) && !ANYOF_BITMAP_TEST(ret,i)) {
8317 ANYOF_BITMAP_SET(ret, i);
8323 for (i = prevvalue; i <= ceilvalue; i++) {
8324 if (!ANYOF_BITMAP_TEST(ret,i)) {
8326 ANYOF_BITMAP_SET(ret, i);
8330 if (value > 255 || UTF) {
8331 const UV prevnatvalue = NATIVE_TO_UNI(prevvalue);
8332 const UV natvalue = NATIVE_TO_UNI(value);
8333 stored+=2; /* can't optimize this class */
8334 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
8335 if (prevnatvalue < natvalue) { /* what about > ? */
8336 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\t%04"UVxf"\n",
8337 prevnatvalue, natvalue);
8339 else if (prevnatvalue == natvalue) {
8340 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n", natvalue);
8342 U8 foldbuf[UTF8_MAXBYTES_CASE+1];
8344 const UV f = to_uni_fold(natvalue, foldbuf, &foldlen);
8346 #ifdef EBCDIC /* RD t/uni/fold ff and 6b */
8347 if (RExC_precomp[0] == ':' &&
8348 RExC_precomp[1] == '[' &&
8349 (f == 0xDF || f == 0x92)) {
8350 f = NATIVE_TO_UNI(f);
8353 /* If folding and foldable and a single
8354 * character, insert also the folded version
8355 * to the charclass. */
8357 #ifdef EBCDIC /* RD tunifold ligatures s,t fb05, fb06 */
8358 if ((RExC_precomp[0] == ':' &&
8359 RExC_precomp[1] == '[' &&
8361 (value == 0xFB05 || value == 0xFB06))) ?
8362 foldlen == ((STRLEN)UNISKIP(f) - 1) :
8363 foldlen == (STRLEN)UNISKIP(f) )
8365 if (foldlen == (STRLEN)UNISKIP(f))
8367 Perl_sv_catpvf(aTHX_ listsv,
8370 /* Any multicharacter foldings
8371 * require the following transform:
8372 * [ABCDEF] -> (?:[ABCabcDEFd]|pq|rst)
8373 * where E folds into "pq" and F folds
8374 * into "rst", all other characters
8375 * fold to single characters. We save
8376 * away these multicharacter foldings,
8377 * to be later saved as part of the
8378 * additional "s" data. */
8381 if (!unicode_alternate)
8382 unicode_alternate = newAV();
8383 sv = newSVpvn_utf8((char*)foldbuf, foldlen,
8385 av_push(unicode_alternate, sv);
8389 /* If folding and the value is one of the Greek
8390 * sigmas insert a few more sigmas to make the
8391 * folding rules of the sigmas to work right.
8392 * Note that not all the possible combinations
8393 * are handled here: some of them are handled
8394 * by the standard folding rules, and some of
8395 * them (literal or EXACTF cases) are handled
8396 * during runtime in regexec.c:S_find_byclass(). */
8397 if (value == UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA) {
8398 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
8399 (UV)UNICODE_GREEK_CAPITAL_LETTER_SIGMA);
8400 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
8401 (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
8403 else if (value == UNICODE_GREEK_CAPITAL_LETTER_SIGMA)
8404 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
8405 (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
8410 literal_endpoint = 0;
8414 range = 0; /* this range (if it was one) is done now */
8418 ANYOF_FLAGS(ret) |= ANYOF_LARGE;
8420 RExC_size += ANYOF_CLASS_ADD_SKIP;
8422 RExC_emit += ANYOF_CLASS_ADD_SKIP;
8428 /****** !SIZE_ONLY AFTER HERE *********/
8430 if( stored == 1 && (value < 128 || (value < 256 && !UTF))
8431 && !( ANYOF_FLAGS(ret) & ( ANYOF_FLAGS_ALL ^ ANYOF_FOLD ) )
8433 /* optimize single char class to an EXACT node
8434 but *only* when its not a UTF/high char */
8435 const char * cur_parse= RExC_parse;
8436 RExC_emit = (regnode *)orig_emit;
8437 RExC_parse = (char *)orig_parse;
8438 ret = reg_node(pRExC_state,
8439 (U8)((ANYOF_FLAGS(ret) & ANYOF_FOLD) ? EXACTF : EXACT));
8440 RExC_parse = (char *)cur_parse;
8441 *STRING(ret)= (char)value;
8443 RExC_emit += STR_SZ(1);
8444 SvREFCNT_dec(listsv);
8447 /* optimize case-insensitive simple patterns (e.g. /[a-z]/i) */
8448 if ( /* If the only flag is folding (plus possibly inversion). */
8449 ((ANYOF_FLAGS(ret) & (ANYOF_FLAGS_ALL ^ ANYOF_INVERT)) == ANYOF_FOLD)
8451 for (value = 0; value < 256; ++value) {
8452 if (ANYOF_BITMAP_TEST(ret, value)) {
8453 UV fold = PL_fold[value];
8456 ANYOF_BITMAP_SET(ret, fold);
8459 ANYOF_FLAGS(ret) &= ~ANYOF_FOLD;
8462 /* optimize inverted simple patterns (e.g. [^a-z]) */
8463 if (optimize_invert &&
8464 /* If the only flag is inversion. */
8465 (ANYOF_FLAGS(ret) & ANYOF_FLAGS_ALL) == ANYOF_INVERT) {
8466 for (value = 0; value < ANYOF_BITMAP_SIZE; ++value)
8467 ANYOF_BITMAP(ret)[value] ^= ANYOF_FLAGS_ALL;
8468 ANYOF_FLAGS(ret) = ANYOF_UNICODE_ALL;
8471 AV * const av = newAV();
8473 /* The 0th element stores the character class description
8474 * in its textual form: used later (regexec.c:Perl_regclass_swash())
8475 * to initialize the appropriate swash (which gets stored in
8476 * the 1st element), and also useful for dumping the regnode.
8477 * The 2nd element stores the multicharacter foldings,
8478 * used later (regexec.c:S_reginclass()). */
8479 av_store(av, 0, listsv);
8480 av_store(av, 1, NULL);
8481 av_store(av, 2, MUTABLE_SV(unicode_alternate));
8482 rv = newRV_noinc(MUTABLE_SV(av));
8483 n = add_data(pRExC_state, 1, "s");
8484 RExC_rxi->data->data[n] = (void*)rv;
8492 /* reg_skipcomment()
8494 Absorbs an /x style # comments from the input stream.
8495 Returns true if there is more text remaining in the stream.
8496 Will set the REG_SEEN_RUN_ON_COMMENT flag if the comment
8497 terminates the pattern without including a newline.
8499 Note its the callers responsibility to ensure that we are
8505 S_reg_skipcomment(pTHX_ RExC_state_t *pRExC_state)
8509 PERL_ARGS_ASSERT_REG_SKIPCOMMENT;
8511 while (RExC_parse < RExC_end)
8512 if (*RExC_parse++ == '\n') {
8517 /* we ran off the end of the pattern without ending
8518 the comment, so we have to add an \n when wrapping */
8519 RExC_seen |= REG_SEEN_RUN_ON_COMMENT;
8527 Advance that parse position, and optionall absorbs
8528 "whitespace" from the inputstream.
8530 Without /x "whitespace" means (?#...) style comments only,
8531 with /x this means (?#...) and # comments and whitespace proper.
8533 Returns the RExC_parse point from BEFORE the scan occurs.
8535 This is the /x friendly way of saying RExC_parse++.
8539 S_nextchar(pTHX_ RExC_state_t *pRExC_state)
8541 char* const retval = RExC_parse++;
8543 PERL_ARGS_ASSERT_NEXTCHAR;
8546 if (*RExC_parse == '(' && RExC_parse[1] == '?' &&
8547 RExC_parse[2] == '#') {
8548 while (*RExC_parse != ')') {
8549 if (RExC_parse == RExC_end)
8550 FAIL("Sequence (?#... not terminated");
8556 if (RExC_flags & RXf_PMf_EXTENDED) {
8557 if (isSPACE(*RExC_parse)) {
8561 else if (*RExC_parse == '#') {
8562 if ( reg_skipcomment( pRExC_state ) )
8571 - reg_node - emit a node
8573 STATIC regnode * /* Location. */
8574 S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
8577 register regnode *ptr;
8578 regnode * const ret = RExC_emit;
8579 GET_RE_DEBUG_FLAGS_DECL;
8581 PERL_ARGS_ASSERT_REG_NODE;
8584 SIZE_ALIGN(RExC_size);
8588 if (RExC_emit >= RExC_emit_bound)
8589 Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d", op);
8591 NODE_ALIGN_FILL(ret);
8593 FILL_ADVANCE_NODE(ptr, op);
8594 #ifdef RE_TRACK_PATTERN_OFFSETS
8595 if (RExC_offsets) { /* MJD */
8596 MJD_OFFSET_DEBUG(("%s:%d: (op %s) %s %"UVuf" (len %"UVuf") (max %"UVuf").\n",
8597 "reg_node", __LINE__,
8599 (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0]
8600 ? "Overwriting end of array!\n" : "OK",
8601 (UV)(RExC_emit - RExC_emit_start),
8602 (UV)(RExC_parse - RExC_start),
8603 (UV)RExC_offsets[0]));
8604 Set_Node_Offset(RExC_emit, RExC_parse + (op == END));
8612 - reganode - emit a node with an argument
8614 STATIC regnode * /* Location. */
8615 S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
8618 register regnode *ptr;
8619 regnode * const ret = RExC_emit;
8620 GET_RE_DEBUG_FLAGS_DECL;
8622 PERL_ARGS_ASSERT_REGANODE;
8625 SIZE_ALIGN(RExC_size);
8630 assert(2==regarglen[op]+1);
8632 Anything larger than this has to allocate the extra amount.
8633 If we changed this to be:
8635 RExC_size += (1 + regarglen[op]);
8637 then it wouldn't matter. Its not clear what side effect
8638 might come from that so its not done so far.
8643 if (RExC_emit >= RExC_emit_bound)
8644 Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d", op);
8646 NODE_ALIGN_FILL(ret);
8648 FILL_ADVANCE_NODE_ARG(ptr, op, arg);
8649 #ifdef RE_TRACK_PATTERN_OFFSETS
8650 if (RExC_offsets) { /* MJD */
8651 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
8655 (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0] ?
8656 "Overwriting end of array!\n" : "OK",
8657 (UV)(RExC_emit - RExC_emit_start),
8658 (UV)(RExC_parse - RExC_start),
8659 (UV)RExC_offsets[0]));
8660 Set_Cur_Node_Offset;
8668 - reguni - emit (if appropriate) a Unicode character
8671 S_reguni(pTHX_ const RExC_state_t *pRExC_state, UV uv, char* s)
8675 PERL_ARGS_ASSERT_REGUNI;
8677 return SIZE_ONLY ? UNISKIP(uv) : (uvchr_to_utf8((U8*)s, uv) - (U8*)s);
8681 - reginsert - insert an operator in front of already-emitted operand
8683 * Means relocating the operand.
8686 S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd, U32 depth)
8689 register regnode *src;
8690 register regnode *dst;
8691 register regnode *place;
8692 const int offset = regarglen[(U8)op];
8693 const int size = NODE_STEP_REGNODE + offset;
8694 GET_RE_DEBUG_FLAGS_DECL;
8696 PERL_ARGS_ASSERT_REGINSERT;
8697 PERL_UNUSED_ARG(depth);
8698 /* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */
8699 DEBUG_PARSE_FMT("inst"," - %s",PL_reg_name[op]);
8708 if (RExC_open_parens) {
8710 /*DEBUG_PARSE_FMT("inst"," - %"IVdf, (IV)RExC_npar);*/
8711 for ( paren=0 ; paren < RExC_npar ; paren++ ) {
8712 if ( RExC_open_parens[paren] >= opnd ) {
8713 /*DEBUG_PARSE_FMT("open"," - %d",size);*/
8714 RExC_open_parens[paren] += size;
8716 /*DEBUG_PARSE_FMT("open"," - %s","ok");*/
8718 if ( RExC_close_parens[paren] >= opnd ) {
8719 /*DEBUG_PARSE_FMT("close"," - %d",size);*/
8720 RExC_close_parens[paren] += size;
8722 /*DEBUG_PARSE_FMT("close"," - %s","ok");*/
8727 while (src > opnd) {
8728 StructCopy(--src, --dst, regnode);
8729 #ifdef RE_TRACK_PATTERN_OFFSETS
8730 if (RExC_offsets) { /* MJD 20010112 */
8731 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s copy %"UVuf" -> %"UVuf" (max %"UVuf").\n",
8735 (UV)(dst - RExC_emit_start) > RExC_offsets[0]
8736 ? "Overwriting end of array!\n" : "OK",
8737 (UV)(src - RExC_emit_start),
8738 (UV)(dst - RExC_emit_start),
8739 (UV)RExC_offsets[0]));
8740 Set_Node_Offset_To_R(dst-RExC_emit_start, Node_Offset(src));
8741 Set_Node_Length_To_R(dst-RExC_emit_start, Node_Length(src));
8747 place = opnd; /* Op node, where operand used to be. */
8748 #ifdef RE_TRACK_PATTERN_OFFSETS
8749 if (RExC_offsets) { /* MJD */
8750 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
8754 (UV)(place - RExC_emit_start) > RExC_offsets[0]
8755 ? "Overwriting end of array!\n" : "OK",
8756 (UV)(place - RExC_emit_start),
8757 (UV)(RExC_parse - RExC_start),
8758 (UV)RExC_offsets[0]));
8759 Set_Node_Offset(place, RExC_parse);
8760 Set_Node_Length(place, 1);
8763 src = NEXTOPER(place);
8764 FILL_ADVANCE_NODE(place, op);
8765 Zero(src, offset, regnode);
8769 - regtail - set the next-pointer at the end of a node chain of p to val.
8770 - SEE ALSO: regtail_study
8772 /* TODO: All three parms should be const */
8774 S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
8777 register regnode *scan;
8778 GET_RE_DEBUG_FLAGS_DECL;
8780 PERL_ARGS_ASSERT_REGTAIL;
8782 PERL_UNUSED_ARG(depth);
8788 /* Find last node. */
8791 regnode * const temp = regnext(scan);
8793 SV * const mysv=sv_newmortal();
8794 DEBUG_PARSE_MSG((scan==p ? "tail" : ""));
8795 regprop(RExC_rx, mysv, scan);
8796 PerlIO_printf(Perl_debug_log, "~ %s (%d) %s %s\n",
8797 SvPV_nolen_const(mysv), REG_NODE_NUM(scan),
8798 (temp == NULL ? "->" : ""),
8799 (temp == NULL ? PL_reg_name[OP(val)] : "")
8807 if (reg_off_by_arg[OP(scan)]) {
8808 ARG_SET(scan, val - scan);
8811 NEXT_OFF(scan) = val - scan;
8817 - regtail_study - set the next-pointer at the end of a node chain of p to val.
8818 - Look for optimizable sequences at the same time.
8819 - currently only looks for EXACT chains.
8821 This is expermental code. The idea is to use this routine to perform
8822 in place optimizations on branches and groups as they are constructed,
8823 with the long term intention of removing optimization from study_chunk so
8824 that it is purely analytical.
8826 Currently only used when in DEBUG mode. The macro REGTAIL_STUDY() is used
8827 to control which is which.
8830 /* TODO: All four parms should be const */
8833 S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
8836 register regnode *scan;
8838 #ifdef EXPERIMENTAL_INPLACESCAN
8841 GET_RE_DEBUG_FLAGS_DECL;
8843 PERL_ARGS_ASSERT_REGTAIL_STUDY;
8849 /* Find last node. */
8853 regnode * const temp = regnext(scan);
8854 #ifdef EXPERIMENTAL_INPLACESCAN
8855 if (PL_regkind[OP(scan)] == EXACT)
8856 if (join_exact(pRExC_state,scan,&min,1,val,depth+1))
8864 if( exact == PSEUDO )
8866 else if ( exact != OP(scan) )
8875 SV * const mysv=sv_newmortal();
8876 DEBUG_PARSE_MSG((scan==p ? "tsdy" : ""));
8877 regprop(RExC_rx, mysv, scan);
8878 PerlIO_printf(Perl_debug_log, "~ %s (%d) -> %s\n",
8879 SvPV_nolen_const(mysv),
8881 PL_reg_name[exact]);
8888 SV * const mysv_val=sv_newmortal();
8889 DEBUG_PARSE_MSG("");
8890 regprop(RExC_rx, mysv_val, val);
8891 PerlIO_printf(Perl_debug_log, "~ attach to %s (%"IVdf") offset to %"IVdf"\n",
8892 SvPV_nolen_const(mysv_val),
8893 (IV)REG_NODE_NUM(val),
8897 if (reg_off_by_arg[OP(scan)]) {
8898 ARG_SET(scan, val - scan);
8901 NEXT_OFF(scan) = val - scan;
8909 - regcurly - a little FSA that accepts {\d+,?\d*}
8912 S_regcurly(register const char *s)
8914 PERL_ARGS_ASSERT_REGCURLY;
8933 - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
8937 S_regdump_extflags(pTHX_ const char *lead, const U32 flags)
8942 for (bit=0; bit<32; bit++) {
8943 if (flags & (1<<bit)) {
8945 PerlIO_printf(Perl_debug_log, "%s",lead);
8946 PerlIO_printf(Perl_debug_log, "%s ",PL_reg_extflags_name[bit]);
8951 PerlIO_printf(Perl_debug_log, "\n");
8953 PerlIO_printf(Perl_debug_log, "%s[none-set]\n",lead);
8959 Perl_regdump(pTHX_ const regexp *r)
8963 SV * const sv = sv_newmortal();
8964 SV *dsv= sv_newmortal();
8966 GET_RE_DEBUG_FLAGS_DECL;
8968 PERL_ARGS_ASSERT_REGDUMP;
8970 (void)dumpuntil(r, ri->program, ri->program + 1, NULL, NULL, sv, 0, 0);
8972 /* Header fields of interest. */
8973 if (r->anchored_substr) {
8974 RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->anchored_substr),
8975 RE_SV_DUMPLEN(r->anchored_substr), 30);
8976 PerlIO_printf(Perl_debug_log,
8977 "anchored %s%s at %"IVdf" ",
8978 s, RE_SV_TAIL(r->anchored_substr),
8979 (IV)r->anchored_offset);
8980 } else if (r->anchored_utf8) {
8981 RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->anchored_utf8),
8982 RE_SV_DUMPLEN(r->anchored_utf8), 30);
8983 PerlIO_printf(Perl_debug_log,
8984 "anchored utf8 %s%s at %"IVdf" ",
8985 s, RE_SV_TAIL(r->anchored_utf8),
8986 (IV)r->anchored_offset);
8988 if (r->float_substr) {
8989 RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->float_substr),
8990 RE_SV_DUMPLEN(r->float_substr), 30);
8991 PerlIO_printf(Perl_debug_log,
8992 "floating %s%s at %"IVdf"..%"UVuf" ",
8993 s, RE_SV_TAIL(r->float_substr),
8994 (IV)r->float_min_offset, (UV)r->float_max_offset);
8995 } else if (r->float_utf8) {
8996 RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->float_utf8),
8997 RE_SV_DUMPLEN(r->float_utf8), 30);
8998 PerlIO_printf(Perl_debug_log,
8999 "floating utf8 %s%s at %"IVdf"..%"UVuf" ",
9000 s, RE_SV_TAIL(r->float_utf8),
9001 (IV)r->float_min_offset, (UV)r->float_max_offset);
9003 if (r->check_substr || r->check_utf8)
9004 PerlIO_printf(Perl_debug_log,
9006 (r->check_substr == r->float_substr
9007 && r->check_utf8 == r->float_utf8
9008 ? "(checking floating" : "(checking anchored"));
9009 if (r->extflags & RXf_NOSCAN)
9010 PerlIO_printf(Perl_debug_log, " noscan");
9011 if (r->extflags & RXf_CHECK_ALL)
9012 PerlIO_printf(Perl_debug_log, " isall");
9013 if (r->check_substr || r->check_utf8)
9014 PerlIO_printf(Perl_debug_log, ") ");
9016 if (ri->regstclass) {
9017 regprop(r, sv, ri->regstclass);
9018 PerlIO_printf(Perl_debug_log, "stclass %s ", SvPVX_const(sv));
9020 if (r->extflags & RXf_ANCH) {
9021 PerlIO_printf(Perl_debug_log, "anchored");
9022 if (r->extflags & RXf_ANCH_BOL)
9023 PerlIO_printf(Perl_debug_log, "(BOL)");
9024 if (r->extflags & RXf_ANCH_MBOL)
9025 PerlIO_printf(Perl_debug_log, "(MBOL)");
9026 if (r->extflags & RXf_ANCH_SBOL)
9027 PerlIO_printf(Perl_debug_log, "(SBOL)");
9028 if (r->extflags & RXf_ANCH_GPOS)
9029 PerlIO_printf(Perl_debug_log, "(GPOS)");
9030 PerlIO_putc(Perl_debug_log, ' ');
9032 if (r->extflags & RXf_GPOS_SEEN)
9033 PerlIO_printf(Perl_debug_log, "GPOS:%"UVuf" ", (UV)r->gofs);
9034 if (r->intflags & PREGf_SKIP)
9035 PerlIO_printf(Perl_debug_log, "plus ");
9036 if (r->intflags & PREGf_IMPLICIT)
9037 PerlIO_printf(Perl_debug_log, "implicit ");
9038 PerlIO_printf(Perl_debug_log, "minlen %"IVdf" ", (IV)r->minlen);
9039 if (r->extflags & RXf_EVAL_SEEN)
9040 PerlIO_printf(Perl_debug_log, "with eval ");
9041 PerlIO_printf(Perl_debug_log, "\n");
9042 DEBUG_FLAGS_r(regdump_extflags("r->extflags: ",r->extflags));
9044 PERL_ARGS_ASSERT_REGDUMP;
9045 PERL_UNUSED_CONTEXT;
9047 #endif /* DEBUGGING */
9051 - regprop - printable representation of opcode
9053 #define EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags) \
9056 Perl_sv_catpvf(aTHX_ sv,"%s][%s",PL_colors[1],PL_colors[0]); \
9057 if (flags & ANYOF_INVERT) \
9058 /*make sure the invert info is in each */ \
9059 sv_catpvs(sv, "^"); \
9065 Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o)
9070 RXi_GET_DECL(prog,progi);
9071 GET_RE_DEBUG_FLAGS_DECL;
9073 PERL_ARGS_ASSERT_REGPROP;
9077 if (OP(o) > REGNODE_MAX) /* regnode.type is unsigned */
9078 /* It would be nice to FAIL() here, but this may be called from
9079 regexec.c, and it would be hard to supply pRExC_state. */
9080 Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d", (int)OP(o), (int)REGNODE_MAX);
9081 sv_catpv(sv, PL_reg_name[OP(o)]); /* Take off const! */
9083 k = PL_regkind[OP(o)];
9087 /* Using is_utf8_string() (via PERL_PV_UNI_DETECT)
9088 * is a crude hack but it may be the best for now since
9089 * we have no flag "this EXACTish node was UTF-8"
9091 pv_pretty(sv, STRING(o), STR_LEN(o), 60, PL_colors[0], PL_colors[1],
9092 PERL_PV_ESCAPE_UNI_DETECT |
9093 PERL_PV_PRETTY_ELLIPSES |
9094 PERL_PV_PRETTY_LTGT |
9095 PERL_PV_PRETTY_NOCLEAR
9097 } else if (k == TRIE) {
9098 /* print the details of the trie in dumpuntil instead, as
9099 * progi->data isn't available here */
9100 const char op = OP(o);
9101 const U32 n = ARG(o);
9102 const reg_ac_data * const ac = IS_TRIE_AC(op) ?
9103 (reg_ac_data *)progi->data->data[n] :
9105 const reg_trie_data * const trie
9106 = (reg_trie_data*)progi->data->data[!IS_TRIE_AC(op) ? n : ac->trie];
9108 Perl_sv_catpvf(aTHX_ sv, "-%s",PL_reg_name[o->flags]);
9109 DEBUG_TRIE_COMPILE_r(
9110 Perl_sv_catpvf(aTHX_ sv,
9111 "<S:%"UVuf"/%"IVdf" W:%"UVuf" L:%"UVuf"/%"UVuf" C:%"UVuf"/%"UVuf">",
9112 (UV)trie->startstate,
9113 (IV)trie->statecount-1, /* -1 because of the unused 0 element */
9114 (UV)trie->wordcount,
9117 (UV)TRIE_CHARCOUNT(trie),
9118 (UV)trie->uniquecharcount
9121 if ( IS_ANYOF_TRIE(op) || trie->bitmap ) {
9123 int rangestart = -1;
9124 U8* bitmap = IS_ANYOF_TRIE(op) ? (U8*)ANYOF_BITMAP(o) : (U8*)TRIE_BITMAP(trie);
9126 for (i = 0; i <= 256; i++) {
9127 if (i < 256 && BITMAP_TEST(bitmap,i)) {
9128 if (rangestart == -1)
9130 } else if (rangestart != -1) {
9131 if (i <= rangestart + 3)
9132 for (; rangestart < i; rangestart++)
9133 put_byte(sv, rangestart);
9135 put_byte(sv, rangestart);
9137 put_byte(sv, i - 1);
9145 } else if (k == CURLY) {
9146 if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX)
9147 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */
9148 Perl_sv_catpvf(aTHX_ sv, " {%d,%d}", ARG1(o), ARG2(o));
9150 else if (k == WHILEM && o->flags) /* Ordinal/of */
9151 Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4);
9152 else if (k == REF || k == OPEN || k == CLOSE || k == GROUPP || OP(o)==ACCEPT) {
9153 Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o)); /* Parenth number */
9154 if ( RXp_PAREN_NAMES(prog) ) {
9155 if ( k != REF || OP(o) < NREF) {
9156 AV *list= MUTABLE_AV(progi->data->data[progi->name_list_idx]);
9157 SV **name= av_fetch(list, ARG(o), 0 );
9159 Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
9162 AV *list= MUTABLE_AV(progi->data->data[ progi->name_list_idx ]);
9163 SV *sv_dat= MUTABLE_SV(progi->data->data[ ARG( o ) ]);
9164 I32 *nums=(I32*)SvPVX(sv_dat);
9165 SV **name= av_fetch(list, nums[0], 0 );
9168 for ( n=0; n<SvIVX(sv_dat); n++ ) {
9169 Perl_sv_catpvf(aTHX_ sv, "%s%"IVdf,
9170 (n ? "," : ""), (IV)nums[n]);
9172 Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
9176 } else if (k == GOSUB)
9177 Perl_sv_catpvf(aTHX_ sv, "%d[%+d]", (int)ARG(o),(int)ARG2L(o)); /* Paren and offset */
9178 else if (k == VERB) {
9180 Perl_sv_catpvf(aTHX_ sv, ":%"SVf,
9181 SVfARG((MUTABLE_SV(progi->data->data[ ARG( o ) ]))));
9182 } else if (k == LOGICAL)
9183 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* 2: embedded, otherwise 1 */
9184 else if (k == FOLDCHAR)
9185 Perl_sv_catpvf(aTHX_ sv, "[0x%"UVXf"]", PTR2UV(ARG(o)) );
9186 else if (k == ANYOF) {
9187 int i, rangestart = -1;
9188 const U8 flags = ANYOF_FLAGS(o);
9191 /* Should be synchronized with * ANYOF_ #xdefines in regcomp.h */
9192 static const char * const anyofs[] = {
9225 if (flags & ANYOF_LOCALE)
9226 sv_catpvs(sv, "{loc}");
9227 if (flags & ANYOF_FOLD)
9228 sv_catpvs(sv, "{i}");
9229 Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
9230 if (flags & ANYOF_INVERT)
9233 /* output what the standard cp 0-255 bitmap matches */
9234 for (i = 0; i <= 256; i++) {
9235 if (i < 256 && ANYOF_BITMAP_TEST(o,i)) {
9236 if (rangestart == -1)
9238 } else if (rangestart != -1) {
9239 if (i <= rangestart + 3)
9240 for (; rangestart < i; rangestart++)
9241 put_byte(sv, rangestart);
9243 put_byte(sv, rangestart);
9245 put_byte(sv, i - 1);
9252 EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags);
9253 /* output any special charclass tests (used mostly under use locale) */
9254 if (o->flags & ANYOF_CLASS)
9255 for (i = 0; i < (int)(sizeof(anyofs)/sizeof(char*)); i++)
9256 if (ANYOF_CLASS_TEST(o,i)) {
9257 sv_catpv(sv, anyofs[i]);
9261 EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags);
9263 /* output information about the unicode matching */
9264 if (flags & ANYOF_UNICODE)
9265 sv_catpvs(sv, "{unicode}");
9266 else if (flags & ANYOF_UNICODE_ALL)
9267 sv_catpvs(sv, "{unicode_all}");
9271 SV * const sw = regclass_swash(prog, o, FALSE, &lv, 0);
9275 U8 s[UTF8_MAXBYTES_CASE+1];
9277 for (i = 0; i <= 256; i++) { /* just the first 256 */
9278 uvchr_to_utf8(s, i);
9280 if (i < 256 && swash_fetch(sw, s, TRUE)) {
9281 if (rangestart == -1)
9283 } else if (rangestart != -1) {
9284 if (i <= rangestart + 3)
9285 for (; rangestart < i; rangestart++) {
9286 const U8 * const e = uvchr_to_utf8(s,rangestart);
9288 for(p = s; p < e; p++)
9292 const U8 *e = uvchr_to_utf8(s,rangestart);
9294 for (p = s; p < e; p++)
9297 e = uvchr_to_utf8(s, i-1);
9298 for (p = s; p < e; p++)
9305 sv_catpvs(sv, "..."); /* et cetera */
9309 char *s = savesvpv(lv);
9310 char * const origs = s;
9312 while (*s && *s != '\n')
9316 const char * const t = ++s;
9334 Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
9336 else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH))
9337 Perl_sv_catpvf(aTHX_ sv, "[%d]", -(o->flags));
9339 PERL_UNUSED_CONTEXT;
9340 PERL_UNUSED_ARG(sv);
9342 PERL_UNUSED_ARG(prog);
9343 #endif /* DEBUGGING */
9347 Perl_re_intuit_string(pTHX_ REGEXP * const r)
9348 { /* Assume that RE_INTUIT is set */
9350 struct regexp *const prog = (struct regexp *)SvANY(r);
9351 GET_RE_DEBUG_FLAGS_DECL;
9353 PERL_ARGS_ASSERT_RE_INTUIT_STRING;
9354 PERL_UNUSED_CONTEXT;
9358 const char * const s = SvPV_nolen_const(prog->check_substr
9359 ? prog->check_substr : prog->check_utf8);
9361 if (!PL_colorset) reginitcolors();
9362 PerlIO_printf(Perl_debug_log,
9363 "%sUsing REx %ssubstr:%s \"%s%.60s%s%s\"\n",
9365 prog->check_substr ? "" : "utf8 ",
9366 PL_colors[5],PL_colors[0],
9369 (strlen(s) > 60 ? "..." : ""));
9372 return prog->check_substr ? prog->check_substr : prog->check_utf8;
9378 handles refcounting and freeing the perl core regexp structure. When
9379 it is necessary to actually free the structure the first thing it
9380 does is call the 'free' method of the regexp_engine associated to to
9381 the regexp, allowing the handling of the void *pprivate; member
9382 first. (This routine is not overridable by extensions, which is why
9383 the extensions free is called first.)
9385 See regdupe and regdupe_internal if you change anything here.
9387 #ifndef PERL_IN_XSUB_RE
9389 Perl_pregfree(pTHX_ REGEXP *r)
9395 Perl_pregfree2(pTHX_ REGEXP *rx)
9398 struct regexp *const r = (struct regexp *)SvANY(rx);
9399 GET_RE_DEBUG_FLAGS_DECL;
9401 PERL_ARGS_ASSERT_PREGFREE2;
9404 ReREFCNT_dec(r->mother_re);
9406 CALLREGFREE_PVT(rx); /* free the private data */
9407 SvREFCNT_dec(RXp_PAREN_NAMES(r));
9410 SvREFCNT_dec(r->anchored_substr);
9411 SvREFCNT_dec(r->anchored_utf8);
9412 SvREFCNT_dec(r->float_substr);
9413 SvREFCNT_dec(r->float_utf8);
9414 Safefree(r->substrs);
9416 RX_MATCH_COPY_FREE(rx);
9417 #ifdef PERL_OLD_COPY_ON_WRITE
9418 SvREFCNT_dec(r->saved_copy);
9425 This is a hacky workaround to the structural issue of match results
9426 being stored in the regexp structure which is in turn stored in
9427 PL_curpm/PL_reg_curpm. The problem is that due to qr// the pattern
9428 could be PL_curpm in multiple contexts, and could require multiple
9429 result sets being associated with the pattern simultaneously, such
9430 as when doing a recursive match with (??{$qr})
9432 The solution is to make a lightweight copy of the regexp structure
9433 when a qr// is returned from the code executed by (??{$qr}) this
9434 lightweight copy doesnt actually own any of its data except for
9435 the starp/end and the actual regexp structure itself.
9441 Perl_reg_temp_copy (pTHX_ REGEXP *ret_x, REGEXP *rx)
9444 struct regexp *const r = (struct regexp *)SvANY(rx);
9445 register const I32 npar = r->nparens+1;
9447 PERL_ARGS_ASSERT_REG_TEMP_COPY;
9450 ret_x = (REGEXP*) newSV_type(SVt_REGEXP);
9451 ret = (struct regexp *)SvANY(ret_x);
9453 (void)ReREFCNT_inc(rx);
9454 /* We can take advantage of the existing "copied buffer" mechanism in SVs
9455 by pointing directly at the buffer, but flagging that the allocated
9456 space in the copy is zero. As we've just done a struct copy, it's now
9457 a case of zero-ing that, rather than copying the current length. */
9458 SvPV_set(ret_x, RX_WRAPPED(rx));
9459 SvFLAGS(ret_x) |= SvFLAGS(rx) & (SVf_POK|SVp_POK|SVf_UTF8);
9460 memcpy(&(ret->xpv_cur), &(r->xpv_cur),
9461 sizeof(regexp) - STRUCT_OFFSET(regexp, xpv_cur));
9462 SvLEN_set(ret_x, 0);
9463 SvSTASH_set(ret_x, NULL);
9464 Newx(ret->offs, npar, regexp_paren_pair);
9465 Copy(r->offs, ret->offs, npar, regexp_paren_pair);
9467 Newx(ret->substrs, 1, struct reg_substr_data);
9468 StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
9470 SvREFCNT_inc_void(ret->anchored_substr);
9471 SvREFCNT_inc_void(ret->anchored_utf8);
9472 SvREFCNT_inc_void(ret->float_substr);
9473 SvREFCNT_inc_void(ret->float_utf8);
9475 /* check_substr and check_utf8, if non-NULL, point to either their
9476 anchored or float namesakes, and don't hold a second reference. */
9478 RX_MATCH_COPIED_off(ret_x);
9479 #ifdef PERL_OLD_COPY_ON_WRITE
9480 ret->saved_copy = NULL;
9482 ret->mother_re = rx;
9488 /* regfree_internal()
9490 Free the private data in a regexp. This is overloadable by
9491 extensions. Perl takes care of the regexp structure in pregfree(),
9492 this covers the *pprivate pointer which technically perldoesnt
9493 know about, however of course we have to handle the
9494 regexp_internal structure when no extension is in use.
9496 Note this is called before freeing anything in the regexp
9501 Perl_regfree_internal(pTHX_ REGEXP * const rx)
9504 struct regexp *const r = (struct regexp *)SvANY(rx);
9506 GET_RE_DEBUG_FLAGS_DECL;
9508 PERL_ARGS_ASSERT_REGFREE_INTERNAL;
9514 SV *dsv= sv_newmortal();
9515 RE_PV_QUOTED_DECL(s, RX_UTF8(rx),
9516 dsv, RX_PRECOMP(rx), RX_PRELEN(rx), 60);
9517 PerlIO_printf(Perl_debug_log,"%sFreeing REx:%s %s\n",
9518 PL_colors[4],PL_colors[5],s);
9521 #ifdef RE_TRACK_PATTERN_OFFSETS
9523 Safefree(ri->u.offsets); /* 20010421 MJD */
9526 int n = ri->data->count;
9527 PAD* new_comppad = NULL;
9532 /* If you add a ->what type here, update the comment in regcomp.h */
9533 switch (ri->data->what[n]) {
9537 SvREFCNT_dec(MUTABLE_SV(ri->data->data[n]));
9540 Safefree(ri->data->data[n]);
9543 new_comppad = MUTABLE_AV(ri->data->data[n]);
9546 if (new_comppad == NULL)
9547 Perl_croak(aTHX_ "panic: pregfree comppad");
9548 PAD_SAVE_LOCAL(old_comppad,
9549 /* Watch out for global destruction's random ordering. */
9550 (SvTYPE(new_comppad) == SVt_PVAV) ? new_comppad : NULL
9553 refcnt = OpREFCNT_dec((OP_4tree*)ri->data->data[n]);
9556 op_free((OP_4tree*)ri->data->data[n]);
9558 PAD_RESTORE_LOCAL(old_comppad);
9559 SvREFCNT_dec(MUTABLE_SV(new_comppad));
9565 { /* Aho Corasick add-on structure for a trie node.
9566 Used in stclass optimization only */
9568 reg_ac_data *aho=(reg_ac_data*)ri->data->data[n];
9570 refcount = --aho->refcount;
9573 PerlMemShared_free(aho->states);
9574 PerlMemShared_free(aho->fail);
9575 /* do this last!!!! */
9576 PerlMemShared_free(ri->data->data[n]);
9577 PerlMemShared_free(ri->regstclass);
9583 /* trie structure. */
9585 reg_trie_data *trie=(reg_trie_data*)ri->data->data[n];
9587 refcount = --trie->refcount;
9590 PerlMemShared_free(trie->charmap);
9591 PerlMemShared_free(trie->states);
9592 PerlMemShared_free(trie->trans);
9594 PerlMemShared_free(trie->bitmap);
9596 PerlMemShared_free(trie->wordlen);
9598 PerlMemShared_free(trie->jump);
9600 PerlMemShared_free(trie->nextword);
9601 /* do this last!!!! */
9602 PerlMemShared_free(ri->data->data[n]);
9607 Perl_croak(aTHX_ "panic: regfree data code '%c'", ri->data->what[n]);
9610 Safefree(ri->data->what);
9617 #define sv_dup_inc(s,t) SvREFCNT_inc(sv_dup(s,t))
9618 #define av_dup_inc(s,t) MUTABLE_AV(SvREFCNT_inc(sv_dup((const SV *)s,t)))
9619 #define hv_dup_inc(s,t) MUTABLE_HV(SvREFCNT_inc(sv_dup((const SV *)s,t)))
9620 #define SAVEPVN(p,n) ((p) ? savepvn(p,n) : NULL)
9623 re_dup - duplicate a regexp.
9625 This routine is expected to clone a given regexp structure. It is only
9626 compiled under USE_ITHREADS.
9628 After all of the core data stored in struct regexp is duplicated
9629 the regexp_engine.dupe method is used to copy any private data
9630 stored in the *pprivate pointer. This allows extensions to handle
9631 any duplication it needs to do.
9633 See pregfree() and regfree_internal() if you change anything here.
9635 #if defined(USE_ITHREADS)
9636 #ifndef PERL_IN_XSUB_RE
9638 Perl_re_dup_guts(pTHX_ const REGEXP *sstr, REGEXP *dstr, CLONE_PARAMS *param)
9642 const struct regexp *r = (const struct regexp *)SvANY(sstr);
9643 struct regexp *ret = (struct regexp *)SvANY(dstr);
9645 PERL_ARGS_ASSERT_RE_DUP_GUTS;
9647 npar = r->nparens+1;
9648 Newx(ret->offs, npar, regexp_paren_pair);
9649 Copy(r->offs, ret->offs, npar, regexp_paren_pair);
9651 /* no need to copy these */
9652 Newx(ret->swap, npar, regexp_paren_pair);
9656 /* Do it this way to avoid reading from *r after the StructCopy().
9657 That way, if any of the sv_dup_inc()s dislodge *r from the L1
9658 cache, it doesn't matter. */
9659 const bool anchored = r->check_substr
9660 ? r->check_substr == r->anchored_substr
9661 : r->check_utf8 == r->anchored_utf8;
9662 Newx(ret->substrs, 1, struct reg_substr_data);
9663 StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
9665 ret->anchored_substr = sv_dup_inc(ret->anchored_substr, param);
9666 ret->anchored_utf8 = sv_dup_inc(ret->anchored_utf8, param);
9667 ret->float_substr = sv_dup_inc(ret->float_substr, param);
9668 ret->float_utf8 = sv_dup_inc(ret->float_utf8, param);
9670 /* check_substr and check_utf8, if non-NULL, point to either their
9671 anchored or float namesakes, and don't hold a second reference. */
9673 if (ret->check_substr) {
9675 assert(r->check_utf8 == r->anchored_utf8);
9676 ret->check_substr = ret->anchored_substr;
9677 ret->check_utf8 = ret->anchored_utf8;
9679 assert(r->check_substr == r->float_substr);
9680 assert(r->check_utf8 == r->float_utf8);
9681 ret->check_substr = ret->float_substr;
9682 ret->check_utf8 = ret->float_utf8;
9684 } else if (ret->check_utf8) {
9686 ret->check_utf8 = ret->anchored_utf8;
9688 ret->check_utf8 = ret->float_utf8;
9693 RXp_PAREN_NAMES(ret) = hv_dup_inc(RXp_PAREN_NAMES(ret), param);
9696 RXi_SET(ret,CALLREGDUPE_PVT(dstr,param));
9698 if (RX_MATCH_COPIED(dstr))
9699 ret->subbeg = SAVEPVN(ret->subbeg, ret->sublen);
9702 #ifdef PERL_OLD_COPY_ON_WRITE
9703 ret->saved_copy = NULL;
9706 if (ret->mother_re) {
9707 if (SvPVX_const(dstr) == SvPVX_const(ret->mother_re)) {
9708 /* Our storage points directly to our mother regexp, but that's
9709 1: a buffer in a different thread
9710 2: something we no longer hold a reference on
9711 so we need to copy it locally. */
9712 /* Note we need to sue SvCUR() on our mother_re, because it, in
9713 turn, may well be pointing to its own mother_re. */
9714 SvPV_set(dstr, SAVEPVN(SvPVX_const(ret->mother_re),
9715 SvCUR(ret->mother_re)+1));
9716 SvLEN_set(dstr, SvCUR(ret->mother_re)+1);
9718 ret->mother_re = NULL;
9722 #endif /* PERL_IN_XSUB_RE */
9727 This is the internal complement to regdupe() which is used to copy
9728 the structure pointed to by the *pprivate pointer in the regexp.
9729 This is the core version of the extension overridable cloning hook.
9730 The regexp structure being duplicated will be copied by perl prior
9731 to this and will be provided as the regexp *r argument, however
9732 with the /old/ structures pprivate pointer value. Thus this routine
9733 may override any copying normally done by perl.
9735 It returns a pointer to the new regexp_internal structure.
9739 Perl_regdupe_internal(pTHX_ REGEXP * const rx, CLONE_PARAMS *param)
9742 struct regexp *const r = (struct regexp *)SvANY(rx);
9743 regexp_internal *reti;
9747 PERL_ARGS_ASSERT_REGDUPE_INTERNAL;
9749 npar = r->nparens+1;
9752 Newxc(reti, sizeof(regexp_internal) + len*sizeof(regnode), char, regexp_internal);
9753 Copy(ri->program, reti->program, len+1, regnode);
9756 reti->regstclass = NULL;
9760 const int count = ri->data->count;
9763 Newxc(d, sizeof(struct reg_data) + count*sizeof(void *),
9764 char, struct reg_data);
9765 Newx(d->what, count, U8);
9768 for (i = 0; i < count; i++) {
9769 d->what[i] = ri->data->what[i];
9770 switch (d->what[i]) {
9771 /* legal options are one of: sSfpontTu
9772 see also regcomp.h and pregfree() */
9775 case 'p': /* actually an AV, but the dup function is identical. */
9776 case 'u': /* actually an HV, but the dup function is identical. */
9777 d->data[i] = sv_dup_inc((const SV *)ri->data->data[i], param);
9780 /* This is cheating. */
9781 Newx(d->data[i], 1, struct regnode_charclass_class);
9782 StructCopy(ri->data->data[i], d->data[i],
9783 struct regnode_charclass_class);
9784 reti->regstclass = (regnode*)d->data[i];
9787 /* Compiled op trees are readonly and in shared memory,
9788 and can thus be shared without duplication. */
9790 d->data[i] = (void*)OpREFCNT_inc((OP*)ri->data->data[i]);
9794 /* Trie stclasses are readonly and can thus be shared
9795 * without duplication. We free the stclass in pregfree
9796 * when the corresponding reg_ac_data struct is freed.
9798 reti->regstclass= ri->regstclass;
9802 ((reg_trie_data*)ri->data->data[i])->refcount++;
9806 d->data[i] = ri->data->data[i];
9809 Perl_croak(aTHX_ "panic: re_dup unknown data code '%c'", ri->data->what[i]);
9818 reti->name_list_idx = ri->name_list_idx;
9820 #ifdef RE_TRACK_PATTERN_OFFSETS
9821 if (ri->u.offsets) {
9822 Newx(reti->u.offsets, 2*len+1, U32);
9823 Copy(ri->u.offsets, reti->u.offsets, 2*len+1, U32);
9826 SetProgLen(reti,len);
9832 #endif /* USE_ITHREADS */
9834 #ifndef PERL_IN_XSUB_RE
9837 - regnext - dig the "next" pointer out of a node
9840 Perl_regnext(pTHX_ register regnode *p)
9843 register I32 offset;
9848 offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p));
9857 S_re_croak2(pTHX_ const char* pat1,const char* pat2,...)
9860 STRLEN l1 = strlen(pat1);
9861 STRLEN l2 = strlen(pat2);
9864 const char *message;
9866 PERL_ARGS_ASSERT_RE_CROAK2;
9872 Copy(pat1, buf, l1 , char);
9873 Copy(pat2, buf + l1, l2 , char);
9874 buf[l1 + l2] = '\n';
9875 buf[l1 + l2 + 1] = '\0';
9877 /* ANSI variant takes additional second argument */
9878 va_start(args, pat2);
9882 msv = vmess(buf, &args);
9884 message = SvPV_const(msv,l1);
9887 Copy(message, buf, l1 , char);
9888 buf[l1-1] = '\0'; /* Overwrite \n */
9889 Perl_croak(aTHX_ "%s", buf);
9892 /* XXX Here's a total kludge. But we need to re-enter for swash routines. */
9894 #ifndef PERL_IN_XSUB_RE
9896 Perl_save_re_context(pTHX)
9900 struct re_save_state *state;
9902 SAVEVPTR(PL_curcop);
9903 SSGROW(SAVESTACK_ALLOC_FOR_RE_SAVE_STATE + 1);
9905 state = (struct re_save_state *)(PL_savestack + PL_savestack_ix);
9906 PL_savestack_ix += SAVESTACK_ALLOC_FOR_RE_SAVE_STATE;
9907 SSPUSHINT(SAVEt_RE_STATE);
9909 Copy(&PL_reg_state, state, 1, struct re_save_state);
9911 PL_reg_start_tmp = 0;
9912 PL_reg_start_tmpl = 0;
9913 PL_reg_oldsaved = NULL;
9914 PL_reg_oldsavedlen = 0;
9916 PL_reg_leftiter = 0;
9917 PL_reg_poscache = NULL;
9918 PL_reg_poscache_size = 0;
9919 #ifdef PERL_OLD_COPY_ON_WRITE
9923 /* Save $1..$n (#18107: UTF-8 s/(\w+)/uc($1)/e); AMS 20021106. */
9925 const REGEXP * const rx = PM_GETRE(PL_curpm);
9928 for (i = 1; i <= RX_NPARENS(rx); i++) {
9929 char digits[TYPE_CHARS(long)];
9930 const STRLEN len = my_snprintf(digits, sizeof(digits), "%lu", (long)i);
9931 GV *const *const gvp
9932 = (GV**)hv_fetch(PL_defstash, digits, len, 0);
9935 GV * const gv = *gvp;
9936 if (SvTYPE(gv) == SVt_PVGV && GvSV(gv))
9946 clear_re(pTHX_ void *r)
9949 ReREFCNT_dec((REGEXP *)r);
9955 S_put_byte(pTHX_ SV *sv, int c)
9957 PERL_ARGS_ASSERT_PUT_BYTE;
9959 /* Our definition of isPRINT() ignores locales, so only bytes that are
9960 not part of UTF-8 are considered printable. I assume that the same
9961 holds for UTF-EBCDIC.
9962 Also, code point 255 is not printable in either (it's E0 in EBCDIC,
9963 which Wikipedia says:
9965 EO, or Eight Ones, is an 8-bit EBCDIC character code represented as all
9966 ones (binary 1111 1111, hexadecimal FF). It is similar, but not
9967 identical, to the ASCII delete (DEL) or rubout control character.
9968 ) So the old condition can be simplified to !isPRINT(c) */
9970 Perl_sv_catpvf(aTHX_ sv, "\\%o", c);
9972 const char string = c;
9973 if (c == '-' || c == ']' || c == '\\' || c == '^')
9974 sv_catpvs(sv, "\\");
9975 sv_catpvn(sv, &string, 1);
9980 #define CLEAR_OPTSTART \
9981 if (optstart) STMT_START { \
9982 DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log, " (%"IVdf" nodes)\n", (IV)(node - optstart))); \
9986 #define DUMPUNTIL(b,e) CLEAR_OPTSTART; node=dumpuntil(r,start,(b),(e),last,sv,indent+1,depth+1);
9988 STATIC const regnode *
9989 S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node,
9990 const regnode *last, const regnode *plast,
9991 SV* sv, I32 indent, U32 depth)
9994 register U8 op = PSEUDO; /* Arbitrary non-END op. */
9995 register const regnode *next;
9996 const regnode *optstart= NULL;
9999 GET_RE_DEBUG_FLAGS_DECL;
10001 PERL_ARGS_ASSERT_DUMPUNTIL;
10003 #ifdef DEBUG_DUMPUNTIL
10004 PerlIO_printf(Perl_debug_log, "--- %d : %d - %d - %d\n",indent,node-start,
10005 last ? last-start : 0,plast ? plast-start : 0);
10008 if (plast && plast < last)
10011 while (PL_regkind[op] != END && (!last || node < last)) {
10012 /* While that wasn't END last time... */
10015 if (op == CLOSE || op == WHILEM)
10017 next = regnext((regnode *)node);
10020 if (OP(node) == OPTIMIZED) {
10021 if (!optstart && RE_DEBUG_FLAG(RE_DEBUG_COMPILE_OPTIMISE))
10028 regprop(r, sv, node);
10029 PerlIO_printf(Perl_debug_log, "%4"IVdf":%*s%s", (IV)(node - start),
10030 (int)(2*indent + 1), "", SvPVX_const(sv));
10032 if (OP(node) != OPTIMIZED) {
10033 if (next == NULL) /* Next ptr. */
10034 PerlIO_printf(Perl_debug_log, " (0)");
10035 else if (PL_regkind[(U8)op] == BRANCH && PL_regkind[OP(next)] != BRANCH )
10036 PerlIO_printf(Perl_debug_log, " (FAIL)");
10038 PerlIO_printf(Perl_debug_log, " (%"IVdf")", (IV)(next - start));
10039 (void)PerlIO_putc(Perl_debug_log, '\n');
10043 if (PL_regkind[(U8)op] == BRANCHJ) {
10046 register const regnode *nnode = (OP(next) == LONGJMP
10047 ? regnext((regnode *)next)
10049 if (last && nnode > last)
10051 DUMPUNTIL(NEXTOPER(NEXTOPER(node)), nnode);
10054 else if (PL_regkind[(U8)op] == BRANCH) {
10056 DUMPUNTIL(NEXTOPER(node), next);
10058 else if ( PL_regkind[(U8)op] == TRIE ) {
10059 const regnode *this_trie = node;
10060 const char op = OP(node);
10061 const U32 n = ARG(node);
10062 const reg_ac_data * const ac = op>=AHOCORASICK ?
10063 (reg_ac_data *)ri->data->data[n] :
10065 const reg_trie_data * const trie =
10066 (reg_trie_data*)ri->data->data[op<AHOCORASICK ? n : ac->trie];
10068 AV *const trie_words = MUTABLE_AV(ri->data->data[n + TRIE_WORDS_OFFSET]);
10070 const regnode *nextbranch= NULL;
10073 for (word_idx= 0; word_idx < (I32)trie->wordcount; word_idx++) {
10074 SV ** const elem_ptr = av_fetch(trie_words,word_idx,0);
10076 PerlIO_printf(Perl_debug_log, "%*s%s ",
10077 (int)(2*(indent+3)), "",
10078 elem_ptr ? pv_pretty(sv, SvPV_nolen_const(*elem_ptr), SvCUR(*elem_ptr), 60,
10079 PL_colors[0], PL_colors[1],
10080 (SvUTF8(*elem_ptr) ? PERL_PV_ESCAPE_UNI : 0) |
10081 PERL_PV_PRETTY_ELLIPSES |
10082 PERL_PV_PRETTY_LTGT
10087 U16 dist= trie->jump[word_idx+1];
10088 PerlIO_printf(Perl_debug_log, "(%"UVuf")\n",
10089 (UV)((dist ? this_trie + dist : next) - start));
10092 nextbranch= this_trie + trie->jump[0];
10093 DUMPUNTIL(this_trie + dist, nextbranch);
10095 if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
10096 nextbranch= regnext((regnode *)nextbranch);
10098 PerlIO_printf(Perl_debug_log, "\n");
10101 if (last && next > last)
10106 else if ( op == CURLY ) { /* "next" might be very big: optimizer */
10107 DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS,
10108 NEXTOPER(node) + EXTRA_STEP_2ARGS + 1);
10110 else if (PL_regkind[(U8)op] == CURLY && op != CURLYX) {
10112 DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS, next);
10114 else if ( op == PLUS || op == STAR) {
10115 DUMPUNTIL(NEXTOPER(node), NEXTOPER(node) + 1);
10117 else if (op == ANYOF) {
10118 /* arglen 1 + class block */
10119 node += 1 + ((ANYOF_FLAGS(node) & ANYOF_LARGE)
10120 ? ANYOF_CLASS_SKIP : ANYOF_SKIP);
10121 node = NEXTOPER(node);
10123 else if (PL_regkind[(U8)op] == EXACT) {
10124 /* Literal string, where present. */
10125 node += NODE_SZ_STR(node) - 1;
10126 node = NEXTOPER(node);
10129 node = NEXTOPER(node);
10130 node += regarglen[(U8)op];
10132 if (op == CURLYX || op == OPEN)
10136 #ifdef DEBUG_DUMPUNTIL
10137 PerlIO_printf(Perl_debug_log, "--- %d\n", (int)indent);
10142 #endif /* DEBUGGING */
10146 * c-indentation-style: bsd
10147 * c-basic-offset: 4
10148 * indent-tabs-mode: t
10151 * ex: set ts=8 sts=4 sw=4 noet: