5 * "A fair jaw-cracker dwarf-language must be." --Samwise Gamgee
8 /* This file contains functions for compiling a regular expression. See
9 * also regexec.c which funnily enough, contains functions for executing
10 * a regular expression.
12 * This file is also copied at build time to ext/re/re_comp.c, where
13 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
14 * This causes the main functions to be compiled under new names and with
15 * debugging support added, which makes "use re 'debug'" work.
18 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
19 * confused with the original package (see point 3 below). Thanks, Henry!
22 /* Additional note: this code is very heavily munged from Henry's version
23 * in places. In some spots I've traded clarity for efficiency, so don't
24 * blame Henry for some of the lack of readability.
27 /* The names of the functions have been changed from regcomp and
28 * regexec to pregcomp and pregexec in order to avoid conflicts
29 * with the POSIX routines of the same names.
32 #ifdef PERL_EXT_RE_BUILD
37 * pregcomp and pregexec -- regsub and regerror are not used in perl
39 * Copyright (c) 1986 by University of Toronto.
40 * Written by Henry Spencer. Not derived from licensed software.
42 * Permission is granted to anyone to use this software for any
43 * purpose on any computer system, and to redistribute it freely,
44 * subject to the following restrictions:
46 * 1. The author is not responsible for the consequences of use of
47 * this software, no matter how awful, even if they arise
50 * 2. The origin of this software must not be misrepresented, either
51 * by explicit claim or by omission.
53 * 3. Altered versions must be plainly marked as such, and must not
54 * be misrepresented as being the original software.
57 **** Alterations to Henry's code are...
59 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
60 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, by Larry Wall and others
62 **** You may distribute under the terms of either the GNU General Public
63 **** License or the Artistic License, as specified in the README file.
66 * Beware that some of this code is subtly aware of the way operator
67 * precedence is structured in regular expressions. Serious changes in
68 * regular-expression syntax might require a total rethink.
71 #define PERL_IN_REGCOMP_C
74 #ifndef PERL_IN_XSUB_RE
79 #ifdef PERL_IN_XSUB_RE
90 # if defined(BUGGY_MSC6)
91 /* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */
92 # pragma optimize("a",off)
93 /* But MSC 6.00A is happy with 'w', for aliases only across function calls*/
94 # pragma optimize("w",on )
95 # endif /* BUGGY_MSC6 */
102 typedef struct RExC_state_t {
103 U32 flags; /* are we folding, multilining? */
104 char *precomp; /* uncompiled string. */
106 char *start; /* Start of input for compile */
107 char *end; /* End of input for compile */
108 char *parse; /* Input-scan pointer. */
109 I32 whilem_seen; /* number of WHILEM in this expr */
110 regnode *emit_start; /* Start of emitted-code area */
111 regnode *emit; /* Code-emit pointer; ®dummy = don't = compiling */
112 I32 naughty; /* How bad is this pattern? */
113 I32 sawback; /* Did we see \1, ...? */
115 I32 size; /* Code size. */
116 I32 npar; /* () count. */
117 I32 nestroot; /* root parens we are in - used by accept */
121 regnode **open_parens; /* pointers to open parens */
122 regnode **close_parens; /* pointers to close parens */
123 regnode *opend; /* END node in program */
125 HV *charnames; /* cache of named sequences */
126 HV *paren_names; /* Paren names */
127 regnode **recurse; /* Recurse regops */
128 I32 recurse_count; /* Number of recurse regops */
130 char *starttry; /* -Dr: where regtry was called. */
131 #define RExC_starttry (pRExC_state->starttry)
134 const char *lastparse;
136 #define RExC_lastparse (pRExC_state->lastparse)
137 #define RExC_lastnum (pRExC_state->lastnum)
141 #define RExC_flags (pRExC_state->flags)
142 #define RExC_precomp (pRExC_state->precomp)
143 #define RExC_rx (pRExC_state->rx)
144 #define RExC_start (pRExC_state->start)
145 #define RExC_end (pRExC_state->end)
146 #define RExC_parse (pRExC_state->parse)
147 #define RExC_whilem_seen (pRExC_state->whilem_seen)
148 #define RExC_offsets (pRExC_state->rx->offsets) /* I am not like the others */
149 #define RExC_emit (pRExC_state->emit)
150 #define RExC_emit_start (pRExC_state->emit_start)
151 #define RExC_naughty (pRExC_state->naughty)
152 #define RExC_sawback (pRExC_state->sawback)
153 #define RExC_seen (pRExC_state->seen)
154 #define RExC_size (pRExC_state->size)
155 #define RExC_npar (pRExC_state->npar)
156 #define RExC_nestroot (pRExC_state->nestroot)
157 #define RExC_extralen (pRExC_state->extralen)
158 #define RExC_seen_zerolen (pRExC_state->seen_zerolen)
159 #define RExC_seen_evals (pRExC_state->seen_evals)
160 #define RExC_utf8 (pRExC_state->utf8)
161 #define RExC_charnames (pRExC_state->charnames)
162 #define RExC_open_parens (pRExC_state->open_parens)
163 #define RExC_close_parens (pRExC_state->close_parens)
164 #define RExC_opend (pRExC_state->opend)
165 #define RExC_paren_names (pRExC_state->paren_names)
166 #define RExC_recurse (pRExC_state->recurse)
167 #define RExC_recurse_count (pRExC_state->recurse_count)
169 #define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?')
170 #define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
171 ((*s) == '{' && regcurly(s)))
174 #undef SPSTART /* dratted cpp namespace... */
177 * Flags to be passed up and down.
179 #define WORST 0 /* Worst case. */
180 #define HASWIDTH 0x1 /* Known to match non-null strings. */
181 #define SIMPLE 0x2 /* Simple enough to be STAR/PLUS operand. */
182 #define SPSTART 0x4 /* Starts with * or +. */
183 #define TRYAGAIN 0x8 /* Weeded out a declaration. */
185 #define REG_NODE_NUM(x) ((x) ? (int)((x)-RExC_emit_start) : -1)
187 /* whether trie related optimizations are enabled */
188 #if PERL_ENABLE_EXTENDED_TRIE_OPTIMISATION
189 #define TRIE_STUDY_OPT
190 #define FULL_TRIE_STUDY
196 #define PBYTE(u8str,paren) ((U8*)(u8str))[(paren) >> 3]
197 #define PBITVAL(paren) (1 << ((paren) & 7))
198 #define PAREN_TEST(u8str,paren) ( PBYTE(u8str,paren) & PBITVAL(paren))
199 #define PAREN_SET(u8str,paren) PBYTE(u8str,paren) |= PBITVAL(paren)
200 #define PAREN_UNSET(u8str,paren) PBYTE(u8str,paren) &= (~PBITVAL(paren))
203 /* About scan_data_t.
205 During optimisation we recurse through the regexp program performing
206 various inplace (keyhole style) optimisations. In addition study_chunk
207 and scan_commit populate this data structure with information about
208 what strings MUST appear in the pattern. We look for the longest
209 string that must appear for at a fixed location, and we look for the
210 longest string that may appear at a floating location. So for instance
215 Both 'FOO' and 'A' are fixed strings. Both 'B' and 'BAR' are floating
216 strings (because they follow a .* construct). study_chunk will identify
217 both FOO and BAR as being the longest fixed and floating strings respectively.
219 The strings can be composites, for instance
223 will result in a composite fixed substring 'foo'.
225 For each string some basic information is maintained:
227 - offset or min_offset
228 This is the position the string must appear at, or not before.
229 It also implicitly (when combined with minlenp) tells us how many
230 character must match before the string we are searching.
231 Likewise when combined with minlenp and the length of the string
232 tells us how many characters must appear after the string we have
236 Only used for floating strings. This is the rightmost point that
237 the string can appear at. Ifset to I32 max it indicates that the
238 string can occur infinitely far to the right.
241 A pointer to the minimum length of the pattern that the string
242 was found inside. This is important as in the case of positive
243 lookahead or positive lookbehind we can have multiple patterns
248 The minimum length of the pattern overall is 3, the minimum length
249 of the lookahead part is 3, but the minimum length of the part that
250 will actually match is 1. So 'FOO's minimum length is 3, but the
251 minimum length for the F is 1. This is important as the minimum length
252 is used to determine offsets in front of and behind the string being
253 looked for. Since strings can be composites this is the length of the
254 pattern at the time it was commited with a scan_commit. Note that
255 the length is calculated by study_chunk, so that the minimum lengths
256 are not known until the full pattern has been compiled, thus the
257 pointer to the value.
261 In the case of lookbehind the string being searched for can be
262 offset past the start point of the final matching string.
263 If this value was just blithely removed from the min_offset it would
264 invalidate some of the calculations for how many chars must match
265 before or after (as they are derived from min_offset and minlen and
266 the length of the string being searched for).
267 When the final pattern is compiled and the data is moved from the
268 scan_data_t structure into the regexp structure the information
269 about lookbehind is factored in, with the information that would
270 have been lost precalculated in the end_shift field for the
273 The fields pos_min and pos_delta are used to store the minimum offset
274 and the delta to the maximum offset at the current point in the pattern.
278 typedef struct scan_data_t {
279 /*I32 len_min; unused */
280 /*I32 len_delta; unused */
284 I32 last_end; /* min value, <0 unless valid. */
287 SV **longest; /* Either &l_fixed, or &l_float. */
288 SV *longest_fixed; /* longest fixed string found in pattern */
289 I32 offset_fixed; /* offset where it starts */
290 I32 *minlen_fixed; /* pointer to the minlen relevent to the string */
291 I32 lookbehind_fixed; /* is the position of the string modfied by LB */
292 SV *longest_float; /* longest floating string found in pattern */
293 I32 offset_float_min; /* earliest point in string it can appear */
294 I32 offset_float_max; /* latest point in string it can appear */
295 I32 *minlen_float; /* pointer to the minlen relevent to the string */
296 I32 lookbehind_float; /* is the position of the string modified by LB */
300 struct regnode_charclass_class *start_class;
304 * Forward declarations for pregcomp()'s friends.
307 static const scan_data_t zero_scan_data =
308 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ,0};
310 #define SF_BEFORE_EOL (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
311 #define SF_BEFORE_SEOL 0x0001
312 #define SF_BEFORE_MEOL 0x0002
313 #define SF_FIX_BEFORE_EOL (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
314 #define SF_FL_BEFORE_EOL (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
317 # define SF_FIX_SHIFT_EOL (0+2)
318 # define SF_FL_SHIFT_EOL (0+4)
320 # define SF_FIX_SHIFT_EOL (+2)
321 # define SF_FL_SHIFT_EOL (+4)
324 #define SF_FIX_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
325 #define SF_FIX_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
327 #define SF_FL_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
328 #define SF_FL_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
329 #define SF_IS_INF 0x0040
330 #define SF_HAS_PAR 0x0080
331 #define SF_IN_PAR 0x0100
332 #define SF_HAS_EVAL 0x0200
333 #define SCF_DO_SUBSTR 0x0400
334 #define SCF_DO_STCLASS_AND 0x0800
335 #define SCF_DO_STCLASS_OR 0x1000
336 #define SCF_DO_STCLASS (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
337 #define SCF_WHILEM_VISITED_POS 0x2000
339 #define SCF_TRIE_RESTUDY 0x4000 /* Do restudy? */
340 #define SCF_SEEN_ACCEPT 0x8000
342 #define UTF (RExC_utf8 != 0)
343 #define LOC ((RExC_flags & PMf_LOCALE) != 0)
344 #define FOLD ((RExC_flags & PMf_FOLD) != 0)
346 #define OOB_UNICODE 12345678
347 #define OOB_NAMEDCLASS -1
349 #define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
350 #define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
353 /* length of regex to show in messages that don't mark a position within */
354 #define RegexLengthToShowInErrorMessages 127
357 * If MARKER[12] are adjusted, be sure to adjust the constants at the top
358 * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
359 * op/pragma/warn/regcomp.
361 #define MARKER1 "<-- HERE" /* marker as it appears in the description */
362 #define MARKER2 " <-- HERE " /* marker as it appears within the regex */
364 #define REPORT_LOCATION " in regex; marked by " MARKER1 " in m/%.*s" MARKER2 "%s/"
367 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
368 * arg. Show regex, up to a maximum length. If it's too long, chop and add
371 #define FAIL(msg) STMT_START { \
372 const char *ellipses = ""; \
373 IV len = RExC_end - RExC_precomp; \
376 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
377 if (len > RegexLengthToShowInErrorMessages) { \
378 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
379 len = RegexLengthToShowInErrorMessages - 10; \
382 Perl_croak(aTHX_ "%s in regex m/%.*s%s/", \
383 msg, (int)len, RExC_precomp, ellipses); \
387 * Simple_vFAIL -- like FAIL, but marks the current location in the scan
389 #define Simple_vFAIL(m) STMT_START { \
390 const IV offset = RExC_parse - RExC_precomp; \
391 Perl_croak(aTHX_ "%s" REPORT_LOCATION, \
392 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
396 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
398 #define vFAIL(m) STMT_START { \
400 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
405 * Like Simple_vFAIL(), but accepts two arguments.
407 #define Simple_vFAIL2(m,a1) STMT_START { \
408 const IV offset = RExC_parse - RExC_precomp; \
409 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, \
410 (int)offset, RExC_precomp, RExC_precomp + offset); \
414 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
416 #define vFAIL2(m,a1) STMT_START { \
418 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
419 Simple_vFAIL2(m, a1); \
424 * Like Simple_vFAIL(), but accepts three arguments.
426 #define Simple_vFAIL3(m, a1, a2) STMT_START { \
427 const IV offset = RExC_parse - RExC_precomp; \
428 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, \
429 (int)offset, RExC_precomp, RExC_precomp + offset); \
433 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
435 #define vFAIL3(m,a1,a2) STMT_START { \
437 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
438 Simple_vFAIL3(m, a1, a2); \
442 * Like Simple_vFAIL(), but accepts four arguments.
444 #define Simple_vFAIL4(m, a1, a2, a3) STMT_START { \
445 const IV offset = RExC_parse - RExC_precomp; \
446 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3, \
447 (int)offset, RExC_precomp, RExC_precomp + offset); \
450 #define vWARN(loc,m) STMT_START { \
451 const IV offset = loc - RExC_precomp; \
452 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s" REPORT_LOCATION, \
453 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
456 #define vWARNdep(loc,m) STMT_START { \
457 const IV offset = loc - RExC_precomp; \
458 Perl_warner(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), \
459 "%s" REPORT_LOCATION, \
460 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
464 #define vWARN2(loc, m, a1) STMT_START { \
465 const IV offset = loc - RExC_precomp; \
466 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
467 a1, (int)offset, RExC_precomp, RExC_precomp + offset); \
470 #define vWARN3(loc, m, a1, a2) STMT_START { \
471 const IV offset = loc - RExC_precomp; \
472 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
473 a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset); \
476 #define vWARN4(loc, m, a1, a2, a3) STMT_START { \
477 const IV offset = loc - RExC_precomp; \
478 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
479 a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
482 #define vWARN5(loc, m, a1, a2, a3, a4) STMT_START { \
483 const IV offset = loc - RExC_precomp; \
484 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
485 a1, a2, a3, a4, (int)offset, RExC_precomp, RExC_precomp + offset); \
489 /* Allow for side effects in s */
490 #define REGC(c,s) STMT_START { \
491 if (!SIZE_ONLY) *(s) = (c); else (void)(s); \
494 /* Macros for recording node offsets. 20001227 mjd@plover.com
495 * Nodes are numbered 1, 2, 3, 4. Node #n's position is recorded in
496 * element 2*n-1 of the array. Element #2n holds the byte length node #n.
497 * Element 0 holds the number n.
498 * Position is 1 indexed.
501 #define Set_Node_Offset_To_R(node,byte) STMT_START { \
503 MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n", \
504 __LINE__, (node), (int)(byte))); \
506 Perl_croak(aTHX_ "value of node is %d in Offset macro", (int)(node)); \
508 RExC_offsets[2*(node)-1] = (byte); \
513 #define Set_Node_Offset(node,byte) \
514 Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
515 #define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
517 #define Set_Node_Length_To_R(node,len) STMT_START { \
519 MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n", \
520 __LINE__, (int)(node), (int)(len))); \
522 Perl_croak(aTHX_ "value of node is %d in Length macro", (int)(node)); \
524 RExC_offsets[2*(node)] = (len); \
529 #define Set_Node_Length(node,len) \
530 Set_Node_Length_To_R((node)-RExC_emit_start, len)
531 #define Set_Cur_Node_Length(len) Set_Node_Length(RExC_emit, len)
532 #define Set_Node_Cur_Length(node) \
533 Set_Node_Length(node, RExC_parse - parse_start)
535 /* Get offsets and lengths */
536 #define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
537 #define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
539 #define Set_Node_Offset_Length(node,offset,len) STMT_START { \
540 Set_Node_Offset_To_R((node)-RExC_emit_start, (offset)); \
541 Set_Node_Length_To_R((node)-RExC_emit_start, (len)); \
545 #if PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS
546 #define EXPERIMENTAL_INPLACESCAN
549 #define DEBUG_STUDYDATA(data,depth) \
550 DEBUG_OPTIMISE_MORE_r(if(data){ \
551 PerlIO_printf(Perl_debug_log, \
552 "%*s"/* Len:%"IVdf"/%"IVdf" */" Pos:%"IVdf"/%"IVdf \
553 " Flags: %"IVdf" Whilem_c: %"IVdf" Lcp: %"IVdf" ", \
554 (int)(depth)*2, "", \
555 (IV)((data)->pos_min), \
556 (IV)((data)->pos_delta), \
557 (IV)((data)->flags), \
558 (IV)((data)->whilem_c), \
559 (IV)((data)->last_closep ? *((data)->last_closep) : -1) \
561 if ((data)->last_found) \
562 PerlIO_printf(Perl_debug_log, \
563 "Last:'%s' %"IVdf":%"IVdf"/%"IVdf" %sFixed:'%s' @ %"IVdf \
564 " %sFloat: '%s' @ %"IVdf"/%"IVdf"", \
565 SvPVX_const((data)->last_found), \
566 (IV)((data)->last_end), \
567 (IV)((data)->last_start_min), \
568 (IV)((data)->last_start_max), \
569 ((data)->longest && \
570 (data)->longest==&((data)->longest_fixed)) ? "*" : "", \
571 SvPVX_const((data)->longest_fixed), \
572 (IV)((data)->offset_fixed), \
573 ((data)->longest && \
574 (data)->longest==&((data)->longest_float)) ? "*" : "", \
575 SvPVX_const((data)->longest_float), \
576 (IV)((data)->offset_float_min), \
577 (IV)((data)->offset_float_max) \
579 PerlIO_printf(Perl_debug_log,"\n"); \
582 static void clear_re(pTHX_ void *r);
584 /* Mark that we cannot extend a found fixed substring at this point.
585 Update the longest found anchored substring and the longest found
586 floating substrings if needed. */
589 S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data, I32 *minlenp)
591 const STRLEN l = CHR_SVLEN(data->last_found);
592 const STRLEN old_l = CHR_SVLEN(*data->longest);
593 GET_RE_DEBUG_FLAGS_DECL;
595 if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
596 SvSetMagicSV(*data->longest, data->last_found);
597 if (*data->longest == data->longest_fixed) {
598 data->offset_fixed = l ? data->last_start_min : data->pos_min;
599 if (data->flags & SF_BEFORE_EOL)
601 |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
603 data->flags &= ~SF_FIX_BEFORE_EOL;
604 data->minlen_fixed=minlenp;
605 data->lookbehind_fixed=0;
608 data->offset_float_min = l ? data->last_start_min : data->pos_min;
609 data->offset_float_max = (l
610 ? data->last_start_max
611 : data->pos_min + data->pos_delta);
612 if ((U32)data->offset_float_max > (U32)I32_MAX)
613 data->offset_float_max = I32_MAX;
614 if (data->flags & SF_BEFORE_EOL)
616 |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
618 data->flags &= ~SF_FL_BEFORE_EOL;
619 data->minlen_float=minlenp;
620 data->lookbehind_float=0;
623 SvCUR_set(data->last_found, 0);
625 SV * const sv = data->last_found;
626 if (SvUTF8(sv) && SvMAGICAL(sv)) {
627 MAGIC * const mg = mg_find(sv, PERL_MAGIC_utf8);
633 data->flags &= ~SF_BEFORE_EOL;
634 DEBUG_STUDYDATA(data,0);
637 /* Can match anything (initialization) */
639 S_cl_anything(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
641 ANYOF_CLASS_ZERO(cl);
642 ANYOF_BITMAP_SETALL(cl);
643 cl->flags = ANYOF_EOS|ANYOF_UNICODE_ALL;
645 cl->flags |= ANYOF_LOCALE;
648 /* Can match anything (initialization) */
650 S_cl_is_anything(const struct regnode_charclass_class *cl)
654 for (value = 0; value <= ANYOF_MAX; value += 2)
655 if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1))
657 if (!(cl->flags & ANYOF_UNICODE_ALL))
659 if (!ANYOF_BITMAP_TESTALLSET((const void*)cl))
664 /* Can match anything (initialization) */
666 S_cl_init(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
668 Zero(cl, 1, struct regnode_charclass_class);
670 cl_anything(pRExC_state, cl);
674 S_cl_init_zero(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
676 Zero(cl, 1, struct regnode_charclass_class);
678 cl_anything(pRExC_state, cl);
680 cl->flags |= ANYOF_LOCALE;
683 /* 'And' a given class with another one. Can create false positives */
684 /* We assume that cl is not inverted */
686 S_cl_and(struct regnode_charclass_class *cl,
687 const struct regnode_charclass_class *and_with)
690 assert(and_with->type == ANYOF);
691 if (!(and_with->flags & ANYOF_CLASS)
692 && !(cl->flags & ANYOF_CLASS)
693 && (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
694 && !(and_with->flags & ANYOF_FOLD)
695 && !(cl->flags & ANYOF_FOLD)) {
698 if (and_with->flags & ANYOF_INVERT)
699 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
700 cl->bitmap[i] &= ~and_with->bitmap[i];
702 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
703 cl->bitmap[i] &= and_with->bitmap[i];
704 } /* XXXX: logic is complicated otherwise, leave it along for a moment. */
705 if (!(and_with->flags & ANYOF_EOS))
706 cl->flags &= ~ANYOF_EOS;
708 if (cl->flags & ANYOF_UNICODE_ALL && and_with->flags & ANYOF_UNICODE &&
709 !(and_with->flags & ANYOF_INVERT)) {
710 cl->flags &= ~ANYOF_UNICODE_ALL;
711 cl->flags |= ANYOF_UNICODE;
712 ARG_SET(cl, ARG(and_with));
714 if (!(and_with->flags & ANYOF_UNICODE_ALL) &&
715 !(and_with->flags & ANYOF_INVERT))
716 cl->flags &= ~ANYOF_UNICODE_ALL;
717 if (!(and_with->flags & (ANYOF_UNICODE|ANYOF_UNICODE_ALL)) &&
718 !(and_with->flags & ANYOF_INVERT))
719 cl->flags &= ~ANYOF_UNICODE;
722 /* 'OR' a given class with another one. Can create false positives */
723 /* We assume that cl is not inverted */
725 S_cl_or(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, const struct regnode_charclass_class *or_with)
727 if (or_with->flags & ANYOF_INVERT) {
729 * (B1 | CL1) | (!B2 & !CL2) = (B1 | !B2 & !CL2) | (CL1 | (!B2 & !CL2))
730 * <= (B1 | !B2) | (CL1 | !CL2)
731 * which is wasteful if CL2 is small, but we ignore CL2:
732 * (B1 | CL1) | (!B2 & !CL2) <= (B1 | CL1) | !B2 = (B1 | !B2) | CL1
733 * XXXX Can we handle case-fold? Unclear:
734 * (OK1(i) | OK1(i')) | !(OK1(i) | OK1(i')) =
735 * (OK1(i) | OK1(i')) | (!OK1(i) & !OK1(i'))
737 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
738 && !(or_with->flags & ANYOF_FOLD)
739 && !(cl->flags & ANYOF_FOLD) ) {
742 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
743 cl->bitmap[i] |= ~or_with->bitmap[i];
744 } /* XXXX: logic is complicated otherwise */
746 cl_anything(pRExC_state, cl);
749 /* (B1 | CL1) | (B2 | CL2) = (B1 | B2) | (CL1 | CL2)) */
750 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
751 && (!(or_with->flags & ANYOF_FOLD)
752 || (cl->flags & ANYOF_FOLD)) ) {
755 /* OR char bitmap and class bitmap separately */
756 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
757 cl->bitmap[i] |= or_with->bitmap[i];
758 if (or_with->flags & ANYOF_CLASS) {
759 for (i = 0; i < ANYOF_CLASSBITMAP_SIZE; i++)
760 cl->classflags[i] |= or_with->classflags[i];
761 cl->flags |= ANYOF_CLASS;
764 else { /* XXXX: logic is complicated, leave it along for a moment. */
765 cl_anything(pRExC_state, cl);
768 if (or_with->flags & ANYOF_EOS)
769 cl->flags |= ANYOF_EOS;
771 if (cl->flags & ANYOF_UNICODE && or_with->flags & ANYOF_UNICODE &&
772 ARG(cl) != ARG(or_with)) {
773 cl->flags |= ANYOF_UNICODE_ALL;
774 cl->flags &= ~ANYOF_UNICODE;
776 if (or_with->flags & ANYOF_UNICODE_ALL) {
777 cl->flags |= ANYOF_UNICODE_ALL;
778 cl->flags &= ~ANYOF_UNICODE;
782 #define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ]
783 #define TRIE_LIST_CUR(state) ( TRIE_LIST_ITEM( state, 0 ).forid )
784 #define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate )
785 #define TRIE_LIST_USED(idx) ( trie->states[state].trans.list ? (TRIE_LIST_CUR( idx ) - 1) : 0 )
791 dump_trie_interim_list(trie,next_alloc)
792 dump_trie_interim_table(trie,next_alloc)
794 These routines dump out a trie in a somewhat readable format.
795 The _interim_ variants are used for debugging the interim
796 tables that are used to generate the final compressed
797 representation which is what dump_trie expects.
799 Part of the reason for their existance is to provide a form
800 of documentation as to how the different representations function.
806 Dumps the final compressed table form of the trie to Perl_debug_log.
807 Used for debugging make_trie().
811 S_dump_trie(pTHX_ const struct _reg_trie_data *trie,U32 depth)
814 SV *sv=sv_newmortal();
815 int colwidth= trie->widecharmap ? 6 : 4;
816 GET_RE_DEBUG_FLAGS_DECL;
819 PerlIO_printf( Perl_debug_log, "%*sChar : %-6s%-6s%-4s ",
820 (int)depth * 2 + 2,"",
821 "Match","Base","Ofs" );
823 for( state = 0 ; state < trie->uniquecharcount ; state++ ) {
824 SV ** const tmp = av_fetch( trie->revcharmap, state, 0);
826 PerlIO_printf( Perl_debug_log, "%*s",
828 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
829 PL_colors[0], PL_colors[1],
830 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
831 PERL_PV_ESCAPE_FIRSTCHAR
836 PerlIO_printf( Perl_debug_log, "\n%*sState|-----------------------",
837 (int)depth * 2 + 2,"");
839 for( state = 0 ; state < trie->uniquecharcount ; state++ )
840 PerlIO_printf( Perl_debug_log, "%.*s", colwidth, "--------");
841 PerlIO_printf( Perl_debug_log, "\n");
843 for( state = 1 ; state < trie->statecount ; state++ ) {
844 const U32 base = trie->states[ state ].trans.base;
846 PerlIO_printf( Perl_debug_log, "%*s#%4"UVXf"|", (int)depth * 2 + 2,"", (UV)state);
848 if ( trie->states[ state ].wordnum ) {
849 PerlIO_printf( Perl_debug_log, " W%4X", trie->states[ state ].wordnum );
851 PerlIO_printf( Perl_debug_log, "%6s", "" );
854 PerlIO_printf( Perl_debug_log, " @%4"UVXf" ", (UV)base );
859 while( ( base + ofs < trie->uniquecharcount ) ||
860 ( base + ofs - trie->uniquecharcount < trie->lasttrans
861 && trie->trans[ base + ofs - trie->uniquecharcount ].check != state))
864 PerlIO_printf( Perl_debug_log, "+%2"UVXf"[ ", (UV)ofs);
866 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
867 if ( ( base + ofs >= trie->uniquecharcount ) &&
868 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
869 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
871 PerlIO_printf( Perl_debug_log, "%*"UVXf,
873 (UV)trie->trans[ base + ofs - trie->uniquecharcount ].next );
875 PerlIO_printf( Perl_debug_log, "%*s",colwidth," ." );
879 PerlIO_printf( Perl_debug_log, "]");
882 PerlIO_printf( Perl_debug_log, "\n" );
886 dump_trie_interim_list(trie,next_alloc)
887 Dumps a fully constructed but uncompressed trie in list form.
888 List tries normally only are used for construction when the number of
889 possible chars (trie->uniquecharcount) is very high.
890 Used for debugging make_trie().
893 S_dump_trie_interim_list(pTHX_ const struct _reg_trie_data *trie, U32 next_alloc,U32 depth)
896 SV *sv=sv_newmortal();
897 int colwidth= trie->widecharmap ? 6 : 4;
898 GET_RE_DEBUG_FLAGS_DECL;
899 /* print out the table precompression. */
900 PerlIO_printf( Perl_debug_log, "%*sState :Word | Transition Data\n%*s%s",
901 (int)depth * 2 + 2,"", (int)depth * 2 + 2,"",
902 "------:-----+-----------------\n" );
904 for( state=1 ; state < next_alloc ; state ++ ) {
907 PerlIO_printf( Perl_debug_log, "%*s %4"UVXf" :",
908 (int)depth * 2 + 2,"", (UV)state );
909 if ( ! trie->states[ state ].wordnum ) {
910 PerlIO_printf( Perl_debug_log, "%5s| ","");
912 PerlIO_printf( Perl_debug_log, "W%4x| ",
913 trie->states[ state ].wordnum
916 for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) {
917 SV ** const tmp = av_fetch( trie->revcharmap, TRIE_LIST_ITEM(state,charid).forid, 0);
919 PerlIO_printf( Perl_debug_log, "%*s:%3X=%4"UVXf" | ",
921 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
922 PL_colors[0], PL_colors[1],
923 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
924 PERL_PV_ESCAPE_FIRSTCHAR
926 TRIE_LIST_ITEM(state,charid).forid,
927 (UV)TRIE_LIST_ITEM(state,charid).newstate
930 PerlIO_printf(Perl_debug_log, "\n%*s| ",
931 (int)((depth * 2) + 14), "");
934 PerlIO_printf( Perl_debug_log, "\n");
939 dump_trie_interim_table(trie,next_alloc)
940 Dumps a fully constructed but uncompressed trie in table form.
941 This is the normal DFA style state transition table, with a few
942 twists to facilitate compression later.
943 Used for debugging make_trie().
946 S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie, U32 next_alloc, U32 depth)
950 SV *sv=sv_newmortal();
951 int colwidth= trie->widecharmap ? 6 : 4;
952 GET_RE_DEBUG_FLAGS_DECL;
955 print out the table precompression so that we can do a visual check
956 that they are identical.
959 PerlIO_printf( Perl_debug_log, "%*sChar : ",(int)depth * 2 + 2,"" );
961 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
962 SV ** const tmp = av_fetch( trie->revcharmap, charid, 0);
964 PerlIO_printf( Perl_debug_log, "%*s",
966 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
967 PL_colors[0], PL_colors[1],
968 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
969 PERL_PV_ESCAPE_FIRSTCHAR
975 PerlIO_printf( Perl_debug_log, "\n%*sState+-",(int)depth * 2 + 2,"" );
977 for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) {
978 PerlIO_printf( Perl_debug_log, "%.*s", colwidth,"--------");
981 PerlIO_printf( Perl_debug_log, "\n" );
983 for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) {
985 PerlIO_printf( Perl_debug_log, "%*s%4"UVXf" : ",
986 (int)depth * 2 + 2,"",
987 (UV)TRIE_NODENUM( state ) );
989 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
990 UV v=(UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next );
992 PerlIO_printf( Perl_debug_log, "%*"UVXf, colwidth, v );
994 PerlIO_printf( Perl_debug_log, "%*s", colwidth, "." );
996 if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) {
997 PerlIO_printf( Perl_debug_log, " (%4"UVXf")\n", (UV)trie->trans[ state ].check );
999 PerlIO_printf( Perl_debug_log, " (%4"UVXf") W%4X\n", (UV)trie->trans[ state ].check,
1000 trie->states[ TRIE_NODENUM( state ) ].wordnum );
1007 /* make_trie(startbranch,first,last,tail,word_count,flags,depth)
1008 startbranch: the first branch in the whole branch sequence
1009 first : start branch of sequence of branch-exact nodes.
1010 May be the same as startbranch
1011 last : Thing following the last branch.
1012 May be the same as tail.
1013 tail : item following the branch sequence
1014 count : words in the sequence
1015 flags : currently the OP() type we will be building one of /EXACT(|F|Fl)/
1016 depth : indent depth
1018 Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node.
1020 A trie is an N'ary tree where the branches are determined by digital
1021 decomposition of the key. IE, at the root node you look up the 1st character and
1022 follow that branch repeat until you find the end of the branches. Nodes can be
1023 marked as "accepting" meaning they represent a complete word. Eg:
1027 would convert into the following structure. Numbers represent states, letters
1028 following numbers represent valid transitions on the letter from that state, if
1029 the number is in square brackets it represents an accepting state, otherwise it
1030 will be in parenthesis.
1032 +-h->+-e->[3]-+-r->(8)-+-s->[9]
1036 (1) +-i->(6)-+-s->[7]
1038 +-s->(3)-+-h->(4)-+-e->[5]
1040 Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers)
1042 This shows that when matching against the string 'hers' we will begin at state 1
1043 read 'h' and move to state 2, read 'e' and move to state 3 which is accepting,
1044 then read 'r' and go to state 8 followed by 's' which takes us to state 9 which
1045 is also accepting. Thus we know that we can match both 'he' and 'hers' with a
1046 single traverse. We store a mapping from accepting to state to which word was
1047 matched, and then when we have multiple possibilities we try to complete the
1048 rest of the regex in the order in which they occured in the alternation.
1050 The only prior NFA like behaviour that would be changed by the TRIE support is
1051 the silent ignoring of duplicate alternations which are of the form:
1053 / (DUPE|DUPE) X? (?{ ... }) Y /x
1055 Thus EVAL blocks follwing a trie may be called a different number of times with
1056 and without the optimisation. With the optimisations dupes will be silently
1057 ignored. This inconsistant behaviour of EVAL type nodes is well established as
1058 the following demonstrates:
1060 'words'=~/(word|word|word)(?{ print $1 })[xyz]/
1062 which prints out 'word' three times, but
1064 'words'=~/(word|word|word)(?{ print $1 })S/
1066 which doesnt print it out at all. This is due to other optimisations kicking in.
1068 Example of what happens on a structural level:
1070 The regexp /(ac|ad|ab)+/ will produce the folowing debug output:
1072 1: CURLYM[1] {1,32767}(18)
1083 This would be optimizable with startbranch=5, first=5, last=16, tail=16
1084 and should turn into:
1086 1: CURLYM[1] {1,32767}(18)
1088 [Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1]
1096 Cases where tail != last would be like /(?foo|bar)baz/:
1106 which would be optimizable with startbranch=1, first=1, last=7, tail=8
1107 and would end up looking like:
1110 [Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1]
1117 d = uvuni_to_utf8_flags(d, uv, 0);
1119 is the recommended Unicode-aware way of saying
1124 #define TRIE_STORE_REVCHAR \
1126 SV *tmp = newSVpvs(""); \
1127 if (UTF) SvUTF8_on(tmp); \
1128 Perl_sv_catpvf( aTHX_ tmp, "%c", (int)uvc ); \
1129 av_push( TRIE_REVCHARMAP(trie), tmp ); \
1132 #define TRIE_READ_CHAR STMT_START { \
1136 if ( foldlen > 0 ) { \
1137 uvc = utf8n_to_uvuni( scan, UTF8_MAXLEN, &len, uniflags ); \
1142 uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
1143 uvc = to_uni_fold( uvc, foldbuf, &foldlen ); \
1144 foldlen -= UNISKIP( uvc ); \
1145 scan = foldbuf + UNISKIP( uvc ); \
1148 uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
1158 #define TRIE_LIST_PUSH(state,fid,ns) STMT_START { \
1159 if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) { \
1160 U32 ging = TRIE_LIST_LEN( state ) *= 2; \
1161 Renew( trie->states[ state ].trans.list, ging, reg_trie_trans_le ); \
1163 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid; \
1164 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns; \
1165 TRIE_LIST_CUR( state )++; \
1168 #define TRIE_LIST_NEW(state) STMT_START { \
1169 Newxz( trie->states[ state ].trans.list, \
1170 4, reg_trie_trans_le ); \
1171 TRIE_LIST_CUR( state ) = 1; \
1172 TRIE_LIST_LEN( state ) = 4; \
1175 #define TRIE_HANDLE_WORD(state) STMT_START { \
1176 U16 dupe= trie->states[ state ].wordnum; \
1177 regnode * const noper_next = regnext( noper ); \
1179 if (trie->wordlen) \
1180 trie->wordlen[ curword ] = wordlen; \
1182 /* store the word for dumping */ \
1184 if (OP(noper) != NOTHING) \
1185 tmp = newSVpvn(STRING(noper), STR_LEN(noper)); \
1187 tmp = newSVpvn( "", 0 ); \
1188 if ( UTF ) SvUTF8_on( tmp ); \
1189 av_push( trie->words, tmp ); \
1194 if ( noper_next < tail ) { \
1196 Newxz( trie->jump, word_count + 1, U16); \
1197 trie->jump[curword] = (U16)(noper_next - convert); \
1199 jumper = noper_next; \
1201 nextbranch= regnext(cur); \
1205 /* So it's a dupe. This means we need to maintain a */\
1206 /* linked-list from the first to the next. */\
1207 /* we only allocate the nextword buffer when there */\
1208 /* a dupe, so first time we have to do the allocation */\
1209 if (!trie->nextword) \
1210 Newxz( trie->nextword, word_count + 1, U16); \
1211 while ( trie->nextword[dupe] ) \
1212 dupe= trie->nextword[dupe]; \
1213 trie->nextword[dupe]= curword; \
1215 /* we haven't inserted this word yet. */ \
1216 trie->states[ state ].wordnum = curword; \
1221 #define TRIE_TRANS_STATE(state,base,ucharcount,charid,special) \
1222 ( ( base + charid >= ucharcount \
1223 && base + charid < ubound \
1224 && state == trie->trans[ base - ucharcount + charid ].check \
1225 && trie->trans[ base - ucharcount + charid ].next ) \
1226 ? trie->trans[ base - ucharcount + charid ].next \
1227 : ( state==1 ? special : 0 ) \
1231 #define MADE_JUMP_TRIE 2
1232 #define MADE_EXACT_TRIE 4
1235 S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *first, regnode *last, regnode *tail, U32 word_count, U32 flags, U32 depth)
1238 /* first pass, loop through and scan words */
1239 reg_trie_data *trie;
1241 const U32 uniflags = UTF8_ALLOW_DEFAULT;
1246 regnode *jumper = NULL;
1247 regnode *nextbranch = NULL;
1248 regnode *convert = NULL;
1249 /* we just use folder as a flag in utf8 */
1250 const U8 * const folder = ( flags == EXACTF
1252 : ( flags == EXACTFL
1258 const U32 data_slot = add_data( pRExC_state, 1, "t" );
1259 SV *re_trie_maxbuff;
1261 /* these are only used during construction but are useful during
1262 * debugging so we store them in the struct when debugging.
1264 STRLEN trie_charcount=0;
1265 AV *trie_revcharmap;
1267 GET_RE_DEBUG_FLAGS_DECL;
1269 PERL_UNUSED_ARG(depth);
1272 Newxz( trie, 1, reg_trie_data );
1274 trie->startstate = 1;
1275 trie->wordcount = word_count;
1276 RExC_rx->data->data[ data_slot ] = (void*)trie;
1277 Newxz( trie->charmap, 256, U16 );
1278 if (!(UTF && folder))
1279 Newxz( trie->bitmap, ANYOF_BITMAP_SIZE, char );
1281 trie->words = newAV();
1283 TRIE_REVCHARMAP(trie) = newAV();
1285 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
1286 if (!SvIOK(re_trie_maxbuff)) {
1287 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
1290 PerlIO_printf( Perl_debug_log,
1291 "%*smake_trie start==%d, first==%d, last==%d, tail==%d depth=%d\n",
1292 (int)depth * 2 + 2, "",
1293 REG_NODE_NUM(startbranch),REG_NODE_NUM(first),
1294 REG_NODE_NUM(last), REG_NODE_NUM(tail),
1298 /* Find the node we are going to overwrite */
1299 if ( first == startbranch && OP( last ) != BRANCH ) {
1300 /* whole branch chain */
1303 /* branch sub-chain */
1304 convert = NEXTOPER( first );
1307 /* -- First loop and Setup --
1309 We first traverse the branches and scan each word to determine if it
1310 contains widechars, and how many unique chars there are, this is
1311 important as we have to build a table with at least as many columns as we
1314 We use an array of integers to represent the character codes 0..255
1315 (trie->charmap) and we use a an HV* to store unicode characters. We use the
1316 native representation of the character value as the key and IV's for the
1319 *TODO* If we keep track of how many times each character is used we can
1320 remap the columns so that the table compression later on is more
1321 efficient in terms of memory by ensuring most common value is in the
1322 middle and the least common are on the outside. IMO this would be better
1323 than a most to least common mapping as theres a decent chance the most
1324 common letter will share a node with the least common, meaning the node
1325 will not be compressable. With a middle is most common approach the worst
1326 case is when we have the least common nodes twice.
1330 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1331 regnode * const noper = NEXTOPER( cur );
1332 const U8 *uc = (U8*)STRING( noper );
1333 const U8 * const e = uc + STR_LEN( noper );
1335 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1336 const U8 *scan = (U8*)NULL;
1337 U32 wordlen = 0; /* required init */
1340 if (OP(noper) == NOTHING) {
1345 TRIE_BITMAP_SET(trie,*uc);
1346 if ( folder ) TRIE_BITMAP_SET(trie,folder[ *uc ]);
1348 for ( ; uc < e ; uc += len ) {
1349 TRIE_CHARCOUNT(trie)++;
1353 if ( !trie->charmap[ uvc ] ) {
1354 trie->charmap[ uvc ]=( ++trie->uniquecharcount );
1356 trie->charmap[ folder[ uvc ] ] = trie->charmap[ uvc ];
1361 if ( !trie->widecharmap )
1362 trie->widecharmap = newHV();
1364 svpp = hv_fetch( trie->widecharmap, (char*)&uvc, sizeof( UV ), 1 );
1367 Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%"UVXf, uvc );
1369 if ( !SvTRUE( *svpp ) ) {
1370 sv_setiv( *svpp, ++trie->uniquecharcount );
1375 if( cur == first ) {
1378 } else if (chars < trie->minlen) {
1380 } else if (chars > trie->maxlen) {
1384 } /* end first pass */
1385 DEBUG_TRIE_COMPILE_r(
1386 PerlIO_printf( Perl_debug_log, "%*sTRIE(%s): W:%d C:%d Uq:%d Min:%d Max:%d\n",
1387 (int)depth * 2 + 2,"",
1388 ( trie->widecharmap ? "UTF8" : "NATIVE" ), (int)word_count,
1389 (int)TRIE_CHARCOUNT(trie), trie->uniquecharcount,
1390 (int)trie->minlen, (int)trie->maxlen )
1392 Newxz( trie->wordlen, word_count, U32 );
1395 We now know what we are dealing with in terms of unique chars and
1396 string sizes so we can calculate how much memory a naive
1397 representation using a flat table will take. If it's over a reasonable
1398 limit (as specified by ${^RE_TRIE_MAXBUF}) we use a more memory
1399 conservative but potentially much slower representation using an array
1402 At the end we convert both representations into the same compressed
1403 form that will be used in regexec.c for matching with. The latter
1404 is a form that cannot be used to construct with but has memory
1405 properties similar to the list form and access properties similar
1406 to the table form making it both suitable for fast searches and
1407 small enough that its feasable to store for the duration of a program.
1409 See the comment in the code where the compressed table is produced
1410 inplace from the flat tabe representation for an explanation of how
1411 the compression works.
1416 if ( (IV)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1) > SvIV(re_trie_maxbuff) ) {
1418 Second Pass -- Array Of Lists Representation
1420 Each state will be represented by a list of charid:state records
1421 (reg_trie_trans_le) the first such element holds the CUR and LEN
1422 points of the allocated array. (See defines above).
1424 We build the initial structure using the lists, and then convert
1425 it into the compressed table form which allows faster lookups
1426 (but cant be modified once converted).
1429 STRLEN transcount = 1;
1431 DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
1432 "%*sCompiling trie using list compiler\n",
1433 (int)depth * 2 + 2, ""));
1435 Newxz( trie->states, TRIE_CHARCOUNT(trie) + 2, reg_trie_state );
1439 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1441 regnode * const noper = NEXTOPER( cur );
1442 U8 *uc = (U8*)STRING( noper );
1443 const U8 * const e = uc + STR_LEN( noper );
1444 U32 state = 1; /* required init */
1445 U16 charid = 0; /* sanity init */
1446 U8 *scan = (U8*)NULL; /* sanity init */
1447 STRLEN foldlen = 0; /* required init */
1448 U32 wordlen = 0; /* required init */
1449 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1451 if (OP(noper) != NOTHING) {
1452 for ( ; uc < e ; uc += len ) {
1457 charid = trie->charmap[ uvc ];
1459 SV** const svpp = hv_fetch( trie->widecharmap, (char*)&uvc, sizeof( UV ), 0);
1463 charid=(U16)SvIV( *svpp );
1466 /* charid is now 0 if we dont know the char read, or nonzero if we do */
1473 if ( !trie->states[ state ].trans.list ) {
1474 TRIE_LIST_NEW( state );
1476 for ( check = 1; check <= TRIE_LIST_USED( state ); check++ ) {
1477 if ( TRIE_LIST_ITEM( state, check ).forid == charid ) {
1478 newstate = TRIE_LIST_ITEM( state, check ).newstate;
1483 newstate = next_alloc++;
1484 TRIE_LIST_PUSH( state, charid, newstate );
1489 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
1493 TRIE_HANDLE_WORD(state);
1495 } /* end second pass */
1497 /* next alloc is the NEXT state to be allocated */
1498 trie->statecount = next_alloc;
1499 Renew( trie->states, next_alloc, reg_trie_state );
1501 /* and now dump it out before we compress it */
1502 DEBUG_TRIE_COMPILE_MORE_r(
1503 dump_trie_interim_list(trie,next_alloc,depth+1)
1506 Newxz( trie->trans, transcount ,reg_trie_trans );
1513 for( state=1 ; state < next_alloc ; state ++ ) {
1517 DEBUG_TRIE_COMPILE_MORE_r(
1518 PerlIO_printf( Perl_debug_log, "tp: %d zp: %d ",tp,zp)
1522 if (trie->states[state].trans.list) {
1523 U16 minid=TRIE_LIST_ITEM( state, 1).forid;
1527 for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
1528 const U16 forid = TRIE_LIST_ITEM( state, idx).forid;
1529 if ( forid < minid ) {
1531 } else if ( forid > maxid ) {
1535 if ( transcount < tp + maxid - minid + 1) {
1537 Renew( trie->trans, transcount, reg_trie_trans );
1538 Zero( trie->trans + (transcount / 2), transcount / 2 , reg_trie_trans );
1540 base = trie->uniquecharcount + tp - minid;
1541 if ( maxid == minid ) {
1543 for ( ; zp < tp ; zp++ ) {
1544 if ( ! trie->trans[ zp ].next ) {
1545 base = trie->uniquecharcount + zp - minid;
1546 trie->trans[ zp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1547 trie->trans[ zp ].check = state;
1553 trie->trans[ tp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1554 trie->trans[ tp ].check = state;
1559 for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
1560 const U32 tid = base - trie->uniquecharcount + TRIE_LIST_ITEM( state, idx ).forid;
1561 trie->trans[ tid ].next = TRIE_LIST_ITEM( state, idx ).newstate;
1562 trie->trans[ tid ].check = state;
1564 tp += ( maxid - minid + 1 );
1566 Safefree(trie->states[ state ].trans.list);
1569 DEBUG_TRIE_COMPILE_MORE_r(
1570 PerlIO_printf( Perl_debug_log, " base: %d\n",base);
1573 trie->states[ state ].trans.base=base;
1575 trie->lasttrans = tp + 1;
1579 Second Pass -- Flat Table Representation.
1581 we dont use the 0 slot of either trans[] or states[] so we add 1 to each.
1582 We know that we will need Charcount+1 trans at most to store the data
1583 (one row per char at worst case) So we preallocate both structures
1584 assuming worst case.
1586 We then construct the trie using only the .next slots of the entry
1589 We use the .check field of the first entry of the node temporarily to
1590 make compression both faster and easier by keeping track of how many non
1591 zero fields are in the node.
1593 Since trans are numbered from 1 any 0 pointer in the table is a FAIL
1596 There are two terms at use here: state as a TRIE_NODEIDX() which is a
1597 number representing the first entry of the node, and state as a
1598 TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1) and
1599 TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3) if there
1600 are 2 entrys per node. eg:
1608 The table is internally in the right hand, idx form. However as we also
1609 have to deal with the states array which is indexed by nodenum we have to
1610 use TRIE_NODENUM() to convert.
1613 DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
1614 "%*sCompiling trie using table compiler\n",
1615 (int)depth * 2 + 2, ""));
1617 Newxz( trie->trans, ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1,
1619 Newxz( trie->states, TRIE_CHARCOUNT(trie) + 2, reg_trie_state );
1620 next_alloc = trie->uniquecharcount + 1;
1623 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1625 regnode * const noper = NEXTOPER( cur );
1626 const U8 *uc = (U8*)STRING( noper );
1627 const U8 * const e = uc + STR_LEN( noper );
1629 U32 state = 1; /* required init */
1631 U16 charid = 0; /* sanity init */
1632 U32 accept_state = 0; /* sanity init */
1633 U8 *scan = (U8*)NULL; /* sanity init */
1635 STRLEN foldlen = 0; /* required init */
1636 U32 wordlen = 0; /* required init */
1637 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1639 if ( OP(noper) != NOTHING ) {
1640 for ( ; uc < e ; uc += len ) {
1645 charid = trie->charmap[ uvc ];
1647 SV* const * const svpp = hv_fetch( trie->widecharmap, (char*)&uvc, sizeof( UV ), 0);
1648 charid = svpp ? (U16)SvIV(*svpp) : 0;
1652 if ( !trie->trans[ state + charid ].next ) {
1653 trie->trans[ state + charid ].next = next_alloc;
1654 trie->trans[ state ].check++;
1655 next_alloc += trie->uniquecharcount;
1657 state = trie->trans[ state + charid ].next;
1659 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
1661 /* charid is now 0 if we dont know the char read, or nonzero if we do */
1664 accept_state = TRIE_NODENUM( state );
1665 TRIE_HANDLE_WORD(accept_state);
1667 } /* end second pass */
1669 /* and now dump it out before we compress it */
1670 DEBUG_TRIE_COMPILE_MORE_r(
1671 dump_trie_interim_table(trie,next_alloc,depth+1)
1676 * Inplace compress the table.*
1678 For sparse data sets the table constructed by the trie algorithm will
1679 be mostly 0/FAIL transitions or to put it another way mostly empty.
1680 (Note that leaf nodes will not contain any transitions.)
1682 This algorithm compresses the tables by eliminating most such
1683 transitions, at the cost of a modest bit of extra work during lookup:
1685 - Each states[] entry contains a .base field which indicates the
1686 index in the state[] array wheres its transition data is stored.
1688 - If .base is 0 there are no valid transitions from that node.
1690 - If .base is nonzero then charid is added to it to find an entry in
1693 -If trans[states[state].base+charid].check!=state then the
1694 transition is taken to be a 0/Fail transition. Thus if there are fail
1695 transitions at the front of the node then the .base offset will point
1696 somewhere inside the previous nodes data (or maybe even into a node
1697 even earlier), but the .check field determines if the transition is
1701 The following process inplace converts the table to the compressed
1702 table: We first do not compress the root node 1,and mark its all its
1703 .check pointers as 1 and set its .base pointer as 1 as well. This
1704 allows to do a DFA construction from the compressed table later, and
1705 ensures that any .base pointers we calculate later are greater than
1708 - We set 'pos' to indicate the first entry of the second node.
1710 - We then iterate over the columns of the node, finding the first and
1711 last used entry at l and m. We then copy l..m into pos..(pos+m-l),
1712 and set the .check pointers accordingly, and advance pos
1713 appropriately and repreat for the next node. Note that when we copy
1714 the next pointers we have to convert them from the original
1715 NODEIDX form to NODENUM form as the former is not valid post
1718 - If a node has no transitions used we mark its base as 0 and do not
1719 advance the pos pointer.
1721 - If a node only has one transition we use a second pointer into the
1722 structure to fill in allocated fail transitions from other states.
1723 This pointer is independent of the main pointer and scans forward
1724 looking for null transitions that are allocated to a state. When it
1725 finds one it writes the single transition into the "hole". If the
1726 pointer doesnt find one the single transition is appended as normal.
1728 - Once compressed we can Renew/realloc the structures to release the
1731 See "Table-Compression Methods" in sec 3.9 of the Red Dragon,
1732 specifically Fig 3.47 and the associated pseudocode.
1736 const U32 laststate = TRIE_NODENUM( next_alloc );
1739 trie->statecount = laststate;
1741 for ( state = 1 ; state < laststate ; state++ ) {
1743 const U32 stateidx = TRIE_NODEIDX( state );
1744 const U32 o_used = trie->trans[ stateidx ].check;
1745 U32 used = trie->trans[ stateidx ].check;
1746 trie->trans[ stateidx ].check = 0;
1748 for ( charid = 0 ; used && charid < trie->uniquecharcount ; charid++ ) {
1749 if ( flag || trie->trans[ stateidx + charid ].next ) {
1750 if ( trie->trans[ stateidx + charid ].next ) {
1752 for ( ; zp < pos ; zp++ ) {
1753 if ( ! trie->trans[ zp ].next ) {
1757 trie->states[ state ].trans.base = zp + trie->uniquecharcount - charid ;
1758 trie->trans[ zp ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
1759 trie->trans[ zp ].check = state;
1760 if ( ++zp > pos ) pos = zp;
1767 trie->states[ state ].trans.base = pos + trie->uniquecharcount - charid ;
1769 trie->trans[ pos ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
1770 trie->trans[ pos ].check = state;
1775 trie->lasttrans = pos + 1;
1776 Renew( trie->states, laststate, reg_trie_state);
1777 DEBUG_TRIE_COMPILE_MORE_r(
1778 PerlIO_printf( Perl_debug_log,
1779 "%*sAlloc: %d Orig: %"IVdf" elements, Final:%"IVdf". Savings of %%%5.2f\n",
1780 (int)depth * 2 + 2,"",
1781 (int)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1 ),
1784 ( ( next_alloc - pos ) * 100 ) / (double)next_alloc );
1787 } /* end table compress */
1789 DEBUG_TRIE_COMPILE_MORE_r(
1790 PerlIO_printf(Perl_debug_log, "%*sStatecount:%"UVxf" Lasttrans:%"UVxf"\n",
1791 (int)depth * 2 + 2, "",
1792 (UV)trie->statecount,
1793 (UV)trie->lasttrans)
1795 /* resize the trans array to remove unused space */
1796 Renew( trie->trans, trie->lasttrans, reg_trie_trans);
1798 /* and now dump out the compressed format */
1799 DEBUG_TRIE_COMPILE_r(
1800 dump_trie(trie,depth+1)
1803 { /* Modify the program and insert the new TRIE node*/
1804 U8 nodetype =(U8)(flags & 0xFF);
1808 regnode *optimize = NULL;
1810 U32 mjd_nodelen = 0;
1813 This means we convert either the first branch or the first Exact,
1814 depending on whether the thing following (in 'last') is a branch
1815 or not and whther first is the startbranch (ie is it a sub part of
1816 the alternation or is it the whole thing.)
1817 Assuming its a sub part we conver the EXACT otherwise we convert
1818 the whole branch sequence, including the first.
1820 /* Find the node we are going to overwrite */
1821 if ( first != startbranch || OP( last ) == BRANCH ) {
1822 /* branch sub-chain */
1823 NEXT_OFF( first ) = (U16)(last - first);
1825 mjd_offset= Node_Offset((convert));
1826 mjd_nodelen= Node_Length((convert));
1828 /* whole branch chain */
1831 const regnode *nop = NEXTOPER( convert );
1832 mjd_offset= Node_Offset((nop));
1833 mjd_nodelen= Node_Length((nop));
1838 PerlIO_printf(Perl_debug_log, "%*sMJD offset:%"UVuf" MJD length:%"UVuf"\n",
1839 (int)depth * 2 + 2, "",
1840 (UV)mjd_offset, (UV)mjd_nodelen)
1843 /* But first we check to see if there is a common prefix we can
1844 split out as an EXACT and put in front of the TRIE node. */
1845 trie->startstate= 1;
1846 if ( trie->bitmap && !trie->widecharmap && !trie->jump ) {
1848 for ( state = 1 ; state < trie->statecount-1 ; state++ ) {
1852 const U32 base = trie->states[ state ].trans.base;
1854 if ( trie->states[state].wordnum )
1857 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
1858 if ( ( base + ofs >= trie->uniquecharcount ) &&
1859 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
1860 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
1862 if ( ++count > 1 ) {
1863 SV **tmp = av_fetch( TRIE_REVCHARMAP(trie), ofs, 0);
1864 const U8 *ch = (U8*)SvPV_nolen_const( *tmp );
1865 if ( state == 1 ) break;
1867 Zero(trie->bitmap, ANYOF_BITMAP_SIZE, char);
1869 PerlIO_printf(Perl_debug_log,
1870 "%*sNew Start State=%"UVuf" Class: [",
1871 (int)depth * 2 + 2, "",
1874 SV ** const tmp = av_fetch( TRIE_REVCHARMAP(trie), idx, 0);
1875 const U8 * const ch = (U8*)SvPV_nolen_const( *tmp );
1877 TRIE_BITMAP_SET(trie,*ch);
1879 TRIE_BITMAP_SET(trie, folder[ *ch ]);
1881 PerlIO_printf(Perl_debug_log, (char*)ch)
1885 TRIE_BITMAP_SET(trie,*ch);
1887 TRIE_BITMAP_SET(trie,folder[ *ch ]);
1888 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"%s", ch));
1894 SV **tmp = av_fetch( TRIE_REVCHARMAP(trie), idx, 0);
1895 const char *ch = SvPV_nolen_const( *tmp );
1897 PerlIO_printf( Perl_debug_log,
1898 "%*sPrefix State: %"UVuf" Idx:%"UVuf" Char='%s'\n",
1899 (int)depth * 2 + 2, "",
1900 (UV)state, (UV)idx, ch)
1903 OP( convert ) = nodetype;
1904 str=STRING(convert);
1913 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"]\n"));
1919 regnode *n = convert+NODE_SZ_STR(convert);
1920 NEXT_OFF(convert) = NODE_SZ_STR(convert);
1921 trie->startstate = state;
1922 trie->minlen -= (state - 1);
1923 trie->maxlen -= (state - 1);
1925 regnode *fix = convert;
1927 Set_Node_Offset_Length(convert, mjd_offset, state - 1);
1928 while( ++fix < n ) {
1929 Set_Node_Offset_Length(fix, 0, 0);
1935 NEXT_OFF(convert) = (U16)(tail - convert);
1936 DEBUG_r(optimize= n);
1942 if ( trie->maxlen ) {
1943 NEXT_OFF( convert ) = (U16)(tail - convert);
1944 ARG_SET( convert, data_slot );
1945 /* Store the offset to the first unabsorbed branch in
1946 jump[0], which is otherwise unused by the jump logic.
1947 We use this when dumping a trie and during optimisation. */
1949 trie->jump[0] = (U16)(nextbranch - convert);
1952 if ( !trie->states[trie->startstate].wordnum && trie->bitmap &&
1953 ( (char *)jumper - (char *)convert) >= (int)sizeof(struct regnode_charclass) )
1955 OP( convert ) = TRIEC;
1956 Copy(trie->bitmap, ((struct regnode_charclass *)convert)->bitmap, ANYOF_BITMAP_SIZE, char);
1957 Safefree(trie->bitmap);
1960 OP( convert ) = TRIE;
1962 /* store the type in the flags */
1963 convert->flags = nodetype;
1967 + regarglen[ OP( convert ) ];
1969 /* XXX We really should free up the resource in trie now,
1970 as we won't use them - (which resources?) dmq */
1972 /* needed for dumping*/
1973 DEBUG_r(if (optimize) {
1974 regnode *opt = convert;
1975 while ( ++opt < optimize) {
1976 Set_Node_Offset_Length(opt,0,0);
1979 Try to clean up some of the debris left after the
1982 while( optimize < jumper ) {
1983 mjd_nodelen += Node_Length((optimize));
1984 OP( optimize ) = OPTIMIZED;
1985 Set_Node_Offset_Length(optimize,0,0);
1988 Set_Node_Offset_Length(convert,mjd_offset,mjd_nodelen);
1990 } /* end node insert */
1992 SvREFCNT_dec(TRIE_REVCHARMAP(trie));
1996 : trie->startstate>1
2002 S_make_trie_failtable(pTHX_ RExC_state_t *pRExC_state, regnode *source, regnode *stclass, U32 depth)
2004 /* The Trie is constructed and compressed now so we can build a fail array now if its needed
2006 This is basically the Aho-Corasick algorithm. Its from exercise 3.31 and 3.32 in the
2007 "Red Dragon" -- Compilers, principles, techniques, and tools. Aho, Sethi, Ullman 1985/88
2010 We find the fail state for each state in the trie, this state is the longest proper
2011 suffix of the current states 'word' that is also a proper prefix of another word in our
2012 trie. State 1 represents the word '' and is the thus the default fail state. This allows
2013 the DFA not to have to restart after its tried and failed a word at a given point, it
2014 simply continues as though it had been matching the other word in the first place.
2016 'abcdgu'=~/abcdefg|cdgu/
2017 When we get to 'd' we are still matching the first word, we would encounter 'g' which would
2018 fail, which would bring use to the state representing 'd' in the second word where we would
2019 try 'g' and succeed, prodceding to match 'cdgu'.
2021 /* add a fail transition */
2022 reg_trie_data *trie=(reg_trie_data *)RExC_rx->data->data[ARG(source)];
2024 const U32 ucharcount = trie->uniquecharcount;
2025 const U32 numstates = trie->statecount;
2026 const U32 ubound = trie->lasttrans + ucharcount;
2030 U32 base = trie->states[ 1 ].trans.base;
2033 const U32 data_slot = add_data( pRExC_state, 1, "T" );
2034 GET_RE_DEBUG_FLAGS_DECL;
2036 PERL_UNUSED_ARG(depth);
2040 ARG_SET( stclass, data_slot );
2041 Newxz( aho, 1, reg_ac_data );
2042 RExC_rx->data->data[ data_slot ] = (void*)aho;
2044 aho->states=(reg_trie_state *)savepvn((const char*)trie->states,
2045 numstates * sizeof(reg_trie_state));
2046 Newxz( q, numstates, U32);
2047 Newxz( aho->fail, numstates, U32 );
2050 /* initialize fail[0..1] to be 1 so that we always have
2051 a valid final fail state */
2052 fail[ 0 ] = fail[ 1 ] = 1;
2054 for ( charid = 0; charid < ucharcount ; charid++ ) {
2055 const U32 newstate = TRIE_TRANS_STATE( 1, base, ucharcount, charid, 0 );
2057 q[ q_write ] = newstate;
2058 /* set to point at the root */
2059 fail[ q[ q_write++ ] ]=1;
2062 while ( q_read < q_write) {
2063 const U32 cur = q[ q_read++ % numstates ];
2064 base = trie->states[ cur ].trans.base;
2066 for ( charid = 0 ; charid < ucharcount ; charid++ ) {
2067 const U32 ch_state = TRIE_TRANS_STATE( cur, base, ucharcount, charid, 1 );
2069 U32 fail_state = cur;
2072 fail_state = fail[ fail_state ];
2073 fail_base = aho->states[ fail_state ].trans.base;
2074 } while ( !TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 ) );
2076 fail_state = TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 );
2077 fail[ ch_state ] = fail_state;
2078 if ( !aho->states[ ch_state ].wordnum && aho->states[ fail_state ].wordnum )
2080 aho->states[ ch_state ].wordnum = aho->states[ fail_state ].wordnum;
2082 q[ q_write++ % numstates] = ch_state;
2086 /* restore fail[0..1] to 0 so that we "fall out" of the AC loop
2087 when we fail in state 1, this allows us to use the
2088 charclass scan to find a valid start char. This is based on the principle
2089 that theres a good chance the string being searched contains lots of stuff
2090 that cant be a start char.
2092 fail[ 0 ] = fail[ 1 ] = 0;
2093 DEBUG_TRIE_COMPILE_r({
2094 PerlIO_printf(Perl_debug_log, "%*sStclass Failtable (%"UVuf" states): 0",
2095 (int)(depth * 2), "", numstates
2097 for( q_read=1; q_read<numstates; q_read++ ) {
2098 PerlIO_printf(Perl_debug_log, ", %"UVuf, (UV)fail[q_read]);
2100 PerlIO_printf(Perl_debug_log, "\n");
2103 /*RExC_seen |= REG_SEEN_TRIEDFA;*/
2108 * There are strange code-generation bugs caused on sparc64 by gcc-2.95.2.
2109 * These need to be revisited when a newer toolchain becomes available.
2111 #if defined(__sparc64__) && defined(__GNUC__)
2112 # if __GNUC__ < 2 || (__GNUC__ == 2 && __GNUC_MINOR__ < 96)
2113 # undef SPARC64_GCC_WORKAROUND
2114 # define SPARC64_GCC_WORKAROUND 1
2118 #define DEBUG_PEEP(str,scan,depth) \
2119 DEBUG_OPTIMISE_r({ \
2120 SV * const mysv=sv_newmortal(); \
2121 regnode *Next = regnext(scan); \
2122 regprop(RExC_rx, mysv, scan); \
2123 PerlIO_printf(Perl_debug_log, "%*s" str ">%3d: %s (%d)\n", \
2124 (int)depth*2, "", REG_NODE_NUM(scan), SvPV_nolen_const(mysv),\
2125 Next ? (REG_NODE_NUM(Next)) : 0 ); \
2132 #define JOIN_EXACT(scan,min,flags) \
2133 if (PL_regkind[OP(scan)] == EXACT) \
2134 join_exact(pRExC_state,(scan),(min),(flags),NULL,depth+1)
2137 S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, I32 *min, U32 flags,regnode *val, U32 depth) {
2138 /* Merge several consecutive EXACTish nodes into one. */
2139 regnode *n = regnext(scan);
2141 regnode *next = scan + NODE_SZ_STR(scan);
2145 regnode *stop = scan;
2146 GET_RE_DEBUG_FLAGS_DECL;
2148 PERL_UNUSED_ARG(depth);
2150 #ifndef EXPERIMENTAL_INPLACESCAN
2151 PERL_UNUSED_ARG(flags);
2152 PERL_UNUSED_ARG(val);
2154 DEBUG_PEEP("join",scan,depth);
2156 /* Skip NOTHING, merge EXACT*. */
2158 ( PL_regkind[OP(n)] == NOTHING ||
2159 (stringok && (OP(n) == OP(scan))))
2161 && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX) {
2163 if (OP(n) == TAIL || n > next)
2165 if (PL_regkind[OP(n)] == NOTHING) {
2166 DEBUG_PEEP("skip:",n,depth);
2167 NEXT_OFF(scan) += NEXT_OFF(n);
2168 next = n + NODE_STEP_REGNODE;
2175 else if (stringok) {
2176 const unsigned int oldl = STR_LEN(scan);
2177 regnode * const nnext = regnext(n);
2179 DEBUG_PEEP("merg",n,depth);
2182 if (oldl + STR_LEN(n) > U8_MAX)
2184 NEXT_OFF(scan) += NEXT_OFF(n);
2185 STR_LEN(scan) += STR_LEN(n);
2186 next = n + NODE_SZ_STR(n);
2187 /* Now we can overwrite *n : */
2188 Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
2196 #ifdef EXPERIMENTAL_INPLACESCAN
2197 if (flags && !NEXT_OFF(n)) {
2198 DEBUG_PEEP("atch", val, depth);
2199 if (reg_off_by_arg[OP(n)]) {
2200 ARG_SET(n, val - n);
2203 NEXT_OFF(n) = val - n;
2210 if (UTF && ( OP(scan) == EXACTF ) && ( STR_LEN(scan) >= 6 ) ) {
2212 Two problematic code points in Unicode casefolding of EXACT nodes:
2214 U+0390 - GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
2215 U+03B0 - GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
2221 U+03B9 U+0308 U+0301 0xCE 0xB9 0xCC 0x88 0xCC 0x81
2222 U+03C5 U+0308 U+0301 0xCF 0x85 0xCC 0x88 0xCC 0x81
2224 This means that in case-insensitive matching (or "loose matching",
2225 as Unicode calls it), an EXACTF of length six (the UTF-8 encoded byte
2226 length of the above casefolded versions) can match a target string
2227 of length two (the byte length of UTF-8 encoded U+0390 or U+03B0).
2228 This would rather mess up the minimum length computation.
2230 What we'll do is to look for the tail four bytes, and then peek
2231 at the preceding two bytes to see whether we need to decrease
2232 the minimum length by four (six minus two).
2234 Thanks to the design of UTF-8, there cannot be false matches:
2235 A sequence of valid UTF-8 bytes cannot be a subsequence of
2236 another valid sequence of UTF-8 bytes.
2239 char * const s0 = STRING(scan), *s, *t;
2240 char * const s1 = s0 + STR_LEN(scan) - 1;
2241 char * const s2 = s1 - 4;
2242 #ifdef EBCDIC /* RD tunifold greek 0390 and 03B0 */
2243 const char t0[] = "\xaf\x49\xaf\x42";
2245 const char t0[] = "\xcc\x88\xcc\x81";
2247 const char * const t1 = t0 + 3;
2250 s < s2 && (t = ninstr(s, s1, t0, t1));
2253 if (((U8)t[-1] == 0x68 && (U8)t[-2] == 0xB4) ||
2254 ((U8)t[-1] == 0x46 && (U8)t[-2] == 0xB5))
2256 if (((U8)t[-1] == 0xB9 && (U8)t[-2] == 0xCE) ||
2257 ((U8)t[-1] == 0x85 && (U8)t[-2] == 0xCF))
2265 n = scan + NODE_SZ_STR(scan);
2267 if (PL_regkind[OP(n)] != NOTHING || OP(n) == NOTHING) {
2274 DEBUG_OPTIMISE_r(if (merged){DEBUG_PEEP("finl",scan,depth)});
2278 /* REx optimizer. Converts nodes into quickier variants "in place".
2279 Finds fixed substrings. */
2281 /* Stops at toplevel WHILEM as well as at "last". At end *scanp is set
2282 to the position after last scanned or to NULL. */
2284 #define INIT_AND_WITHP \
2285 assert(!and_withp); \
2286 Newx(and_withp,1,struct regnode_charclass_class); \
2287 SAVEFREEPV(and_withp)
2290 S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
2291 I32 *minlenp, I32 *deltap,
2296 struct regnode_charclass_class *and_withp,
2297 U32 flags, U32 depth)
2298 /* scanp: Start here (read-write). */
2299 /* deltap: Write maxlen-minlen here. */
2300 /* last: Stop before this one. */
2301 /* data: string data about the pattern */
2302 /* stopparen: treat close N as END */
2303 /* recursed: which subroutines have we recursed into */
2304 /* and_withp: Valid if flags & SCF_DO_STCLASS_OR */
2307 I32 min = 0, pars = 0, code;
2308 regnode *scan = *scanp, *next;
2310 int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
2311 int is_inf_internal = 0; /* The studied chunk is infinite */
2312 I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
2313 scan_data_t data_fake;
2314 SV *re_trie_maxbuff = NULL;
2315 regnode *first_non_open = scan;
2316 I32 stopmin = I32_MAX;
2317 GET_RE_DEBUG_FLAGS_DECL;
2319 StructCopy(&zero_scan_data, &data_fake, scan_data_t);
2323 while (first_non_open && OP(first_non_open) == OPEN)
2324 first_non_open=regnext(first_non_open);
2328 while (scan && OP(scan) != END && scan < last) {
2329 /* Peephole optimizer: */
2330 DEBUG_STUDYDATA(data,depth);
2331 DEBUG_PEEP("Peep",scan,depth);
2332 JOIN_EXACT(scan,&min,0);
2334 /* Follow the next-chain of the current node and optimize
2335 away all the NOTHINGs from it. */
2336 if (OP(scan) != CURLYX) {
2337 const int max = (reg_off_by_arg[OP(scan)]
2339 /* I32 may be smaller than U16 on CRAYs! */
2340 : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
2341 int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
2345 /* Skip NOTHING and LONGJMP. */
2346 while ((n = regnext(n))
2347 && ((PL_regkind[OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
2348 || ((OP(n) == LONGJMP) && (noff = ARG(n))))
2349 && off + noff < max)
2351 if (reg_off_by_arg[OP(scan)])
2354 NEXT_OFF(scan) = off;
2359 /* The principal pseudo-switch. Cannot be a switch, since we
2360 look into several different things. */
2361 if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
2362 || OP(scan) == IFTHEN || OP(scan) == SUSPEND) {
2363 next = regnext(scan);
2365 /* demq: the op(next)==code check is to see if we have "branch-branch" AFAICT */
2367 if (OP(next) == code || code == IFTHEN || code == SUSPEND) {
2368 /* NOTE - There is similar code to this block below for handling
2369 TRIE nodes on a re-study. If you change stuff here check there
2371 I32 max1 = 0, min1 = I32_MAX, num = 0;
2372 struct regnode_charclass_class accum;
2373 regnode * const startbranch=scan;
2375 if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
2376 scan_commit(pRExC_state, data, minlenp); /* Cannot merge strings after this. */
2377 if (flags & SCF_DO_STCLASS)
2378 cl_init_zero(pRExC_state, &accum);
2380 while (OP(scan) == code) {
2381 I32 deltanext, minnext, f = 0, fake;
2382 struct regnode_charclass_class this_class;
2385 data_fake.flags = 0;
2387 data_fake.whilem_c = data->whilem_c;
2388 data_fake.last_closep = data->last_closep;
2391 data_fake.last_closep = &fake;
2392 next = regnext(scan);
2393 scan = NEXTOPER(scan);
2395 scan = NEXTOPER(scan);
2396 if (flags & SCF_DO_STCLASS) {
2397 cl_init(pRExC_state, &this_class);
2398 data_fake.start_class = &this_class;
2399 f = SCF_DO_STCLASS_AND;
2401 if (flags & SCF_WHILEM_VISITED_POS)
2402 f |= SCF_WHILEM_VISITED_POS;
2404 /* we suppose the run is continuous, last=next...*/
2405 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
2407 stopparen, recursed, NULL, f,depth+1);
2410 if (max1 < minnext + deltanext)
2411 max1 = minnext + deltanext;
2412 if (deltanext == I32_MAX)
2413 is_inf = is_inf_internal = 1;
2415 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
2417 if (data_fake.flags & SCF_SEEN_ACCEPT) {
2418 if ( stopmin > minnext)
2419 stopmin = min + min1;
2420 flags &= ~SCF_DO_SUBSTR;
2422 data->flags |= SCF_SEEN_ACCEPT;
2425 if (data_fake.flags & SF_HAS_EVAL)
2426 data->flags |= SF_HAS_EVAL;
2427 data->whilem_c = data_fake.whilem_c;
2429 if (flags & SCF_DO_STCLASS)
2430 cl_or(pRExC_state, &accum, &this_class);
2431 if (code == SUSPEND)
2434 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
2436 if (flags & SCF_DO_SUBSTR) {
2437 data->pos_min += min1;
2438 data->pos_delta += max1 - min1;
2439 if (max1 != min1 || is_inf)
2440 data->longest = &(data->longest_float);
2443 delta += max1 - min1;
2444 if (flags & SCF_DO_STCLASS_OR) {
2445 cl_or(pRExC_state, data->start_class, &accum);
2447 cl_and(data->start_class, and_withp);
2448 flags &= ~SCF_DO_STCLASS;
2451 else if (flags & SCF_DO_STCLASS_AND) {
2453 cl_and(data->start_class, &accum);
2454 flags &= ~SCF_DO_STCLASS;
2457 /* Switch to OR mode: cache the old value of
2458 * data->start_class */
2460 StructCopy(data->start_class, and_withp,
2461 struct regnode_charclass_class);
2462 flags &= ~SCF_DO_STCLASS_AND;
2463 StructCopy(&accum, data->start_class,
2464 struct regnode_charclass_class);
2465 flags |= SCF_DO_STCLASS_OR;
2466 data->start_class->flags |= ANYOF_EOS;
2470 if (PERL_ENABLE_TRIE_OPTIMISATION && OP( startbranch ) == BRANCH ) {
2473 Assuming this was/is a branch we are dealing with: 'scan' now
2474 points at the item that follows the branch sequence, whatever
2475 it is. We now start at the beginning of the sequence and look
2482 which would be constructed from a pattern like /A|LIST|OF|WORDS/
2484 If we can find such a subseqence we need to turn the first
2485 element into a trie and then add the subsequent branch exact
2486 strings to the trie.
2490 1. patterns where the whole set of branch can be converted.
2492 2. patterns where only a subset can be converted.
2494 In case 1 we can replace the whole set with a single regop
2495 for the trie. In case 2 we need to keep the start and end
2498 'BRANCH EXACT; BRANCH EXACT; BRANCH X'
2499 becomes BRANCH TRIE; BRANCH X;
2501 There is an additional case, that being where there is a
2502 common prefix, which gets split out into an EXACT like node
2503 preceding the TRIE node.
2505 If x(1..n)==tail then we can do a simple trie, if not we make
2506 a "jump" trie, such that when we match the appropriate word
2507 we "jump" to the appopriate tail node. Essentailly we turn
2508 a nested if into a case structure of sorts.
2513 if (!re_trie_maxbuff) {
2514 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
2515 if (!SvIOK(re_trie_maxbuff))
2516 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
2518 if ( SvIV(re_trie_maxbuff)>=0 ) {
2520 regnode *first = (regnode *)NULL;
2521 regnode *last = (regnode *)NULL;
2522 regnode *tail = scan;
2527 SV * const mysv = sv_newmortal(); /* for dumping */
2529 /* var tail is used because there may be a TAIL
2530 regop in the way. Ie, the exacts will point to the
2531 thing following the TAIL, but the last branch will
2532 point at the TAIL. So we advance tail. If we
2533 have nested (?:) we may have to move through several
2537 while ( OP( tail ) == TAIL ) {
2538 /* this is the TAIL generated by (?:) */
2539 tail = regnext( tail );
2544 regprop(RExC_rx, mysv, tail );
2545 PerlIO_printf( Perl_debug_log, "%*s%s%s\n",
2546 (int)depth * 2 + 2, "",
2547 "Looking for TRIE'able sequences. Tail node is: ",
2548 SvPV_nolen_const( mysv )
2554 step through the branches, cur represents each
2555 branch, noper is the first thing to be matched
2556 as part of that branch and noper_next is the
2557 regnext() of that node. if noper is an EXACT
2558 and noper_next is the same as scan (our current
2559 position in the regex) then the EXACT branch is
2560 a possible optimization target. Once we have
2561 two or more consequetive such branches we can
2562 create a trie of the EXACT's contents and stich
2563 it in place. If the sequence represents all of
2564 the branches we eliminate the whole thing and
2565 replace it with a single TRIE. If it is a
2566 subsequence then we need to stitch it in. This
2567 means the first branch has to remain, and needs
2568 to be repointed at the item on the branch chain
2569 following the last branch optimized. This could
2570 be either a BRANCH, in which case the
2571 subsequence is internal, or it could be the
2572 item following the branch sequence in which
2573 case the subsequence is at the end.
2577 /* dont use tail as the end marker for this traverse */
2578 for ( cur = startbranch ; cur != scan ; cur = regnext( cur ) ) {
2579 regnode * const noper = NEXTOPER( cur );
2580 #if defined(DEBUGGING) || defined(NOJUMPTRIE)
2581 regnode * const noper_next = regnext( noper );
2585 regprop(RExC_rx, mysv, cur);
2586 PerlIO_printf( Perl_debug_log, "%*s- %s (%d)",
2587 (int)depth * 2 + 2,"", SvPV_nolen_const( mysv ), REG_NODE_NUM(cur) );
2589 regprop(RExC_rx, mysv, noper);
2590 PerlIO_printf( Perl_debug_log, " -> %s",
2591 SvPV_nolen_const(mysv));
2594 regprop(RExC_rx, mysv, noper_next );
2595 PerlIO_printf( Perl_debug_log,"\t=> %s\t",
2596 SvPV_nolen_const(mysv));
2598 PerlIO_printf( Perl_debug_log, "(First==%d,Last==%d,Cur==%d)\n",
2599 REG_NODE_NUM(first), REG_NODE_NUM(last), REG_NODE_NUM(cur) );
2601 if ( (((first && optype!=NOTHING) ? OP( noper ) == optype
2602 : PL_regkind[ OP( noper ) ] == EXACT )
2603 || OP(noper) == NOTHING )
2605 && noper_next == tail
2610 if ( !first || optype == NOTHING ) {
2611 if (!first) first = cur;
2612 optype = OP( noper );
2618 make_trie( pRExC_state,
2619 startbranch, first, cur, tail, count,
2622 if ( PL_regkind[ OP( noper ) ] == EXACT
2624 && noper_next == tail
2629 optype = OP( noper );
2639 regprop(RExC_rx, mysv, cur);
2640 PerlIO_printf( Perl_debug_log,
2641 "%*s- %s (%d) <SCAN FINISHED>\n", (int)depth * 2 + 2,
2642 "", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur));
2646 made= make_trie( pRExC_state, startbranch, first, scan, tail, count, optype, depth+1 );
2647 #ifdef TRIE_STUDY_OPT
2648 if ( ((made == MADE_EXACT_TRIE &&
2649 startbranch == first)
2650 || ( first_non_open == first )) &&
2652 flags |= SCF_TRIE_RESTUDY;
2660 else if ( code == BRANCHJ ) { /* single branch is optimized. */
2661 scan = NEXTOPER(NEXTOPER(scan));
2662 } else /* single branch is optimized. */
2663 scan = NEXTOPER(scan);
2666 else if (OP(scan) == EXACT) {
2667 I32 l = STR_LEN(scan);
2670 const U8 * const s = (U8*)STRING(scan);
2671 l = utf8_length(s, s + l);
2672 uc = utf8_to_uvchr(s, NULL);
2674 uc = *((U8*)STRING(scan));
2677 if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
2678 /* The code below prefers earlier match for fixed
2679 offset, later match for variable offset. */
2680 if (data->last_end == -1) { /* Update the start info. */
2681 data->last_start_min = data->pos_min;
2682 data->last_start_max = is_inf
2683 ? I32_MAX : data->pos_min + data->pos_delta;
2685 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
2687 SvUTF8_on(data->last_found);
2689 SV * const sv = data->last_found;
2690 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
2691 mg_find(sv, PERL_MAGIC_utf8) : NULL;
2692 if (mg && mg->mg_len >= 0)
2693 mg->mg_len += utf8_length((U8*)STRING(scan),
2694 (U8*)STRING(scan)+STR_LEN(scan));
2696 data->last_end = data->pos_min + l;
2697 data->pos_min += l; /* As in the first entry. */
2698 data->flags &= ~SF_BEFORE_EOL;
2700 if (flags & SCF_DO_STCLASS_AND) {
2701 /* Check whether it is compatible with what we know already! */
2705 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
2706 && !ANYOF_BITMAP_TEST(data->start_class, uc)
2707 && (!(data->start_class->flags & ANYOF_FOLD)
2708 || !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
2711 ANYOF_CLASS_ZERO(data->start_class);
2712 ANYOF_BITMAP_ZERO(data->start_class);
2714 ANYOF_BITMAP_SET(data->start_class, uc);
2715 data->start_class->flags &= ~ANYOF_EOS;
2717 data->start_class->flags &= ~ANYOF_UNICODE_ALL;
2719 else if (flags & SCF_DO_STCLASS_OR) {
2720 /* false positive possible if the class is case-folded */
2722 ANYOF_BITMAP_SET(data->start_class, uc);
2724 data->start_class->flags |= ANYOF_UNICODE_ALL;
2725 data->start_class->flags &= ~ANYOF_EOS;
2726 cl_and(data->start_class, and_withp);
2728 flags &= ~SCF_DO_STCLASS;
2730 else if (PL_regkind[OP(scan)] == EXACT) { /* But OP != EXACT! */
2731 I32 l = STR_LEN(scan);
2732 UV uc = *((U8*)STRING(scan));
2734 /* Search for fixed substrings supports EXACT only. */
2735 if (flags & SCF_DO_SUBSTR) {
2737 scan_commit(pRExC_state, data, minlenp);
2740 const U8 * const s = (U8 *)STRING(scan);
2741 l = utf8_length(s, s + l);
2742 uc = utf8_to_uvchr(s, NULL);
2745 if (flags & SCF_DO_SUBSTR)
2747 if (flags & SCF_DO_STCLASS_AND) {
2748 /* Check whether it is compatible with what we know already! */
2752 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
2753 && !ANYOF_BITMAP_TEST(data->start_class, uc)
2754 && !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
2756 ANYOF_CLASS_ZERO(data->start_class);
2757 ANYOF_BITMAP_ZERO(data->start_class);
2759 ANYOF_BITMAP_SET(data->start_class, uc);
2760 data->start_class->flags &= ~ANYOF_EOS;
2761 data->start_class->flags |= ANYOF_FOLD;
2762 if (OP(scan) == EXACTFL)
2763 data->start_class->flags |= ANYOF_LOCALE;
2766 else if (flags & SCF_DO_STCLASS_OR) {
2767 if (data->start_class->flags & ANYOF_FOLD) {
2768 /* false positive possible if the class is case-folded.
2769 Assume that the locale settings are the same... */
2771 ANYOF_BITMAP_SET(data->start_class, uc);
2772 data->start_class->flags &= ~ANYOF_EOS;
2774 cl_and(data->start_class, and_withp);
2776 flags &= ~SCF_DO_STCLASS;
2778 else if (strchr((const char*)PL_varies,OP(scan))) {
2779 I32 mincount, maxcount, minnext, deltanext, fl = 0;
2780 I32 f = flags, pos_before = 0;
2781 regnode * const oscan = scan;
2782 struct regnode_charclass_class this_class;
2783 struct regnode_charclass_class *oclass = NULL;
2784 I32 next_is_eval = 0;
2786 switch (PL_regkind[OP(scan)]) {
2787 case WHILEM: /* End of (?:...)* . */
2788 scan = NEXTOPER(scan);
2791 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
2792 next = NEXTOPER(scan);
2793 if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) {
2795 maxcount = REG_INFTY;
2796 next = regnext(scan);
2797 scan = NEXTOPER(scan);
2801 if (flags & SCF_DO_SUBSTR)
2806 if (flags & SCF_DO_STCLASS) {
2808 maxcount = REG_INFTY;
2809 next = regnext(scan);
2810 scan = NEXTOPER(scan);
2813 is_inf = is_inf_internal = 1;
2814 scan = regnext(scan);
2815 if (flags & SCF_DO_SUBSTR) {
2816 scan_commit(pRExC_state, data, minlenp); /* Cannot extend fixed substrings */
2817 data->longest = &(data->longest_float);
2819 goto optimize_curly_tail;
2821 if (stopparen>0 && (OP(scan)==CURLYN || OP(scan)==CURLYM)
2822 && (scan->flags == stopparen))
2827 mincount = ARG1(scan);
2828 maxcount = ARG2(scan);
2830 next = regnext(scan);
2831 if (OP(scan) == CURLYX) {
2832 I32 lp = (data ? *(data->last_closep) : 0);
2833 scan->flags = ((lp <= (I32)U8_MAX) ? (U8)lp : U8_MAX);
2835 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
2836 next_is_eval = (OP(scan) == EVAL);
2838 if (flags & SCF_DO_SUBSTR) {
2839 if (mincount == 0) scan_commit(pRExC_state,data,minlenp); /* Cannot extend fixed substrings */
2840 pos_before = data->pos_min;
2844 data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
2846 data->flags |= SF_IS_INF;
2848 if (flags & SCF_DO_STCLASS) {
2849 cl_init(pRExC_state, &this_class);
2850 oclass = data->start_class;
2851 data->start_class = &this_class;
2852 f |= SCF_DO_STCLASS_AND;
2853 f &= ~SCF_DO_STCLASS_OR;
2855 /* These are the cases when once a subexpression
2856 fails at a particular position, it cannot succeed
2857 even after backtracking at the enclosing scope.
2859 XXXX what if minimal match and we are at the
2860 initial run of {n,m}? */
2861 if ((mincount != maxcount - 1) && (maxcount != REG_INFTY))
2862 f &= ~SCF_WHILEM_VISITED_POS;
2864 /* This will finish on WHILEM, setting scan, or on NULL: */
2865 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
2866 last, data, stopparen, recursed, NULL,
2868 ? (f & ~SCF_DO_SUBSTR) : f),depth+1);
2870 if (flags & SCF_DO_STCLASS)
2871 data->start_class = oclass;
2872 if (mincount == 0 || minnext == 0) {
2873 if (flags & SCF_DO_STCLASS_OR) {
2874 cl_or(pRExC_state, data->start_class, &this_class);
2876 else if (flags & SCF_DO_STCLASS_AND) {
2877 /* Switch to OR mode: cache the old value of
2878 * data->start_class */
2880 StructCopy(data->start_class, and_withp,
2881 struct regnode_charclass_class);
2882 flags &= ~SCF_DO_STCLASS_AND;
2883 StructCopy(&this_class, data->start_class,
2884 struct regnode_charclass_class);
2885 flags |= SCF_DO_STCLASS_OR;
2886 data->start_class->flags |= ANYOF_EOS;
2888 } else { /* Non-zero len */
2889 if (flags & SCF_DO_STCLASS_OR) {
2890 cl_or(pRExC_state, data->start_class, &this_class);
2891 cl_and(data->start_class, and_withp);
2893 else if (flags & SCF_DO_STCLASS_AND)
2894 cl_and(data->start_class, &this_class);
2895 flags &= ~SCF_DO_STCLASS;
2897 if (!scan) /* It was not CURLYX, but CURLY. */
2899 if ( /* ? quantifier ok, except for (?{ ... }) */
2900 (next_is_eval || !(mincount == 0 && maxcount == 1))
2901 && (minnext == 0) && (deltanext == 0)
2902 && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
2903 && maxcount <= REG_INFTY/3 /* Complement check for big count */
2904 && ckWARN(WARN_REGEXP))
2907 "Quantifier unexpected on zero-length expression");
2910 min += minnext * mincount;
2911 is_inf_internal |= ((maxcount == REG_INFTY
2912 && (minnext + deltanext) > 0)
2913 || deltanext == I32_MAX);
2914 is_inf |= is_inf_internal;
2915 delta += (minnext + deltanext) * maxcount - minnext * mincount;
2917 /* Try powerful optimization CURLYX => CURLYN. */
2918 if ( OP(oscan) == CURLYX && data
2919 && data->flags & SF_IN_PAR
2920 && !(data->flags & SF_HAS_EVAL)
2921 && !deltanext && minnext == 1 ) {
2922 /* Try to optimize to CURLYN. */
2923 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
2924 regnode * const nxt1 = nxt;
2931 if (!strchr((const char*)PL_simple,OP(nxt))
2932 && !(PL_regkind[OP(nxt)] == EXACT
2933 && STR_LEN(nxt) == 1))
2939 if (OP(nxt) != CLOSE)
2941 if (RExC_open_parens) {
2942 RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
2943 RExC_close_parens[ARG(nxt1)-1]=nxt+2; /*close->while*/
2945 /* Now we know that nxt2 is the only contents: */
2946 oscan->flags = (U8)ARG(nxt);
2948 OP(nxt1) = NOTHING; /* was OPEN. */
2951 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
2952 NEXT_OFF(nxt1+ 1) = 0; /* just for consistancy. */
2953 NEXT_OFF(nxt2) = 0; /* just for consistancy with CURLY. */
2954 OP(nxt) = OPTIMIZED; /* was CLOSE. */
2955 OP(nxt + 1) = OPTIMIZED; /* was count. */
2956 NEXT_OFF(nxt+ 1) = 0; /* just for consistancy. */
2961 /* Try optimization CURLYX => CURLYM. */
2962 if ( OP(oscan) == CURLYX && data
2963 && !(data->flags & SF_HAS_PAR)
2964 && !(data->flags & SF_HAS_EVAL)
2965 && !deltanext /* atom is fixed width */
2966 && minnext != 0 /* CURLYM can't handle zero width */
2968 /* XXXX How to optimize if data == 0? */
2969 /* Optimize to a simpler form. */
2970 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
2974 while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
2975 && (OP(nxt2) != WHILEM))
2977 OP(nxt2) = SUCCEED; /* Whas WHILEM */
2978 /* Need to optimize away parenths. */
2979 if (data->flags & SF_IN_PAR) {
2980 /* Set the parenth number. */
2981 regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
2983 if (OP(nxt) != CLOSE)
2984 FAIL("Panic opt close");
2985 oscan->flags = (U8)ARG(nxt);
2986 if (RExC_open_parens) {
2987 RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
2988 RExC_close_parens[ARG(nxt1)-1]=nxt2+1; /*close->NOTHING*/
2990 OP(nxt1) = OPTIMIZED; /* was OPEN. */
2991 OP(nxt) = OPTIMIZED; /* was CLOSE. */
2994 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
2995 OP(nxt + 1) = OPTIMIZED; /* was count. */
2996 NEXT_OFF(nxt1 + 1) = 0; /* just for consistancy. */
2997 NEXT_OFF(nxt + 1) = 0; /* just for consistancy. */
3000 while ( nxt1 && (OP(nxt1) != WHILEM)) {
3001 regnode *nnxt = regnext(nxt1);
3004 if (reg_off_by_arg[OP(nxt1)])
3005 ARG_SET(nxt1, nxt2 - nxt1);
3006 else if (nxt2 - nxt1 < U16_MAX)
3007 NEXT_OFF(nxt1) = nxt2 - nxt1;
3009 OP(nxt) = NOTHING; /* Cannot beautify */
3014 /* Optimize again: */
3015 study_chunk(pRExC_state, &nxt1, minlenp, &deltanext, nxt,
3016 NULL, stopparen, recursed, NULL, 0,depth+1);
3021 else if ((OP(oscan) == CURLYX)
3022 && (flags & SCF_WHILEM_VISITED_POS)
3023 /* See the comment on a similar expression above.
3024 However, this time it not a subexpression
3025 we care about, but the expression itself. */
3026 && (maxcount == REG_INFTY)
3027 && data && ++data->whilem_c < 16) {
3028 /* This stays as CURLYX, we can put the count/of pair. */
3029 /* Find WHILEM (as in regexec.c) */
3030 regnode *nxt = oscan + NEXT_OFF(oscan);
3032 if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
3034 PREVOPER(nxt)->flags = (U8)(data->whilem_c
3035 | (RExC_whilem_seen << 4)); /* On WHILEM */
3037 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
3039 if (flags & SCF_DO_SUBSTR) {
3040 SV *last_str = NULL;
3041 int counted = mincount != 0;
3043 if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */
3044 #if defined(SPARC64_GCC_WORKAROUND)
3047 const char *s = NULL;
3050 if (pos_before >= data->last_start_min)
3053 b = data->last_start_min;
3056 s = SvPV_const(data->last_found, l);
3057 old = b - data->last_start_min;
3060 I32 b = pos_before >= data->last_start_min
3061 ? pos_before : data->last_start_min;
3063 const char * const s = SvPV_const(data->last_found, l);
3064 I32 old = b - data->last_start_min;
3068 old = utf8_hop((U8*)s, old) - (U8*)s;
3071 /* Get the added string: */
3072 last_str = newSVpvn(s + old, l);
3074 SvUTF8_on(last_str);
3075 if (deltanext == 0 && pos_before == b) {
3076 /* What was added is a constant string */
3078 SvGROW(last_str, (mincount * l) + 1);
3079 repeatcpy(SvPVX(last_str) + l,
3080 SvPVX_const(last_str), l, mincount - 1);
3081 SvCUR_set(last_str, SvCUR(last_str) * mincount);
3082 /* Add additional parts. */
3083 SvCUR_set(data->last_found,
3084 SvCUR(data->last_found) - l);
3085 sv_catsv(data->last_found, last_str);
3087 SV * sv = data->last_found;
3089 SvUTF8(sv) && SvMAGICAL(sv) ?
3090 mg_find(sv, PERL_MAGIC_utf8) : NULL;
3091 if (mg && mg->mg_len >= 0)
3092 mg->mg_len += CHR_SVLEN(last_str);
3094 data->last_end += l * (mincount - 1);
3097 /* start offset must point into the last copy */
3098 data->last_start_min += minnext * (mincount - 1);
3099 data->last_start_max += is_inf ? I32_MAX
3100 : (maxcount - 1) * (minnext + data->pos_delta);
3103 /* It is counted once already... */
3104 data->pos_min += minnext * (mincount - counted);
3105 data->pos_delta += - counted * deltanext +
3106 (minnext + deltanext) * maxcount - minnext * mincount;
3107 if (mincount != maxcount) {
3108 /* Cannot extend fixed substrings found inside
3110 scan_commit(pRExC_state,data,minlenp);
3111 if (mincount && last_str) {
3112 SV * const sv = data->last_found;
3113 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
3114 mg_find(sv, PERL_MAGIC_utf8) : NULL;
3118 sv_setsv(sv, last_str);
3119 data->last_end = data->pos_min;
3120 data->last_start_min =
3121 data->pos_min - CHR_SVLEN(last_str);
3122 data->last_start_max = is_inf
3124 : data->pos_min + data->pos_delta
3125 - CHR_SVLEN(last_str);
3127 data->longest = &(data->longest_float);
3129 SvREFCNT_dec(last_str);
3131 if (data && (fl & SF_HAS_EVAL))
3132 data->flags |= SF_HAS_EVAL;
3133 optimize_curly_tail:
3134 if (OP(oscan) != CURLYX) {
3135 while (PL_regkind[OP(next = regnext(oscan))] == NOTHING
3137 NEXT_OFF(oscan) += NEXT_OFF(next);
3140 default: /* REF and CLUMP only? */
3141 if (flags & SCF_DO_SUBSTR) {
3142 scan_commit(pRExC_state,data,minlenp); /* Cannot expect anything... */
3143 data->longest = &(data->longest_float);
3145 is_inf = is_inf_internal = 1;
3146 if (flags & SCF_DO_STCLASS_OR)
3147 cl_anything(pRExC_state, data->start_class);
3148 flags &= ~SCF_DO_STCLASS;
3152 else if (strchr((const char*)PL_simple,OP(scan))) {
3155 if (flags & SCF_DO_SUBSTR) {
3156 scan_commit(pRExC_state,data,minlenp);
3160 if (flags & SCF_DO_STCLASS) {
3161 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
3163 /* Some of the logic below assumes that switching
3164 locale on will only add false positives. */
3165 switch (PL_regkind[OP(scan)]) {
3169 /* Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan)); */
3170 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
3171 cl_anything(pRExC_state, data->start_class);
3174 if (OP(scan) == SANY)
3176 if (flags & SCF_DO_STCLASS_OR) { /* Everything but \n */
3177 value = (ANYOF_BITMAP_TEST(data->start_class,'\n')
3178 || (data->start_class->flags & ANYOF_CLASS));
3179 cl_anything(pRExC_state, data->start_class);
3181 if (flags & SCF_DO_STCLASS_AND || !value)
3182 ANYOF_BITMAP_CLEAR(data->start_class,'\n');
3185 if (flags & SCF_DO_STCLASS_AND)
3186 cl_and(data->start_class,
3187 (struct regnode_charclass_class*)scan);
3189 cl_or(pRExC_state, data->start_class,
3190 (struct regnode_charclass_class*)scan);
3193 if (flags & SCF_DO_STCLASS_AND) {
3194 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3195 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
3196 for (value = 0; value < 256; value++)
3197 if (!isALNUM(value))
3198 ANYOF_BITMAP_CLEAR(data->start_class, value);
3202 if (data->start_class->flags & ANYOF_LOCALE)
3203 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
3205 for (value = 0; value < 256; value++)
3207 ANYOF_BITMAP_SET(data->start_class, value);
3212 if (flags & SCF_DO_STCLASS_AND) {
3213 if (data->start_class->flags & ANYOF_LOCALE)
3214 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
3217 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
3218 data->start_class->flags |= ANYOF_LOCALE;
3222 if (flags & SCF_DO_STCLASS_AND) {
3223 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3224 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
3225 for (value = 0; value < 256; value++)
3227 ANYOF_BITMAP_CLEAR(data->start_class, value);
3231 if (data->start_class->flags & ANYOF_LOCALE)
3232 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
3234 for (value = 0; value < 256; value++)
3235 if (!isALNUM(value))
3236 ANYOF_BITMAP_SET(data->start_class, value);
3241 if (flags & SCF_DO_STCLASS_AND) {
3242 if (data->start_class->flags & ANYOF_LOCALE)
3243 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
3246 data->start_class->flags |= ANYOF_LOCALE;
3247 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
3251 if (flags & SCF_DO_STCLASS_AND) {
3252 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3253 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
3254 for (value = 0; value < 256; value++)
3255 if (!isSPACE(value))
3256 ANYOF_BITMAP_CLEAR(data->start_class, value);
3260 if (data->start_class->flags & ANYOF_LOCALE)
3261 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
3263 for (value = 0; value < 256; value++)
3265 ANYOF_BITMAP_SET(data->start_class, value);
3270 if (flags & SCF_DO_STCLASS_AND) {
3271 if (data->start_class->flags & ANYOF_LOCALE)
3272 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
3275 data->start_class->flags |= ANYOF_LOCALE;
3276 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
3280 if (flags & SCF_DO_STCLASS_AND) {
3281 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3282 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
3283 for (value = 0; value < 256; value++)
3285 ANYOF_BITMAP_CLEAR(data->start_class, value);
3289 if (data->start_class->flags & ANYOF_LOCALE)
3290 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
3292 for (value = 0; value < 256; value++)
3293 if (!isSPACE(value))
3294 ANYOF_BITMAP_SET(data->start_class, value);
3299 if (flags & SCF_DO_STCLASS_AND) {
3300 if (data->start_class->flags & ANYOF_LOCALE) {
3301 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
3302 for (value = 0; value < 256; value++)
3303 if (!isSPACE(value))
3304 ANYOF_BITMAP_CLEAR(data->start_class, value);
3308 data->start_class->flags |= ANYOF_LOCALE;
3309 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
3313 if (flags & SCF_DO_STCLASS_AND) {
3314 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NDIGIT);
3315 for (value = 0; value < 256; value++)
3316 if (!isDIGIT(value))
3317 ANYOF_BITMAP_CLEAR(data->start_class, value);
3320 if (data->start_class->flags & ANYOF_LOCALE)
3321 ANYOF_CLASS_SET(data->start_class,ANYOF_DIGIT);
3323 for (value = 0; value < 256; value++)
3325 ANYOF_BITMAP_SET(data->start_class, value);
3330 if (flags & SCF_DO_STCLASS_AND) {
3331 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_DIGIT);
3332 for (value = 0; value < 256; value++)
3334 ANYOF_BITMAP_CLEAR(data->start_class, value);
3337 if (data->start_class->flags & ANYOF_LOCALE)
3338 ANYOF_CLASS_SET(data->start_class,ANYOF_NDIGIT);
3340 for (value = 0; value < 256; value++)
3341 if (!isDIGIT(value))
3342 ANYOF_BITMAP_SET(data->start_class, value);
3347 if (flags & SCF_DO_STCLASS_OR)
3348 cl_and(data->start_class, and_withp);
3349 flags &= ~SCF_DO_STCLASS;
3352 else if (PL_regkind[OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
3353 data->flags |= (OP(scan) == MEOL
3357 else if ( PL_regkind[OP(scan)] == BRANCHJ
3358 /* Lookbehind, or need to calculate parens/evals/stclass: */
3359 && (scan->flags || data || (flags & SCF_DO_STCLASS))
3360 && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) {
3361 if ( !PERL_ENABLE_POSITIVE_ASSERTION_STUDY
3362 || OP(scan) == UNLESSM )
3364 /* Negative Lookahead/lookbehind
3365 In this case we can't do fixed string optimisation.
3368 I32 deltanext, minnext, fake = 0;
3370 struct regnode_charclass_class intrnl;
3373 data_fake.flags = 0;
3375 data_fake.whilem_c = data->whilem_c;
3376 data_fake.last_closep = data->last_closep;
3379 data_fake.last_closep = &fake;
3380 if ( flags & SCF_DO_STCLASS && !scan->flags
3381 && OP(scan) == IFMATCH ) { /* Lookahead */
3382 cl_init(pRExC_state, &intrnl);
3383 data_fake.start_class = &intrnl;
3384 f |= SCF_DO_STCLASS_AND;
3386 if (flags & SCF_WHILEM_VISITED_POS)
3387 f |= SCF_WHILEM_VISITED_POS;
3388 next = regnext(scan);
3389 nscan = NEXTOPER(NEXTOPER(scan));
3390 minnext = study_chunk(pRExC_state, &nscan, minlenp, &deltanext,
3391 last, &data_fake, stopparen, recursed, NULL, f, depth+1);
3394 vFAIL("Variable length lookbehind not implemented");
3396 else if (minnext > (I32)U8_MAX) {
3397 vFAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
3399 scan->flags = (U8)minnext;
3402 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3404 if (data_fake.flags & SF_HAS_EVAL)
3405 data->flags |= SF_HAS_EVAL;
3406 data->whilem_c = data_fake.whilem_c;
3408 if (f & SCF_DO_STCLASS_AND) {
3409 const int was = (data->start_class->flags & ANYOF_EOS);
3411 cl_and(data->start_class, &intrnl);
3413 data->start_class->flags |= ANYOF_EOS;
3416 #if PERL_ENABLE_POSITIVE_ASSERTION_STUDY
3418 /* Positive Lookahead/lookbehind
3419 In this case we can do fixed string optimisation,
3420 but we must be careful about it. Note in the case of
3421 lookbehind the positions will be offset by the minimum
3422 length of the pattern, something we won't know about
3423 until after the recurse.
3425 I32 deltanext, fake = 0;
3427 struct regnode_charclass_class intrnl;
3429 /* We use SAVEFREEPV so that when the full compile
3430 is finished perl will clean up the allocated
3431 minlens when its all done. This was we don't
3432 have to worry about freeing them when we know
3433 they wont be used, which would be a pain.
3436 Newx( minnextp, 1, I32 );
3437 SAVEFREEPV(minnextp);
3440 StructCopy(data, &data_fake, scan_data_t);
3441 if ((flags & SCF_DO_SUBSTR) && data->last_found) {
3444 scan_commit(pRExC_state, &data_fake,minlenp);
3445 data_fake.last_found=newSVsv(data->last_found);
3449 data_fake.last_closep = &fake;
3450 data_fake.flags = 0;
3452 data_fake.flags |= SF_IS_INF;
3453 if ( flags & SCF_DO_STCLASS && !scan->flags
3454 && OP(scan) == IFMATCH ) { /* Lookahead */
3455 cl_init(pRExC_state, &intrnl);
3456 data_fake.start_class = &intrnl;
3457 f |= SCF_DO_STCLASS_AND;
3459 if (flags & SCF_WHILEM_VISITED_POS)
3460 f |= SCF_WHILEM_VISITED_POS;
3461 next = regnext(scan);
3462 nscan = NEXTOPER(NEXTOPER(scan));
3464 *minnextp = study_chunk(pRExC_state, &nscan, minnextp, &deltanext,
3465 last, &data_fake, stopparen, recursed, NULL, f,depth+1);
3468 vFAIL("Variable length lookbehind not implemented");
3470 else if (*minnextp > (I32)U8_MAX) {
3471 vFAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
3473 scan->flags = (U8)*minnextp;
3478 if (f & SCF_DO_STCLASS_AND) {
3479 const int was = (data->start_class->flags & ANYOF_EOS);
3481 cl_and(data->start_class, &intrnl);
3483 data->start_class->flags |= ANYOF_EOS;
3486 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3488 if (data_fake.flags & SF_HAS_EVAL)
3489 data->flags |= SF_HAS_EVAL;
3490 data->whilem_c = data_fake.whilem_c;
3491 if ((flags & SCF_DO_SUBSTR) && data_fake.last_found) {
3492 if (RExC_rx->minlen<*minnextp)
3493 RExC_rx->minlen=*minnextp;
3494 scan_commit(pRExC_state, &data_fake, minnextp);
3495 SvREFCNT_dec(data_fake.last_found);
3497 if ( data_fake.minlen_fixed != minlenp )
3499 data->offset_fixed= data_fake.offset_fixed;
3500 data->minlen_fixed= data_fake.minlen_fixed;
3501 data->lookbehind_fixed+= scan->flags;
3503 if ( data_fake.minlen_float != minlenp )
3505 data->minlen_float= data_fake.minlen_float;
3506 data->offset_float_min=data_fake.offset_float_min;
3507 data->offset_float_max=data_fake.offset_float_max;
3508 data->lookbehind_float+= scan->flags;
3517 else if (OP(scan) == OPEN) {
3518 if (stopparen != (I32)ARG(scan))
3521 else if (OP(scan) == CLOSE) {
3522 if (stopparen == (I32)ARG(scan)) {
3525 if ((I32)ARG(scan) == is_par) {
3526 next = regnext(scan);
3528 if ( next && (OP(next) != WHILEM) && next < last)
3529 is_par = 0; /* Disable optimization */
3532 *(data->last_closep) = ARG(scan);
3534 else if (OP(scan) == GOSUB || OP(scan) == GOSTART) {
3535 /* set the pointer */
3539 if (OP(scan) == GOSUB) {
3541 RExC_recurse[ARG2L(scan)] = scan;
3542 start = RExC_open_parens[paren-1];
3543 end = RExC_close_parens[paren-1];
3546 start = RExC_rx->program + 1;
3552 Newxz(recursed, (((RExC_npar)>>3) +1), U8);
3553 SAVEFREEPV(recursed);
3555 if (!PAREN_TEST(recursed,paren+1)) {
3557 PAREN_SET(recursed,paren+1);
3559 DEBUG_PEEP("goto",start,depth);
3572 if (deltanext == I32_MAX) {
3573 is_inf = is_inf_internal = 1;
3576 DEBUG_PEEP("rtrn",end,depth);
3577 PAREN_UNSET(recursed,paren+1);
3579 if (flags & SCF_DO_SUBSTR) {
3580 scan_commit(pRExC_state,data,minlenp);
3581 data->longest = &(data->longest_float);
3583 is_inf = is_inf_internal = 1;
3584 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
3585 cl_anything(pRExC_state, data->start_class);
3586 flags &= ~SCF_DO_STCLASS;
3589 else if (OP(scan) == EVAL) {
3591 data->flags |= SF_HAS_EVAL;
3593 else if ( PL_regkind[OP(scan)] == ENDLIKE ) {
3594 if (flags & SCF_DO_SUBSTR) {
3595 scan_commit(pRExC_state,data,minlenp);
3596 flags &= ~SCF_DO_SUBSTR;
3598 if (data && OP(scan)==ACCEPT) {
3599 data->flags |= SCF_SEEN_ACCEPT;
3604 else if (OP(scan) == LOGICAL && scan->flags == 2) /* Embedded follows */
3606 if (flags & SCF_DO_SUBSTR) {
3607 scan_commit(pRExC_state,data,minlenp);
3608 data->longest = &(data->longest_float);
3610 is_inf = is_inf_internal = 1;
3611 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
3612 cl_anything(pRExC_state, data->start_class);
3613 flags &= ~SCF_DO_STCLASS;
3615 #ifdef TRIE_STUDY_OPT
3616 #ifdef FULL_TRIE_STUDY
3617 else if (PL_regkind[OP(scan)] == TRIE) {
3618 /* NOTE - There is similar code to this block above for handling
3619 BRANCH nodes on the initial study. If you change stuff here
3621 regnode *trie_node= scan;
3622 regnode *tail= regnext(scan);
3623 reg_trie_data *trie = (reg_trie_data*)RExC_rx->data->data[ ARG(scan) ];
3624 I32 max1 = 0, min1 = I32_MAX;
3625 struct regnode_charclass_class accum;
3627 if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
3628 scan_commit(pRExC_state, data,minlenp); /* Cannot merge strings after this. */
3629 if (flags & SCF_DO_STCLASS)
3630 cl_init_zero(pRExC_state, &accum);
3636 const regnode *nextbranch= NULL;
3639 for ( word=1 ; word <= trie->wordcount ; word++)
3641 I32 deltanext=0, minnext=0, f = 0, fake;
3642 struct regnode_charclass_class this_class;
3644 data_fake.flags = 0;
3646 data_fake.whilem_c = data->whilem_c;
3647 data_fake.last_closep = data->last_closep;
3650 data_fake.last_closep = &fake;
3652 if (flags & SCF_DO_STCLASS) {
3653 cl_init(pRExC_state, &this_class);
3654 data_fake.start_class = &this_class;
3655 f = SCF_DO_STCLASS_AND;
3657 if (flags & SCF_WHILEM_VISITED_POS)
3658 f |= SCF_WHILEM_VISITED_POS;
3660 if (trie->jump[word]) {
3662 nextbranch = trie_node + trie->jump[0];
3663 scan= trie_node + trie->jump[word];
3664 /* We go from the jump point to the branch that follows
3665 it. Note this means we need the vestigal unused branches
3666 even though they arent otherwise used.
3668 minnext = study_chunk(pRExC_state, &scan, minlenp,
3669 &deltanext, (regnode *)nextbranch, &data_fake,
3670 stopparen, recursed, NULL, f,depth+1);
3672 if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
3673 nextbranch= regnext((regnode*)nextbranch);
3675 if (min1 > (I32)(minnext + trie->minlen))
3676 min1 = minnext + trie->minlen;
3677 if (max1 < (I32)(minnext + deltanext + trie->maxlen))
3678 max1 = minnext + deltanext + trie->maxlen;
3679 if (deltanext == I32_MAX)
3680 is_inf = is_inf_internal = 1;
3682 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3684 if (data_fake.flags & SCF_SEEN_ACCEPT) {
3685 if ( stopmin > min + min1)
3686 stopmin = min + min1;
3687 flags &= ~SCF_DO_SUBSTR;
3689 data->flags |= SCF_SEEN_ACCEPT;
3692 if (data_fake.flags & SF_HAS_EVAL)
3693 data->flags |= SF_HAS_EVAL;
3694 data->whilem_c = data_fake.whilem_c;
3696 if (flags & SCF_DO_STCLASS)
3697 cl_or(pRExC_state, &accum, &this_class);
3700 if (flags & SCF_DO_SUBSTR) {
3701 data->pos_min += min1;
3702 data->pos_delta += max1 - min1;
3703 if (max1 != min1 || is_inf)
3704 data->longest = &(data->longest_float);
3707 delta += max1 - min1;
3708 if (flags & SCF_DO_STCLASS_OR) {
3709 cl_or(pRExC_state, data->start_class, &accum);
3711 cl_and(data->start_class, and_withp);
3712 flags &= ~SCF_DO_STCLASS;
3715 else if (flags & SCF_DO_STCLASS_AND) {
3717 cl_and(data->start_class, &accum);
3718 flags &= ~SCF_DO_STCLASS;
3721 /* Switch to OR mode: cache the old value of
3722 * data->start_class */
3724 StructCopy(data->start_class, and_withp,
3725 struct regnode_charclass_class);
3726 flags &= ~SCF_DO_STCLASS_AND;
3727 StructCopy(&accum, data->start_class,
3728 struct regnode_charclass_class);
3729 flags |= SCF_DO_STCLASS_OR;
3730 data->start_class->flags |= ANYOF_EOS;
3737 else if (PL_regkind[OP(scan)] == TRIE) {
3738 reg_trie_data *trie = (reg_trie_data*)RExC_rx->data->data[ ARG(scan) ];
3741 min += trie->minlen;
3742 delta += (trie->maxlen - trie->minlen);
3743 flags &= ~SCF_DO_STCLASS; /* xxx */
3744 if (flags & SCF_DO_SUBSTR) {
3745 scan_commit(pRExC_state,data,minlenp); /* Cannot expect anything... */
3746 data->pos_min += trie->minlen;
3747 data->pos_delta += (trie->maxlen - trie->minlen);
3748 if (trie->maxlen != trie->minlen)
3749 data->longest = &(data->longest_float);
3751 if (trie->jump) /* no more substrings -- for now /grr*/
3752 flags &= ~SCF_DO_SUBSTR;
3754 #endif /* old or new */
3755 #endif /* TRIE_STUDY_OPT */
3756 /* Else: zero-length, ignore. */
3757 scan = regnext(scan);
3762 *deltap = is_inf_internal ? I32_MAX : delta;
3763 if (flags & SCF_DO_SUBSTR && is_inf)
3764 data->pos_delta = I32_MAX - data->pos_min;
3765 if (is_par > (I32)U8_MAX)
3767 if (is_par && pars==1 && data) {
3768 data->flags |= SF_IN_PAR;
3769 data->flags &= ~SF_HAS_PAR;
3771 else if (pars && data) {
3772 data->flags |= SF_HAS_PAR;
3773 data->flags &= ~SF_IN_PAR;
3775 if (flags & SCF_DO_STCLASS_OR)
3776 cl_and(data->start_class, and_withp);
3777 if (flags & SCF_TRIE_RESTUDY)
3778 data->flags |= SCF_TRIE_RESTUDY;
3780 DEBUG_STUDYDATA(data,depth);
3782 return min < stopmin ? min : stopmin;
3786 S_add_data(RExC_state_t *pRExC_state, I32 n, const char *s)
3788 if (RExC_rx->data) {
3789 const U32 count = RExC_rx->data->count;
3790 Renewc(RExC_rx->data,
3791 sizeof(*RExC_rx->data) + sizeof(void*) * (count + n - 1),
3792 char, struct reg_data);
3793 Renew(RExC_rx->data->what, count + n, U8);
3794 RExC_rx->data->count += n;
3797 Newxc(RExC_rx->data, sizeof(*RExC_rx->data) + sizeof(void*) * (n - 1),
3798 char, struct reg_data);
3799 Newx(RExC_rx->data->what, n, U8);
3800 RExC_rx->data->count = n;
3802 Copy(s, RExC_rx->data->what + RExC_rx->data->count - n, n, U8);
3803 return RExC_rx->data->count - n;
3806 #ifndef PERL_IN_XSUB_RE
3808 Perl_reginitcolors(pTHX)
3811 const char * const s = PerlEnv_getenv("PERL_RE_COLORS");
3813 char *t = savepv(s);
3817 t = strchr(t, '\t');
3823 PL_colors[i] = t = (char *)"";
3828 PL_colors[i++] = (char *)"";
3835 #ifdef TRIE_STUDY_OPT
3836 #define CHECK_RESTUDY_GOTO \
3838 (data.flags & SCF_TRIE_RESTUDY) \
3842 #define CHECK_RESTUDY_GOTO
3846 - pregcomp - compile a regular expression into internal code
3848 * We can't allocate space until we know how big the compiled form will be,
3849 * but we can't compile it (and thus know how big it is) until we've got a
3850 * place to put the code. So we cheat: we compile it twice, once with code
3851 * generation turned off and size counting turned on, and once "for real".
3852 * This also means that we don't allocate space until we are sure that the
3853 * thing really will compile successfully, and we never have to move the
3854 * code and thus invalidate pointers into it. (Note that it has to be in
3855 * one piece because free() must be able to free it all.) [NB: not true in perl]
3857 * Beware that the optimization-preparation code in here knows about some
3858 * of the structure of the compiled regexp. [I'll say.]
3863 #ifndef PERL_IN_XSUB_RE
3864 #define RE_ENGINE_PTR &PL_core_reg_engine
3866 extern const struct regexp_engine my_reg_engine;
3867 #define RE_ENGINE_PTR &my_reg_engine
3869 /* these make a few things look better, to avoid indentation */
3870 #define BEGIN_BLOCK {
3874 Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm)
3877 GET_RE_DEBUG_FLAGS_DECL;
3878 DEBUG_r(if (!PL_colorset) reginitcolors());
3879 #ifndef PERL_IN_XSUB_RE
3881 /* Dispatch a request to compile a regexp to correct
3883 HV * const table = GvHV(PL_hintgv);
3885 SV **ptr= hv_fetchs(table, "regcomp", FALSE);
3886 if (ptr && SvIOK(*ptr) && SvIV(*ptr)) {
3887 const regexp_engine *eng=INT2PTR(regexp_engine*,SvIV(*ptr));
3889 PerlIO_printf(Perl_debug_log, "Using engine %"UVxf"\n",
3892 return CALLREGCOMP_ENG(eng, exp, xend, pm);
3906 RExC_state_t RExC_state;
3907 RExC_state_t * const pRExC_state = &RExC_state;
3908 #ifdef TRIE_STUDY_OPT
3910 RExC_state_t copyRExC_state;
3913 FAIL("NULL regexp argument");
3915 RExC_utf8 = pm->op_pmdynflags & PMdf_CMP_UTF8;
3919 SV *dsv= sv_newmortal();
3920 RE_PV_QUOTED_DECL(s, RExC_utf8,
3921 dsv, RExC_precomp, (xend - exp), 60);
3922 PerlIO_printf(Perl_debug_log, "%sCompiling REx%s %s\n",
3923 PL_colors[4],PL_colors[5],s);
3925 RExC_flags = pm->op_pmflags;
3929 RExC_seen_zerolen = *exp == '^' ? -1 : 0;
3930 RExC_seen_evals = 0;
3933 /* First pass: determine size, legality. */
3941 RExC_emit = &PL_regdummy;
3942 RExC_whilem_seen = 0;
3943 RExC_charnames = NULL;
3944 RExC_open_parens = NULL;
3945 RExC_close_parens = NULL;
3947 RExC_paren_names = NULL;
3948 RExC_recurse = NULL;
3949 RExC_recurse_count = 0;
3951 #if 0 /* REGC() is (currently) a NOP at the first pass.
3952 * Clever compilers notice this and complain. --jhi */
3953 REGC((U8)REG_MAGIC, (char*)RExC_emit);
3955 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "Starting first pass (sizing)\n"));
3956 if (reg(pRExC_state, 0, &flags,1) == NULL) {
3957 RExC_precomp = NULL;
3961 PerlIO_printf(Perl_debug_log,
3962 "Required size %"IVdf" nodes\n"
3963 "Starting second pass (creation)\n",
3966 RExC_lastparse=NULL;
3968 /* Small enough for pointer-storage convention?
3969 If extralen==0, this means that we will not need long jumps. */
3970 if (RExC_size >= 0x10000L && RExC_extralen)
3971 RExC_size += RExC_extralen;
3974 if (RExC_whilem_seen > 15)
3975 RExC_whilem_seen = 15;
3978 /* Make room for a sentinel value at the end of the program */
3982 /* Allocate space and zero-initialize. Note, the two step process
3983 of zeroing when in debug mode, thus anything assigned has to
3984 happen after that */
3985 Newxc(r, sizeof(regexp) + (unsigned)RExC_size * sizeof(regnode),
3988 FAIL("Regexp out of space");
3990 /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */
3991 Zero(r, sizeof(regexp) + (unsigned)RExC_size * sizeof(regnode), char);
3993 /* initialization begins here */
3994 r->engine= RE_ENGINE_PTR;
3996 r->prelen = xend - exp;
3997 r->precomp = savepvn(RExC_precomp, r->prelen);
3999 #ifdef PERL_OLD_COPY_ON_WRITE
4000 r->saved_copy = NULL;
4002 r->reganch = pm->op_pmflags & PMf_COMPILETIME;
4003 r->nparens = RExC_npar - 1; /* set early to validate backrefs */
4004 r->lastparen = 0; /* mg.c reads this. */
4006 r->substrs = 0; /* Useful during FAIL. */
4007 r->startp = 0; /* Useful during FAIL. */
4011 if (RExC_seen & REG_SEEN_RECURSE) {
4012 Newxz(RExC_open_parens, RExC_npar,regnode *);
4013 SAVEFREEPV(RExC_open_parens);
4014 Newxz(RExC_close_parens,RExC_npar,regnode *);
4015 SAVEFREEPV(RExC_close_parens);
4018 /* Useful during FAIL. */
4019 Newxz(r->offsets, 2*RExC_size+1, U32); /* MJD 20001228 */
4021 r->offsets[0] = RExC_size;
4023 DEBUG_OFFSETS_r(PerlIO_printf(Perl_debug_log,
4024 "%s %"UVuf" bytes for offset annotations.\n",
4025 r->offsets ? "Got" : "Couldn't get",
4026 (UV)((2*RExC_size+1) * sizeof(U32))));
4030 /* Second pass: emit code. */
4031 RExC_flags = pm->op_pmflags; /* don't let top level (?i) bleed */
4036 RExC_emit_start = r->program;
4037 RExC_emit = r->program;
4039 /* put a sentinal on the end of the program so we can check for
4041 r->program[RExC_size].type = 255;
4043 /* Store the count of eval-groups for security checks: */
4044 RExC_emit->next_off = (RExC_seen_evals > (I32)U16_MAX) ? U16_MAX : (U16)RExC_seen_evals;
4045 REGC((U8)REG_MAGIC, (char*) RExC_emit++);
4047 if (reg(pRExC_state, 0, &flags,1) == NULL)
4050 /* XXXX To minimize changes to RE engine we always allocate
4051 3-units-long substrs field. */
4052 Newx(r->substrs, 1, struct reg_substr_data);
4053 if (RExC_recurse_count) {
4054 Newxz(RExC_recurse,RExC_recurse_count,regnode *);
4055 SAVEFREEPV(RExC_recurse);
4059 r->minlen = minlen = sawplus = sawopen = 0;
4060 Zero(r->substrs, 1, struct reg_substr_data);
4062 #ifdef TRIE_STUDY_OPT
4064 DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log,"Restudying\n"));
4065 RExC_state=copyRExC_state;
4066 if (data.last_found) {
4067 SvREFCNT_dec(data.longest_fixed);
4068 SvREFCNT_dec(data.longest_float);
4069 SvREFCNT_dec(data.last_found);
4071 StructCopy(&zero_scan_data, &data, scan_data_t);
4073 StructCopy(&zero_scan_data, &data, scan_data_t);
4074 copyRExC_state=RExC_state;
4077 StructCopy(&zero_scan_data, &data, scan_data_t);
4080 /* Dig out information for optimizations. */
4081 r->reganch = pm->op_pmflags & PMf_COMPILETIME; /* Again? */
4082 pm->op_pmflags = RExC_flags;
4084 r->reganch |= ROPT_UTF8; /* Unicode in it? */
4085 r->regstclass = NULL;
4086 if (RExC_naughty >= 10) /* Probably an expensive pattern. */
4087 r->reganch |= ROPT_NAUGHTY;
4088 scan = r->program + 1; /* First BRANCH. */
4090 /* testing for BRANCH here tells us whether there is "must appear"
4091 data in the pattern. If there is then we can use it for optimisations */
4092 if (!(RExC_seen & REG_TOP_LEVEL_BRANCHES)) { /* Only one top-level choice. */
4094 STRLEN longest_float_length, longest_fixed_length;
4095 struct regnode_charclass_class ch_class; /* pointed to by data */
4097 I32 last_close = 0; /* pointed to by data */
4100 /* Skip introductions and multiplicators >= 1. */
4101 while ((OP(first) == OPEN && (sawopen = 1)) ||
4102 /* An OR of *one* alternative - should not happen now. */
4103 (OP(first) == BRANCH && OP(regnext(first)) != BRANCH) ||
4104 /* for now we can't handle lookbehind IFMATCH*/
4105 (OP(first) == IFMATCH && !first->flags) ||
4106 (OP(first) == PLUS) ||
4107 (OP(first) == MINMOD) ||
4108 /* An {n,m} with n>0 */
4109 (PL_regkind[OP(first)] == CURLY && ARG1(first) > 0) )
4112 if (OP(first) == PLUS)
4115 first += regarglen[OP(first)];
4116 if (OP(first) == IFMATCH) {
4117 first = NEXTOPER(first);
4118 first += EXTRA_STEP_2ARGS;
4119 } else /* XXX possible optimisation for /(?=)/ */
4120 first = NEXTOPER(first);
4123 /* Starting-point info. */
4125 DEBUG_PEEP("first:",first,0);
4126 /* Ignore EXACT as we deal with it later. */
4127 if (PL_regkind[OP(first)] == EXACT) {
4128 if (OP(first) == EXACT)
4129 NOOP; /* Empty, get anchored substr later. */
4130 else if ((OP(first) == EXACTF || OP(first) == EXACTFL))
4131 r->regstclass = first;
4134 else if (PL_regkind[OP(first)] == TRIE &&
4135 ((reg_trie_data *)r->data->data[ ARG(first) ])->minlen>0)
4138 /* this can happen only on restudy */
4139 if ( OP(first) == TRIE ) {
4140 struct regnode_1 *trieop;
4141 Newxz(trieop,1,struct regnode_1);
4142 StructCopy(first,trieop,struct regnode_1);
4143 trie_op=(regnode *)trieop;
4145 struct regnode_charclass *trieop;
4146 Newxz(trieop,1,struct regnode_charclass);
4147 StructCopy(first,trieop,struct regnode_charclass);
4148 trie_op=(regnode *)trieop;
4151 make_trie_failtable(pRExC_state, (regnode *)first, trie_op, 0);
4152 r->regstclass = trie_op;
4155 else if (strchr((const char*)PL_simple,OP(first)))
4156 r->regstclass = first;
4157 else if (PL_regkind[OP(first)] == BOUND ||
4158 PL_regkind[OP(first)] == NBOUND)
4159 r->regstclass = first;
4160 else if (PL_regkind[OP(first)] == BOL) {
4161 r->reganch |= (OP(first) == MBOL
4163 : (OP(first) == SBOL
4166 first = NEXTOPER(first);
4169 else if (OP(first) == GPOS) {
4170 r->reganch |= ROPT_ANCH_GPOS;
4171 first = NEXTOPER(first);
4174 else if (!sawopen && (OP(first) == STAR &&
4175 PL_regkind[OP(NEXTOPER(first))] == REG_ANY) &&
4176 !(r->reganch & ROPT_ANCH) )
4178 /* turn .* into ^.* with an implied $*=1 */
4180 (OP(NEXTOPER(first)) == REG_ANY)
4183 r->reganch |= type | ROPT_IMPLICIT;
4184 first = NEXTOPER(first);
4187 if (sawplus && (!sawopen || !RExC_sawback)
4188 && !(RExC_seen & REG_SEEN_EVAL)) /* May examine pos and $& */
4189 /* x+ must match at the 1st pos of run of x's */
4190 r->reganch |= ROPT_SKIP;
4192 /* Scan is after the zeroth branch, first is atomic matcher. */
4193 #ifdef TRIE_STUDY_OPT
4196 PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
4197 (IV)(first - scan + 1))
4201 PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
4202 (IV)(first - scan + 1))
4208 * If there's something expensive in the r.e., find the
4209 * longest literal string that must appear and make it the
4210 * regmust. Resolve ties in favor of later strings, since
4211 * the regstart check works with the beginning of the r.e.
4212 * and avoiding duplication strengthens checking. Not a
4213 * strong reason, but sufficient in the absence of others.
4214 * [Now we resolve ties in favor of the earlier string if
4215 * it happens that c_offset_min has been invalidated, since the
4216 * earlier string may buy us something the later one won't.]
4220 data.longest_fixed = newSVpvs("");
4221 data.longest_float = newSVpvs("");
4222 data.last_found = newSVpvs("");
4223 data.longest = &(data.longest_fixed);
4225 if (!r->regstclass) {
4226 cl_init(pRExC_state, &ch_class);
4227 data.start_class = &ch_class;
4228 stclass_flag = SCF_DO_STCLASS_AND;
4229 } else /* XXXX Check for BOUND? */
4231 data.last_closep = &last_close;
4233 minlen = study_chunk(pRExC_state, &first, &minlen, &fake, scan + RExC_size, /* Up to end */
4234 &data, -1, NULL, NULL,
4235 SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag,0);
4241 if ( RExC_npar == 1 && data.longest == &(data.longest_fixed)
4242 && data.last_start_min == 0 && data.last_end > 0
4243 && !RExC_seen_zerolen
4244 && (!(RExC_seen & REG_SEEN_GPOS) || (r->reganch & ROPT_ANCH_GPOS)))
4245 r->reganch |= ROPT_CHECK_ALL;
4246 scan_commit(pRExC_state, &data,&minlen);
4247 SvREFCNT_dec(data.last_found);
4249 /* Note that code very similar to this but for anchored string
4250 follows immediately below, changes may need to be made to both.
4253 longest_float_length = CHR_SVLEN(data.longest_float);
4254 if (longest_float_length
4255 || (data.flags & SF_FL_BEFORE_EOL
4256 && (!(data.flags & SF_FL_BEFORE_MEOL)
4257 || (RExC_flags & PMf_MULTILINE))))
4261 if (SvCUR(data.longest_fixed) /* ok to leave SvCUR */
4262 && data.offset_fixed == data.offset_float_min
4263 && SvCUR(data.longest_fixed) == SvCUR(data.longest_float))
4264 goto remove_float; /* As in (a)+. */
4266 /* copy the information about the longest float from the reg_scan_data
4267 over to the program. */
4268 if (SvUTF8(data.longest_float)) {
4269 r->float_utf8 = data.longest_float;
4270 r->float_substr = NULL;
4272 r->float_substr = data.longest_float;
4273 r->float_utf8 = NULL;
4275 /* float_end_shift is how many chars that must be matched that
4276 follow this item. We calculate it ahead of time as once the
4277 lookbehind offset is added in we lose the ability to correctly
4279 ml = data.minlen_float ? *(data.minlen_float)
4280 : (I32)longest_float_length;
4281 r->float_end_shift = ml - data.offset_float_min
4282 - longest_float_length + (SvTAIL(data.longest_float) != 0)
4283 + data.lookbehind_float;
4284 r->float_min_offset = data.offset_float_min - data.lookbehind_float;
4285 r->float_max_offset = data.offset_float_max;
4286 if (data.offset_float_max < I32_MAX) /* Don't offset infinity */
4287 r->float_max_offset -= data.lookbehind_float;
4289 t = (data.flags & SF_FL_BEFORE_EOL /* Can't have SEOL and MULTI */
4290 && (!(data.flags & SF_FL_BEFORE_MEOL)
4291 || (RExC_flags & PMf_MULTILINE)));
4292 fbm_compile(data.longest_float, t ? FBMcf_TAIL : 0);
4296 r->float_substr = r->float_utf8 = NULL;
4297 SvREFCNT_dec(data.longest_float);
4298 longest_float_length = 0;
4301 /* Note that code very similar to this but for floating string
4302 is immediately above, changes may need to be made to both.
4305 longest_fixed_length = CHR_SVLEN(data.longest_fixed);
4306 if (longest_fixed_length
4307 || (data.flags & SF_FIX_BEFORE_EOL /* Cannot have SEOL and MULTI */
4308 && (!(data.flags & SF_FIX_BEFORE_MEOL)
4309 || (RExC_flags & PMf_MULTILINE))))
4313 /* copy the information about the longest fixed
4314 from the reg_scan_data over to the program. */
4315 if (SvUTF8(data.longest_fixed)) {
4316 r->anchored_utf8 = data.longest_fixed;
4317 r->anchored_substr = NULL;
4319 r->anchored_substr = data.longest_fixed;
4320 r->anchored_utf8 = NULL;
4322 /* fixed_end_shift is how many chars that must be matched that
4323 follow this item. We calculate it ahead of time as once the
4324 lookbehind offset is added in we lose the ability to correctly
4326 ml = data.minlen_fixed ? *(data.minlen_fixed)
4327 : (I32)longest_fixed_length;
4328 r->anchored_end_shift = ml - data.offset_fixed
4329 - longest_fixed_length + (SvTAIL(data.longest_fixed) != 0)
4330 + data.lookbehind_fixed;
4331 r->anchored_offset = data.offset_fixed - data.lookbehind_fixed;
4333 t = (data.flags & SF_FIX_BEFORE_EOL /* Can't have SEOL and MULTI */
4334 && (!(data.flags & SF_FIX_BEFORE_MEOL)
4335 || (RExC_flags & PMf_MULTILINE)));
4336 fbm_compile(data.longest_fixed, t ? FBMcf_TAIL : 0);
4339 r->anchored_substr = r->anchored_utf8 = NULL;
4340 SvREFCNT_dec(data.longest_fixed);
4341 longest_fixed_length = 0;
4344 && (OP(r->regstclass) == REG_ANY || OP(r->regstclass) == SANY))
4345 r->regstclass = NULL;
4346 if ((!(r->anchored_substr || r->anchored_utf8) || r->anchored_offset)
4348 && !(data.start_class->flags & ANYOF_EOS)
4349 && !cl_is_anything(data.start_class))
4351 const I32 n = add_data(pRExC_state, 1, "f");
4353 Newx(RExC_rx->data->data[n], 1,
4354 struct regnode_charclass_class);
4355 StructCopy(data.start_class,
4356 (struct regnode_charclass_class*)RExC_rx->data->data[n],
4357 struct regnode_charclass_class);
4358 r->regstclass = (regnode*)RExC_rx->data->data[n];
4359 r->reganch &= ~ROPT_SKIP; /* Used in find_byclass(). */
4360 DEBUG_COMPILE_r({ SV *sv = sv_newmortal();
4361 regprop(r, sv, (regnode*)data.start_class);
4362 PerlIO_printf(Perl_debug_log,
4363 "synthetic stclass \"%s\".\n",
4364 SvPVX_const(sv));});
4367 /* A temporary algorithm prefers floated substr to fixed one to dig more info. */
4368 if (longest_fixed_length > longest_float_length) {
4369 r->check_end_shift = r->anchored_end_shift;
4370 r->check_substr = r->anchored_substr;
4371 r->check_utf8 = r->anchored_utf8;
4372 r->check_offset_min = r->check_offset_max = r->anchored_offset;
4373 if (r->reganch & ROPT_ANCH_SINGLE)
4374 r->reganch |= ROPT_NOSCAN;
4377 r->check_end_shift = r->float_end_shift;
4378 r->check_substr = r->float_substr;
4379 r->check_utf8 = r->float_utf8;
4380 r->check_offset_min = r->float_min_offset;
4381 r->check_offset_max = r->float_max_offset;
4383 /* XXXX Currently intuiting is not compatible with ANCH_GPOS.
4384 This should be changed ASAP! */
4385 if ((r->check_substr || r->check_utf8) && !(r->reganch & ROPT_ANCH_GPOS)) {
4386 r->reganch |= RE_USE_INTUIT;
4387 if (SvTAIL(r->check_substr ? r->check_substr : r->check_utf8))
4388 r->reganch |= RE_INTUIT_TAIL;
4390 /* XXX Unneeded? dmq (shouldn't as this is handled elsewhere)
4391 if ( (STRLEN)minlen < longest_float_length )
4392 minlen= longest_float_length;
4393 if ( (STRLEN)minlen < longest_fixed_length )
4394 minlen= longest_fixed_length;
4398 /* Several toplevels. Best we can is to set minlen. */
4400 struct regnode_charclass_class ch_class;
4403 DEBUG_COMPILE_r(PerlIO_printf(Perl_debug_log, "\n"));
4405 scan = r->program + 1;
4406 cl_init(pRExC_state, &ch_class);
4407 data.start_class = &ch_class;
4408 data.last_closep = &last_close;
4410 minlen = study_chunk(pRExC_state, &scan, &minlen, &fake, scan + RExC_size,
4411 &data, -1, NULL, NULL, SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS,0);
4415 r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8
4416 = r->float_substr = r->float_utf8 = NULL;
4417 if (!(data.start_class->flags & ANYOF_EOS)
4418 && !cl_is_anything(data.start_class))
4420 const I32 n = add_data(pRExC_state, 1, "f");
4422 Newx(RExC_rx->data->data[n], 1,
4423 struct regnode_charclass_class);
4424 StructCopy(data.start_class,
4425 (struct regnode_charclass_class*)RExC_rx->data->data[n],
4426 struct regnode_charclass_class);
4427 r->regstclass = (regnode*)RExC_rx->data->data[n];
4428 r->reganch &= ~ROPT_SKIP; /* Used in find_byclass(). */
4429 DEBUG_COMPILE_r({ SV* sv = sv_newmortal();
4430 regprop(r, sv, (regnode*)data.start_class);
4431 PerlIO_printf(Perl_debug_log,
4432 "synthetic stclass \"%s\".\n",
4433 SvPVX_const(sv));});
4437 /* Guard against an embedded (?=) or (?<=) with a longer minlen than
4438 the "real" pattern. */
4439 if (r->minlen < minlen)
4442 if (RExC_seen & REG_SEEN_GPOS)
4443 r->reganch |= ROPT_GPOS_SEEN;
4444 if (RExC_seen & REG_SEEN_LOOKBEHIND)
4445 r->reganch |= ROPT_LOOKBEHIND_SEEN;
4446 if (RExC_seen & REG_SEEN_EVAL)
4447 r->reganch |= ROPT_EVAL_SEEN;
4448 if (RExC_seen & REG_SEEN_CANY)
4449 r->reganch |= ROPT_CANY_SEEN;
4450 if (RExC_seen & REG_SEEN_VERBARG)
4451 r->reganch |= ROPT_VERBARG_SEEN;
4452 if (RExC_paren_names)
4453 r->paren_names = (HV*)SvREFCNT_inc(RExC_paren_names);
4455 r->paren_names = NULL;
4457 if (RExC_recurse_count) {
4458 for ( ; RExC_recurse_count ; RExC_recurse_count-- ) {
4459 const regnode *scan = RExC_recurse[RExC_recurse_count-1];
4460 ARG2L_SET( scan, RExC_open_parens[ARG(scan)-1] - scan );
4463 Newxz(r->startp, RExC_npar, I32);
4464 Newxz(r->endp, RExC_npar, I32);
4466 DEBUG_r( RX_DEBUG_on(r) );
4468 PerlIO_printf(Perl_debug_log,"Final program:\n");
4471 DEBUG_OFFSETS_r(if (r->offsets) {
4472 const U32 len = r->offsets[0];
4474 GET_RE_DEBUG_FLAGS_DECL;
4475 PerlIO_printf(Perl_debug_log, "Offsets: [%"UVuf"]\n\t", (UV)r->offsets[0]);
4476 for (i = 1; i <= len; i++) {
4477 if (r->offsets[i*2-1] || r->offsets[i*2])
4478 PerlIO_printf(Perl_debug_log, "%"UVuf":%"UVuf"[%"UVuf"] ",
4479 (UV)i, (UV)r->offsets[i*2-1], (UV)r->offsets[i*2]);
4481 PerlIO_printf(Perl_debug_log, "\n");
4487 #undef CORE_ONLY_BLOCK
4489 #undef RE_ENGINE_PTR
4491 #ifndef PERL_IN_XSUB_RE
4493 Perl_reg_named_buff_sv(pTHX_ SV* namesv)
4495 I32 parno = 0; /* no match */
4497 const REGEXP * const rx = PM_GETRE(PL_curpm);
4498 if (rx && rx->paren_names) {
4499 HE *he_str = hv_fetch_ent( rx->paren_names, namesv, 0, 0 );
4502 SV* sv_dat=HeVAL(he_str);
4503 I32 *nums=(I32*)SvPVX(sv_dat);
4504 for ( i=0; i<SvIVX(sv_dat); i++ ) {
4505 if ((I32)(rx->lastparen) >= nums[i] &&
4506 rx->endp[nums[i]] != -1)
4519 SV *sv= sv_newmortal();
4520 Perl_sv_setpvf(aTHX_ sv, "%"IVdf,(IV)parno);
4521 gv_paren= Perl_gv_fetchsv(aTHX_ sv, GV_ADD, SVt_PVGV);
4522 return GvSVn(gv_paren);
4527 /* Scans the name of a named buffer from the pattern.
4528 * If flags is REG_RSN_RETURN_NULL returns null.
4529 * If flags is REG_RSN_RETURN_NAME returns an SV* containing the name
4530 * If flags is REG_RSN_RETURN_DATA returns the data SV* corresponding
4531 * to the parsed name as looked up in the RExC_paren_names hash.
4532 * If there is an error throws a vFAIL().. type exception.
4535 #define REG_RSN_RETURN_NULL 0
4536 #define REG_RSN_RETURN_NAME 1
4537 #define REG_RSN_RETURN_DATA 2
4540 S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags) {
4541 char *name_start = RExC_parse;
4544 while( isIDFIRST_uni(utf8n_to_uvchr((U8*)RExC_parse,
4545 RExC_end - RExC_parse, &numlen, UTF8_ALLOW_DEFAULT)))
4547 RExC_parse += numlen;
4550 while( isIDFIRST(*RExC_parse) )
4554 SV* sv_name = sv_2mortal(Perl_newSVpvn(aTHX_ name_start,
4555 (int)(RExC_parse - name_start)));
4558 if ( flags == REG_RSN_RETURN_NAME)
4560 else if (flags==REG_RSN_RETURN_DATA) {
4563 if ( ! sv_name ) /* should not happen*/
4564 Perl_croak(aTHX_ "panic: no svname in reg_scan_name");
4565 if (RExC_paren_names)
4566 he_str = hv_fetch_ent( RExC_paren_names, sv_name, 0, 0 );
4568 sv_dat = HeVAL(he_str);
4570 vFAIL("Reference to nonexistent named group");
4574 Perl_croak(aTHX_ "panic: bad flag in reg_scan_name");
4581 #define DEBUG_PARSE_MSG(funcname) DEBUG_PARSE_r({ \
4582 int rem=(int)(RExC_end - RExC_parse); \
4591 if (RExC_lastparse!=RExC_parse) \
4592 PerlIO_printf(Perl_debug_log," >%.*s%-*s", \
4595 iscut ? "..." : "<" \
4598 PerlIO_printf(Perl_debug_log,"%16s",""); \
4603 num=REG_NODE_NUM(RExC_emit); \
4604 if (RExC_lastnum!=num) \
4605 PerlIO_printf(Perl_debug_log,"|%4d",num); \
4607 PerlIO_printf(Perl_debug_log,"|%4s",""); \
4608 PerlIO_printf(Perl_debug_log,"|%*s%-4s", \
4609 (int)((depth*2)), "", \
4613 RExC_lastparse=RExC_parse; \
4618 #define DEBUG_PARSE(funcname) DEBUG_PARSE_r({ \
4619 DEBUG_PARSE_MSG((funcname)); \
4620 PerlIO_printf(Perl_debug_log,"%4s","\n"); \
4622 #define DEBUG_PARSE_FMT(funcname,fmt,args) DEBUG_PARSE_r({ \
4623 DEBUG_PARSE_MSG((funcname)); \
4624 PerlIO_printf(Perl_debug_log,fmt "\n",args); \
4627 - reg - regular expression, i.e. main body or parenthesized thing
4629 * Caller must absorb opening parenthesis.
4631 * Combining parenthesis handling with the base level of regular expression
4632 * is a trifle forced, but the need to tie the tails of the branches to what
4633 * follows makes it hard to avoid.
4635 #define REGTAIL(x,y,z) regtail((x),(y),(z),depth+1)
4637 #define REGTAIL_STUDY(x,y,z) regtail_study((x),(y),(z),depth+1)
4639 #define REGTAIL_STUDY(x,y,z) regtail((x),(y),(z),depth+1)
4642 /* this idea is borrowed from STR_WITH_LEN in handy.h */
4643 #define CHECK_WORD(s,v,l) \
4644 (((sizeof(s)-1)==(l)) && (strnEQ(start_verb, (s ""), (sizeof(s)-1))))
4647 S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
4648 /* paren: Parenthesized? 0=top, 1=(, inside: changed to letter. */
4651 register regnode *ret; /* Will be the head of the group. */
4652 register regnode *br;
4653 register regnode *lastbr;
4654 register regnode *ender = NULL;
4655 register I32 parno = 0;
4657 const I32 oregflags = RExC_flags;
4658 bool have_branch = 0;
4661 /* for (?g), (?gc), and (?o) warnings; warning
4662 about (?c) will warn about (?g) -- japhy */
4664 #define WASTED_O 0x01
4665 #define WASTED_G 0x02
4666 #define WASTED_C 0x04
4667 #define WASTED_GC (0x02|0x04)
4668 I32 wastedflags = 0x00;
4670 char * parse_start = RExC_parse; /* MJD */
4671 char * const oregcomp_parse = RExC_parse;
4673 GET_RE_DEBUG_FLAGS_DECL;
4674 DEBUG_PARSE("reg ");
4677 *flagp = 0; /* Tentatively. */
4680 /* Make an OPEN node, if parenthesized. */
4682 if ( *RExC_parse == '*') { /* (*VERB:ARG) */
4683 char *start_verb = RExC_parse;
4684 STRLEN verb_len = 0;
4685 char *start_arg = NULL;
4686 unsigned char op = 0;
4688 int internal_argval = 0; /* internal_argval is only useful if !argok */
4689 while ( *RExC_parse && *RExC_parse != ')' ) {
4690 if ( *RExC_parse == ':' ) {
4691 start_arg = RExC_parse + 1;
4697 verb_len = RExC_parse - start_verb;
4700 while ( *RExC_parse && *RExC_parse != ')' )
4702 if ( *RExC_parse != ')' )
4703 vFAIL("Unterminated verb pattern argument");
4704 if ( RExC_parse == start_arg )
4707 if ( *RExC_parse != ')' )
4708 vFAIL("Unterminated verb pattern");
4710 switch ( *start_verb ) {
4711 case 'A': /* (*ACCEPT) */
4712 if ( CHECK_WORD("ACCEPT",start_verb,verb_len) ) {
4714 internal_argval = RExC_nestroot;
4717 case 'C': /* (*COMMIT) */
4718 if ( CHECK_WORD("COMMIT",start_verb,verb_len) )
4720 else if ( CHECK_WORD("CUT",start_verb,verb_len) )
4723 case 'F': /* (*FAIL) */
4724 if ( verb_len==1 || CHECK_WORD("FAIL",start_verb,verb_len) ) {
4730 if ( CHECK_WORD("MARK",start_verb,verb_len) )
4733 case 'N': /* (*NOMATCH) */
4734 if ( CHECK_WORD("NOMATCH",start_verb,verb_len) )
4740 vFAIL3("Unknown verb pattern '%.*s'",
4741 verb_len, start_verb);
4744 if ( start_arg && internal_argval ) {
4745 vFAIL3("Verb pattern '%.*s' may not have an argument",
4746 verb_len, start_verb);
4747 } else if ( argok < 0 && !start_arg ) {
4748 vFAIL3("Verb pattern '%.*s' has a mandatory argument",
4749 verb_len, start_verb);
4751 ret = reganode(pRExC_state, op, internal_argval);
4752 if ( ! internal_argval && ! SIZE_ONLY ) {
4754 SV *sv = newSVpvn( start_arg, RExC_parse - start_arg);
4755 ARG(ret) = add_data( pRExC_state, 1, "S" );
4756 RExC_rx->data->data[ARG(ret)]=(void*)sv;
4763 if (!internal_argval)
4764 RExC_seen |= REG_SEEN_VERBARG;
4765 } else if ( start_arg ) {
4766 vFAIL3("Verb pattern '%.*s' may not have an argument",
4767 verb_len, start_verb);
4769 ret = reg_node(pRExC_state, op);
4771 nextchar(pRExC_state);
4774 if (*RExC_parse == '?') { /* (?...) */
4775 U32 posflags = 0, negflags = 0;
4776 U32 *flagsp = &posflags;
4777 bool is_logical = 0;
4778 const char * const seqstart = RExC_parse;
4781 paren = *RExC_parse++;
4782 ret = NULL; /* For look-ahead/behind. */
4785 case '<': /* (?<...) */
4786 if (*RExC_parse == '!')
4788 else if (*RExC_parse != '=')
4793 case '\'': /* (?'...') */
4794 name_start= RExC_parse;
4795 svname = reg_scan_name(pRExC_state,
4796 SIZE_ONLY ? /* reverse test from the others */
4797 REG_RSN_RETURN_NAME :
4798 REG_RSN_RETURN_NULL);
4799 if (RExC_parse == name_start)
4801 if (*RExC_parse != paren)
4802 vFAIL2("Sequence (?%c... not terminated",
4803 paren=='>' ? '<' : paren);
4807 if (!svname) /* shouldnt happen */
4809 "panic: reg_scan_name returned NULL");
4810 if (!RExC_paren_names) {
4811 RExC_paren_names= newHV();
4812 sv_2mortal((SV*)RExC_paren_names);
4814 he_str = hv_fetch_ent( RExC_paren_names, svname, 1, 0 );
4816 sv_dat = HeVAL(he_str);
4818 /* croak baby croak */
4820 "panic: paren_name hash element allocation failed");
4821 } else if ( SvPOK(sv_dat) ) {
4822 IV count=SvIV(sv_dat);
4823 I32 *pv=(I32*)SvGROW(sv_dat,SvCUR(sv_dat)+sizeof(I32)+1);
4824 SvCUR_set(sv_dat,SvCUR(sv_dat)+sizeof(I32));
4825 pv[count]=RExC_npar;
4828 (void)SvUPGRADE(sv_dat,SVt_PVNV);
4829 sv_setpvn(sv_dat, (char *)&(RExC_npar), sizeof(I32));
4834 /*sv_dump(sv_dat);*/
4836 nextchar(pRExC_state);
4838 goto capturing_parens;
4840 RExC_seen |= REG_SEEN_LOOKBEHIND;
4842 case '=': /* (?=...) */
4843 case '!': /* (?!...) */
4844 RExC_seen_zerolen++;
4845 if (*RExC_parse == ')') {
4846 ret=reg_node(pRExC_state, OPFAIL);
4847 nextchar(pRExC_state);
4850 case ':': /* (?:...) */
4851 case '>': /* (?>...) */
4853 case '$': /* (?$...) */
4854 case '@': /* (?@...) */
4855 vFAIL2("Sequence (?%c...) not implemented", (int)paren);
4857 case '#': /* (?#...) */
4858 while (*RExC_parse && *RExC_parse != ')')
4860 if (*RExC_parse != ')')
4861 FAIL("Sequence (?#... not terminated");
4862 nextchar(pRExC_state);
4865 case '0' : /* (?0) */
4866 case 'R' : /* (?R) */
4867 if (*RExC_parse != ')')
4868 FAIL("Sequence (?R) not terminated");
4869 ret = reg_node(pRExC_state, GOSTART);
4870 nextchar(pRExC_state);
4873 { /* named and numeric backreferences */
4876 case '&': /* (?&NAME) */
4877 parse_start = RExC_parse - 1;
4879 SV *sv_dat = reg_scan_name(pRExC_state,
4880 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
4881 num = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
4883 goto gen_recurse_regop;
4885 case '1': case '2': case '3': case '4': /* (?1) */
4886 case '5': case '6': case '7': case '8': case '9':
4888 num = atoi(RExC_parse);
4889 parse_start = RExC_parse - 1; /* MJD */
4890 while (isDIGIT(*RExC_parse))
4892 if (*RExC_parse!=')')
4893 vFAIL("Expecting close bracket");
4896 ret = reganode(pRExC_state, GOSUB, num);
4898 if (num > (I32)RExC_rx->nparens) {
4900 vFAIL("Reference to nonexistent group");
4902 ARG2L_SET( ret, RExC_recurse_count++);
4904 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
4905 "Recurse #%"UVuf" to %"IVdf"\n", (UV)ARG(ret), (IV)ARG2L(ret)));
4909 RExC_seen |= REG_SEEN_RECURSE;
4910 Set_Node_Length(ret, 1 + regarglen[OP(ret)]); /* MJD */
4911 Set_Node_Offset(ret, parse_start); /* MJD */
4913 nextchar(pRExC_state);
4915 } /* named and numeric backreferences */
4918 case 'p': /* (?p...) */
4919 if (SIZE_ONLY && ckWARN2(WARN_DEPRECATED, WARN_REGEXP))
4920 vWARNdep(RExC_parse, "(?p{}) is deprecated - use (??{})");
4922 case '?': /* (??...) */
4924 if (*RExC_parse != '{')
4926 paren = *RExC_parse++;
4928 case '{': /* (?{...}) */
4930 I32 count = 1, n = 0;
4932 char *s = RExC_parse;
4934 RExC_seen_zerolen++;
4935 RExC_seen |= REG_SEEN_EVAL;
4936 while (count && (c = *RExC_parse)) {
4947 if (*RExC_parse != ')') {
4949 vFAIL("Sequence (?{...}) not terminated or not {}-balanced");
4953 OP_4tree *sop, *rop;
4954 SV * const sv = newSVpvn(s, RExC_parse - 1 - s);
4957 Perl_save_re_context(aTHX);
4958 rop = sv_compile_2op(sv, &sop, "re", &pad);
4959 sop->op_private |= OPpREFCOUNTED;
4960 /* re_dup will OpREFCNT_inc */
4961 OpREFCNT_set(sop, 1);
4964 n = add_data(pRExC_state, 3, "nop");
4965 RExC_rx->data->data[n] = (void*)rop;
4966 RExC_rx->data->data[n+1] = (void*)sop;
4967 RExC_rx->data->data[n+2] = (void*)pad;
4970 else { /* First pass */
4971 if (PL_reginterp_cnt < ++RExC_seen_evals
4973 /* No compiled RE interpolated, has runtime
4974 components ===> unsafe. */
4975 FAIL("Eval-group not allowed at runtime, use re 'eval'");
4976 if (PL_tainting && PL_tainted)
4977 FAIL("Eval-group in insecure regular expression");
4978 #if PERL_VERSION > 8
4979 if (IN_PERL_COMPILETIME)
4984 nextchar(pRExC_state);
4986 ret = reg_node(pRExC_state, LOGICAL);
4989 REGTAIL(pRExC_state, ret, reganode(pRExC_state, EVAL, n));
4990 /* deal with the length of this later - MJD */
4993 ret = reganode(pRExC_state, EVAL, n);
4994 Set_Node_Length(ret, RExC_parse - parse_start + 1);
4995 Set_Node_Offset(ret, parse_start);
4998 case '(': /* (?(?{...})...) and (?(?=...)...) */
5001 if (RExC_parse[0] == '?') { /* (?(?...)) */
5002 if (RExC_parse[1] == '=' || RExC_parse[1] == '!'
5003 || RExC_parse[1] == '<'
5004 || RExC_parse[1] == '{') { /* Lookahead or eval. */
5007 ret = reg_node(pRExC_state, LOGICAL);
5010 REGTAIL(pRExC_state, ret, reg(pRExC_state, 1, &flag,depth+1));
5014 else if ( RExC_parse[0] == '<' /* (?(<NAME>)...) */
5015 || RExC_parse[0] == '\'' ) /* (?('NAME')...) */
5017 char ch = RExC_parse[0] == '<' ? '>' : '\'';
5018 char *name_start= RExC_parse++;
5020 SV *sv_dat=reg_scan_name(pRExC_state,
5021 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
5022 if (RExC_parse == name_start || *RExC_parse != ch)
5023 vFAIL2("Sequence (?(%c... not terminated",
5024 (ch == '>' ? '<' : ch));
5027 num = add_data( pRExC_state, 1, "S" );
5028 RExC_rx->data->data[num]=(void*)sv_dat;
5029 SvREFCNT_inc(sv_dat);
5031 ret = reganode(pRExC_state,NGROUPP,num);
5032 goto insert_if_check_paren;
5034 else if (RExC_parse[0] == 'D' &&
5035 RExC_parse[1] == 'E' &&
5036 RExC_parse[2] == 'F' &&
5037 RExC_parse[3] == 'I' &&
5038 RExC_parse[4] == 'N' &&
5039 RExC_parse[5] == 'E')
5041 ret = reganode(pRExC_state,DEFINEP,0);
5044 goto insert_if_check_paren;
5046 else if (RExC_parse[0] == 'R') {
5049 if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
5050 parno = atoi(RExC_parse++);
5051 while (isDIGIT(*RExC_parse))
5053 } else if (RExC_parse[0] == '&') {
5056 sv_dat = reg_scan_name(pRExC_state,
5057 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
5058 parno = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
5060 ret = reganode(pRExC_state,INSUBP,parno);
5061 goto insert_if_check_paren;
5063 else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
5066 parno = atoi(RExC_parse++);
5068 while (isDIGIT(*RExC_parse))
5070 ret = reganode(pRExC_state, GROUPP, parno);
5072 insert_if_check_paren:
5073 if ((c = *nextchar(pRExC_state)) != ')')
5074 vFAIL("Switch condition not recognized");
5076 REGTAIL(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
5077 br = regbranch(pRExC_state, &flags, 1,depth+1);
5079 br = reganode(pRExC_state, LONGJMP, 0);
5081 REGTAIL(pRExC_state, br, reganode(pRExC_state, LONGJMP, 0));
5082 c = *nextchar(pRExC_state);
5087 vFAIL("(?(DEFINE)....) does not allow branches");
5088 lastbr = reganode(pRExC_state, IFTHEN, 0); /* Fake one for optimizer. */
5089 regbranch(pRExC_state, &flags, 1,depth+1);
5090 REGTAIL(pRExC_state, ret, lastbr);
5093 c = *nextchar(pRExC_state);
5098 vFAIL("Switch (?(condition)... contains too many branches");
5099 ender = reg_node(pRExC_state, TAIL);
5100 REGTAIL(pRExC_state, br, ender);
5102 REGTAIL(pRExC_state, lastbr, ender);
5103 REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
5106 REGTAIL(pRExC_state, ret, ender);
5110 vFAIL2("Unknown switch condition (?(%.2s", RExC_parse);
5114 RExC_parse--; /* for vFAIL to print correctly */
5115 vFAIL("Sequence (? incomplete");
5119 parse_flags: /* (?i) */
5120 while (*RExC_parse && strchr("iogcmsx", *RExC_parse)) {
5121 /* (?g), (?gc) and (?o) are useless here
5122 and must be globally applied -- japhy */
5124 if (*RExC_parse == 'o' || *RExC_parse == 'g') {
5125 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
5126 const I32 wflagbit = *RExC_parse == 'o' ? WASTED_O : WASTED_G;
5127 if (! (wastedflags & wflagbit) ) {
5128 wastedflags |= wflagbit;
5131 "Useless (%s%c) - %suse /%c modifier",
5132 flagsp == &negflags ? "?-" : "?",
5134 flagsp == &negflags ? "don't " : "",
5140 else if (*RExC_parse == 'c') {
5141 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
5142 if (! (wastedflags & WASTED_C) ) {
5143 wastedflags |= WASTED_GC;
5146 "Useless (%sc) - %suse /gc modifier",
5147 flagsp == &negflags ? "?-" : "?",
5148 flagsp == &negflags ? "don't " : ""
5153 else { pmflag(flagsp, *RExC_parse); }
5157 if (*RExC_parse == '-') {
5159 wastedflags = 0; /* reset so (?g-c) warns twice */
5163 RExC_flags |= posflags;
5164 RExC_flags &= ~negflags;
5165 if (*RExC_parse == ':') {
5171 if (*RExC_parse != ')') {
5173 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
5175 nextchar(pRExC_state);
5185 ret = reganode(pRExC_state, OPEN, parno);
5188 RExC_nestroot = parno;
5189 if (RExC_seen & REG_SEEN_RECURSE) {
5190 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
5191 "Setting open paren #%"IVdf" to %d\n",
5192 (IV)parno, REG_NODE_NUM(ret)));
5193 RExC_open_parens[parno-1]= ret;
5196 Set_Node_Length(ret, 1); /* MJD */
5197 Set_Node_Offset(ret, RExC_parse); /* MJD */
5204 /* Pick up the branches, linking them together. */
5205 parse_start = RExC_parse; /* MJD */
5206 br = regbranch(pRExC_state, &flags, 1,depth+1);
5207 /* branch_len = (paren != 0); */
5211 if (*RExC_parse == '|') {
5212 if (!SIZE_ONLY && RExC_extralen) {
5213 reginsert(pRExC_state, BRANCHJ, br, depth+1);
5216 reginsert(pRExC_state, BRANCH, br, depth+1);
5217 Set_Node_Length(br, paren != 0);
5218 Set_Node_Offset_To_R(br-RExC_emit_start, parse_start-RExC_start);
5222 RExC_extralen += 1; /* For BRANCHJ-BRANCH. */
5224 else if (paren == ':') {
5225 *flagp |= flags&SIMPLE;
5227 if (is_open) { /* Starts with OPEN. */
5228 REGTAIL(pRExC_state, ret, br); /* OPEN -> first. */
5230 else if (paren != '?') /* Not Conditional */
5232 *flagp |= flags & (SPSTART | HASWIDTH);
5234 while (*RExC_parse == '|') {
5235 if (!SIZE_ONLY && RExC_extralen) {
5236 ender = reganode(pRExC_state, LONGJMP,0);
5237 REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender); /* Append to the previous. */
5240 RExC_extralen += 2; /* Account for LONGJMP. */
5241 nextchar(pRExC_state);
5242 br = regbranch(pRExC_state, &flags, 0, depth+1);
5246 REGTAIL(pRExC_state, lastbr, br); /* BRANCH -> BRANCH. */
5250 *flagp |= flags&SPSTART;
5253 if (have_branch || paren != ':') {
5254 /* Make a closing node, and hook it on the end. */
5257 ender = reg_node(pRExC_state, TAIL);
5260 ender = reganode(pRExC_state, CLOSE, parno);
5261 if (!SIZE_ONLY && RExC_seen & REG_SEEN_RECURSE) {
5262 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
5263 "Setting close paren #%"IVdf" to %d\n",
5264 (IV)parno, REG_NODE_NUM(ender)));
5265 RExC_close_parens[parno-1]= ender;
5266 if (RExC_nestroot == parno)
5269 Set_Node_Offset(ender,RExC_parse+1); /* MJD */
5270 Set_Node_Length(ender,1); /* MJD */
5276 *flagp &= ~HASWIDTH;
5279 ender = reg_node(pRExC_state, SUCCEED);
5282 ender = reg_node(pRExC_state, END);
5284 assert(!RExC_opend); /* there can only be one! */
5289 REGTAIL(pRExC_state, lastbr, ender);
5291 if (have_branch && !SIZE_ONLY) {
5293 RExC_seen |= REG_TOP_LEVEL_BRANCHES;
5295 /* Hook the tails of the branches to the closing node. */
5296 for (br = ret; br; br = regnext(br)) {
5297 const U8 op = PL_regkind[OP(br)];
5299 REGTAIL_STUDY(pRExC_state, NEXTOPER(br), ender);
5301 else if (op == BRANCHJ) {
5302 REGTAIL_STUDY(pRExC_state, NEXTOPER(NEXTOPER(br)), ender);
5310 static const char parens[] = "=!<,>";
5312 if (paren && (p = strchr(parens, paren))) {
5313 U8 node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
5314 int flag = (p - parens) > 1;
5317 node = SUSPEND, flag = 0;
5318 reginsert(pRExC_state, node,ret, depth+1);
5319 Set_Node_Cur_Length(ret);
5320 Set_Node_Offset(ret, parse_start + 1);
5322 REGTAIL_STUDY(pRExC_state, ret, reg_node(pRExC_state, TAIL));
5326 /* Check for proper termination. */
5328 RExC_flags = oregflags;
5329 if (RExC_parse >= RExC_end || *nextchar(pRExC_state) != ')') {
5330 RExC_parse = oregcomp_parse;
5331 vFAIL("Unmatched (");
5334 else if (!paren && RExC_parse < RExC_end) {
5335 if (*RExC_parse == ')') {
5337 vFAIL("Unmatched )");
5340 FAIL("Junk on end of regexp"); /* "Can't happen". */
5348 - regbranch - one alternative of an | operator
5350 * Implements the concatenation operator.
5353 S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first, U32 depth)
5356 register regnode *ret;
5357 register regnode *chain = NULL;
5358 register regnode *latest;
5359 I32 flags = 0, c = 0;
5360 GET_RE_DEBUG_FLAGS_DECL;
5361 DEBUG_PARSE("brnc");
5365 if (!SIZE_ONLY && RExC_extralen)
5366 ret = reganode(pRExC_state, BRANCHJ,0);
5368 ret = reg_node(pRExC_state, BRANCH);
5369 Set_Node_Length(ret, 1);
5373 if (!first && SIZE_ONLY)
5374 RExC_extralen += 1; /* BRANCHJ */
5376 *flagp = WORST; /* Tentatively. */
5379 nextchar(pRExC_state);
5380 while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
5382 latest = regpiece(pRExC_state, &flags,depth+1);
5383 if (latest == NULL) {
5384 if (flags & TRYAGAIN)
5388 else if (ret == NULL)
5390 *flagp |= flags&HASWIDTH;
5391 if (chain == NULL) /* First piece. */
5392 *flagp |= flags&SPSTART;
5395 REGTAIL(pRExC_state, chain, latest);
5400 if (chain == NULL) { /* Loop ran zero times. */
5401 chain = reg_node(pRExC_state, NOTHING);
5406 *flagp |= flags&SIMPLE;
5413 - regpiece - something followed by possible [*+?]
5415 * Note that the branching code sequences used for ? and the general cases
5416 * of * and + are somewhat optimized: they use the same NOTHING node as
5417 * both the endmarker for their branch list and the body of the last branch.
5418 * It might seem that this node could be dispensed with entirely, but the
5419 * endmarker role is not redundant.
5422 S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
5425 register regnode *ret;
5427 register char *next;
5429 const char * const origparse = RExC_parse;
5431 I32 max = REG_INFTY;
5433 const char *maxpos = NULL;
5434 GET_RE_DEBUG_FLAGS_DECL;
5435 DEBUG_PARSE("piec");
5437 ret = regatom(pRExC_state, &flags,depth+1);
5439 if (flags & TRYAGAIN)
5446 if (op == '{' && regcurly(RExC_parse)) {
5448 parse_start = RExC_parse; /* MJD */
5449 next = RExC_parse + 1;
5450 while (isDIGIT(*next) || *next == ',') {
5459 if (*next == '}') { /* got one */
5463 min = atoi(RExC_parse);
5467 maxpos = RExC_parse;
5469 if (!max && *maxpos != '0')
5470 max = REG_INFTY; /* meaning "infinity" */
5471 else if (max >= REG_INFTY)
5472 vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
5474 nextchar(pRExC_state);
5477 if ((flags&SIMPLE)) {
5478 RExC_naughty += 2 + RExC_naughty / 2;
5479 reginsert(pRExC_state, CURLY, ret, depth+1);
5480 Set_Node_Offset(ret, parse_start+1); /* MJD */
5481 Set_Node_Cur_Length(ret);
5484 regnode * const w = reg_node(pRExC_state, WHILEM);
5487 REGTAIL(pRExC_state, ret, w);
5488 if (!SIZE_ONLY && RExC_extralen) {
5489 reginsert(pRExC_state, LONGJMP,ret, depth+1);
5490 reginsert(pRExC_state, NOTHING,ret, depth+1);
5491 NEXT_OFF(ret) = 3; /* Go over LONGJMP. */
5493 reginsert(pRExC_state, CURLYX,ret, depth+1);
5495 Set_Node_Offset(ret, parse_start+1);
5496 Set_Node_Length(ret,
5497 op == '{' ? (RExC_parse - parse_start) : 1);
5499 if (!SIZE_ONLY && RExC_extralen)
5500 NEXT_OFF(ret) = 3; /* Go over NOTHING to LONGJMP. */
5501 REGTAIL(pRExC_state, ret, reg_node(pRExC_state, NOTHING));
5503 RExC_whilem_seen++, RExC_extralen += 3;
5504 RExC_naughty += 4 + RExC_naughty; /* compound interest */
5512 if (max && max < min)
5513 vFAIL("Can't do {n,m} with n > m");
5515 ARG1_SET(ret, (U16)min);
5516 ARG2_SET(ret, (U16)max);
5528 #if 0 /* Now runtime fix should be reliable. */
5530 /* if this is reinstated, don't forget to put this back into perldiag:
5532 =item Regexp *+ operand could be empty at {#} in regex m/%s/
5534 (F) The part of the regexp subject to either the * or + quantifier
5535 could match an empty string. The {#} shows in the regular
5536 expression about where the problem was discovered.
5540 if (!(flags&HASWIDTH) && op != '?')
5541 vFAIL("Regexp *+ operand could be empty");
5544 parse_start = RExC_parse;
5545 nextchar(pRExC_state);
5547 *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH);
5549 if (op == '*' && (flags&SIMPLE)) {
5550 reginsert(pRExC_state, STAR, ret, depth+1);
5554 else if (op == '*') {
5558 else if (op == '+' && (flags&SIMPLE)) {
5559 reginsert(pRExC_state, PLUS, ret, depth+1);
5563 else if (op == '+') {
5567 else if (op == '?') {
5572 if (!SIZE_ONLY && !(flags&HASWIDTH) && max > REG_INFTY/3 && ckWARN(WARN_REGEXP)) {
5574 "%.*s matches null string many times",
5575 (int)(RExC_parse >= origparse ? RExC_parse - origparse : 0),
5579 if (RExC_parse < RExC_end && *RExC_parse == '?') {
5580 nextchar(pRExC_state);
5581 reginsert(pRExC_state, MINMOD, ret, depth+1);
5582 REGTAIL(pRExC_state, ret, ret + NODE_STEP_REGNODE);
5584 #ifndef REG_ALLOW_MINMOD_SUSPEND
5587 if (RExC_parse < RExC_end && *RExC_parse == '+') {
5589 nextchar(pRExC_state);
5590 ender = reg_node(pRExC_state, SUCCEED);
5591 REGTAIL(pRExC_state, ret, ender);
5592 reginsert(pRExC_state, SUSPEND, ret, depth+1);
5594 ender = reg_node(pRExC_state, TAIL);
5595 REGTAIL(pRExC_state, ret, ender);
5599 if (RExC_parse < RExC_end && ISMULT2(RExC_parse)) {
5601 vFAIL("Nested quantifiers");
5608 /* reg_namedseq(pRExC_state,UVp)
5610 This is expected to be called by a parser routine that has
5611 recognized'\N' and needs to handle the rest. RExC_parse is
5612 expected to point at the first char following the N at the time
5615 If valuep is non-null then it is assumed that we are parsing inside
5616 of a charclass definition and the first codepoint in the resolved
5617 string is returned via *valuep and the routine will return NULL.
5618 In this mode if a multichar string is returned from the charnames
5619 handler a warning will be issued, and only the first char in the
5620 sequence will be examined. If the string returned is zero length
5621 then the value of *valuep is undefined and NON-NULL will
5622 be returned to indicate failure. (This will NOT be a valid pointer
5625 If value is null then it is assumed that we are parsing normal text
5626 and inserts a new EXACT node into the program containing the resolved
5627 string and returns a pointer to the new node. If the string is
5628 zerolength a NOTHING node is emitted.
5630 On success RExC_parse is set to the char following the endbrace.
5631 Parsing failures will generate a fatal errorvia vFAIL(...)
5633 NOTE: We cache all results from the charnames handler locally in
5634 the RExC_charnames hash (created on first use) to prevent a charnames
5635 handler from playing silly-buggers and returning a short string and
5636 then a long string for a given pattern. Since the regexp program
5637 size is calculated during an initial parse this would result
5638 in a buffer overrun so we cache to prevent the charname result from
5639 changing during the course of the parse.
5643 S_reg_namedseq(pTHX_ RExC_state_t *pRExC_state, UV *valuep)
5645 char * name; /* start of the content of the name */
5646 char * endbrace; /* endbrace following the name */
5649 STRLEN len; /* this has various purposes throughout the code */
5650 bool cached = 0; /* if this is true then we shouldn't refcount dev sv_str */
5651 regnode *ret = NULL;
5653 if (*RExC_parse != '{') {
5654 vFAIL("Missing braces on \\N{}");
5656 name = RExC_parse+1;
5657 endbrace = strchr(RExC_parse, '}');
5660 vFAIL("Missing right brace on \\N{}");
5662 RExC_parse = endbrace + 1;
5665 /* RExC_parse points at the beginning brace,
5666 endbrace points at the last */
5667 if ( name[0]=='U' && name[1]=='+' ) {
5668 /* its a "unicode hex" notation {U+89AB} */
5669 I32 fl = PERL_SCAN_ALLOW_UNDERSCORES
5670 | PERL_SCAN_DISALLOW_PREFIX
5671 | (SIZE_ONLY ? PERL_SCAN_SILENT_ILLDIGIT : 0);
5673 len = (STRLEN)(endbrace - name - 2);
5674 cp = grok_hex(name + 2, &len, &fl, NULL);
5675 if ( len != (STRLEN)(endbrace - name - 2) ) {
5684 sv_str= Perl_newSVpvf_nocontext("%c",(int)cp);
5686 /* fetch the charnames handler for this scope */
5687 HV * const table = GvHV(PL_hintgv);
5689 hv_fetchs(table, "charnames", FALSE) :
5691 SV *cv= cvp ? *cvp : NULL;
5694 /* create an SV with the name as argument */
5695 sv_name = newSVpvn(name, endbrace - name);
5697 if (!table || !(PL_hints & HINT_LOCALIZE_HH)) {
5698 vFAIL2("Constant(\\N{%s}) unknown: "
5699 "(possibly a missing \"use charnames ...\")",
5702 if (!cvp || !SvOK(*cvp)) { /* when $^H{charnames} = undef; */
5703 vFAIL2("Constant(\\N{%s}): "
5704 "$^H{charnames} is not defined",SvPVX(sv_name));
5709 if (!RExC_charnames) {
5710 /* make sure our cache is allocated */
5711 RExC_charnames = newHV();
5712 sv_2mortal((SV*)RExC_charnames);
5714 /* see if we have looked this one up before */
5715 he_str = hv_fetch_ent( RExC_charnames, sv_name, 0, 0 );
5717 sv_str = HeVAL(he_str);
5730 count= call_sv(cv, G_SCALAR);
5732 if (count == 1) { /* XXXX is this right? dmq */
5734 SvREFCNT_inc_simple_void(sv_str);
5742 if ( !sv_str || !SvOK(sv_str) ) {
5743 vFAIL2("Constant(\\N{%s}): Call to &{$^H{charnames}} "
5744 "did not return a defined value",SvPVX(sv_name));
5746 if (hv_store_ent( RExC_charnames, sv_name, sv_str, 0))
5751 char *p = SvPV(sv_str, len);
5754 if ( SvUTF8(sv_str) ) {
5755 *valuep = utf8_to_uvchr((U8*)p, &numlen);
5759 We have to turn on utf8 for high bit chars otherwise
5760 we get failures with
5762 "ss" =~ /[\N{LATIN SMALL LETTER SHARP S}]/i
5763 "SS" =~ /[\N{LATIN SMALL LETTER SHARP S}]/i
5765 This is different from what \x{} would do with the same
5766 codepoint, where the condition is > 0xFF.
5773 /* warn if we havent used the whole string? */
5775 if (numlen<len && SIZE_ONLY && ckWARN(WARN_REGEXP)) {
5777 "Ignoring excess chars from \\N{%s} in character class",
5781 } else if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
5783 "Ignoring zero length \\N{%s} in character class",
5788 SvREFCNT_dec(sv_name);
5790 SvREFCNT_dec(sv_str);
5791 return len ? NULL : (regnode *)&len;
5792 } else if(SvCUR(sv_str)) {
5797 char * parse_start = name-3; /* needed for the offsets */
5798 GET_RE_DEBUG_FLAGS_DECL; /* needed for the offsets */
5800 ret = reg_node(pRExC_state,
5801 (U8)(FOLD ? (LOC ? EXACTFL : EXACTF) : EXACT));
5804 if ( RExC_utf8 && !SvUTF8(sv_str) ) {
5805 sv_utf8_upgrade(sv_str);
5806 } else if ( !RExC_utf8 && SvUTF8(sv_str) ) {
5810 p = SvPV(sv_str, len);
5812 /* len is the length written, charlen is the size the char read */
5813 for ( len = 0; p < pend; p += charlen ) {
5815 UV uvc = utf8_to_uvchr((U8*)p, &charlen);
5817 STRLEN foldlen,numlen;
5818 U8 tmpbuf[UTF8_MAXBYTES_CASE+1], *foldbuf;
5819 uvc = toFOLD_uni(uvc, tmpbuf, &foldlen);
5820 /* Emit all the Unicode characters. */
5822 for (foldbuf = tmpbuf;
5826 uvc = utf8_to_uvchr(foldbuf, &numlen);
5828 const STRLEN unilen = reguni(pRExC_state, uvc, s);
5831 /* In EBCDIC the numlen
5832 * and unilen can differ. */
5834 if (numlen >= foldlen)
5838 break; /* "Can't happen." */
5841 const STRLEN unilen = reguni(pRExC_state, uvc, s);
5853 RExC_size += STR_SZ(len);
5856 RExC_emit += STR_SZ(len);
5858 Set_Node_Cur_Length(ret); /* MJD */
5860 nextchar(pRExC_state);
5862 ret = reg_node(pRExC_state,NOTHING);
5865 SvREFCNT_dec(sv_str);
5868 SvREFCNT_dec(sv_name);
5878 * It returns the code point in utf8 for the value in *encp.
5879 * value: a code value in the source encoding
5880 * encp: a pointer to an Encode object
5882 * If the result from Encode is not a single character,
5883 * it returns U+FFFD (Replacement character) and sets *encp to NULL.
5886 S_reg_recode(pTHX_ const char value, SV **encp)
5889 SV * const sv = sv_2mortal(newSVpvn(&value, numlen));
5890 const char * const s = encp && *encp ? sv_recode_to_utf8(sv, *encp)
5892 const STRLEN newlen = SvCUR(sv);
5893 UV uv = UNICODE_REPLACEMENT;
5897 ? utf8n_to_uvchr((U8*)s, newlen, &numlen, UTF8_ALLOW_DEFAULT)
5900 if (!newlen || numlen != newlen) {
5901 uv = UNICODE_REPLACEMENT;
5910 - regatom - the lowest level
5912 * Optimization: gobbles an entire sequence of ordinary characters so that
5913 * it can turn them into a single node, which is smaller to store and
5914 * faster to run. Backslashed characters are exceptions, each becoming a
5915 * separate node; the code is simpler that way and it's not worth fixing.
5917 * [Yes, it is worth fixing, some scripts can run twice the speed.]
5918 * [It looks like its ok, as in S_study_chunk we merge adjacent EXACT nodes]
5921 S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
5924 register regnode *ret = NULL;
5926 char *parse_start = RExC_parse;
5927 GET_RE_DEBUG_FLAGS_DECL;
5928 DEBUG_PARSE("atom");
5929 *flagp = WORST; /* Tentatively. */
5932 switch (*RExC_parse) {
5934 RExC_seen_zerolen++;
5935 nextchar(pRExC_state);
5936 if (RExC_flags & PMf_MULTILINE)
5937 ret = reg_node(pRExC_state, MBOL);
5938 else if (RExC_flags & PMf_SINGLELINE)
5939 ret = reg_node(pRExC_state, SBOL);
5941 ret = reg_node(pRExC_state, BOL);
5942 Set_Node_Length(ret, 1); /* MJD */
5945 nextchar(pRExC_state);
5947 RExC_seen_zerolen++;
5948 if (RExC_flags & PMf_MULTILINE)
5949 ret = reg_node(pRExC_state, MEOL);
5950 else if (RExC_flags & PMf_SINGLELINE)
5951 ret = reg_node(pRExC_state, SEOL);
5953 ret = reg_node(pRExC_state, EOL);
5954 Set_Node_Length(ret, 1); /* MJD */
5957 nextchar(pRExC_state);
5958 if (RExC_flags & PMf_SINGLELINE)
5959 ret = reg_node(pRExC_state, SANY);
5961 ret = reg_node(pRExC_state, REG_ANY);
5962 *flagp |= HASWIDTH|SIMPLE;
5964 Set_Node_Length(ret, 1); /* MJD */
5968 char * const oregcomp_parse = ++RExC_parse;
5969 ret = regclass(pRExC_state,depth+1);
5970 if (*RExC_parse != ']') {
5971 RExC_parse = oregcomp_parse;
5972 vFAIL("Unmatched [");
5974 nextchar(pRExC_state);
5975 *flagp |= HASWIDTH|SIMPLE;
5976 Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */
5980 nextchar(pRExC_state);
5981 ret = reg(pRExC_state, 1, &flags,depth+1);
5983 if (flags & TRYAGAIN) {
5984 if (RExC_parse == RExC_end) {
5985 /* Make parent create an empty node if needed. */
5993 *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE);
5997 if (flags & TRYAGAIN) {
6001 vFAIL("Internal urp");
6002 /* Supposed to be caught earlier. */
6005 if (!regcurly(RExC_parse)) {
6014 vFAIL("Quantifier follows nothing");
6017 switch (*++RExC_parse) {
6019 RExC_seen_zerolen++;
6020 ret = reg_node(pRExC_state, SBOL);
6022 nextchar(pRExC_state);
6023 Set_Node_Length(ret, 2); /* MJD */
6026 ret = reg_node(pRExC_state, GPOS);
6027 RExC_seen |= REG_SEEN_GPOS;
6029 nextchar(pRExC_state);
6030 Set_Node_Length(ret, 2); /* MJD */
6033 ret = reg_node(pRExC_state, SEOL);
6035 RExC_seen_zerolen++; /* Do not optimize RE away */
6036 nextchar(pRExC_state);
6039 ret = reg_node(pRExC_state, EOS);
6041 RExC_seen_zerolen++; /* Do not optimize RE away */
6042 nextchar(pRExC_state);
6043 Set_Node_Length(ret, 2); /* MJD */
6046 ret = reg_node(pRExC_state, CANY);
6047 RExC_seen |= REG_SEEN_CANY;
6048 *flagp |= HASWIDTH|SIMPLE;
6049 nextchar(pRExC_state);
6050 Set_Node_Length(ret, 2); /* MJD */
6053 ret = reg_node(pRExC_state, CLUMP);
6055 nextchar(pRExC_state);
6056 Set_Node_Length(ret, 2); /* MJD */
6059 ret = reg_node(pRExC_state, (U8)(LOC ? ALNUML : ALNUM));
6060 *flagp |= HASWIDTH|SIMPLE;
6061 nextchar(pRExC_state);
6062 Set_Node_Length(ret, 2); /* MJD */
6065 ret = reg_node(pRExC_state, (U8)(LOC ? NALNUML : NALNUM));
6066 *flagp |= HASWIDTH|SIMPLE;
6067 nextchar(pRExC_state);
6068 Set_Node_Length(ret, 2); /* MJD */
6071 RExC_seen_zerolen++;
6072 RExC_seen |= REG_SEEN_LOOKBEHIND;
6073 ret = reg_node(pRExC_state, (U8)(LOC ? BOUNDL : BOUND));
6075 nextchar(pRExC_state);
6076 Set_Node_Length(ret, 2); /* MJD */
6079 RExC_seen_zerolen++;
6080 RExC_seen |= REG_SEEN_LOOKBEHIND;
6081 ret = reg_node(pRExC_state, (U8)(LOC ? NBOUNDL : NBOUND));
6083 nextchar(pRExC_state);
6084 Set_Node_Length(ret, 2); /* MJD */
6087 ret = reg_node(pRExC_state, (U8)(LOC ? SPACEL : SPACE));
6088 *flagp |= HASWIDTH|SIMPLE;
6089 nextchar(pRExC_state);
6090 Set_Node_Length(ret, 2); /* MJD */
6093 ret = reg_node(pRExC_state, (U8)(LOC ? NSPACEL : NSPACE));
6094 *flagp |= HASWIDTH|SIMPLE;
6095 nextchar(pRExC_state);
6096 Set_Node_Length(ret, 2); /* MJD */
6099 ret = reg_node(pRExC_state, DIGIT);
6100 *flagp |= HASWIDTH|SIMPLE;
6101 nextchar(pRExC_state);
6102 Set_Node_Length(ret, 2); /* MJD */
6105 ret = reg_node(pRExC_state, NDIGIT);
6106 *flagp |= HASWIDTH|SIMPLE;
6107 nextchar(pRExC_state);
6108 Set_Node_Length(ret, 2); /* MJD */
6113 char* const oldregxend = RExC_end;
6114 char* parse_start = RExC_parse - 2;
6116 if (RExC_parse[1] == '{') {
6117 /* a lovely hack--pretend we saw [\pX] instead */
6118 RExC_end = strchr(RExC_parse, '}');
6120 const U8 c = (U8)*RExC_parse;
6122 RExC_end = oldregxend;
6123 vFAIL2("Missing right brace on \\%c{}", c);
6128 RExC_end = RExC_parse + 2;
6129 if (RExC_end > oldregxend)
6130 RExC_end = oldregxend;
6134 ret = regclass(pRExC_state,depth+1);
6136 RExC_end = oldregxend;
6139 Set_Node_Offset(ret, parse_start + 2);
6140 Set_Node_Cur_Length(ret);
6141 nextchar(pRExC_state);
6142 *flagp |= HASWIDTH|SIMPLE;
6146 /* Handle \N{NAME} here and not below because it can be
6147 multicharacter. join_exact() will join them up later on.
6148 Also this makes sure that things like /\N{BLAH}+/ and
6149 \N{BLAH} being multi char Just Happen. dmq*/
6151 ret= reg_namedseq(pRExC_state, NULL);
6153 case 'k': /* Handle \k<NAME> and \k'NAME' */
6155 char ch= RExC_parse[1];
6156 if (ch != '<' && ch != '\'') {
6158 vWARN( RExC_parse + 1,
6159 "Possible broken named back reference treated as literal k");
6163 char* name_start = (RExC_parse += 2);
6165 SV *sv_dat = reg_scan_name(pRExC_state,
6166 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
6167 ch= (ch == '<') ? '>' : '\'';
6169 if (RExC_parse == name_start || *RExC_parse != ch)
6170 vFAIL2("Sequence \\k%c... not terminated",
6171 (ch == '>' ? '<' : ch));
6174 ret = reganode(pRExC_state,
6175 (U8)(FOLD ? (LOC ? NREFFL : NREFF) : NREF),
6181 num = add_data( pRExC_state, 1, "S" );
6183 RExC_rx->data->data[num]=(void*)sv_dat;
6184 SvREFCNT_inc(sv_dat);
6186 /* override incorrect value set in reganode MJD */
6187 Set_Node_Offset(ret, parse_start+1);
6188 Set_Node_Cur_Length(ret); /* MJD */
6189 nextchar(pRExC_state);
6204 case '1': case '2': case '3': case '4':
6205 case '5': case '6': case '7': case '8': case '9':
6207 const I32 num = atoi(RExC_parse);
6209 if (num > 9 && num >= RExC_npar)
6212 char * const parse_start = RExC_parse - 1; /* MJD */
6213 while (isDIGIT(*RExC_parse))
6216 if (!SIZE_ONLY && num > (I32)RExC_rx->nparens)
6217 vFAIL("Reference to nonexistent group");
6219 ret = reganode(pRExC_state,
6220 (U8)(FOLD ? (LOC ? REFFL : REFF) : REF),
6224 /* override incorrect value set in reganode MJD */
6225 Set_Node_Offset(ret, parse_start+1);
6226 Set_Node_Cur_Length(ret); /* MJD */
6228 nextchar(pRExC_state);
6233 if (RExC_parse >= RExC_end)
6234 FAIL("Trailing \\");
6237 /* Do not generate "unrecognized" warnings here, we fall
6238 back into the quick-grab loop below */
6245 if (RExC_flags & PMf_EXTENDED) {
6246 while (RExC_parse < RExC_end && *RExC_parse != '\n')
6248 if (RExC_parse < RExC_end)
6254 register STRLEN len;
6259 U8 tmpbuf[UTF8_MAXBYTES_CASE+1], *foldbuf;
6261 parse_start = RExC_parse - 1;
6267 ret = reg_node(pRExC_state,
6268 (U8)(FOLD ? (LOC ? EXACTFL : EXACTF) : EXACT));
6270 for (len = 0, p = RExC_parse - 1;
6271 len < 127 && p < RExC_end;
6274 char * const oldp = p;
6276 if (RExC_flags & PMf_EXTENDED)
6277 p = regwhite(p, RExC_end);
6325 ender = ASCII_TO_NATIVE('\033');
6329 ender = ASCII_TO_NATIVE('\007');
6334 char* const e = strchr(p, '}');
6338 vFAIL("Missing right brace on \\x{}");
6341 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
6342 | PERL_SCAN_DISALLOW_PREFIX;
6343 STRLEN numlen = e - p - 1;
6344 ender = grok_hex(p + 1, &numlen, &flags, NULL);
6351 I32 flags = PERL_SCAN_DISALLOW_PREFIX;
6353 ender = grok_hex(p, &numlen, &flags, NULL);
6356 if (PL_encoding && ender < 0x100)
6357 goto recode_encoding;
6361 ender = UCHARAT(p++);
6362 ender = toCTRL(ender);
6364 case '0': case '1': case '2': case '3':case '4':
6365 case '5': case '6': case '7': case '8':case '9':
6367 (isDIGIT(p[1]) && atoi(p) >= RExC_npar) ) {
6370 ender = grok_oct(p, &numlen, &flags, NULL);
6377 if (PL_encoding && ender < 0x100)
6378 goto recode_encoding;
6382 SV* enc = PL_encoding;
6383 ender = reg_recode((const char)(U8)ender, &enc);
6384 if (!enc && SIZE_ONLY && ckWARN(WARN_REGEXP))
6385 vWARN(p, "Invalid escape in the specified encoding");
6391 FAIL("Trailing \\");
6394 if (!SIZE_ONLY&& isALPHA(*p) && ckWARN(WARN_REGEXP))
6395 vWARN2(p + 1, "Unrecognized escape \\%c passed through", UCHARAT(p));
6396 goto normal_default;
6401 if (UTF8_IS_START(*p) && UTF) {
6403 ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
6404 &numlen, UTF8_ALLOW_DEFAULT);
6411 if (RExC_flags & PMf_EXTENDED)
6412 p = regwhite(p, RExC_end);
6414 /* Prime the casefolded buffer. */
6415 ender = toFOLD_uni(ender, tmpbuf, &foldlen);
6417 if (ISMULT2(p)) { /* Back off on ?+*. */
6422 /* Emit all the Unicode characters. */
6424 for (foldbuf = tmpbuf;
6426 foldlen -= numlen) {
6427 ender = utf8_to_uvchr(foldbuf, &numlen);
6429 const STRLEN unilen = reguni(pRExC_state, ender, s);
6432 /* In EBCDIC the numlen
6433 * and unilen can differ. */
6435 if (numlen >= foldlen)
6439 break; /* "Can't happen." */
6443 const STRLEN unilen = reguni(pRExC_state, ender, s);
6452 REGC((char)ender, s++);
6458 /* Emit all the Unicode characters. */
6460 for (foldbuf = tmpbuf;
6462 foldlen -= numlen) {
6463 ender = utf8_to_uvchr(foldbuf, &numlen);
6465 const STRLEN unilen = reguni(pRExC_state, ender, s);
6468 /* In EBCDIC the numlen
6469 * and unilen can differ. */
6471 if (numlen >= foldlen)
6479 const STRLEN unilen = reguni(pRExC_state, ender, s);
6488 REGC((char)ender, s++);
6492 Set_Node_Cur_Length(ret); /* MJD */
6493 nextchar(pRExC_state);
6495 /* len is STRLEN which is unsigned, need to copy to signed */
6498 vFAIL("Internal disaster");
6502 if (len == 1 && UNI_IS_INVARIANT(ender))
6506 RExC_size += STR_SZ(len);
6509 RExC_emit += STR_SZ(len);
6519 S_regwhite(char *p, const char *e)
6524 else if (*p == '#') {
6527 } while (p < e && *p != '\n');
6535 /* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
6536 Character classes ([:foo:]) can also be negated ([:^foo:]).
6537 Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
6538 Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
6539 but trigger failures because they are currently unimplemented. */
6541 #define POSIXCC_DONE(c) ((c) == ':')
6542 #define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.')
6543 #define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c))
6546 S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value)
6549 I32 namedclass = OOB_NAMEDCLASS;
6551 if (value == '[' && RExC_parse + 1 < RExC_end &&
6552 /* I smell either [: or [= or [. -- POSIX has been here, right? */
6553 POSIXCC(UCHARAT(RExC_parse))) {
6554 const char c = UCHARAT(RExC_parse);
6555 char* const s = RExC_parse++;
6557 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != c)
6559 if (RExC_parse == RExC_end)
6560 /* Grandfather lone [:, [=, [. */
6563 const char* const t = RExC_parse++; /* skip over the c */
6566 if (UCHARAT(RExC_parse) == ']') {
6567 const char *posixcc = s + 1;
6568 RExC_parse++; /* skip over the ending ] */
6571 const I32 complement = *posixcc == '^' ? *posixcc++ : 0;
6572 const I32 skip = t - posixcc;
6574 /* Initially switch on the length of the name. */
6577 if (memEQ(posixcc, "word", 4)) /* this is not POSIX, this is the Perl \w */
6578 namedclass = complement ? ANYOF_NALNUM : ANYOF_ALNUM;
6581 /* Names all of length 5. */
6582 /* alnum alpha ascii blank cntrl digit graph lower
6583 print punct space upper */
6584 /* Offset 4 gives the best switch position. */
6585 switch (posixcc[4]) {
6587 if (memEQ(posixcc, "alph", 4)) /* alpha */
6588 namedclass = complement ? ANYOF_NALPHA : ANYOF_ALPHA;
6591 if (memEQ(posixcc, "spac", 4)) /* space */
6592 namedclass = complement ? ANYOF_NPSXSPC : ANYOF_PSXSPC;
6595 if (memEQ(posixcc, "grap", 4)) /* graph */
6596 namedclass = complement ? ANYOF_NGRAPH : ANYOF_GRAPH;
6599 if (memEQ(posixcc, "asci", 4)) /* ascii */
6600 namedclass = complement ? ANYOF_NASCII : ANYOF_ASCII;
6603 if (memEQ(posixcc, "blan", 4)) /* blank */
6604 namedclass = complement ? ANYOF_NBLANK : ANYOF_BLANK;
6607 if (memEQ(posixcc, "cntr", 4)) /* cntrl */
6608 namedclass = complement ? ANYOF_NCNTRL : ANYOF_CNTRL;
6611 if (memEQ(posixcc, "alnu", 4)) /* alnum */
6612 namedclass = complement ? ANYOF_NALNUMC : ANYOF_ALNUMC;
6615 if (memEQ(posixcc, "lowe", 4)) /* lower */
6616 namedclass = complement ? ANYOF_NLOWER : ANYOF_LOWER;
6617 else if (memEQ(posixcc, "uppe", 4)) /* upper */
6618 namedclass = complement ? ANYOF_NUPPER : ANYOF_UPPER;
6621 if (memEQ(posixcc, "digi", 4)) /* digit */
6622 namedclass = complement ? ANYOF_NDIGIT : ANYOF_DIGIT;
6623 else if (memEQ(posixcc, "prin", 4)) /* print */
6624 namedclass = complement ? ANYOF_NPRINT : ANYOF_PRINT;
6625 else if (memEQ(posixcc, "punc", 4)) /* punct */
6626 namedclass = complement ? ANYOF_NPUNCT : ANYOF_PUNCT;
6631 if (memEQ(posixcc, "xdigit", 6))
6632 namedclass = complement ? ANYOF_NXDIGIT : ANYOF_XDIGIT;
6636 if (namedclass == OOB_NAMEDCLASS)
6637 Simple_vFAIL3("POSIX class [:%.*s:] unknown",
6639 assert (posixcc[skip] == ':');
6640 assert (posixcc[skip+1] == ']');
6641 } else if (!SIZE_ONLY) {
6642 /* [[=foo=]] and [[.foo.]] are still future. */
6644 /* adjust RExC_parse so the warning shows after
6646 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse) != ']')
6648 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
6651 /* Maternal grandfather:
6652 * "[:" ending in ":" but not in ":]" */
6662 S_checkposixcc(pTHX_ RExC_state_t *pRExC_state)
6665 if (POSIXCC(UCHARAT(RExC_parse))) {
6666 const char *s = RExC_parse;
6667 const char c = *s++;
6671 if (*s && c == *s && s[1] == ']') {
6672 if (ckWARN(WARN_REGEXP))
6674 "POSIX syntax [%c %c] belongs inside character classes",
6677 /* [[=foo=]] and [[.foo.]] are still future. */
6678 if (POSIXCC_NOTYET(c)) {
6679 /* adjust RExC_parse so the error shows after
6681 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse++) != ']')
6683 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
6691 parse a class specification and produce either an ANYOF node that
6692 matches the pattern. If the pattern matches a single char only and
6693 that char is < 256 then we produce an EXACT node instead.
6696 S_regclass(pTHX_ RExC_state_t *pRExC_state, U32 depth)
6699 register UV value = 0;
6700 register UV nextvalue;
6701 register IV prevvalue = OOB_UNICODE;
6702 register IV range = 0;
6703 register regnode *ret;
6706 char *rangebegin = NULL;
6707 bool need_class = 0;
6710 bool optimize_invert = TRUE;
6711 AV* unicode_alternate = NULL;
6713 UV literal_endpoint = 0;
6715 UV stored = 0; /* number of chars stored in the class */
6717 regnode * const orig_emit = RExC_emit; /* Save the original RExC_emit in
6718 case we need to change the emitted regop to an EXACT. */
6719 const char * orig_parse = RExC_parse;
6720 GET_RE_DEBUG_FLAGS_DECL;
6722 PERL_UNUSED_ARG(depth);
6725 DEBUG_PARSE("clas");
6727 /* Assume we are going to generate an ANYOF node. */
6728 ret = reganode(pRExC_state, ANYOF, 0);
6731 ANYOF_FLAGS(ret) = 0;
6733 if (UCHARAT(RExC_parse) == '^') { /* Complement of range. */
6737 ANYOF_FLAGS(ret) |= ANYOF_INVERT;
6741 RExC_size += ANYOF_SKIP;
6742 listsv = &PL_sv_undef; /* For code scanners: listsv always non-NULL. */
6745 RExC_emit += ANYOF_SKIP;
6747 ANYOF_FLAGS(ret) |= ANYOF_FOLD;
6749 ANYOF_FLAGS(ret) |= ANYOF_LOCALE;
6750 ANYOF_BITMAP_ZERO(ret);
6751 listsv = newSVpvs("# comment\n");
6754 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
6756 if (!SIZE_ONLY && POSIXCC(nextvalue))
6757 checkposixcc(pRExC_state);
6759 /* allow 1st char to be ] (allowing it to be - is dealt with later) */
6760 if (UCHARAT(RExC_parse) == ']')
6764 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != ']') {
6768 namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
6771 rangebegin = RExC_parse;
6773 value = utf8n_to_uvchr((U8*)RExC_parse,
6774 RExC_end - RExC_parse,
6775 &numlen, UTF8_ALLOW_DEFAULT);
6776 RExC_parse += numlen;
6779 value = UCHARAT(RExC_parse++);
6781 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
6782 if (value == '[' && POSIXCC(nextvalue))
6783 namedclass = regpposixcc(pRExC_state, value);
6784 else if (value == '\\') {
6786 value = utf8n_to_uvchr((U8*)RExC_parse,
6787 RExC_end - RExC_parse,
6788 &numlen, UTF8_ALLOW_DEFAULT);
6789 RExC_parse += numlen;
6792 value = UCHARAT(RExC_parse++);
6793 /* Some compilers cannot handle switching on 64-bit integer
6794 * values, therefore value cannot be an UV. Yes, this will
6795 * be a problem later if we want switch on Unicode.
6796 * A similar issue a little bit later when switching on
6797 * namedclass. --jhi */
6798 switch ((I32)value) {
6799 case 'w': namedclass = ANYOF_ALNUM; break;
6800 case 'W': namedclass = ANYOF_NALNUM; break;
6801 case 's': namedclass = ANYOF_SPACE; break;
6802 case 'S': namedclass = ANYOF_NSPACE; break;
6803 case 'd': namedclass = ANYOF_DIGIT; break;
6804 case 'D': namedclass = ANYOF_NDIGIT; break;
6805 case 'N': /* Handle \N{NAME} in class */
6807 /* We only pay attention to the first char of
6808 multichar strings being returned. I kinda wonder
6809 if this makes sense as it does change the behaviour
6810 from earlier versions, OTOH that behaviour was broken
6812 UV v; /* value is register so we cant & it /grrr */
6813 if (reg_namedseq(pRExC_state, &v)) {
6823 if (RExC_parse >= RExC_end)
6824 vFAIL2("Empty \\%c{}", (U8)value);
6825 if (*RExC_parse == '{') {
6826 const U8 c = (U8)value;
6827 e = strchr(RExC_parse++, '}');
6829 vFAIL2("Missing right brace on \\%c{}", c);
6830 while (isSPACE(UCHARAT(RExC_parse)))
6832 if (e == RExC_parse)
6833 vFAIL2("Empty \\%c{}", c);
6835 while (isSPACE(UCHARAT(RExC_parse + n - 1)))
6843 if (UCHARAT(RExC_parse) == '^') {
6846 value = value == 'p' ? 'P' : 'p'; /* toggle */
6847 while (isSPACE(UCHARAT(RExC_parse))) {
6852 Perl_sv_catpvf(aTHX_ listsv, "%cutf8::%.*s\n",
6853 (value=='p' ? '+' : '!'), (int)n, RExC_parse);
6856 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
6857 namedclass = ANYOF_MAX; /* no official name, but it's named */
6860 case 'n': value = '\n'; break;
6861 case 'r': value = '\r'; break;
6862 case 't': value = '\t'; break;
6863 case 'f': value = '\f'; break;
6864 case 'b': value = '\b'; break;
6865 case 'e': value = ASCII_TO_NATIVE('\033');break;
6866 case 'a': value = ASCII_TO_NATIVE('\007');break;
6868 if (*RExC_parse == '{') {
6869 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
6870 | PERL_SCAN_DISALLOW_PREFIX;
6871 char * const e = strchr(RExC_parse++, '}');
6873 vFAIL("Missing right brace on \\x{}");
6875 numlen = e - RExC_parse;
6876 value = grok_hex(RExC_parse, &numlen, &flags, NULL);
6880 I32 flags = PERL_SCAN_DISALLOW_PREFIX;
6882 value = grok_hex(RExC_parse, &numlen, &flags, NULL);
6883 RExC_parse += numlen;
6885 if (PL_encoding && value < 0x100)
6886 goto recode_encoding;
6889 value = UCHARAT(RExC_parse++);
6890 value = toCTRL(value);
6892 case '0': case '1': case '2': case '3': case '4':
6893 case '5': case '6': case '7': case '8': case '9':
6897 value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
6898 RExC_parse += numlen;
6899 if (PL_encoding && value < 0x100)
6900 goto recode_encoding;
6905 SV* enc = PL_encoding;
6906 value = reg_recode((const char)(U8)value, &enc);
6907 if (!enc && SIZE_ONLY && ckWARN(WARN_REGEXP))
6909 "Invalid escape in the specified encoding");
6913 if (!SIZE_ONLY && isALPHA(value) && ckWARN(WARN_REGEXP))
6915 "Unrecognized escape \\%c in character class passed through",
6919 } /* end of \blah */
6925 if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
6927 if (!SIZE_ONLY && !need_class)
6928 ANYOF_CLASS_ZERO(ret);
6932 /* a bad range like a-\d, a-[:digit:] ? */
6935 if (ckWARN(WARN_REGEXP)) {
6937 RExC_parse >= rangebegin ?
6938 RExC_parse - rangebegin : 0;
6940 "False [] range \"%*.*s\"",
6943 if (prevvalue < 256) {
6944 ANYOF_BITMAP_SET(ret, prevvalue);
6945 ANYOF_BITMAP_SET(ret, '-');
6948 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
6949 Perl_sv_catpvf(aTHX_ listsv,
6950 "%04"UVxf"\n%04"UVxf"\n", (UV)prevvalue, (UV) '-');
6954 range = 0; /* this was not a true range */
6958 const char *what = NULL;
6961 if (namedclass > OOB_NAMEDCLASS)
6962 optimize_invert = FALSE;
6963 /* Possible truncation here but in some 64-bit environments
6964 * the compiler gets heartburn about switch on 64-bit values.
6965 * A similar issue a little earlier when switching on value.
6967 switch ((I32)namedclass) {
6970 ANYOF_CLASS_SET(ret, ANYOF_ALNUM);
6972 for (value = 0; value < 256; value++)
6974 ANYOF_BITMAP_SET(ret, value);
6981 ANYOF_CLASS_SET(ret, ANYOF_NALNUM);
6983 for (value = 0; value < 256; value++)
6984 if (!isALNUM(value))
6985 ANYOF_BITMAP_SET(ret, value);
6992 ANYOF_CLASS_SET(ret, ANYOF_ALNUMC);
6994 for (value = 0; value < 256; value++)
6995 if (isALNUMC(value))
6996 ANYOF_BITMAP_SET(ret, value);
7003 ANYOF_CLASS_SET(ret, ANYOF_NALNUMC);
7005 for (value = 0; value < 256; value++)
7006 if (!isALNUMC(value))
7007 ANYOF_BITMAP_SET(ret, value);
7014 ANYOF_CLASS_SET(ret, ANYOF_ALPHA);
7016 for (value = 0; value < 256; value++)
7018 ANYOF_BITMAP_SET(ret, value);
7025 ANYOF_CLASS_SET(ret, ANYOF_NALPHA);
7027 for (value = 0; value < 256; value++)
7028 if (!isALPHA(value))
7029 ANYOF_BITMAP_SET(ret, value);
7036 ANYOF_CLASS_SET(ret, ANYOF_ASCII);
7039 for (value = 0; value < 128; value++)
7040 ANYOF_BITMAP_SET(ret, value);
7042 for (value = 0; value < 256; value++) {
7044 ANYOF_BITMAP_SET(ret, value);
7053 ANYOF_CLASS_SET(ret, ANYOF_NASCII);
7056 for (value = 128; value < 256; value++)
7057 ANYOF_BITMAP_SET(ret, value);
7059 for (value = 0; value < 256; value++) {
7060 if (!isASCII(value))
7061 ANYOF_BITMAP_SET(ret, value);
7070 ANYOF_CLASS_SET(ret, ANYOF_BLANK);
7072 for (value = 0; value < 256; value++)
7074 ANYOF_BITMAP_SET(ret, value);
7081 ANYOF_CLASS_SET(ret, ANYOF_NBLANK);
7083 for (value = 0; value < 256; value++)
7084 if (!isBLANK(value))
7085 ANYOF_BITMAP_SET(ret, value);
7092 ANYOF_CLASS_SET(ret, ANYOF_CNTRL);
7094 for (value = 0; value < 256; value++)
7096 ANYOF_BITMAP_SET(ret, value);
7103 ANYOF_CLASS_SET(ret, ANYOF_NCNTRL);
7105 for (value = 0; value < 256; value++)
7106 if (!isCNTRL(value))
7107 ANYOF_BITMAP_SET(ret, value);
7114 ANYOF_CLASS_SET(ret, ANYOF_DIGIT);
7116 /* consecutive digits assumed */
7117 for (value = '0'; value <= '9'; value++)
7118 ANYOF_BITMAP_SET(ret, value);
7125 ANYOF_CLASS_SET(ret, ANYOF_NDIGIT);
7127 /* consecutive digits assumed */
7128 for (value = 0; value < '0'; value++)
7129 ANYOF_BITMAP_SET(ret, value);
7130 for (value = '9' + 1; value < 256; value++)
7131 ANYOF_BITMAP_SET(ret, value);
7138 ANYOF_CLASS_SET(ret, ANYOF_GRAPH);
7140 for (value = 0; value < 256; value++)
7142 ANYOF_BITMAP_SET(ret, value);
7149 ANYOF_CLASS_SET(ret, ANYOF_NGRAPH);
7151 for (value = 0; value < 256; value++)
7152 if (!isGRAPH(value))
7153 ANYOF_BITMAP_SET(ret, value);
7160 ANYOF_CLASS_SET(ret, ANYOF_LOWER);
7162 for (value = 0; value < 256; value++)
7164 ANYOF_BITMAP_SET(ret, value);
7171 ANYOF_CLASS_SET(ret, ANYOF_NLOWER);
7173 for (value = 0; value < 256; value++)
7174 if (!isLOWER(value))
7175 ANYOF_BITMAP_SET(ret, value);
7182 ANYOF_CLASS_SET(ret, ANYOF_PRINT);
7184 for (value = 0; value < 256; value++)
7186 ANYOF_BITMAP_SET(ret, value);
7193 ANYOF_CLASS_SET(ret, ANYOF_NPRINT);
7195 for (value = 0; value < 256; value++)
7196 if (!isPRINT(value))
7197 ANYOF_BITMAP_SET(ret, value);
7204 ANYOF_CLASS_SET(ret, ANYOF_PSXSPC);
7206 for (value = 0; value < 256; value++)
7207 if (isPSXSPC(value))
7208 ANYOF_BITMAP_SET(ret, value);
7215 ANYOF_CLASS_SET(ret, ANYOF_NPSXSPC);
7217 for (value = 0; value < 256; value++)
7218 if (!isPSXSPC(value))
7219 ANYOF_BITMAP_SET(ret, value);
7226 ANYOF_CLASS_SET(ret, ANYOF_PUNCT);
7228 for (value = 0; value < 256; value++)
7230 ANYOF_BITMAP_SET(ret, value);
7237 ANYOF_CLASS_SET(ret, ANYOF_NPUNCT);
7239 for (value = 0; value < 256; value++)
7240 if (!isPUNCT(value))
7241 ANYOF_BITMAP_SET(ret, value);
7248 ANYOF_CLASS_SET(ret, ANYOF_SPACE);
7250 for (value = 0; value < 256; value++)
7252 ANYOF_BITMAP_SET(ret, value);
7259 ANYOF_CLASS_SET(ret, ANYOF_NSPACE);
7261 for (value = 0; value < 256; value++)
7262 if (!isSPACE(value))
7263 ANYOF_BITMAP_SET(ret, value);
7270 ANYOF_CLASS_SET(ret, ANYOF_UPPER);
7272 for (value = 0; value < 256; value++)
7274 ANYOF_BITMAP_SET(ret, value);
7281 ANYOF_CLASS_SET(ret, ANYOF_NUPPER);
7283 for (value = 0; value < 256; value++)
7284 if (!isUPPER(value))
7285 ANYOF_BITMAP_SET(ret, value);
7292 ANYOF_CLASS_SET(ret, ANYOF_XDIGIT);
7294 for (value = 0; value < 256; value++)
7295 if (isXDIGIT(value))
7296 ANYOF_BITMAP_SET(ret, value);
7303 ANYOF_CLASS_SET(ret, ANYOF_NXDIGIT);
7305 for (value = 0; value < 256; value++)
7306 if (!isXDIGIT(value))
7307 ANYOF_BITMAP_SET(ret, value);
7313 /* this is to handle \p and \P */
7316 vFAIL("Invalid [::] class");
7320 /* Strings such as "+utf8::isWord\n" */
7321 Perl_sv_catpvf(aTHX_ listsv, "%cutf8::Is%s\n", yesno, what);
7324 ANYOF_FLAGS(ret) |= ANYOF_CLASS;
7327 } /* end of namedclass \blah */
7330 if (prevvalue > (IV)value) /* b-a */ {
7331 const int w = RExC_parse - rangebegin;
7332 Simple_vFAIL4("Invalid [] range \"%*.*s\"", w, w, rangebegin);
7333 range = 0; /* not a valid range */
7337 prevvalue = value; /* save the beginning of the range */
7338 if (*RExC_parse == '-' && RExC_parse+1 < RExC_end &&
7339 RExC_parse[1] != ']') {
7342 /* a bad range like \w-, [:word:]- ? */
7343 if (namedclass > OOB_NAMEDCLASS) {
7344 if (ckWARN(WARN_REGEXP)) {
7346 RExC_parse >= rangebegin ?
7347 RExC_parse - rangebegin : 0;
7349 "False [] range \"%*.*s\"",
7353 ANYOF_BITMAP_SET(ret, '-');
7355 range = 1; /* yeah, it's a range! */
7356 continue; /* but do it the next time */
7360 /* now is the next time */
7361 /*stored += (value - prevvalue + 1);*/
7363 if (prevvalue < 256) {
7364 const IV ceilvalue = value < 256 ? value : 255;
7367 /* In EBCDIC [\x89-\x91] should include
7368 * the \x8e but [i-j] should not. */
7369 if (literal_endpoint == 2 &&
7370 ((isLOWER(prevvalue) && isLOWER(ceilvalue)) ||
7371 (isUPPER(prevvalue) && isUPPER(ceilvalue))))
7373 if (isLOWER(prevvalue)) {
7374 for (i = prevvalue; i <= ceilvalue; i++)
7376 ANYOF_BITMAP_SET(ret, i);
7378 for (i = prevvalue; i <= ceilvalue; i++)
7380 ANYOF_BITMAP_SET(ret, i);
7385 for (i = prevvalue; i <= ceilvalue; i++) {
7386 if (!ANYOF_BITMAP_TEST(ret,i)) {
7388 ANYOF_BITMAP_SET(ret, i);
7392 if (value > 255 || UTF) {
7393 const UV prevnatvalue = NATIVE_TO_UNI(prevvalue);
7394 const UV natvalue = NATIVE_TO_UNI(value);
7395 stored+=2; /* can't optimize this class */
7396 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
7397 if (prevnatvalue < natvalue) { /* what about > ? */
7398 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\t%04"UVxf"\n",
7399 prevnatvalue, natvalue);
7401 else if (prevnatvalue == natvalue) {
7402 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n", natvalue);
7404 U8 foldbuf[UTF8_MAXBYTES_CASE+1];
7406 const UV f = to_uni_fold(natvalue, foldbuf, &foldlen);
7408 #ifdef EBCDIC /* RD t/uni/fold ff and 6b */
7409 if (RExC_precomp[0] == ':' &&
7410 RExC_precomp[1] == '[' &&
7411 (f == 0xDF || f == 0x92)) {
7412 f = NATIVE_TO_UNI(f);
7415 /* If folding and foldable and a single
7416 * character, insert also the folded version
7417 * to the charclass. */
7419 #ifdef EBCDIC /* RD tunifold ligatures s,t fb05, fb06 */
7420 if ((RExC_precomp[0] == ':' &&
7421 RExC_precomp[1] == '[' &&
7423 (value == 0xFB05 || value == 0xFB06))) ?
7424 foldlen == ((STRLEN)UNISKIP(f) - 1) :
7425 foldlen == (STRLEN)UNISKIP(f) )
7427 if (foldlen == (STRLEN)UNISKIP(f))
7429 Perl_sv_catpvf(aTHX_ listsv,
7432 /* Any multicharacter foldings
7433 * require the following transform:
7434 * [ABCDEF] -> (?:[ABCabcDEFd]|pq|rst)
7435 * where E folds into "pq" and F folds
7436 * into "rst", all other characters
7437 * fold to single characters. We save
7438 * away these multicharacter foldings,
7439 * to be later saved as part of the
7440 * additional "s" data. */
7443 if (!unicode_alternate)
7444 unicode_alternate = newAV();
7445 sv = newSVpvn((char*)foldbuf, foldlen);
7447 av_push(unicode_alternate, sv);
7451 /* If folding and the value is one of the Greek
7452 * sigmas insert a few more sigmas to make the
7453 * folding rules of the sigmas to work right.
7454 * Note that not all the possible combinations
7455 * are handled here: some of them are handled
7456 * by the standard folding rules, and some of
7457 * them (literal or EXACTF cases) are handled
7458 * during runtime in regexec.c:S_find_byclass(). */
7459 if (value == UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA) {
7460 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
7461 (UV)UNICODE_GREEK_CAPITAL_LETTER_SIGMA);
7462 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
7463 (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
7465 else if (value == UNICODE_GREEK_CAPITAL_LETTER_SIGMA)
7466 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
7467 (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
7472 literal_endpoint = 0;
7476 range = 0; /* this range (if it was one) is done now */
7480 ANYOF_FLAGS(ret) |= ANYOF_LARGE;
7482 RExC_size += ANYOF_CLASS_ADD_SKIP;
7484 RExC_emit += ANYOF_CLASS_ADD_SKIP;
7490 /****** !SIZE_ONLY AFTER HERE *********/
7492 if( stored == 1 && value < 256
7493 && !( ANYOF_FLAGS(ret) & ( ANYOF_FLAGS_ALL ^ ANYOF_FOLD ) )
7495 /* optimize single char class to an EXACT node
7496 but *only* when its not a UTF/high char */
7497 const char * cur_parse= RExC_parse;
7498 RExC_emit = (regnode *)orig_emit;
7499 RExC_parse = (char *)orig_parse;
7500 ret = reg_node(pRExC_state,
7501 (U8)((ANYOF_FLAGS(ret) & ANYOF_FOLD) ? EXACTF : EXACT));
7502 RExC_parse = (char *)cur_parse;
7503 *STRING(ret)= (char)value;
7505 RExC_emit += STR_SZ(1);
7508 /* optimize case-insensitive simple patterns (e.g. /[a-z]/i) */
7509 if ( /* If the only flag is folding (plus possibly inversion). */
7510 ((ANYOF_FLAGS(ret) & (ANYOF_FLAGS_ALL ^ ANYOF_INVERT)) == ANYOF_FOLD)
7512 for (value = 0; value < 256; ++value) {
7513 if (ANYOF_BITMAP_TEST(ret, value)) {
7514 UV fold = PL_fold[value];
7517 ANYOF_BITMAP_SET(ret, fold);
7520 ANYOF_FLAGS(ret) &= ~ANYOF_FOLD;
7523 /* optimize inverted simple patterns (e.g. [^a-z]) */
7524 if (optimize_invert &&
7525 /* If the only flag is inversion. */
7526 (ANYOF_FLAGS(ret) & ANYOF_FLAGS_ALL) == ANYOF_INVERT) {
7527 for (value = 0; value < ANYOF_BITMAP_SIZE; ++value)
7528 ANYOF_BITMAP(ret)[value] ^= ANYOF_FLAGS_ALL;
7529 ANYOF_FLAGS(ret) = ANYOF_UNICODE_ALL;
7532 AV * const av = newAV();
7534 /* The 0th element stores the character class description
7535 * in its textual form: used later (regexec.c:Perl_regclass_swash())
7536 * to initialize the appropriate swash (which gets stored in
7537 * the 1st element), and also useful for dumping the regnode.
7538 * The 2nd element stores the multicharacter foldings,
7539 * used later (regexec.c:S_reginclass()). */
7540 av_store(av, 0, listsv);
7541 av_store(av, 1, NULL);
7542 av_store(av, 2, (SV*)unicode_alternate);
7543 rv = newRV_noinc((SV*)av);
7544 n = add_data(pRExC_state, 1, "s");
7545 RExC_rx->data->data[n] = (void*)rv;
7552 S_nextchar(pTHX_ RExC_state_t *pRExC_state)
7554 char* const retval = RExC_parse++;
7557 if (*RExC_parse == '(' && RExC_parse[1] == '?' &&
7558 RExC_parse[2] == '#') {
7559 while (*RExC_parse != ')') {
7560 if (RExC_parse == RExC_end)
7561 FAIL("Sequence (?#... not terminated");
7567 if (RExC_flags & PMf_EXTENDED) {
7568 if (isSPACE(*RExC_parse)) {
7572 else if (*RExC_parse == '#') {
7573 while (RExC_parse < RExC_end)
7574 if (*RExC_parse++ == '\n') break;
7583 - reg_node - emit a node
7585 STATIC regnode * /* Location. */
7586 S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
7589 register regnode *ptr;
7590 regnode * const ret = RExC_emit;
7591 GET_RE_DEBUG_FLAGS_DECL;
7594 SIZE_ALIGN(RExC_size);
7599 if (OP(RExC_emit) == 255)
7600 Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %s: %d ",
7601 reg_name[op], OP(RExC_emit));
7603 NODE_ALIGN_FILL(ret);
7605 FILL_ADVANCE_NODE(ptr, op);
7606 if (RExC_offsets) { /* MJD */
7607 MJD_OFFSET_DEBUG(("%s:%d: (op %s) %s %"UVuf" (len %"UVuf") (max %"UVuf").\n",
7608 "reg_node", __LINE__,
7610 (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0]
7611 ? "Overwriting end of array!\n" : "OK",
7612 (UV)(RExC_emit - RExC_emit_start),
7613 (UV)(RExC_parse - RExC_start),
7614 (UV)RExC_offsets[0]));
7615 Set_Node_Offset(RExC_emit, RExC_parse + (op == END));
7623 - reganode - emit a node with an argument
7625 STATIC regnode * /* Location. */
7626 S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
7629 register regnode *ptr;
7630 regnode * const ret = RExC_emit;
7631 GET_RE_DEBUG_FLAGS_DECL;
7634 SIZE_ALIGN(RExC_size);
7639 assert(2==regarglen[op]+1);
7641 Anything larger than this has to allocate the extra amount.
7642 If we changed this to be:
7644 RExC_size += (1 + regarglen[op]);
7646 then it wouldn't matter. Its not clear what side effect
7647 might come from that so its not done so far.
7653 if (OP(RExC_emit) == 255)
7654 Perl_croak(aTHX_ "panic: reganode overwriting end of allocated program space");
7656 NODE_ALIGN_FILL(ret);
7658 FILL_ADVANCE_NODE_ARG(ptr, op, arg);
7659 if (RExC_offsets) { /* MJD */
7660 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
7664 (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0] ?
7665 "Overwriting end of array!\n" : "OK",
7666 (UV)(RExC_emit - RExC_emit_start),
7667 (UV)(RExC_parse - RExC_start),
7668 (UV)RExC_offsets[0]));
7669 Set_Cur_Node_Offset;
7677 - reguni - emit (if appropriate) a Unicode character
7680 S_reguni(pTHX_ const RExC_state_t *pRExC_state, UV uv, char* s)
7683 return SIZE_ONLY ? UNISKIP(uv) : (uvchr_to_utf8((U8*)s, uv) - (U8*)s);
7687 - reginsert - insert an operator in front of already-emitted operand
7689 * Means relocating the operand.
7692 S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd, U32 depth)
7695 register regnode *src;
7696 register regnode *dst;
7697 register regnode *place;
7698 const int offset = regarglen[(U8)op];
7699 const int size = NODE_STEP_REGNODE + offset;
7700 GET_RE_DEBUG_FLAGS_DECL;
7701 /* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */
7702 DEBUG_PARSE_FMT("inst"," - %s",reg_name[op]);
7711 if (RExC_open_parens) {
7713 DEBUG_PARSE_FMT("inst"," - %"IVdf,RExC_npar);
7714 for ( paren=0 ; paren < RExC_npar ; paren++ ) {
7715 if ( RExC_open_parens[paren] >= opnd ) {
7716 DEBUG_PARSE_FMT("open"," - %d",size);
7717 RExC_open_parens[paren] += size;
7719 DEBUG_PARSE_FMT("open"," - %s","ok");
7721 if ( RExC_close_parens[paren] >= opnd ) {
7722 DEBUG_PARSE_FMT("close"," - %d",size);
7723 RExC_close_parens[paren] += size;
7725 DEBUG_PARSE_FMT("close"," - %s","ok");
7730 while (src > opnd) {
7731 StructCopy(--src, --dst, regnode);
7732 if (RExC_offsets) { /* MJD 20010112 */
7733 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s copy %"UVuf" -> %"UVuf" (max %"UVuf").\n",
7737 (UV)(dst - RExC_emit_start) > RExC_offsets[0]
7738 ? "Overwriting end of array!\n" : "OK",
7739 (UV)(src - RExC_emit_start),
7740 (UV)(dst - RExC_emit_start),
7741 (UV)RExC_offsets[0]));
7742 Set_Node_Offset_To_R(dst-RExC_emit_start, Node_Offset(src));
7743 Set_Node_Length_To_R(dst-RExC_emit_start, Node_Length(src));
7748 place = opnd; /* Op node, where operand used to be. */
7749 if (RExC_offsets) { /* MJD */
7750 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
7754 (UV)(place - RExC_emit_start) > RExC_offsets[0]
7755 ? "Overwriting end of array!\n" : "OK",
7756 (UV)(place - RExC_emit_start),
7757 (UV)(RExC_parse - RExC_start),
7758 (UV)RExC_offsets[0]));
7759 Set_Node_Offset(place, RExC_parse);
7760 Set_Node_Length(place, 1);
7762 src = NEXTOPER(place);
7763 FILL_ADVANCE_NODE(place, op);
7764 Zero(src, offset, regnode);
7768 - regtail - set the next-pointer at the end of a node chain of p to val.
7769 - SEE ALSO: regtail_study
7771 /* TODO: All three parms should be const */
7773 S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
7776 register regnode *scan;
7777 GET_RE_DEBUG_FLAGS_DECL;
7779 PERL_UNUSED_ARG(depth);
7785 /* Find last node. */
7788 regnode * const temp = regnext(scan);
7790 SV * const mysv=sv_newmortal();
7791 DEBUG_PARSE_MSG((scan==p ? "tail" : ""));
7792 regprop(RExC_rx, mysv, scan);
7793 PerlIO_printf(Perl_debug_log, "~ %s (%d) %s %s\n",
7794 SvPV_nolen_const(mysv), REG_NODE_NUM(scan),
7795 (temp == NULL ? "->" : ""),
7796 (temp == NULL ? reg_name[OP(val)] : "")
7804 if (reg_off_by_arg[OP(scan)]) {
7805 ARG_SET(scan, val - scan);
7808 NEXT_OFF(scan) = val - scan;
7814 - regtail_study - set the next-pointer at the end of a node chain of p to val.
7815 - Look for optimizable sequences at the same time.
7816 - currently only looks for EXACT chains.
7818 This is expermental code. The idea is to use this routine to perform
7819 in place optimizations on branches and groups as they are constructed,
7820 with the long term intention of removing optimization from study_chunk so
7821 that it is purely analytical.
7823 Currently only used when in DEBUG mode. The macro REGTAIL_STUDY() is used
7824 to control which is which.
7827 /* TODO: All four parms should be const */
7830 S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
7833 register regnode *scan;
7835 #ifdef EXPERIMENTAL_INPLACESCAN
7839 GET_RE_DEBUG_FLAGS_DECL;
7845 /* Find last node. */
7849 regnode * const temp = regnext(scan);
7850 #ifdef EXPERIMENTAL_INPLACESCAN
7851 if (PL_regkind[OP(scan)] == EXACT)
7852 if (join_exact(pRExC_state,scan,&min,1,val,depth+1))
7860 if( exact == PSEUDO )
7862 else if ( exact != OP(scan) )
7871 SV * const mysv=sv_newmortal();
7872 DEBUG_PARSE_MSG((scan==p ? "tsdy" : ""));
7873 regprop(RExC_rx, mysv, scan);
7874 PerlIO_printf(Perl_debug_log, "~ %s (%d) -> %s\n",
7875 SvPV_nolen_const(mysv),
7884 SV * const mysv_val=sv_newmortal();
7885 DEBUG_PARSE_MSG("");
7886 regprop(RExC_rx, mysv_val, val);
7887 PerlIO_printf(Perl_debug_log, "~ attach to %s (%d) offset to %d\n",
7888 SvPV_nolen_const(mysv_val),
7893 if (reg_off_by_arg[OP(scan)]) {
7894 ARG_SET(scan, val - scan);
7897 NEXT_OFF(scan) = val - scan;
7905 - regcurly - a little FSA that accepts {\d+,?\d*}
7908 S_regcurly(register const char *s)
7927 - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
7930 Perl_regdump(pTHX_ const regexp *r)
7934 SV * const sv = sv_newmortal();
7935 SV *dsv= sv_newmortal();
7937 (void)dumpuntil(r, r->program, r->program + 1, NULL, NULL, sv, 0, 0);
7939 /* Header fields of interest. */
7940 if (r->anchored_substr) {
7941 RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->anchored_substr),
7942 RE_SV_DUMPLEN(r->anchored_substr), 30);
7943 PerlIO_printf(Perl_debug_log,
7944 "anchored %s%s at %"IVdf" ",
7945 s, RE_SV_TAIL(r->anchored_substr),
7946 (IV)r->anchored_offset);
7947 } else if (r->anchored_utf8) {
7948 RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->anchored_utf8),
7949 RE_SV_DUMPLEN(r->anchored_utf8), 30);
7950 PerlIO_printf(Perl_debug_log,
7951 "anchored utf8 %s%s at %"IVdf" ",
7952 s, RE_SV_TAIL(r->anchored_utf8),
7953 (IV)r->anchored_offset);
7955 if (r->float_substr) {
7956 RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->float_substr),
7957 RE_SV_DUMPLEN(r->float_substr), 30);
7958 PerlIO_printf(Perl_debug_log,
7959 "floating %s%s at %"IVdf"..%"UVuf" ",
7960 s, RE_SV_TAIL(r->float_substr),
7961 (IV)r->float_min_offset, (UV)r->float_max_offset);
7962 } else if (r->float_utf8) {
7963 RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->float_utf8),
7964 RE_SV_DUMPLEN(r->float_utf8), 30);
7965 PerlIO_printf(Perl_debug_log,
7966 "floating utf8 %s%s at %"IVdf"..%"UVuf" ",
7967 s, RE_SV_TAIL(r->float_utf8),
7968 (IV)r->float_min_offset, (UV)r->float_max_offset);
7970 if (r->check_substr || r->check_utf8)
7971 PerlIO_printf(Perl_debug_log,
7973 (r->check_substr == r->float_substr
7974 && r->check_utf8 == r->float_utf8
7975 ? "(checking floating" : "(checking anchored"));
7976 if (r->reganch & ROPT_NOSCAN)
7977 PerlIO_printf(Perl_debug_log, " noscan");
7978 if (r->reganch & ROPT_CHECK_ALL)
7979 PerlIO_printf(Perl_debug_log, " isall");
7980 if (r->check_substr || r->check_utf8)
7981 PerlIO_printf(Perl_debug_log, ") ");
7983 if (r->regstclass) {
7984 regprop(r, sv, r->regstclass);
7985 PerlIO_printf(Perl_debug_log, "stclass %s ", SvPVX_const(sv));
7987 if (r->reganch & ROPT_ANCH) {
7988 PerlIO_printf(Perl_debug_log, "anchored");
7989 if (r->reganch & ROPT_ANCH_BOL)
7990 PerlIO_printf(Perl_debug_log, "(BOL)");
7991 if (r->reganch & ROPT_ANCH_MBOL)
7992 PerlIO_printf(Perl_debug_log, "(MBOL)");
7993 if (r->reganch & ROPT_ANCH_SBOL)
7994 PerlIO_printf(Perl_debug_log, "(SBOL)");
7995 if (r->reganch & ROPT_ANCH_GPOS)
7996 PerlIO_printf(Perl_debug_log, "(GPOS)");
7997 PerlIO_putc(Perl_debug_log, ' ');
7999 if (r->reganch & ROPT_GPOS_SEEN)
8000 PerlIO_printf(Perl_debug_log, "GPOS ");
8001 if (r->reganch & ROPT_SKIP)
8002 PerlIO_printf(Perl_debug_log, "plus ");
8003 if (r->reganch & ROPT_IMPLICIT)
8004 PerlIO_printf(Perl_debug_log, "implicit ");
8005 PerlIO_printf(Perl_debug_log, "minlen %ld ", (long) r->minlen);
8006 if (r->reganch & ROPT_EVAL_SEEN)
8007 PerlIO_printf(Perl_debug_log, "with eval ");
8008 PerlIO_printf(Perl_debug_log, "\n");
8010 PERL_UNUSED_CONTEXT;
8012 #endif /* DEBUGGING */
8016 - regprop - printable representation of opcode
8019 Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o)
8024 GET_RE_DEBUG_FLAGS_DECL;
8026 sv_setpvn(sv, "", 0);
8027 if (OP(o) > REGNODE_MAX) /* regnode.type is unsigned */
8028 /* It would be nice to FAIL() here, but this may be called from
8029 regexec.c, and it would be hard to supply pRExC_state. */
8030 Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d", (int)OP(o), (int)REGNODE_MAX);
8031 sv_catpv(sv, reg_name[OP(o)]); /* Take off const! */
8033 k = PL_regkind[OP(o)];
8036 SV * const dsv = sv_2mortal(newSVpvs(""));
8037 /* Using is_utf8_string() (via PERL_PV_UNI_DETECT)
8038 * is a crude hack but it may be the best for now since
8039 * we have no flag "this EXACTish node was UTF-8"
8041 const char * const s =
8042 pv_pretty(dsv, STRING(o), STR_LEN(o), 60,
8043 PL_colors[0], PL_colors[1],
8044 PERL_PV_ESCAPE_UNI_DETECT |
8045 PERL_PV_PRETTY_ELIPSES |
8048 Perl_sv_catpvf(aTHX_ sv, " %s", s );
8049 } else if (k == TRIE) {
8050 /* print the details of the trie in dumpuntil instead, as
8051 * prog->data isn't available here */
8052 const char op = OP(o);
8053 const I32 n = ARG(o);
8054 const reg_ac_data * const ac = IS_TRIE_AC(op) ?
8055 (reg_ac_data *)prog->data->data[n] :
8057 const reg_trie_data * const trie = !IS_TRIE_AC(op) ?
8058 (reg_trie_data*)prog->data->data[n] :
8061 Perl_sv_catpvf(aTHX_ sv, "-%s",reg_name[o->flags]);
8062 DEBUG_TRIE_COMPILE_r(
8063 Perl_sv_catpvf(aTHX_ sv,
8064 "<S:%"UVuf"/%"IVdf" W:%"UVuf" L:%"UVuf"/%"UVuf" C:%"UVuf"/%"UVuf">",
8065 (UV)trie->startstate,
8066 (IV)trie->statecount-1, /* -1 because of the unused 0 element */
8067 (UV)trie->wordcount,
8070 (UV)TRIE_CHARCOUNT(trie),
8071 (UV)trie->uniquecharcount
8074 if ( IS_ANYOF_TRIE(op) || trie->bitmap ) {
8076 int rangestart = -1;
8077 U8* bitmap = IS_ANYOF_TRIE(op) ? (U8*)ANYOF_BITMAP(o) : (U8*)TRIE_BITMAP(trie);
8078 Perl_sv_catpvf(aTHX_ sv, "[");
8079 for (i = 0; i <= 256; i++) {
8080 if (i < 256 && BITMAP_TEST(bitmap,i)) {
8081 if (rangestart == -1)
8083 } else if (rangestart != -1) {
8084 if (i <= rangestart + 3)
8085 for (; rangestart < i; rangestart++)
8086 put_byte(sv, rangestart);
8088 put_byte(sv, rangestart);
8090 put_byte(sv, i - 1);
8095 Perl_sv_catpvf(aTHX_ sv, "]");
8098 } else if (k == CURLY) {
8099 if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX)
8100 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */
8101 Perl_sv_catpvf(aTHX_ sv, " {%d,%d}", ARG1(o), ARG2(o));
8103 else if (k == WHILEM && o->flags) /* Ordinal/of */
8104 Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4);
8105 else if (k == REF || k == OPEN || k == CLOSE || k == GROUPP || OP(o)==ACCEPT)
8106 Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o)); /* Parenth number */
8107 else if (k == GOSUB)
8108 Perl_sv_catpvf(aTHX_ sv, "%d[%+d]", (int)ARG(o),(int)ARG2L(o)); /* Paren and offset */
8109 else if (k == VERB) {
8111 Perl_sv_catpvf(aTHX_ sv, ":%"SVf,
8112 (SV*)prog->data->data[ ARG( o ) ]);
8113 } else if (k == LOGICAL)
8114 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* 2: embedded, otherwise 1 */
8115 else if (k == ANYOF) {
8116 int i, rangestart = -1;
8117 const U8 flags = ANYOF_FLAGS(o);
8119 /* Should be synchronized with * ANYOF_ #xdefines in regcomp.h */
8120 static const char * const anyofs[] = {
8153 if (flags & ANYOF_LOCALE)
8154 sv_catpvs(sv, "{loc}");
8155 if (flags & ANYOF_FOLD)
8156 sv_catpvs(sv, "{i}");
8157 Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
8158 if (flags & ANYOF_INVERT)
8160 for (i = 0; i <= 256; i++) {
8161 if (i < 256 && ANYOF_BITMAP_TEST(o,i)) {
8162 if (rangestart == -1)
8164 } else if (rangestart != -1) {
8165 if (i <= rangestart + 3)
8166 for (; rangestart < i; rangestart++)
8167 put_byte(sv, rangestart);
8169 put_byte(sv, rangestart);
8171 put_byte(sv, i - 1);
8177 if (o->flags & ANYOF_CLASS)
8178 for (i = 0; i < (int)(sizeof(anyofs)/sizeof(char*)); i++)
8179 if (ANYOF_CLASS_TEST(o,i))
8180 sv_catpv(sv, anyofs[i]);
8182 if (flags & ANYOF_UNICODE)
8183 sv_catpvs(sv, "{unicode}");
8184 else if (flags & ANYOF_UNICODE_ALL)
8185 sv_catpvs(sv, "{unicode_all}");
8189 SV * const sw = regclass_swash(prog, o, FALSE, &lv, 0);
8193 U8 s[UTF8_MAXBYTES_CASE+1];
8195 for (i = 0; i <= 256; i++) { /* just the first 256 */
8196 uvchr_to_utf8(s, i);
8198 if (i < 256 && swash_fetch(sw, s, TRUE)) {
8199 if (rangestart == -1)
8201 } else if (rangestart != -1) {
8202 if (i <= rangestart + 3)
8203 for (; rangestart < i; rangestart++) {
8204 const U8 * const e = uvchr_to_utf8(s,rangestart);
8206 for(p = s; p < e; p++)
8210 const U8 *e = uvchr_to_utf8(s,rangestart);
8212 for (p = s; p < e; p++)
8215 e = uvchr_to_utf8(s, i-1);
8216 for (p = s; p < e; p++)
8223 sv_catpvs(sv, "..."); /* et cetera */
8227 char *s = savesvpv(lv);
8228 char * const origs = s;
8230 while (*s && *s != '\n')
8234 const char * const t = ++s;
8252 Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
8254 else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH))
8255 Perl_sv_catpvf(aTHX_ sv, "[%d]", -(o->flags));
8257 PERL_UNUSED_CONTEXT;
8258 PERL_UNUSED_ARG(sv);
8260 PERL_UNUSED_ARG(prog);
8261 #endif /* DEBUGGING */
8265 Perl_re_intuit_string(pTHX_ regexp *prog)
8266 { /* Assume that RE_INTUIT is set */
8268 GET_RE_DEBUG_FLAGS_DECL;
8269 PERL_UNUSED_CONTEXT;
8273 const char * const s = SvPV_nolen_const(prog->check_substr
8274 ? prog->check_substr : prog->check_utf8);
8276 if (!PL_colorset) reginitcolors();
8277 PerlIO_printf(Perl_debug_log,
8278 "%sUsing REx %ssubstr:%s \"%s%.60s%s%s\"\n",
8280 prog->check_substr ? "" : "utf8 ",
8281 PL_colors[5],PL_colors[0],
8284 (strlen(s) > 60 ? "..." : ""));
8287 return prog->check_substr ? prog->check_substr : prog->check_utf8;
8291 pregfree - free a regexp
8293 See regdupe below if you change anything here.
8297 Perl_pregfree(pTHX_ struct regexp *r)
8301 GET_RE_DEBUG_FLAGS_DECL;
8303 if (!r || (--r->refcnt > 0))
8309 SV *dsv= sv_newmortal();
8310 RE_PV_QUOTED_DECL(s, (r->reganch & ROPT_UTF8),
8311 dsv, r->precomp, r->prelen, 60);
8312 PerlIO_printf(Perl_debug_log,"%sFreeing REx:%s %s\n",
8313 PL_colors[4],PL_colors[5],s);
8317 /* gcov results gave these as non-null 100% of the time, so there's no
8318 optimisation in checking them before calling Safefree */
8319 Safefree(r->precomp);
8320 Safefree(r->offsets); /* 20010421 MJD */
8321 RX_MATCH_COPY_FREE(r);
8322 #ifdef PERL_OLD_COPY_ON_WRITE
8324 SvREFCNT_dec(r->saved_copy);
8327 if (r->anchored_substr)
8328 SvREFCNT_dec(r->anchored_substr);
8329 if (r->anchored_utf8)
8330 SvREFCNT_dec(r->anchored_utf8);
8331 if (r->float_substr)
8332 SvREFCNT_dec(r->float_substr);
8334 SvREFCNT_dec(r->float_utf8);
8335 Safefree(r->substrs);
8338 SvREFCNT_dec(r->paren_names);
8340 int n = r->data->count;
8341 PAD* new_comppad = NULL;
8346 /* If you add a ->what type here, update the comment in regcomp.h */
8347 switch (r->data->what[n]) {
8350 SvREFCNT_dec((SV*)r->data->data[n]);
8353 Safefree(r->data->data[n]);
8356 new_comppad = (AV*)r->data->data[n];
8359 if (new_comppad == NULL)
8360 Perl_croak(aTHX_ "panic: pregfree comppad");
8361 PAD_SAVE_LOCAL(old_comppad,
8362 /* Watch out for global destruction's random ordering. */
8363 (SvTYPE(new_comppad) == SVt_PVAV) ? new_comppad : NULL
8366 refcnt = OpREFCNT_dec((OP_4tree*)r->data->data[n]);
8369 op_free((OP_4tree*)r->data->data[n]);
8371 PAD_RESTORE_LOCAL(old_comppad);
8372 SvREFCNT_dec((SV*)new_comppad);
8378 { /* Aho Corasick add-on structure for a trie node.
8379 Used in stclass optimization only */
8381 reg_ac_data *aho=(reg_ac_data*)r->data->data[n];
8383 refcount = --aho->refcount;
8386 Safefree(aho->states);
8387 Safefree(aho->fail);
8388 aho->trie=NULL; /* not necessary to free this as it is
8389 handled by the 't' case */
8390 Safefree(r->data->data[n]); /* do this last!!!! */
8391 Safefree(r->regstclass);
8397 /* trie structure. */
8399 reg_trie_data *trie=(reg_trie_data*)r->data->data[n];
8401 refcount = --trie->refcount;
8404 Safefree(trie->charmap);
8405 if (trie->widecharmap)
8406 SvREFCNT_dec((SV*)trie->widecharmap);
8407 Safefree(trie->states);
8408 Safefree(trie->trans);
8410 Safefree(trie->bitmap);
8412 Safefree(trie->wordlen);
8414 Safefree(trie->jump);
8416 Safefree(trie->nextword);
8420 SvREFCNT_dec((SV*)trie->words);
8421 if (trie->revcharmap)
8422 SvREFCNT_dec((SV*)trie->revcharmap);
8425 Safefree(r->data->data[n]); /* do this last!!!! */
8430 Perl_croak(aTHX_ "panic: regfree data code '%c'", r->data->what[n]);
8433 Safefree(r->data->what);
8436 Safefree(r->startp);
8441 #define sv_dup_inc(s,t) SvREFCNT_inc(sv_dup(s,t))
8442 #define av_dup_inc(s,t) (AV*)SvREFCNT_inc(sv_dup((SV*)s,t))
8443 #define hv_dup_inc(s,t) (HV*)SvREFCNT_inc(sv_dup((SV*)s,t))
8444 #define SAVEPVN(p,n) ((p) ? savepvn(p,n) : NULL)
8447 regdupe - duplicate a regexp.
8449 This routine is called by sv.c's re_dup and is expected to clone a
8450 given regexp structure. It is a no-op when not under USE_ITHREADS.
8451 (Originally this *was* re_dup() for change history see sv.c)
8453 See pregfree() above if you change anything here.
8455 #if defined(USE_ITHREADS)
8457 Perl_regdupe(pTHX_ const regexp *r, CLONE_PARAMS *param)
8462 struct reg_substr_datum *s;
8465 return (REGEXP *)NULL;
8467 if ((ret = (REGEXP *)ptr_table_fetch(PL_ptr_table, r)))
8470 len = r->offsets[0];
8471 npar = r->nparens+1;
8473 Newxc(ret, sizeof(regexp) + (len+1)*sizeof(regnode), char, regexp);
8474 Copy(r->program, ret->program, len+1, regnode);
8476 Newx(ret->startp, npar, I32);
8477 Copy(r->startp, ret->startp, npar, I32);
8478 Newx(ret->endp, npar, I32);
8479 Copy(r->startp, ret->startp, npar, I32);
8481 Newx(ret->substrs, 1, struct reg_substr_data);
8482 for (s = ret->substrs->data, i = 0; i < 3; i++, s++) {
8483 s->min_offset = r->substrs->data[i].min_offset;
8484 s->max_offset = r->substrs->data[i].max_offset;
8485 s->end_shift = r->substrs->data[i].end_shift;
8486 s->substr = sv_dup_inc(r->substrs->data[i].substr, param);
8487 s->utf8_substr = sv_dup_inc(r->substrs->data[i].utf8_substr, param);
8490 ret->regstclass = NULL;
8493 const int count = r->data->count;
8496 Newxc(d, sizeof(struct reg_data) + count*sizeof(void *),
8497 char, struct reg_data);
8498 Newx(d->what, count, U8);
8501 for (i = 0; i < count; i++) {
8502 d->what[i] = r->data->what[i];
8503 switch (d->what[i]) {
8504 /* legal options are one of: sSfpont
8505 see also regcomp.h and pregfree() */
8508 d->data[i] = sv_dup_inc((SV *)r->data->data[i], param);
8511 d->data[i] = av_dup_inc((AV *)r->data->data[i], param);
8514 /* This is cheating. */
8515 Newx(d->data[i], 1, struct regnode_charclass_class);
8516 StructCopy(r->data->data[i], d->data[i],
8517 struct regnode_charclass_class);
8518 ret->regstclass = (regnode*)d->data[i];
8521 /* Compiled op trees are readonly, and can thus be
8522 shared without duplication. */
8524 d->data[i] = (void*)OpREFCNT_inc((OP*)r->data->data[i]);
8528 d->data[i] = r->data->data[i];
8531 d->data[i] = r->data->data[i];
8533 ((reg_trie_data*)d->data[i])->refcount++;
8537 d->data[i] = r->data->data[i];
8539 ((reg_ac_data*)d->data[i])->refcount++;
8541 /* Trie stclasses are readonly and can thus be shared
8542 * without duplication. We free the stclass in pregfree
8543 * when the corresponding reg_ac_data struct is freed.
8545 ret->regstclass= r->regstclass;
8548 Perl_croak(aTHX_ "panic: re_dup unknown data code '%c'", r->data->what[i]);
8557 Newx(ret->offsets, 2*len+1, U32);
8558 Copy(r->offsets, ret->offsets, 2*len+1, U32);
8560 ret->precomp = SAVEPVN(r->precomp, r->prelen);
8561 ret->refcnt = r->refcnt;
8562 ret->minlen = r->minlen;
8563 ret->prelen = r->prelen;
8564 ret->nparens = r->nparens;
8565 ret->lastparen = r->lastparen;
8566 ret->lastcloseparen = r->lastcloseparen;
8567 ret->reganch = r->reganch;
8569 ret->sublen = r->sublen;
8571 ret->engine = r->engine;
8573 ret->paren_names = hv_dup_inc(r->paren_names, param);
8575 if (RX_MATCH_COPIED(ret))
8576 ret->subbeg = SAVEPVN(r->subbeg, r->sublen);
8579 #ifdef PERL_OLD_COPY_ON_WRITE
8580 ret->saved_copy = NULL;
8583 ptr_table_store(PL_ptr_table, r, ret);
8588 #ifndef PERL_IN_XSUB_RE
8590 - regnext - dig the "next" pointer out of a node
8593 Perl_regnext(pTHX_ register regnode *p)
8596 register I32 offset;
8598 if (p == &PL_regdummy)
8601 offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p));
8610 S_re_croak2(pTHX_ const char* pat1,const char* pat2,...)
8613 STRLEN l1 = strlen(pat1);
8614 STRLEN l2 = strlen(pat2);
8617 const char *message;
8623 Copy(pat1, buf, l1 , char);
8624 Copy(pat2, buf + l1, l2 , char);
8625 buf[l1 + l2] = '\n';
8626 buf[l1 + l2 + 1] = '\0';
8628 /* ANSI variant takes additional second argument */
8629 va_start(args, pat2);
8633 msv = vmess(buf, &args);
8635 message = SvPV_const(msv,l1);
8638 Copy(message, buf, l1 , char);
8639 buf[l1-1] = '\0'; /* Overwrite \n */
8640 Perl_croak(aTHX_ "%s", buf);
8643 /* XXX Here's a total kludge. But we need to re-enter for swash routines. */
8645 #ifndef PERL_IN_XSUB_RE
8647 Perl_save_re_context(pTHX)
8651 struct re_save_state *state;
8653 SAVEVPTR(PL_curcop);
8654 SSGROW(SAVESTACK_ALLOC_FOR_RE_SAVE_STATE + 1);
8656 state = (struct re_save_state *)(PL_savestack + PL_savestack_ix);
8657 PL_savestack_ix += SAVESTACK_ALLOC_FOR_RE_SAVE_STATE;
8658 SSPUSHINT(SAVEt_RE_STATE);
8660 Copy(&PL_reg_state, state, 1, struct re_save_state);
8662 PL_reg_start_tmp = 0;
8663 PL_reg_start_tmpl = 0;
8664 PL_reg_oldsaved = NULL;
8665 PL_reg_oldsavedlen = 0;
8667 PL_reg_leftiter = 0;
8668 PL_reg_poscache = NULL;
8669 PL_reg_poscache_size = 0;
8670 #ifdef PERL_OLD_COPY_ON_WRITE
8674 /* Save $1..$n (#18107: UTF-8 s/(\w+)/uc($1)/e); AMS 20021106. */
8676 const REGEXP * const rx = PM_GETRE(PL_curpm);
8679 for (i = 1; i <= rx->nparens; i++) {
8680 char digits[TYPE_CHARS(long)];
8681 const STRLEN len = my_snprintf(digits, sizeof(digits), "%lu", (long)i);
8682 GV *const *const gvp
8683 = (GV**)hv_fetch(PL_defstash, digits, len, 0);
8686 GV * const gv = *gvp;
8687 if (SvTYPE(gv) == SVt_PVGV && GvSV(gv))
8697 clear_re(pTHX_ void *r)
8700 ReREFCNT_dec((regexp *)r);
8706 S_put_byte(pTHX_ SV *sv, int c)
8708 if (isCNTRL(c) || c == 255 || !isPRINT(c))
8709 Perl_sv_catpvf(aTHX_ sv, "\\%o", c);
8710 else if (c == '-' || c == ']' || c == '\\' || c == '^')
8711 Perl_sv_catpvf(aTHX_ sv, "\\%c", c);
8713 Perl_sv_catpvf(aTHX_ sv, "%c", c);
8717 #define CLEAR_OPTSTART \
8718 if (optstart) STMT_START { \
8719 DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log, " (%d nodes)\n", node - optstart)); \
8723 #define DUMPUNTIL(b,e) CLEAR_OPTSTART; node=dumpuntil(r,start,(b),(e),last,sv,indent+1,depth+1);
8725 STATIC const regnode *
8726 S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node,
8727 const regnode *last, const regnode *plast,
8728 SV* sv, I32 indent, U32 depth)
8731 register U8 op = PSEUDO; /* Arbitrary non-END op. */
8732 register const regnode *next;
8733 const regnode *optstart= NULL;
8734 GET_RE_DEBUG_FLAGS_DECL;
8736 #ifdef DEBUG_DUMPUNTIL
8737 PerlIO_printf(Perl_debug_log, "--- %d : %d - %d - %d\n",indent,node-start,
8738 last ? last-start : 0,plast ? plast-start : 0);
8741 if (plast && plast < last)
8744 while (PL_regkind[op] != END && (!last || node < last)) {
8745 /* While that wasn't END last time... */
8751 next = regnext((regnode *)node);
8754 if (OP(node) == OPTIMIZED) {
8755 if (!optstart && RE_DEBUG_FLAG(RE_DEBUG_COMPILE_OPTIMISE))
8762 regprop(r, sv, node);
8763 PerlIO_printf(Perl_debug_log, "%4"IVdf":%*s%s", (IV)(node - start),
8764 (int)(2*indent + 1), "", SvPVX_const(sv));
8766 if (OP(node) != OPTIMIZED) {
8767 if (next == NULL) /* Next ptr. */
8768 PerlIO_printf(Perl_debug_log, "(0)");
8769 else if (PL_regkind[(U8)op] == BRANCH && PL_regkind[OP(next)] != BRANCH )
8770 PerlIO_printf(Perl_debug_log, "(FAIL)");
8772 PerlIO_printf(Perl_debug_log, "(%"IVdf")", (IV)(next - start));
8774 /*if (PL_regkind[(U8)op] != TRIE)*/
8775 (void)PerlIO_putc(Perl_debug_log, '\n');
8779 if (PL_regkind[(U8)op] == BRANCHJ) {
8782 register const regnode *nnode = (OP(next) == LONGJMP
8783 ? regnext((regnode *)next)
8785 if (last && nnode > last)
8787 DUMPUNTIL(NEXTOPER(NEXTOPER(node)), nnode);
8790 else if (PL_regkind[(U8)op] == BRANCH) {
8792 DUMPUNTIL(NEXTOPER(node), next);
8794 else if ( PL_regkind[(U8)op] == TRIE ) {
8795 const regnode *this_trie = node;
8796 const char op = OP(node);
8797 const I32 n = ARG(node);
8798 const reg_ac_data * const ac = op>=AHOCORASICK ?
8799 (reg_ac_data *)r->data->data[n] :
8801 const reg_trie_data * const trie = op<AHOCORASICK ?
8802 (reg_trie_data*)r->data->data[n] :
8804 const regnode *nextbranch= NULL;
8806 sv_setpvn(sv, "", 0);
8807 for (word_idx= 0; word_idx < (I32)trie->wordcount; word_idx++) {
8808 SV ** const elem_ptr = av_fetch(trie->words,word_idx,0);
8810 PerlIO_printf(Perl_debug_log, "%*s%s ",
8811 (int)(2*(indent+3)), "",
8812 elem_ptr ? pv_pretty(sv, SvPV_nolen_const(*elem_ptr), SvCUR(*elem_ptr), 60,
8813 PL_colors[0], PL_colors[1],
8814 (SvUTF8(*elem_ptr) ? PERL_PV_ESCAPE_UNI : 0) |
8815 PERL_PV_PRETTY_ELIPSES |
8821 U16 dist= trie->jump[word_idx+1];
8822 PerlIO_printf(Perl_debug_log, "(%u)\n",
8823 (dist ? this_trie + dist : next) - start);
8826 nextbranch= this_trie + trie->jump[0];
8827 DUMPUNTIL(this_trie + dist, nextbranch);
8829 if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
8830 nextbranch= regnext((regnode *)nextbranch);
8832 PerlIO_printf(Perl_debug_log, "\n");
8835 if (last && next > last)
8840 else if ( op == CURLY ) { /* "next" might be very big: optimizer */
8841 DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS,
8842 NEXTOPER(node) + EXTRA_STEP_2ARGS + 1);
8844 else if (PL_regkind[(U8)op] == CURLY && op != CURLYX) {
8846 DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS, next);
8848 else if ( op == PLUS || op == STAR) {
8849 DUMPUNTIL(NEXTOPER(node), NEXTOPER(node) + 1);
8851 else if (op == ANYOF) {
8852 /* arglen 1 + class block */
8853 node += 1 + ((ANYOF_FLAGS(node) & ANYOF_LARGE)
8854 ? ANYOF_CLASS_SKIP : ANYOF_SKIP);
8855 node = NEXTOPER(node);
8857 else if (PL_regkind[(U8)op] == EXACT) {
8858 /* Literal string, where present. */
8859 node += NODE_SZ_STR(node) - 1;
8860 node = NEXTOPER(node);
8863 node = NEXTOPER(node);
8864 node += regarglen[(U8)op];
8866 if (op == CURLYX || op == OPEN)
8868 else if (op == WHILEM)
8872 #ifdef DEBUG_DUMPUNTIL
8873 PerlIO_printf(Perl_debug_log, "--- %d\n",indent);
8878 #endif /* DEBUGGING */
8882 * c-indentation-style: bsd
8884 * indent-tabs-mode: t
8887 * ex: set ts=8 sts=4 sw=4 noet: