5 * "A fair jaw-cracker dwarf-language must be." --Samwise Gamgee
8 /* This file contains functions for compiling a regular expression. See
9 * also regexec.c which funnily enough, contains functions for executing
10 * a regular expression.
12 * This file is also copied at build time to ext/re/re_comp.c, where
13 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
14 * This causes the main functions to be compiled under new names and with
15 * debugging support added, which makes "use re 'debug'" work.
18 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
19 * confused with the original package (see point 3 below). Thanks, Henry!
22 /* Additional note: this code is very heavily munged from Henry's version
23 * in places. In some spots I've traded clarity for efficiency, so don't
24 * blame Henry for some of the lack of readability.
27 /* The names of the functions have been changed from regcomp and
28 * regexec to pregcomp and pregexec in order to avoid conflicts
29 * with the POSIX routines of the same names.
32 #ifdef PERL_EXT_RE_BUILD
37 * pregcomp and pregexec -- regsub and regerror are not used in perl
39 * Copyright (c) 1986 by University of Toronto.
40 * Written by Henry Spencer. Not derived from licensed software.
42 * Permission is granted to anyone to use this software for any
43 * purpose on any computer system, and to redistribute it freely,
44 * subject to the following restrictions:
46 * 1. The author is not responsible for the consequences of use of
47 * this software, no matter how awful, even if they arise
50 * 2. The origin of this software must not be misrepresented, either
51 * by explicit claim or by omission.
53 * 3. Altered versions must be plainly marked as such, and must not
54 * be misrepresented as being the original software.
57 **** Alterations to Henry's code are...
59 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
60 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, by Larry Wall and others
62 **** You may distribute under the terms of either the GNU General Public
63 **** License or the Artistic License, as specified in the README file.
66 * Beware that some of this code is subtly aware of the way operator
67 * precedence is structured in regular expressions. Serious changes in
68 * regular-expression syntax might require a total rethink.
71 #define PERL_IN_REGCOMP_C
74 #ifndef PERL_IN_XSUB_RE
79 #ifdef PERL_IN_XSUB_RE
90 # if defined(BUGGY_MSC6)
91 /* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */
92 # pragma optimize("a",off)
93 /* But MSC 6.00A is happy with 'w', for aliases only across function calls*/
94 # pragma optimize("w",on )
95 # endif /* BUGGY_MSC6 */
102 typedef struct RExC_state_t {
103 U32 flags; /* are we folding, multilining? */
104 char *precomp; /* uncompiled string. */
106 char *start; /* Start of input for compile */
107 char *end; /* End of input for compile */
108 char *parse; /* Input-scan pointer. */
109 I32 whilem_seen; /* number of WHILEM in this expr */
110 regnode *emit_start; /* Start of emitted-code area */
111 regnode *emit; /* Code-emit pointer; ®dummy = don't = compiling */
112 I32 naughty; /* How bad is this pattern? */
113 I32 sawback; /* Did we see \1, ...? */
115 I32 size; /* Code size. */
116 I32 npar; /* () count. */
117 I32 nestroot; /* root parens we are in - used by accept */
121 regnode **open_parens; /* pointers to open parens */
122 regnode **close_parens; /* pointers to close parens */
123 regnode *opend; /* END node in program */
125 HV *charnames; /* cache of named sequences */
126 HV *paren_names; /* Paren names */
127 regnode **recurse; /* Recurse regops */
128 I32 recurse_count; /* Number of recurse regops */
130 char *starttry; /* -Dr: where regtry was called. */
131 #define RExC_starttry (pRExC_state->starttry)
134 const char *lastparse;
136 #define RExC_lastparse (pRExC_state->lastparse)
137 #define RExC_lastnum (pRExC_state->lastnum)
141 #define RExC_flags (pRExC_state->flags)
142 #define RExC_precomp (pRExC_state->precomp)
143 #define RExC_rx (pRExC_state->rx)
144 #define RExC_start (pRExC_state->start)
145 #define RExC_end (pRExC_state->end)
146 #define RExC_parse (pRExC_state->parse)
147 #define RExC_whilem_seen (pRExC_state->whilem_seen)
148 #define RExC_offsets (pRExC_state->rx->offsets) /* I am not like the others */
149 #define RExC_emit (pRExC_state->emit)
150 #define RExC_emit_start (pRExC_state->emit_start)
151 #define RExC_naughty (pRExC_state->naughty)
152 #define RExC_sawback (pRExC_state->sawback)
153 #define RExC_seen (pRExC_state->seen)
154 #define RExC_size (pRExC_state->size)
155 #define RExC_npar (pRExC_state->npar)
156 #define RExC_nestroot (pRExC_state->nestroot)
157 #define RExC_extralen (pRExC_state->extralen)
158 #define RExC_seen_zerolen (pRExC_state->seen_zerolen)
159 #define RExC_seen_evals (pRExC_state->seen_evals)
160 #define RExC_utf8 (pRExC_state->utf8)
161 #define RExC_charnames (pRExC_state->charnames)
162 #define RExC_open_parens (pRExC_state->open_parens)
163 #define RExC_close_parens (pRExC_state->close_parens)
164 #define RExC_opend (pRExC_state->opend)
165 #define RExC_paren_names (pRExC_state->paren_names)
166 #define RExC_recurse (pRExC_state->recurse)
167 #define RExC_recurse_count (pRExC_state->recurse_count)
169 #define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?')
170 #define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
171 ((*s) == '{' && regcurly(s)))
174 #undef SPSTART /* dratted cpp namespace... */
177 * Flags to be passed up and down.
179 #define WORST 0 /* Worst case. */
180 #define HASWIDTH 0x1 /* Known to match non-null strings. */
181 #define SIMPLE 0x2 /* Simple enough to be STAR/PLUS operand. */
182 #define SPSTART 0x4 /* Starts with * or +. */
183 #define TRYAGAIN 0x8 /* Weeded out a declaration. */
185 #define REG_NODE_NUM(x) ((x) ? (int)((x)-RExC_emit_start) : -1)
187 /* whether trie related optimizations are enabled */
188 #if PERL_ENABLE_EXTENDED_TRIE_OPTIMISATION
189 #define TRIE_STUDY_OPT
190 #define FULL_TRIE_STUDY
196 #define PBYTE(u8str,paren) ((U8*)(u8str))[(paren) >> 3]
197 #define PBITVAL(paren) (1 << ((paren) & 7))
198 #define PAREN_TEST(u8str,paren) ( PBYTE(u8str,paren) & PBITVAL(paren))
199 #define PAREN_SET(u8str,paren) PBYTE(u8str,paren) |= PBITVAL(paren)
200 #define PAREN_UNSET(u8str,paren) PBYTE(u8str,paren) &= (~PBITVAL(paren))
203 /* About scan_data_t.
205 During optimisation we recurse through the regexp program performing
206 various inplace (keyhole style) optimisations. In addition study_chunk
207 and scan_commit populate this data structure with information about
208 what strings MUST appear in the pattern. We look for the longest
209 string that must appear for at a fixed location, and we look for the
210 longest string that may appear at a floating location. So for instance
215 Both 'FOO' and 'A' are fixed strings. Both 'B' and 'BAR' are floating
216 strings (because they follow a .* construct). study_chunk will identify
217 both FOO and BAR as being the longest fixed and floating strings respectively.
219 The strings can be composites, for instance
223 will result in a composite fixed substring 'foo'.
225 For each string some basic information is maintained:
227 - offset or min_offset
228 This is the position the string must appear at, or not before.
229 It also implicitly (when combined with minlenp) tells us how many
230 character must match before the string we are searching.
231 Likewise when combined with minlenp and the length of the string
232 tells us how many characters must appear after the string we have
236 Only used for floating strings. This is the rightmost point that
237 the string can appear at. Ifset to I32 max it indicates that the
238 string can occur infinitely far to the right.
241 A pointer to the minimum length of the pattern that the string
242 was found inside. This is important as in the case of positive
243 lookahead or positive lookbehind we can have multiple patterns
248 The minimum length of the pattern overall is 3, the minimum length
249 of the lookahead part is 3, but the minimum length of the part that
250 will actually match is 1. So 'FOO's minimum length is 3, but the
251 minimum length for the F is 1. This is important as the minimum length
252 is used to determine offsets in front of and behind the string being
253 looked for. Since strings can be composites this is the length of the
254 pattern at the time it was commited with a scan_commit. Note that
255 the length is calculated by study_chunk, so that the minimum lengths
256 are not known until the full pattern has been compiled, thus the
257 pointer to the value.
261 In the case of lookbehind the string being searched for can be
262 offset past the start point of the final matching string.
263 If this value was just blithely removed from the min_offset it would
264 invalidate some of the calculations for how many chars must match
265 before or after (as they are derived from min_offset and minlen and
266 the length of the string being searched for).
267 When the final pattern is compiled and the data is moved from the
268 scan_data_t structure into the regexp structure the information
269 about lookbehind is factored in, with the information that would
270 have been lost precalculated in the end_shift field for the
273 The fields pos_min and pos_delta are used to store the minimum offset
274 and the delta to the maximum offset at the current point in the pattern.
278 typedef struct scan_data_t {
279 /*I32 len_min; unused */
280 /*I32 len_delta; unused */
284 I32 last_end; /* min value, <0 unless valid. */
287 SV **longest; /* Either &l_fixed, or &l_float. */
288 SV *longest_fixed; /* longest fixed string found in pattern */
289 I32 offset_fixed; /* offset where it starts */
290 I32 *minlen_fixed; /* pointer to the minlen relevent to the string */
291 I32 lookbehind_fixed; /* is the position of the string modfied by LB */
292 SV *longest_float; /* longest floating string found in pattern */
293 I32 offset_float_min; /* earliest point in string it can appear */
294 I32 offset_float_max; /* latest point in string it can appear */
295 I32 *minlen_float; /* pointer to the minlen relevent to the string */
296 I32 lookbehind_float; /* is the position of the string modified by LB */
300 struct regnode_charclass_class *start_class;
304 * Forward declarations for pregcomp()'s friends.
307 static const scan_data_t zero_scan_data =
308 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ,0};
310 #define SF_BEFORE_EOL (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
311 #define SF_BEFORE_SEOL 0x0001
312 #define SF_BEFORE_MEOL 0x0002
313 #define SF_FIX_BEFORE_EOL (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
314 #define SF_FL_BEFORE_EOL (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
317 # define SF_FIX_SHIFT_EOL (0+2)
318 # define SF_FL_SHIFT_EOL (0+4)
320 # define SF_FIX_SHIFT_EOL (+2)
321 # define SF_FL_SHIFT_EOL (+4)
324 #define SF_FIX_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
325 #define SF_FIX_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
327 #define SF_FL_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
328 #define SF_FL_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
329 #define SF_IS_INF 0x0040
330 #define SF_HAS_PAR 0x0080
331 #define SF_IN_PAR 0x0100
332 #define SF_HAS_EVAL 0x0200
333 #define SCF_DO_SUBSTR 0x0400
334 #define SCF_DO_STCLASS_AND 0x0800
335 #define SCF_DO_STCLASS_OR 0x1000
336 #define SCF_DO_STCLASS (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
337 #define SCF_WHILEM_VISITED_POS 0x2000
339 #define SCF_TRIE_RESTUDY 0x4000 /* Do restudy? */
340 #define SCF_SEEN_ACCEPT 0x8000
342 #define UTF (RExC_utf8 != 0)
343 #define LOC ((RExC_flags & PMf_LOCALE) != 0)
344 #define FOLD ((RExC_flags & PMf_FOLD) != 0)
346 #define OOB_UNICODE 12345678
347 #define OOB_NAMEDCLASS -1
349 #define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
350 #define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
353 /* length of regex to show in messages that don't mark a position within */
354 #define RegexLengthToShowInErrorMessages 127
357 * If MARKER[12] are adjusted, be sure to adjust the constants at the top
358 * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
359 * op/pragma/warn/regcomp.
361 #define MARKER1 "<-- HERE" /* marker as it appears in the description */
362 #define MARKER2 " <-- HERE " /* marker as it appears within the regex */
364 #define REPORT_LOCATION " in regex; marked by " MARKER1 " in m/%.*s" MARKER2 "%s/"
367 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
368 * arg. Show regex, up to a maximum length. If it's too long, chop and add
371 #define FAIL(msg) STMT_START { \
372 const char *ellipses = ""; \
373 IV len = RExC_end - RExC_precomp; \
376 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
377 if (len > RegexLengthToShowInErrorMessages) { \
378 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
379 len = RegexLengthToShowInErrorMessages - 10; \
382 Perl_croak(aTHX_ "%s in regex m/%.*s%s/", \
383 msg, (int)len, RExC_precomp, ellipses); \
387 * Simple_vFAIL -- like FAIL, but marks the current location in the scan
389 #define Simple_vFAIL(m) STMT_START { \
390 const IV offset = RExC_parse - RExC_precomp; \
391 Perl_croak(aTHX_ "%s" REPORT_LOCATION, \
392 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
396 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
398 #define vFAIL(m) STMT_START { \
400 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
405 * Like Simple_vFAIL(), but accepts two arguments.
407 #define Simple_vFAIL2(m,a1) STMT_START { \
408 const IV offset = RExC_parse - RExC_precomp; \
409 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, \
410 (int)offset, RExC_precomp, RExC_precomp + offset); \
414 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
416 #define vFAIL2(m,a1) STMT_START { \
418 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
419 Simple_vFAIL2(m, a1); \
424 * Like Simple_vFAIL(), but accepts three arguments.
426 #define Simple_vFAIL3(m, a1, a2) STMT_START { \
427 const IV offset = RExC_parse - RExC_precomp; \
428 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, \
429 (int)offset, RExC_precomp, RExC_precomp + offset); \
433 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
435 #define vFAIL3(m,a1,a2) STMT_START { \
437 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
438 Simple_vFAIL3(m, a1, a2); \
442 * Like Simple_vFAIL(), but accepts four arguments.
444 #define Simple_vFAIL4(m, a1, a2, a3) STMT_START { \
445 const IV offset = RExC_parse - RExC_precomp; \
446 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3, \
447 (int)offset, RExC_precomp, RExC_precomp + offset); \
450 #define vWARN(loc,m) STMT_START { \
451 const IV offset = loc - RExC_precomp; \
452 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s" REPORT_LOCATION, \
453 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
456 #define vWARNdep(loc,m) STMT_START { \
457 const IV offset = loc - RExC_precomp; \
458 Perl_warner(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), \
459 "%s" REPORT_LOCATION, \
460 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
464 #define vWARN2(loc, m, a1) STMT_START { \
465 const IV offset = loc - RExC_precomp; \
466 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
467 a1, (int)offset, RExC_precomp, RExC_precomp + offset); \
470 #define vWARN3(loc, m, a1, a2) STMT_START { \
471 const IV offset = loc - RExC_precomp; \
472 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
473 a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset); \
476 #define vWARN4(loc, m, a1, a2, a3) STMT_START { \
477 const IV offset = loc - RExC_precomp; \
478 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
479 a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
482 #define vWARN5(loc, m, a1, a2, a3, a4) STMT_START { \
483 const IV offset = loc - RExC_precomp; \
484 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
485 a1, a2, a3, a4, (int)offset, RExC_precomp, RExC_precomp + offset); \
489 /* Allow for side effects in s */
490 #define REGC(c,s) STMT_START { \
491 if (!SIZE_ONLY) *(s) = (c); else (void)(s); \
494 /* Macros for recording node offsets. 20001227 mjd@plover.com
495 * Nodes are numbered 1, 2, 3, 4. Node #n's position is recorded in
496 * element 2*n-1 of the array. Element #2n holds the byte length node #n.
497 * Element 0 holds the number n.
498 * Position is 1 indexed.
501 #define Set_Node_Offset_To_R(node,byte) STMT_START { \
503 MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n", \
504 __LINE__, (int)(node), (int)(byte))); \
506 Perl_croak(aTHX_ "value of node is %d in Offset macro", (int)(node)); \
508 RExC_offsets[2*(node)-1] = (byte); \
513 #define Set_Node_Offset(node,byte) \
514 Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
515 #define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
517 #define Set_Node_Length_To_R(node,len) STMT_START { \
519 MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n", \
520 __LINE__, (int)(node), (int)(len))); \
522 Perl_croak(aTHX_ "value of node is %d in Length macro", (int)(node)); \
524 RExC_offsets[2*(node)] = (len); \
529 #define Set_Node_Length(node,len) \
530 Set_Node_Length_To_R((node)-RExC_emit_start, len)
531 #define Set_Cur_Node_Length(len) Set_Node_Length(RExC_emit, len)
532 #define Set_Node_Cur_Length(node) \
533 Set_Node_Length(node, RExC_parse - parse_start)
535 /* Get offsets and lengths */
536 #define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
537 #define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
539 #define Set_Node_Offset_Length(node,offset,len) STMT_START { \
540 Set_Node_Offset_To_R((node)-RExC_emit_start, (offset)); \
541 Set_Node_Length_To_R((node)-RExC_emit_start, (len)); \
545 #if PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS
546 #define EXPERIMENTAL_INPLACESCAN
549 #define DEBUG_STUDYDATA(data,depth) \
550 DEBUG_OPTIMISE_MORE_r(if(data){ \
551 PerlIO_printf(Perl_debug_log, \
552 "%*s"/* Len:%"IVdf"/%"IVdf" */" Pos:%"IVdf"/%"IVdf \
553 " Flags: %"IVdf" Whilem_c: %"IVdf" Lcp: %"IVdf" ", \
554 (int)(depth)*2, "", \
555 (IV)((data)->pos_min), \
556 (IV)((data)->pos_delta), \
557 (IV)((data)->flags), \
558 (IV)((data)->whilem_c), \
559 (IV)((data)->last_closep ? *((data)->last_closep) : -1) \
561 if ((data)->last_found) \
562 PerlIO_printf(Perl_debug_log, \
563 "Last:'%s' %"IVdf":%"IVdf"/%"IVdf" %sFixed:'%s' @ %"IVdf \
564 " %sFloat: '%s' @ %"IVdf"/%"IVdf"", \
565 SvPVX_const((data)->last_found), \
566 (IV)((data)->last_end), \
567 (IV)((data)->last_start_min), \
568 (IV)((data)->last_start_max), \
569 ((data)->longest && \
570 (data)->longest==&((data)->longest_fixed)) ? "*" : "", \
571 SvPVX_const((data)->longest_fixed), \
572 (IV)((data)->offset_fixed), \
573 ((data)->longest && \
574 (data)->longest==&((data)->longest_float)) ? "*" : "", \
575 SvPVX_const((data)->longest_float), \
576 (IV)((data)->offset_float_min), \
577 (IV)((data)->offset_float_max) \
579 PerlIO_printf(Perl_debug_log,"\n"); \
582 static void clear_re(pTHX_ void *r);
584 /* Mark that we cannot extend a found fixed substring at this point.
585 Update the longest found anchored substring and the longest found
586 floating substrings if needed. */
589 S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data, I32 *minlenp)
591 const STRLEN l = CHR_SVLEN(data->last_found);
592 const STRLEN old_l = CHR_SVLEN(*data->longest);
593 GET_RE_DEBUG_FLAGS_DECL;
595 if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
596 SvSetMagicSV(*data->longest, data->last_found);
597 if (*data->longest == data->longest_fixed) {
598 data->offset_fixed = l ? data->last_start_min : data->pos_min;
599 if (data->flags & SF_BEFORE_EOL)
601 |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
603 data->flags &= ~SF_FIX_BEFORE_EOL;
604 data->minlen_fixed=minlenp;
605 data->lookbehind_fixed=0;
608 data->offset_float_min = l ? data->last_start_min : data->pos_min;
609 data->offset_float_max = (l
610 ? data->last_start_max
611 : data->pos_min + data->pos_delta);
612 if ((U32)data->offset_float_max > (U32)I32_MAX)
613 data->offset_float_max = I32_MAX;
614 if (data->flags & SF_BEFORE_EOL)
616 |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
618 data->flags &= ~SF_FL_BEFORE_EOL;
619 data->minlen_float=minlenp;
620 data->lookbehind_float=0;
623 SvCUR_set(data->last_found, 0);
625 SV * const sv = data->last_found;
626 if (SvUTF8(sv) && SvMAGICAL(sv)) {
627 MAGIC * const mg = mg_find(sv, PERL_MAGIC_utf8);
633 data->flags &= ~SF_BEFORE_EOL;
634 DEBUG_STUDYDATA(data,0);
637 /* Can match anything (initialization) */
639 S_cl_anything(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
641 ANYOF_CLASS_ZERO(cl);
642 ANYOF_BITMAP_SETALL(cl);
643 cl->flags = ANYOF_EOS|ANYOF_UNICODE_ALL;
645 cl->flags |= ANYOF_LOCALE;
648 /* Can match anything (initialization) */
650 S_cl_is_anything(const struct regnode_charclass_class *cl)
654 for (value = 0; value <= ANYOF_MAX; value += 2)
655 if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1))
657 if (!(cl->flags & ANYOF_UNICODE_ALL))
659 if (!ANYOF_BITMAP_TESTALLSET((const void*)cl))
664 /* Can match anything (initialization) */
666 S_cl_init(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
668 Zero(cl, 1, struct regnode_charclass_class);
670 cl_anything(pRExC_state, cl);
674 S_cl_init_zero(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
676 Zero(cl, 1, struct regnode_charclass_class);
678 cl_anything(pRExC_state, cl);
680 cl->flags |= ANYOF_LOCALE;
683 /* 'And' a given class with another one. Can create false positives */
684 /* We assume that cl is not inverted */
686 S_cl_and(struct regnode_charclass_class *cl,
687 const struct regnode_charclass_class *and_with)
690 assert(and_with->type == ANYOF);
691 if (!(and_with->flags & ANYOF_CLASS)
692 && !(cl->flags & ANYOF_CLASS)
693 && (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
694 && !(and_with->flags & ANYOF_FOLD)
695 && !(cl->flags & ANYOF_FOLD)) {
698 if (and_with->flags & ANYOF_INVERT)
699 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
700 cl->bitmap[i] &= ~and_with->bitmap[i];
702 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
703 cl->bitmap[i] &= and_with->bitmap[i];
704 } /* XXXX: logic is complicated otherwise, leave it along for a moment. */
705 if (!(and_with->flags & ANYOF_EOS))
706 cl->flags &= ~ANYOF_EOS;
708 if (cl->flags & ANYOF_UNICODE_ALL && and_with->flags & ANYOF_UNICODE &&
709 !(and_with->flags & ANYOF_INVERT)) {
710 cl->flags &= ~ANYOF_UNICODE_ALL;
711 cl->flags |= ANYOF_UNICODE;
712 ARG_SET(cl, ARG(and_with));
714 if (!(and_with->flags & ANYOF_UNICODE_ALL) &&
715 !(and_with->flags & ANYOF_INVERT))
716 cl->flags &= ~ANYOF_UNICODE_ALL;
717 if (!(and_with->flags & (ANYOF_UNICODE|ANYOF_UNICODE_ALL)) &&
718 !(and_with->flags & ANYOF_INVERT))
719 cl->flags &= ~ANYOF_UNICODE;
722 /* 'OR' a given class with another one. Can create false positives */
723 /* We assume that cl is not inverted */
725 S_cl_or(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, const struct regnode_charclass_class *or_with)
727 if (or_with->flags & ANYOF_INVERT) {
729 * (B1 | CL1) | (!B2 & !CL2) = (B1 | !B2 & !CL2) | (CL1 | (!B2 & !CL2))
730 * <= (B1 | !B2) | (CL1 | !CL2)
731 * which is wasteful if CL2 is small, but we ignore CL2:
732 * (B1 | CL1) | (!B2 & !CL2) <= (B1 | CL1) | !B2 = (B1 | !B2) | CL1
733 * XXXX Can we handle case-fold? Unclear:
734 * (OK1(i) | OK1(i')) | !(OK1(i) | OK1(i')) =
735 * (OK1(i) | OK1(i')) | (!OK1(i) & !OK1(i'))
737 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
738 && !(or_with->flags & ANYOF_FOLD)
739 && !(cl->flags & ANYOF_FOLD) ) {
742 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
743 cl->bitmap[i] |= ~or_with->bitmap[i];
744 } /* XXXX: logic is complicated otherwise */
746 cl_anything(pRExC_state, cl);
749 /* (B1 | CL1) | (B2 | CL2) = (B1 | B2) | (CL1 | CL2)) */
750 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
751 && (!(or_with->flags & ANYOF_FOLD)
752 || (cl->flags & ANYOF_FOLD)) ) {
755 /* OR char bitmap and class bitmap separately */
756 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
757 cl->bitmap[i] |= or_with->bitmap[i];
758 if (or_with->flags & ANYOF_CLASS) {
759 for (i = 0; i < ANYOF_CLASSBITMAP_SIZE; i++)
760 cl->classflags[i] |= or_with->classflags[i];
761 cl->flags |= ANYOF_CLASS;
764 else { /* XXXX: logic is complicated, leave it along for a moment. */
765 cl_anything(pRExC_state, cl);
768 if (or_with->flags & ANYOF_EOS)
769 cl->flags |= ANYOF_EOS;
771 if (cl->flags & ANYOF_UNICODE && or_with->flags & ANYOF_UNICODE &&
772 ARG(cl) != ARG(or_with)) {
773 cl->flags |= ANYOF_UNICODE_ALL;
774 cl->flags &= ~ANYOF_UNICODE;
776 if (or_with->flags & ANYOF_UNICODE_ALL) {
777 cl->flags |= ANYOF_UNICODE_ALL;
778 cl->flags &= ~ANYOF_UNICODE;
782 #define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ]
783 #define TRIE_LIST_CUR(state) ( TRIE_LIST_ITEM( state, 0 ).forid )
784 #define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate )
785 #define TRIE_LIST_USED(idx) ( trie->states[state].trans.list ? (TRIE_LIST_CUR( idx ) - 1) : 0 )
791 dump_trie_interim_list(trie,next_alloc)
792 dump_trie_interim_table(trie,next_alloc)
794 These routines dump out a trie in a somewhat readable format.
795 The _interim_ variants are used for debugging the interim
796 tables that are used to generate the final compressed
797 representation which is what dump_trie expects.
799 Part of the reason for their existance is to provide a form
800 of documentation as to how the different representations function.
806 Dumps the final compressed table form of the trie to Perl_debug_log.
807 Used for debugging make_trie().
811 S_dump_trie(pTHX_ const struct _reg_trie_data *trie,U32 depth)
814 SV *sv=sv_newmortal();
815 int colwidth= trie->widecharmap ? 6 : 4;
816 GET_RE_DEBUG_FLAGS_DECL;
819 PerlIO_printf( Perl_debug_log, "%*sChar : %-6s%-6s%-4s ",
820 (int)depth * 2 + 2,"",
821 "Match","Base","Ofs" );
823 for( state = 0 ; state < trie->uniquecharcount ; state++ ) {
824 SV ** const tmp = av_fetch( trie->revcharmap, state, 0);
826 PerlIO_printf( Perl_debug_log, "%*s",
828 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
829 PL_colors[0], PL_colors[1],
830 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
831 PERL_PV_ESCAPE_FIRSTCHAR
836 PerlIO_printf( Perl_debug_log, "\n%*sState|-----------------------",
837 (int)depth * 2 + 2,"");
839 for( state = 0 ; state < trie->uniquecharcount ; state++ )
840 PerlIO_printf( Perl_debug_log, "%.*s", colwidth, "--------");
841 PerlIO_printf( Perl_debug_log, "\n");
843 for( state = 1 ; state < trie->statecount ; state++ ) {
844 const U32 base = trie->states[ state ].trans.base;
846 PerlIO_printf( Perl_debug_log, "%*s#%4"UVXf"|", (int)depth * 2 + 2,"", (UV)state);
848 if ( trie->states[ state ].wordnum ) {
849 PerlIO_printf( Perl_debug_log, " W%4X", trie->states[ state ].wordnum );
851 PerlIO_printf( Perl_debug_log, "%6s", "" );
854 PerlIO_printf( Perl_debug_log, " @%4"UVXf" ", (UV)base );
859 while( ( base + ofs < trie->uniquecharcount ) ||
860 ( base + ofs - trie->uniquecharcount < trie->lasttrans
861 && trie->trans[ base + ofs - trie->uniquecharcount ].check != state))
864 PerlIO_printf( Perl_debug_log, "+%2"UVXf"[ ", (UV)ofs);
866 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
867 if ( ( base + ofs >= trie->uniquecharcount ) &&
868 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
869 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
871 PerlIO_printf( Perl_debug_log, "%*"UVXf,
873 (UV)trie->trans[ base + ofs - trie->uniquecharcount ].next );
875 PerlIO_printf( Perl_debug_log, "%*s",colwidth," ." );
879 PerlIO_printf( Perl_debug_log, "]");
882 PerlIO_printf( Perl_debug_log, "\n" );
886 dump_trie_interim_list(trie,next_alloc)
887 Dumps a fully constructed but uncompressed trie in list form.
888 List tries normally only are used for construction when the number of
889 possible chars (trie->uniquecharcount) is very high.
890 Used for debugging make_trie().
893 S_dump_trie_interim_list(pTHX_ const struct _reg_trie_data *trie, U32 next_alloc,U32 depth)
896 SV *sv=sv_newmortal();
897 int colwidth= trie->widecharmap ? 6 : 4;
898 GET_RE_DEBUG_FLAGS_DECL;
899 /* print out the table precompression. */
900 PerlIO_printf( Perl_debug_log, "%*sState :Word | Transition Data\n%*s%s",
901 (int)depth * 2 + 2,"", (int)depth * 2 + 2,"",
902 "------:-----+-----------------\n" );
904 for( state=1 ; state < next_alloc ; state ++ ) {
907 PerlIO_printf( Perl_debug_log, "%*s %4"UVXf" :",
908 (int)depth * 2 + 2,"", (UV)state );
909 if ( ! trie->states[ state ].wordnum ) {
910 PerlIO_printf( Perl_debug_log, "%5s| ","");
912 PerlIO_printf( Perl_debug_log, "W%4x| ",
913 trie->states[ state ].wordnum
916 for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) {
917 SV ** const tmp = av_fetch( trie->revcharmap, TRIE_LIST_ITEM(state,charid).forid, 0);
919 PerlIO_printf( Perl_debug_log, "%*s:%3X=%4"UVXf" | ",
921 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
922 PL_colors[0], PL_colors[1],
923 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
924 PERL_PV_ESCAPE_FIRSTCHAR
926 TRIE_LIST_ITEM(state,charid).forid,
927 (UV)TRIE_LIST_ITEM(state,charid).newstate
930 PerlIO_printf(Perl_debug_log, "\n%*s| ",
931 (int)((depth * 2) + 14), "");
934 PerlIO_printf( Perl_debug_log, "\n");
939 dump_trie_interim_table(trie,next_alloc)
940 Dumps a fully constructed but uncompressed trie in table form.
941 This is the normal DFA style state transition table, with a few
942 twists to facilitate compression later.
943 Used for debugging make_trie().
946 S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie, U32 next_alloc, U32 depth)
950 SV *sv=sv_newmortal();
951 int colwidth= trie->widecharmap ? 6 : 4;
952 GET_RE_DEBUG_FLAGS_DECL;
955 print out the table precompression so that we can do a visual check
956 that they are identical.
959 PerlIO_printf( Perl_debug_log, "%*sChar : ",(int)depth * 2 + 2,"" );
961 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
962 SV ** const tmp = av_fetch( trie->revcharmap, charid, 0);
964 PerlIO_printf( Perl_debug_log, "%*s",
966 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
967 PL_colors[0], PL_colors[1],
968 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
969 PERL_PV_ESCAPE_FIRSTCHAR
975 PerlIO_printf( Perl_debug_log, "\n%*sState+-",(int)depth * 2 + 2,"" );
977 for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) {
978 PerlIO_printf( Perl_debug_log, "%.*s", colwidth,"--------");
981 PerlIO_printf( Perl_debug_log, "\n" );
983 for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) {
985 PerlIO_printf( Perl_debug_log, "%*s%4"UVXf" : ",
986 (int)depth * 2 + 2,"",
987 (UV)TRIE_NODENUM( state ) );
989 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
990 UV v=(UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next );
992 PerlIO_printf( Perl_debug_log, "%*"UVXf, colwidth, v );
994 PerlIO_printf( Perl_debug_log, "%*s", colwidth, "." );
996 if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) {
997 PerlIO_printf( Perl_debug_log, " (%4"UVXf")\n", (UV)trie->trans[ state ].check );
999 PerlIO_printf( Perl_debug_log, " (%4"UVXf") W%4X\n", (UV)trie->trans[ state ].check,
1000 trie->states[ TRIE_NODENUM( state ) ].wordnum );
1007 /* make_trie(startbranch,first,last,tail,word_count,flags,depth)
1008 startbranch: the first branch in the whole branch sequence
1009 first : start branch of sequence of branch-exact nodes.
1010 May be the same as startbranch
1011 last : Thing following the last branch.
1012 May be the same as tail.
1013 tail : item following the branch sequence
1014 count : words in the sequence
1015 flags : currently the OP() type we will be building one of /EXACT(|F|Fl)/
1016 depth : indent depth
1018 Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node.
1020 A trie is an N'ary tree where the branches are determined by digital
1021 decomposition of the key. IE, at the root node you look up the 1st character and
1022 follow that branch repeat until you find the end of the branches. Nodes can be
1023 marked as "accepting" meaning they represent a complete word. Eg:
1027 would convert into the following structure. Numbers represent states, letters
1028 following numbers represent valid transitions on the letter from that state, if
1029 the number is in square brackets it represents an accepting state, otherwise it
1030 will be in parenthesis.
1032 +-h->+-e->[3]-+-r->(8)-+-s->[9]
1036 (1) +-i->(6)-+-s->[7]
1038 +-s->(3)-+-h->(4)-+-e->[5]
1040 Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers)
1042 This shows that when matching against the string 'hers' we will begin at state 1
1043 read 'h' and move to state 2, read 'e' and move to state 3 which is accepting,
1044 then read 'r' and go to state 8 followed by 's' which takes us to state 9 which
1045 is also accepting. Thus we know that we can match both 'he' and 'hers' with a
1046 single traverse. We store a mapping from accepting to state to which word was
1047 matched, and then when we have multiple possibilities we try to complete the
1048 rest of the regex in the order in which they occured in the alternation.
1050 The only prior NFA like behaviour that would be changed by the TRIE support is
1051 the silent ignoring of duplicate alternations which are of the form:
1053 / (DUPE|DUPE) X? (?{ ... }) Y /x
1055 Thus EVAL blocks follwing a trie may be called a different number of times with
1056 and without the optimisation. With the optimisations dupes will be silently
1057 ignored. This inconsistant behaviour of EVAL type nodes is well established as
1058 the following demonstrates:
1060 'words'=~/(word|word|word)(?{ print $1 })[xyz]/
1062 which prints out 'word' three times, but
1064 'words'=~/(word|word|word)(?{ print $1 })S/
1066 which doesnt print it out at all. This is due to other optimisations kicking in.
1068 Example of what happens on a structural level:
1070 The regexp /(ac|ad|ab)+/ will produce the folowing debug output:
1072 1: CURLYM[1] {1,32767}(18)
1083 This would be optimizable with startbranch=5, first=5, last=16, tail=16
1084 and should turn into:
1086 1: CURLYM[1] {1,32767}(18)
1088 [Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1]
1096 Cases where tail != last would be like /(?foo|bar)baz/:
1106 which would be optimizable with startbranch=1, first=1, last=7, tail=8
1107 and would end up looking like:
1110 [Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1]
1117 d = uvuni_to_utf8_flags(d, uv, 0);
1119 is the recommended Unicode-aware way of saying
1124 #define TRIE_STORE_REVCHAR \
1126 SV *tmp = newSVpvs(""); \
1127 if (UTF) SvUTF8_on(tmp); \
1128 Perl_sv_catpvf( aTHX_ tmp, "%c", (int)uvc ); \
1129 av_push( TRIE_REVCHARMAP(trie), tmp ); \
1132 #define TRIE_READ_CHAR STMT_START { \
1136 if ( foldlen > 0 ) { \
1137 uvc = utf8n_to_uvuni( scan, UTF8_MAXLEN, &len, uniflags ); \
1142 uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
1143 uvc = to_uni_fold( uvc, foldbuf, &foldlen ); \
1144 foldlen -= UNISKIP( uvc ); \
1145 scan = foldbuf + UNISKIP( uvc ); \
1148 uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
1158 #define TRIE_LIST_PUSH(state,fid,ns) STMT_START { \
1159 if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) { \
1160 U32 ging = TRIE_LIST_LEN( state ) *= 2; \
1161 Renew( trie->states[ state ].trans.list, ging, reg_trie_trans_le ); \
1163 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid; \
1164 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns; \
1165 TRIE_LIST_CUR( state )++; \
1168 #define TRIE_LIST_NEW(state) STMT_START { \
1169 Newxz( trie->states[ state ].trans.list, \
1170 4, reg_trie_trans_le ); \
1171 TRIE_LIST_CUR( state ) = 1; \
1172 TRIE_LIST_LEN( state ) = 4; \
1175 #define TRIE_HANDLE_WORD(state) STMT_START { \
1176 U16 dupe= trie->states[ state ].wordnum; \
1177 regnode * const noper_next = regnext( noper ); \
1179 if (trie->wordlen) \
1180 trie->wordlen[ curword ] = wordlen; \
1182 /* store the word for dumping */ \
1184 if (OP(noper) != NOTHING) \
1185 tmp = newSVpvn(STRING(noper), STR_LEN(noper)); \
1187 tmp = newSVpvn( "", 0 ); \
1188 if ( UTF ) SvUTF8_on( tmp ); \
1189 av_push( trie->words, tmp ); \
1194 if ( noper_next < tail ) { \
1196 Newxz( trie->jump, word_count + 1, U16); \
1197 trie->jump[curword] = (U16)(noper_next - convert); \
1199 jumper = noper_next; \
1201 nextbranch= regnext(cur); \
1205 /* So it's a dupe. This means we need to maintain a */\
1206 /* linked-list from the first to the next. */\
1207 /* we only allocate the nextword buffer when there */\
1208 /* a dupe, so first time we have to do the allocation */\
1209 if (!trie->nextword) \
1210 Newxz( trie->nextword, word_count + 1, U16); \
1211 while ( trie->nextword[dupe] ) \
1212 dupe= trie->nextword[dupe]; \
1213 trie->nextword[dupe]= curword; \
1215 /* we haven't inserted this word yet. */ \
1216 trie->states[ state ].wordnum = curword; \
1221 #define TRIE_TRANS_STATE(state,base,ucharcount,charid,special) \
1222 ( ( base + charid >= ucharcount \
1223 && base + charid < ubound \
1224 && state == trie->trans[ base - ucharcount + charid ].check \
1225 && trie->trans[ base - ucharcount + charid ].next ) \
1226 ? trie->trans[ base - ucharcount + charid ].next \
1227 : ( state==1 ? special : 0 ) \
1231 #define MADE_JUMP_TRIE 2
1232 #define MADE_EXACT_TRIE 4
1235 S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *first, regnode *last, regnode *tail, U32 word_count, U32 flags, U32 depth)
1238 /* first pass, loop through and scan words */
1239 reg_trie_data *trie;
1241 const U32 uniflags = UTF8_ALLOW_DEFAULT;
1246 regnode *jumper = NULL;
1247 regnode *nextbranch = NULL;
1248 regnode *convert = NULL;
1249 /* we just use folder as a flag in utf8 */
1250 const U8 * const folder = ( flags == EXACTF
1252 : ( flags == EXACTFL
1258 const U32 data_slot = add_data( pRExC_state, 1, "t" );
1259 SV *re_trie_maxbuff;
1261 /* these are only used during construction but are useful during
1262 * debugging so we store them in the struct when debugging.
1264 STRLEN trie_charcount=0;
1265 AV *trie_revcharmap;
1267 GET_RE_DEBUG_FLAGS_DECL;
1269 PERL_UNUSED_ARG(depth);
1272 Newxz( trie, 1, reg_trie_data );
1274 trie->startstate = 1;
1275 trie->wordcount = word_count;
1276 RExC_rx->data->data[ data_slot ] = (void*)trie;
1277 Newxz( trie->charmap, 256, U16 );
1278 if (!(UTF && folder))
1279 Newxz( trie->bitmap, ANYOF_BITMAP_SIZE, char );
1281 trie->words = newAV();
1283 TRIE_REVCHARMAP(trie) = newAV();
1285 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
1286 if (!SvIOK(re_trie_maxbuff)) {
1287 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
1290 PerlIO_printf( Perl_debug_log,
1291 "%*smake_trie start==%d, first==%d, last==%d, tail==%d depth=%d\n",
1292 (int)depth * 2 + 2, "",
1293 REG_NODE_NUM(startbranch),REG_NODE_NUM(first),
1294 REG_NODE_NUM(last), REG_NODE_NUM(tail),
1298 /* Find the node we are going to overwrite */
1299 if ( first == startbranch && OP( last ) != BRANCH ) {
1300 /* whole branch chain */
1303 /* branch sub-chain */
1304 convert = NEXTOPER( first );
1307 /* -- First loop and Setup --
1309 We first traverse the branches and scan each word to determine if it
1310 contains widechars, and how many unique chars there are, this is
1311 important as we have to build a table with at least as many columns as we
1314 We use an array of integers to represent the character codes 0..255
1315 (trie->charmap) and we use a an HV* to store unicode characters. We use the
1316 native representation of the character value as the key and IV's for the
1319 *TODO* If we keep track of how many times each character is used we can
1320 remap the columns so that the table compression later on is more
1321 efficient in terms of memory by ensuring most common value is in the
1322 middle and the least common are on the outside. IMO this would be better
1323 than a most to least common mapping as theres a decent chance the most
1324 common letter will share a node with the least common, meaning the node
1325 will not be compressable. With a middle is most common approach the worst
1326 case is when we have the least common nodes twice.
1330 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1331 regnode * const noper = NEXTOPER( cur );
1332 const U8 *uc = (U8*)STRING( noper );
1333 const U8 * const e = uc + STR_LEN( noper );
1335 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1336 const U8 *scan = (U8*)NULL;
1337 U32 wordlen = 0; /* required init */
1340 if (OP(noper) == NOTHING) {
1345 TRIE_BITMAP_SET(trie,*uc);
1346 if ( folder ) TRIE_BITMAP_SET(trie,folder[ *uc ]);
1348 for ( ; uc < e ; uc += len ) {
1349 TRIE_CHARCOUNT(trie)++;
1353 if ( !trie->charmap[ uvc ] ) {
1354 trie->charmap[ uvc ]=( ++trie->uniquecharcount );
1356 trie->charmap[ folder[ uvc ] ] = trie->charmap[ uvc ];
1361 if ( !trie->widecharmap )
1362 trie->widecharmap = newHV();
1364 svpp = hv_fetch( trie->widecharmap, (char*)&uvc, sizeof( UV ), 1 );
1367 Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%"UVXf, uvc );
1369 if ( !SvTRUE( *svpp ) ) {
1370 sv_setiv( *svpp, ++trie->uniquecharcount );
1375 if( cur == first ) {
1378 } else if (chars < trie->minlen) {
1380 } else if (chars > trie->maxlen) {
1384 } /* end first pass */
1385 DEBUG_TRIE_COMPILE_r(
1386 PerlIO_printf( Perl_debug_log, "%*sTRIE(%s): W:%d C:%d Uq:%d Min:%d Max:%d\n",
1387 (int)depth * 2 + 2,"",
1388 ( trie->widecharmap ? "UTF8" : "NATIVE" ), (int)word_count,
1389 (int)TRIE_CHARCOUNT(trie), trie->uniquecharcount,
1390 (int)trie->minlen, (int)trie->maxlen )
1392 Newxz( trie->wordlen, word_count, U32 );
1395 We now know what we are dealing with in terms of unique chars and
1396 string sizes so we can calculate how much memory a naive
1397 representation using a flat table will take. If it's over a reasonable
1398 limit (as specified by ${^RE_TRIE_MAXBUF}) we use a more memory
1399 conservative but potentially much slower representation using an array
1402 At the end we convert both representations into the same compressed
1403 form that will be used in regexec.c for matching with. The latter
1404 is a form that cannot be used to construct with but has memory
1405 properties similar to the list form and access properties similar
1406 to the table form making it both suitable for fast searches and
1407 small enough that its feasable to store for the duration of a program.
1409 See the comment in the code where the compressed table is produced
1410 inplace from the flat tabe representation for an explanation of how
1411 the compression works.
1416 if ( (IV)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1) > SvIV(re_trie_maxbuff) ) {
1418 Second Pass -- Array Of Lists Representation
1420 Each state will be represented by a list of charid:state records
1421 (reg_trie_trans_le) the first such element holds the CUR and LEN
1422 points of the allocated array. (See defines above).
1424 We build the initial structure using the lists, and then convert
1425 it into the compressed table form which allows faster lookups
1426 (but cant be modified once converted).
1429 STRLEN transcount = 1;
1431 DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
1432 "%*sCompiling trie using list compiler\n",
1433 (int)depth * 2 + 2, ""));
1435 Newxz( trie->states, TRIE_CHARCOUNT(trie) + 2, reg_trie_state );
1439 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1441 regnode * const noper = NEXTOPER( cur );
1442 U8 *uc = (U8*)STRING( noper );
1443 const U8 * const e = uc + STR_LEN( noper );
1444 U32 state = 1; /* required init */
1445 U16 charid = 0; /* sanity init */
1446 U8 *scan = (U8*)NULL; /* sanity init */
1447 STRLEN foldlen = 0; /* required init */
1448 U32 wordlen = 0; /* required init */
1449 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1451 if (OP(noper) != NOTHING) {
1452 for ( ; uc < e ; uc += len ) {
1457 charid = trie->charmap[ uvc ];
1459 SV** const svpp = hv_fetch( trie->widecharmap, (char*)&uvc, sizeof( UV ), 0);
1463 charid=(U16)SvIV( *svpp );
1466 /* charid is now 0 if we dont know the char read, or nonzero if we do */
1473 if ( !trie->states[ state ].trans.list ) {
1474 TRIE_LIST_NEW( state );
1476 for ( check = 1; check <= TRIE_LIST_USED( state ); check++ ) {
1477 if ( TRIE_LIST_ITEM( state, check ).forid == charid ) {
1478 newstate = TRIE_LIST_ITEM( state, check ).newstate;
1483 newstate = next_alloc++;
1484 TRIE_LIST_PUSH( state, charid, newstate );
1489 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
1493 TRIE_HANDLE_WORD(state);
1495 } /* end second pass */
1497 /* next alloc is the NEXT state to be allocated */
1498 trie->statecount = next_alloc;
1499 Renew( trie->states, next_alloc, reg_trie_state );
1501 /* and now dump it out before we compress it */
1502 DEBUG_TRIE_COMPILE_MORE_r(
1503 dump_trie_interim_list(trie,next_alloc,depth+1)
1506 Newxz( trie->trans, transcount ,reg_trie_trans );
1513 for( state=1 ; state < next_alloc ; state ++ ) {
1517 DEBUG_TRIE_COMPILE_MORE_r(
1518 PerlIO_printf( Perl_debug_log, "tp: %d zp: %d ",tp,zp)
1522 if (trie->states[state].trans.list) {
1523 U16 minid=TRIE_LIST_ITEM( state, 1).forid;
1527 for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
1528 const U16 forid = TRIE_LIST_ITEM( state, idx).forid;
1529 if ( forid < minid ) {
1531 } else if ( forid > maxid ) {
1535 if ( transcount < tp + maxid - minid + 1) {
1537 Renew( trie->trans, transcount, reg_trie_trans );
1538 Zero( trie->trans + (transcount / 2), transcount / 2 , reg_trie_trans );
1540 base = trie->uniquecharcount + tp - minid;
1541 if ( maxid == minid ) {
1543 for ( ; zp < tp ; zp++ ) {
1544 if ( ! trie->trans[ zp ].next ) {
1545 base = trie->uniquecharcount + zp - minid;
1546 trie->trans[ zp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1547 trie->trans[ zp ].check = state;
1553 trie->trans[ tp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1554 trie->trans[ tp ].check = state;
1559 for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
1560 const U32 tid = base - trie->uniquecharcount + TRIE_LIST_ITEM( state, idx ).forid;
1561 trie->trans[ tid ].next = TRIE_LIST_ITEM( state, idx ).newstate;
1562 trie->trans[ tid ].check = state;
1564 tp += ( maxid - minid + 1 );
1566 Safefree(trie->states[ state ].trans.list);
1569 DEBUG_TRIE_COMPILE_MORE_r(
1570 PerlIO_printf( Perl_debug_log, " base: %d\n",base);
1573 trie->states[ state ].trans.base=base;
1575 trie->lasttrans = tp + 1;
1579 Second Pass -- Flat Table Representation.
1581 we dont use the 0 slot of either trans[] or states[] so we add 1 to each.
1582 We know that we will need Charcount+1 trans at most to store the data
1583 (one row per char at worst case) So we preallocate both structures
1584 assuming worst case.
1586 We then construct the trie using only the .next slots of the entry
1589 We use the .check field of the first entry of the node temporarily to
1590 make compression both faster and easier by keeping track of how many non
1591 zero fields are in the node.
1593 Since trans are numbered from 1 any 0 pointer in the table is a FAIL
1596 There are two terms at use here: state as a TRIE_NODEIDX() which is a
1597 number representing the first entry of the node, and state as a
1598 TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1) and
1599 TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3) if there
1600 are 2 entrys per node. eg:
1608 The table is internally in the right hand, idx form. However as we also
1609 have to deal with the states array which is indexed by nodenum we have to
1610 use TRIE_NODENUM() to convert.
1613 DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
1614 "%*sCompiling trie using table compiler\n",
1615 (int)depth * 2 + 2, ""));
1617 Newxz( trie->trans, ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1,
1619 Newxz( trie->states, TRIE_CHARCOUNT(trie) + 2, reg_trie_state );
1620 next_alloc = trie->uniquecharcount + 1;
1623 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1625 regnode * const noper = NEXTOPER( cur );
1626 const U8 *uc = (U8*)STRING( noper );
1627 const U8 * const e = uc + STR_LEN( noper );
1629 U32 state = 1; /* required init */
1631 U16 charid = 0; /* sanity init */
1632 U32 accept_state = 0; /* sanity init */
1633 U8 *scan = (U8*)NULL; /* sanity init */
1635 STRLEN foldlen = 0; /* required init */
1636 U32 wordlen = 0; /* required init */
1637 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1639 if ( OP(noper) != NOTHING ) {
1640 for ( ; uc < e ; uc += len ) {
1645 charid = trie->charmap[ uvc ];
1647 SV* const * const svpp = hv_fetch( trie->widecharmap, (char*)&uvc, sizeof( UV ), 0);
1648 charid = svpp ? (U16)SvIV(*svpp) : 0;
1652 if ( !trie->trans[ state + charid ].next ) {
1653 trie->trans[ state + charid ].next = next_alloc;
1654 trie->trans[ state ].check++;
1655 next_alloc += trie->uniquecharcount;
1657 state = trie->trans[ state + charid ].next;
1659 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
1661 /* charid is now 0 if we dont know the char read, or nonzero if we do */
1664 accept_state = TRIE_NODENUM( state );
1665 TRIE_HANDLE_WORD(accept_state);
1667 } /* end second pass */
1669 /* and now dump it out before we compress it */
1670 DEBUG_TRIE_COMPILE_MORE_r(
1671 dump_trie_interim_table(trie,next_alloc,depth+1)
1676 * Inplace compress the table.*
1678 For sparse data sets the table constructed by the trie algorithm will
1679 be mostly 0/FAIL transitions or to put it another way mostly empty.
1680 (Note that leaf nodes will not contain any transitions.)
1682 This algorithm compresses the tables by eliminating most such
1683 transitions, at the cost of a modest bit of extra work during lookup:
1685 - Each states[] entry contains a .base field which indicates the
1686 index in the state[] array wheres its transition data is stored.
1688 - If .base is 0 there are no valid transitions from that node.
1690 - If .base is nonzero then charid is added to it to find an entry in
1693 -If trans[states[state].base+charid].check!=state then the
1694 transition is taken to be a 0/Fail transition. Thus if there are fail
1695 transitions at the front of the node then the .base offset will point
1696 somewhere inside the previous nodes data (or maybe even into a node
1697 even earlier), but the .check field determines if the transition is
1701 The following process inplace converts the table to the compressed
1702 table: We first do not compress the root node 1,and mark its all its
1703 .check pointers as 1 and set its .base pointer as 1 as well. This
1704 allows to do a DFA construction from the compressed table later, and
1705 ensures that any .base pointers we calculate later are greater than
1708 - We set 'pos' to indicate the first entry of the second node.
1710 - We then iterate over the columns of the node, finding the first and
1711 last used entry at l and m. We then copy l..m into pos..(pos+m-l),
1712 and set the .check pointers accordingly, and advance pos
1713 appropriately and repreat for the next node. Note that when we copy
1714 the next pointers we have to convert them from the original
1715 NODEIDX form to NODENUM form as the former is not valid post
1718 - If a node has no transitions used we mark its base as 0 and do not
1719 advance the pos pointer.
1721 - If a node only has one transition we use a second pointer into the
1722 structure to fill in allocated fail transitions from other states.
1723 This pointer is independent of the main pointer and scans forward
1724 looking for null transitions that are allocated to a state. When it
1725 finds one it writes the single transition into the "hole". If the
1726 pointer doesnt find one the single transition is appended as normal.
1728 - Once compressed we can Renew/realloc the structures to release the
1731 See "Table-Compression Methods" in sec 3.9 of the Red Dragon,
1732 specifically Fig 3.47 and the associated pseudocode.
1736 const U32 laststate = TRIE_NODENUM( next_alloc );
1739 trie->statecount = laststate;
1741 for ( state = 1 ; state < laststate ; state++ ) {
1743 const U32 stateidx = TRIE_NODEIDX( state );
1744 const U32 o_used = trie->trans[ stateidx ].check;
1745 U32 used = trie->trans[ stateidx ].check;
1746 trie->trans[ stateidx ].check = 0;
1748 for ( charid = 0 ; used && charid < trie->uniquecharcount ; charid++ ) {
1749 if ( flag || trie->trans[ stateidx + charid ].next ) {
1750 if ( trie->trans[ stateidx + charid ].next ) {
1752 for ( ; zp < pos ; zp++ ) {
1753 if ( ! trie->trans[ zp ].next ) {
1757 trie->states[ state ].trans.base = zp + trie->uniquecharcount - charid ;
1758 trie->trans[ zp ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
1759 trie->trans[ zp ].check = state;
1760 if ( ++zp > pos ) pos = zp;
1767 trie->states[ state ].trans.base = pos + trie->uniquecharcount - charid ;
1769 trie->trans[ pos ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
1770 trie->trans[ pos ].check = state;
1775 trie->lasttrans = pos + 1;
1776 Renew( trie->states, laststate, reg_trie_state);
1777 DEBUG_TRIE_COMPILE_MORE_r(
1778 PerlIO_printf( Perl_debug_log,
1779 "%*sAlloc: %d Orig: %"IVdf" elements, Final:%"IVdf". Savings of %%%5.2f\n",
1780 (int)depth * 2 + 2,"",
1781 (int)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1 ),
1784 ( ( next_alloc - pos ) * 100 ) / (double)next_alloc );
1787 } /* end table compress */
1789 DEBUG_TRIE_COMPILE_MORE_r(
1790 PerlIO_printf(Perl_debug_log, "%*sStatecount:%"UVxf" Lasttrans:%"UVxf"\n",
1791 (int)depth * 2 + 2, "",
1792 (UV)trie->statecount,
1793 (UV)trie->lasttrans)
1795 /* resize the trans array to remove unused space */
1796 Renew( trie->trans, trie->lasttrans, reg_trie_trans);
1798 /* and now dump out the compressed format */
1799 DEBUG_TRIE_COMPILE_r(
1800 dump_trie(trie,depth+1)
1803 { /* Modify the program and insert the new TRIE node*/
1804 U8 nodetype =(U8)(flags & 0xFF);
1808 regnode *optimize = NULL;
1810 U32 mjd_nodelen = 0;
1813 This means we convert either the first branch or the first Exact,
1814 depending on whether the thing following (in 'last') is a branch
1815 or not and whther first is the startbranch (ie is it a sub part of
1816 the alternation or is it the whole thing.)
1817 Assuming its a sub part we conver the EXACT otherwise we convert
1818 the whole branch sequence, including the first.
1820 /* Find the node we are going to overwrite */
1821 if ( first != startbranch || OP( last ) == BRANCH ) {
1822 /* branch sub-chain */
1823 NEXT_OFF( first ) = (U16)(last - first);
1825 mjd_offset= Node_Offset((convert));
1826 mjd_nodelen= Node_Length((convert));
1828 /* whole branch chain */
1831 const regnode *nop = NEXTOPER( convert );
1832 mjd_offset= Node_Offset((nop));
1833 mjd_nodelen= Node_Length((nop));
1838 PerlIO_printf(Perl_debug_log, "%*sMJD offset:%"UVuf" MJD length:%"UVuf"\n",
1839 (int)depth * 2 + 2, "",
1840 (UV)mjd_offset, (UV)mjd_nodelen)
1843 /* But first we check to see if there is a common prefix we can
1844 split out as an EXACT and put in front of the TRIE node. */
1845 trie->startstate= 1;
1846 if ( trie->bitmap && !trie->widecharmap && !trie->jump ) {
1848 for ( state = 1 ; state < trie->statecount-1 ; state++ ) {
1852 const U32 base = trie->states[ state ].trans.base;
1854 if ( trie->states[state].wordnum )
1857 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
1858 if ( ( base + ofs >= trie->uniquecharcount ) &&
1859 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
1860 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
1862 if ( ++count > 1 ) {
1863 SV **tmp = av_fetch( TRIE_REVCHARMAP(trie), ofs, 0);
1864 const U8 *ch = (U8*)SvPV_nolen_const( *tmp );
1865 if ( state == 1 ) break;
1867 Zero(trie->bitmap, ANYOF_BITMAP_SIZE, char);
1869 PerlIO_printf(Perl_debug_log,
1870 "%*sNew Start State=%"UVuf" Class: [",
1871 (int)depth * 2 + 2, "",
1874 SV ** const tmp = av_fetch( TRIE_REVCHARMAP(trie), idx, 0);
1875 const U8 * const ch = (U8*)SvPV_nolen_const( *tmp );
1877 TRIE_BITMAP_SET(trie,*ch);
1879 TRIE_BITMAP_SET(trie, folder[ *ch ]);
1881 PerlIO_printf(Perl_debug_log, (char*)ch)
1885 TRIE_BITMAP_SET(trie,*ch);
1887 TRIE_BITMAP_SET(trie,folder[ *ch ]);
1888 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"%s", ch));
1894 SV **tmp = av_fetch( TRIE_REVCHARMAP(trie), idx, 0);
1895 const char *ch = SvPV_nolen_const( *tmp );
1897 PerlIO_printf( Perl_debug_log,
1898 "%*sPrefix State: %"UVuf" Idx:%"UVuf" Char='%s'\n",
1899 (int)depth * 2 + 2, "",
1900 (UV)state, (UV)idx, ch)
1903 OP( convert ) = nodetype;
1904 str=STRING(convert);
1913 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"]\n"));
1919 regnode *n = convert+NODE_SZ_STR(convert);
1920 NEXT_OFF(convert) = NODE_SZ_STR(convert);
1921 trie->startstate = state;
1922 trie->minlen -= (state - 1);
1923 trie->maxlen -= (state - 1);
1925 regnode *fix = convert;
1927 Set_Node_Offset_Length(convert, mjd_offset, state - 1);
1928 while( ++fix < n ) {
1929 Set_Node_Offset_Length(fix, 0, 0);
1935 NEXT_OFF(convert) = (U16)(tail - convert);
1936 DEBUG_r(optimize= n);
1942 if ( trie->maxlen ) {
1943 NEXT_OFF( convert ) = (U16)(tail - convert);
1944 ARG_SET( convert, data_slot );
1945 /* Store the offset to the first unabsorbed branch in
1946 jump[0], which is otherwise unused by the jump logic.
1947 We use this when dumping a trie and during optimisation. */
1949 trie->jump[0] = (U16)(nextbranch - convert);
1952 if ( !trie->states[trie->startstate].wordnum && trie->bitmap &&
1953 ( (char *)jumper - (char *)convert) >= (int)sizeof(struct regnode_charclass) )
1955 OP( convert ) = TRIEC;
1956 Copy(trie->bitmap, ((struct regnode_charclass *)convert)->bitmap, ANYOF_BITMAP_SIZE, char);
1957 Safefree(trie->bitmap);
1960 OP( convert ) = TRIE;
1962 /* store the type in the flags */
1963 convert->flags = nodetype;
1967 + regarglen[ OP( convert ) ];
1969 /* XXX We really should free up the resource in trie now,
1970 as we won't use them - (which resources?) dmq */
1972 /* needed for dumping*/
1973 DEBUG_r(if (optimize) {
1974 regnode *opt = convert;
1975 while ( ++opt < optimize) {
1976 Set_Node_Offset_Length(opt,0,0);
1979 Try to clean up some of the debris left after the
1982 while( optimize < jumper ) {
1983 mjd_nodelen += Node_Length((optimize));
1984 OP( optimize ) = OPTIMIZED;
1985 Set_Node_Offset_Length(optimize,0,0);
1988 Set_Node_Offset_Length(convert,mjd_offset,mjd_nodelen);
1990 } /* end node insert */
1992 SvREFCNT_dec(TRIE_REVCHARMAP(trie));
1996 : trie->startstate>1
2002 S_make_trie_failtable(pTHX_ RExC_state_t *pRExC_state, regnode *source, regnode *stclass, U32 depth)
2004 /* The Trie is constructed and compressed now so we can build a fail array now if its needed
2006 This is basically the Aho-Corasick algorithm. Its from exercise 3.31 and 3.32 in the
2007 "Red Dragon" -- Compilers, principles, techniques, and tools. Aho, Sethi, Ullman 1985/88
2010 We find the fail state for each state in the trie, this state is the longest proper
2011 suffix of the current states 'word' that is also a proper prefix of another word in our
2012 trie. State 1 represents the word '' and is the thus the default fail state. This allows
2013 the DFA not to have to restart after its tried and failed a word at a given point, it
2014 simply continues as though it had been matching the other word in the first place.
2016 'abcdgu'=~/abcdefg|cdgu/
2017 When we get to 'd' we are still matching the first word, we would encounter 'g' which would
2018 fail, which would bring use to the state representing 'd' in the second word where we would
2019 try 'g' and succeed, prodceding to match 'cdgu'.
2021 /* add a fail transition */
2022 reg_trie_data *trie=(reg_trie_data *)RExC_rx->data->data[ARG(source)];
2024 const U32 ucharcount = trie->uniquecharcount;
2025 const U32 numstates = trie->statecount;
2026 const U32 ubound = trie->lasttrans + ucharcount;
2030 U32 base = trie->states[ 1 ].trans.base;
2033 const U32 data_slot = add_data( pRExC_state, 1, "T" );
2034 GET_RE_DEBUG_FLAGS_DECL;
2036 PERL_UNUSED_ARG(depth);
2040 ARG_SET( stclass, data_slot );
2041 Newxz( aho, 1, reg_ac_data );
2042 RExC_rx->data->data[ data_slot ] = (void*)aho;
2044 aho->states=(reg_trie_state *)savepvn((const char*)trie->states,
2045 numstates * sizeof(reg_trie_state));
2046 Newxz( q, numstates, U32);
2047 Newxz( aho->fail, numstates, U32 );
2050 /* initialize fail[0..1] to be 1 so that we always have
2051 a valid final fail state */
2052 fail[ 0 ] = fail[ 1 ] = 1;
2054 for ( charid = 0; charid < ucharcount ; charid++ ) {
2055 const U32 newstate = TRIE_TRANS_STATE( 1, base, ucharcount, charid, 0 );
2057 q[ q_write ] = newstate;
2058 /* set to point at the root */
2059 fail[ q[ q_write++ ] ]=1;
2062 while ( q_read < q_write) {
2063 const U32 cur = q[ q_read++ % numstates ];
2064 base = trie->states[ cur ].trans.base;
2066 for ( charid = 0 ; charid < ucharcount ; charid++ ) {
2067 const U32 ch_state = TRIE_TRANS_STATE( cur, base, ucharcount, charid, 1 );
2069 U32 fail_state = cur;
2072 fail_state = fail[ fail_state ];
2073 fail_base = aho->states[ fail_state ].trans.base;
2074 } while ( !TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 ) );
2076 fail_state = TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 );
2077 fail[ ch_state ] = fail_state;
2078 if ( !aho->states[ ch_state ].wordnum && aho->states[ fail_state ].wordnum )
2080 aho->states[ ch_state ].wordnum = aho->states[ fail_state ].wordnum;
2082 q[ q_write++ % numstates] = ch_state;
2086 /* restore fail[0..1] to 0 so that we "fall out" of the AC loop
2087 when we fail in state 1, this allows us to use the
2088 charclass scan to find a valid start char. This is based on the principle
2089 that theres a good chance the string being searched contains lots of stuff
2090 that cant be a start char.
2092 fail[ 0 ] = fail[ 1 ] = 0;
2093 DEBUG_TRIE_COMPILE_r({
2094 PerlIO_printf(Perl_debug_log,
2095 "%*sStclass Failtable (%"UVuf" states): 0",
2096 (int)(depth * 2), "", (UV)numstates
2098 for( q_read=1; q_read<numstates; q_read++ ) {
2099 PerlIO_printf(Perl_debug_log, ", %"UVuf, (UV)fail[q_read]);
2101 PerlIO_printf(Perl_debug_log, "\n");
2104 /*RExC_seen |= REG_SEEN_TRIEDFA;*/
2109 * There are strange code-generation bugs caused on sparc64 by gcc-2.95.2.
2110 * These need to be revisited when a newer toolchain becomes available.
2112 #if defined(__sparc64__) && defined(__GNUC__)
2113 # if __GNUC__ < 2 || (__GNUC__ == 2 && __GNUC_MINOR__ < 96)
2114 # undef SPARC64_GCC_WORKAROUND
2115 # define SPARC64_GCC_WORKAROUND 1
2119 #define DEBUG_PEEP(str,scan,depth) \
2120 DEBUG_OPTIMISE_r({ \
2121 SV * const mysv=sv_newmortal(); \
2122 regnode *Next = regnext(scan); \
2123 regprop(RExC_rx, mysv, scan); \
2124 PerlIO_printf(Perl_debug_log, "%*s" str ">%3d: %s (%d)\n", \
2125 (int)depth*2, "", REG_NODE_NUM(scan), SvPV_nolen_const(mysv),\
2126 Next ? (REG_NODE_NUM(Next)) : 0 ); \
2133 #define JOIN_EXACT(scan,min,flags) \
2134 if (PL_regkind[OP(scan)] == EXACT) \
2135 join_exact(pRExC_state,(scan),(min),(flags),NULL,depth+1)
2138 S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, I32 *min, U32 flags,regnode *val, U32 depth) {
2139 /* Merge several consecutive EXACTish nodes into one. */
2140 regnode *n = regnext(scan);
2142 regnode *next = scan + NODE_SZ_STR(scan);
2146 regnode *stop = scan;
2147 GET_RE_DEBUG_FLAGS_DECL;
2149 PERL_UNUSED_ARG(depth);
2151 #ifndef EXPERIMENTAL_INPLACESCAN
2152 PERL_UNUSED_ARG(flags);
2153 PERL_UNUSED_ARG(val);
2155 DEBUG_PEEP("join",scan,depth);
2157 /* Skip NOTHING, merge EXACT*. */
2159 ( PL_regkind[OP(n)] == NOTHING ||
2160 (stringok && (OP(n) == OP(scan))))
2162 && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX) {
2164 if (OP(n) == TAIL || n > next)
2166 if (PL_regkind[OP(n)] == NOTHING) {
2167 DEBUG_PEEP("skip:",n,depth);
2168 NEXT_OFF(scan) += NEXT_OFF(n);
2169 next = n + NODE_STEP_REGNODE;
2176 else if (stringok) {
2177 const unsigned int oldl = STR_LEN(scan);
2178 regnode * const nnext = regnext(n);
2180 DEBUG_PEEP("merg",n,depth);
2183 if (oldl + STR_LEN(n) > U8_MAX)
2185 NEXT_OFF(scan) += NEXT_OFF(n);
2186 STR_LEN(scan) += STR_LEN(n);
2187 next = n + NODE_SZ_STR(n);
2188 /* Now we can overwrite *n : */
2189 Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
2197 #ifdef EXPERIMENTAL_INPLACESCAN
2198 if (flags && !NEXT_OFF(n)) {
2199 DEBUG_PEEP("atch", val, depth);
2200 if (reg_off_by_arg[OP(n)]) {
2201 ARG_SET(n, val - n);
2204 NEXT_OFF(n) = val - n;
2211 if (UTF && ( OP(scan) == EXACTF ) && ( STR_LEN(scan) >= 6 ) ) {
2213 Two problematic code points in Unicode casefolding of EXACT nodes:
2215 U+0390 - GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
2216 U+03B0 - GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
2222 U+03B9 U+0308 U+0301 0xCE 0xB9 0xCC 0x88 0xCC 0x81
2223 U+03C5 U+0308 U+0301 0xCF 0x85 0xCC 0x88 0xCC 0x81
2225 This means that in case-insensitive matching (or "loose matching",
2226 as Unicode calls it), an EXACTF of length six (the UTF-8 encoded byte
2227 length of the above casefolded versions) can match a target string
2228 of length two (the byte length of UTF-8 encoded U+0390 or U+03B0).
2229 This would rather mess up the minimum length computation.
2231 What we'll do is to look for the tail four bytes, and then peek
2232 at the preceding two bytes to see whether we need to decrease
2233 the minimum length by four (six minus two).
2235 Thanks to the design of UTF-8, there cannot be false matches:
2236 A sequence of valid UTF-8 bytes cannot be a subsequence of
2237 another valid sequence of UTF-8 bytes.
2240 char * const s0 = STRING(scan), *s, *t;
2241 char * const s1 = s0 + STR_LEN(scan) - 1;
2242 char * const s2 = s1 - 4;
2243 #ifdef EBCDIC /* RD tunifold greek 0390 and 03B0 */
2244 const char t0[] = "\xaf\x49\xaf\x42";
2246 const char t0[] = "\xcc\x88\xcc\x81";
2248 const char * const t1 = t0 + 3;
2251 s < s2 && (t = ninstr(s, s1, t0, t1));
2254 if (((U8)t[-1] == 0x68 && (U8)t[-2] == 0xB4) ||
2255 ((U8)t[-1] == 0x46 && (U8)t[-2] == 0xB5))
2257 if (((U8)t[-1] == 0xB9 && (U8)t[-2] == 0xCE) ||
2258 ((U8)t[-1] == 0x85 && (U8)t[-2] == 0xCF))
2266 n = scan + NODE_SZ_STR(scan);
2268 if (PL_regkind[OP(n)] != NOTHING || OP(n) == NOTHING) {
2275 DEBUG_OPTIMISE_r(if (merged){DEBUG_PEEP("finl",scan,depth)});
2279 /* REx optimizer. Converts nodes into quickier variants "in place".
2280 Finds fixed substrings. */
2282 /* Stops at toplevel WHILEM as well as at "last". At end *scanp is set
2283 to the position after last scanned or to NULL. */
2285 #define INIT_AND_WITHP \
2286 assert(!and_withp); \
2287 Newx(and_withp,1,struct regnode_charclass_class); \
2288 SAVEFREEPV(and_withp)
2291 S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
2292 I32 *minlenp, I32 *deltap,
2297 struct regnode_charclass_class *and_withp,
2298 U32 flags, U32 depth)
2299 /* scanp: Start here (read-write). */
2300 /* deltap: Write maxlen-minlen here. */
2301 /* last: Stop before this one. */
2302 /* data: string data about the pattern */
2303 /* stopparen: treat close N as END */
2304 /* recursed: which subroutines have we recursed into */
2305 /* and_withp: Valid if flags & SCF_DO_STCLASS_OR */
2308 I32 min = 0, pars = 0, code;
2309 regnode *scan = *scanp, *next;
2311 int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
2312 int is_inf_internal = 0; /* The studied chunk is infinite */
2313 I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
2314 scan_data_t data_fake;
2315 SV *re_trie_maxbuff = NULL;
2316 regnode *first_non_open = scan;
2317 I32 stopmin = I32_MAX;
2318 GET_RE_DEBUG_FLAGS_DECL;
2320 StructCopy(&zero_scan_data, &data_fake, scan_data_t);
2324 while (first_non_open && OP(first_non_open) == OPEN)
2325 first_non_open=regnext(first_non_open);
2329 while (scan && OP(scan) != END && scan < last) {
2330 /* Peephole optimizer: */
2331 DEBUG_STUDYDATA(data,depth);
2332 DEBUG_PEEP("Peep",scan,depth);
2333 JOIN_EXACT(scan,&min,0);
2335 /* Follow the next-chain of the current node and optimize
2336 away all the NOTHINGs from it. */
2337 if (OP(scan) != CURLYX) {
2338 const int max = (reg_off_by_arg[OP(scan)]
2340 /* I32 may be smaller than U16 on CRAYs! */
2341 : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
2342 int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
2346 /* Skip NOTHING and LONGJMP. */
2347 while ((n = regnext(n))
2348 && ((PL_regkind[OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
2349 || ((OP(n) == LONGJMP) && (noff = ARG(n))))
2350 && off + noff < max)
2352 if (reg_off_by_arg[OP(scan)])
2355 NEXT_OFF(scan) = off;
2360 /* The principal pseudo-switch. Cannot be a switch, since we
2361 look into several different things. */
2362 if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
2363 || OP(scan) == IFTHEN || OP(scan) == SUSPEND) {
2364 next = regnext(scan);
2366 /* demq: the op(next)==code check is to see if we have "branch-branch" AFAICT */
2368 if (OP(next) == code || code == IFTHEN || code == SUSPEND) {
2369 /* NOTE - There is similar code to this block below for handling
2370 TRIE nodes on a re-study. If you change stuff here check there
2372 I32 max1 = 0, min1 = I32_MAX, num = 0;
2373 struct regnode_charclass_class accum;
2374 regnode * const startbranch=scan;
2376 if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
2377 scan_commit(pRExC_state, data, minlenp); /* Cannot merge strings after this. */
2378 if (flags & SCF_DO_STCLASS)
2379 cl_init_zero(pRExC_state, &accum);
2381 while (OP(scan) == code) {
2382 I32 deltanext, minnext, f = 0, fake;
2383 struct regnode_charclass_class this_class;
2386 data_fake.flags = 0;
2388 data_fake.whilem_c = data->whilem_c;
2389 data_fake.last_closep = data->last_closep;
2392 data_fake.last_closep = &fake;
2393 next = regnext(scan);
2394 scan = NEXTOPER(scan);
2396 scan = NEXTOPER(scan);
2397 if (flags & SCF_DO_STCLASS) {
2398 cl_init(pRExC_state, &this_class);
2399 data_fake.start_class = &this_class;
2400 f = SCF_DO_STCLASS_AND;
2402 if (flags & SCF_WHILEM_VISITED_POS)
2403 f |= SCF_WHILEM_VISITED_POS;
2405 /* we suppose the run is continuous, last=next...*/
2406 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
2408 stopparen, recursed, NULL, f,depth+1);
2411 if (max1 < minnext + deltanext)
2412 max1 = minnext + deltanext;
2413 if (deltanext == I32_MAX)
2414 is_inf = is_inf_internal = 1;
2416 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
2418 if (data_fake.flags & SCF_SEEN_ACCEPT) {
2419 if ( stopmin > minnext)
2420 stopmin = min + min1;
2421 flags &= ~SCF_DO_SUBSTR;
2423 data->flags |= SCF_SEEN_ACCEPT;
2426 if (data_fake.flags & SF_HAS_EVAL)
2427 data->flags |= SF_HAS_EVAL;
2428 data->whilem_c = data_fake.whilem_c;
2430 if (flags & SCF_DO_STCLASS)
2431 cl_or(pRExC_state, &accum, &this_class);
2432 if (code == SUSPEND)
2435 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
2437 if (flags & SCF_DO_SUBSTR) {
2438 data->pos_min += min1;
2439 data->pos_delta += max1 - min1;
2440 if (max1 != min1 || is_inf)
2441 data->longest = &(data->longest_float);
2444 delta += max1 - min1;
2445 if (flags & SCF_DO_STCLASS_OR) {
2446 cl_or(pRExC_state, data->start_class, &accum);
2448 cl_and(data->start_class, and_withp);
2449 flags &= ~SCF_DO_STCLASS;
2452 else if (flags & SCF_DO_STCLASS_AND) {
2454 cl_and(data->start_class, &accum);
2455 flags &= ~SCF_DO_STCLASS;
2458 /* Switch to OR mode: cache the old value of
2459 * data->start_class */
2461 StructCopy(data->start_class, and_withp,
2462 struct regnode_charclass_class);
2463 flags &= ~SCF_DO_STCLASS_AND;
2464 StructCopy(&accum, data->start_class,
2465 struct regnode_charclass_class);
2466 flags |= SCF_DO_STCLASS_OR;
2467 data->start_class->flags |= ANYOF_EOS;
2471 if (PERL_ENABLE_TRIE_OPTIMISATION && OP( startbranch ) == BRANCH ) {
2474 Assuming this was/is a branch we are dealing with: 'scan' now
2475 points at the item that follows the branch sequence, whatever
2476 it is. We now start at the beginning of the sequence and look
2483 which would be constructed from a pattern like /A|LIST|OF|WORDS/
2485 If we can find such a subseqence we need to turn the first
2486 element into a trie and then add the subsequent branch exact
2487 strings to the trie.
2491 1. patterns where the whole set of branch can be converted.
2493 2. patterns where only a subset can be converted.
2495 In case 1 we can replace the whole set with a single regop
2496 for the trie. In case 2 we need to keep the start and end
2499 'BRANCH EXACT; BRANCH EXACT; BRANCH X'
2500 becomes BRANCH TRIE; BRANCH X;
2502 There is an additional case, that being where there is a
2503 common prefix, which gets split out into an EXACT like node
2504 preceding the TRIE node.
2506 If x(1..n)==tail then we can do a simple trie, if not we make
2507 a "jump" trie, such that when we match the appropriate word
2508 we "jump" to the appopriate tail node. Essentailly we turn
2509 a nested if into a case structure of sorts.
2514 if (!re_trie_maxbuff) {
2515 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
2516 if (!SvIOK(re_trie_maxbuff))
2517 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
2519 if ( SvIV(re_trie_maxbuff)>=0 ) {
2521 regnode *first = (regnode *)NULL;
2522 regnode *last = (regnode *)NULL;
2523 regnode *tail = scan;
2528 SV * const mysv = sv_newmortal(); /* for dumping */
2530 /* var tail is used because there may be a TAIL
2531 regop in the way. Ie, the exacts will point to the
2532 thing following the TAIL, but the last branch will
2533 point at the TAIL. So we advance tail. If we
2534 have nested (?:) we may have to move through several
2538 while ( OP( tail ) == TAIL ) {
2539 /* this is the TAIL generated by (?:) */
2540 tail = regnext( tail );
2545 regprop(RExC_rx, mysv, tail );
2546 PerlIO_printf( Perl_debug_log, "%*s%s%s\n",
2547 (int)depth * 2 + 2, "",
2548 "Looking for TRIE'able sequences. Tail node is: ",
2549 SvPV_nolen_const( mysv )
2555 step through the branches, cur represents each
2556 branch, noper is the first thing to be matched
2557 as part of that branch and noper_next is the
2558 regnext() of that node. if noper is an EXACT
2559 and noper_next is the same as scan (our current
2560 position in the regex) then the EXACT branch is
2561 a possible optimization target. Once we have
2562 two or more consequetive such branches we can
2563 create a trie of the EXACT's contents and stich
2564 it in place. If the sequence represents all of
2565 the branches we eliminate the whole thing and
2566 replace it with a single TRIE. If it is a
2567 subsequence then we need to stitch it in. This
2568 means the first branch has to remain, and needs
2569 to be repointed at the item on the branch chain
2570 following the last branch optimized. This could
2571 be either a BRANCH, in which case the
2572 subsequence is internal, or it could be the
2573 item following the branch sequence in which
2574 case the subsequence is at the end.
2578 /* dont use tail as the end marker for this traverse */
2579 for ( cur = startbranch ; cur != scan ; cur = regnext( cur ) ) {
2580 regnode * const noper = NEXTOPER( cur );
2581 #if defined(DEBUGGING) || defined(NOJUMPTRIE)
2582 regnode * const noper_next = regnext( noper );
2586 regprop(RExC_rx, mysv, cur);
2587 PerlIO_printf( Perl_debug_log, "%*s- %s (%d)",
2588 (int)depth * 2 + 2,"", SvPV_nolen_const( mysv ), REG_NODE_NUM(cur) );
2590 regprop(RExC_rx, mysv, noper);
2591 PerlIO_printf( Perl_debug_log, " -> %s",
2592 SvPV_nolen_const(mysv));
2595 regprop(RExC_rx, mysv, noper_next );
2596 PerlIO_printf( Perl_debug_log,"\t=> %s\t",
2597 SvPV_nolen_const(mysv));
2599 PerlIO_printf( Perl_debug_log, "(First==%d,Last==%d,Cur==%d)\n",
2600 REG_NODE_NUM(first), REG_NODE_NUM(last), REG_NODE_NUM(cur) );
2602 if ( (((first && optype!=NOTHING) ? OP( noper ) == optype
2603 : PL_regkind[ OP( noper ) ] == EXACT )
2604 || OP(noper) == NOTHING )
2606 && noper_next == tail
2611 if ( !first || optype == NOTHING ) {
2612 if (!first) first = cur;
2613 optype = OP( noper );
2619 make_trie( pRExC_state,
2620 startbranch, first, cur, tail, count,
2623 if ( PL_regkind[ OP( noper ) ] == EXACT
2625 && noper_next == tail
2630 optype = OP( noper );
2640 regprop(RExC_rx, mysv, cur);
2641 PerlIO_printf( Perl_debug_log,
2642 "%*s- %s (%d) <SCAN FINISHED>\n", (int)depth * 2 + 2,
2643 "", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur));
2647 made= make_trie( pRExC_state, startbranch, first, scan, tail, count, optype, depth+1 );
2648 #ifdef TRIE_STUDY_OPT
2649 if ( ((made == MADE_EXACT_TRIE &&
2650 startbranch == first)
2651 || ( first_non_open == first )) &&
2653 flags |= SCF_TRIE_RESTUDY;
2661 else if ( code == BRANCHJ ) { /* single branch is optimized. */
2662 scan = NEXTOPER(NEXTOPER(scan));
2663 } else /* single branch is optimized. */
2664 scan = NEXTOPER(scan);
2667 else if (OP(scan) == EXACT) {
2668 I32 l = STR_LEN(scan);
2671 const U8 * const s = (U8*)STRING(scan);
2672 l = utf8_length(s, s + l);
2673 uc = utf8_to_uvchr(s, NULL);
2675 uc = *((U8*)STRING(scan));
2678 if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
2679 /* The code below prefers earlier match for fixed
2680 offset, later match for variable offset. */
2681 if (data->last_end == -1) { /* Update the start info. */
2682 data->last_start_min = data->pos_min;
2683 data->last_start_max = is_inf
2684 ? I32_MAX : data->pos_min + data->pos_delta;
2686 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
2688 SvUTF8_on(data->last_found);
2690 SV * const sv = data->last_found;
2691 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
2692 mg_find(sv, PERL_MAGIC_utf8) : NULL;
2693 if (mg && mg->mg_len >= 0)
2694 mg->mg_len += utf8_length((U8*)STRING(scan),
2695 (U8*)STRING(scan)+STR_LEN(scan));
2697 data->last_end = data->pos_min + l;
2698 data->pos_min += l; /* As in the first entry. */
2699 data->flags &= ~SF_BEFORE_EOL;
2701 if (flags & SCF_DO_STCLASS_AND) {
2702 /* Check whether it is compatible with what we know already! */
2706 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
2707 && !ANYOF_BITMAP_TEST(data->start_class, uc)
2708 && (!(data->start_class->flags & ANYOF_FOLD)
2709 || !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
2712 ANYOF_CLASS_ZERO(data->start_class);
2713 ANYOF_BITMAP_ZERO(data->start_class);
2715 ANYOF_BITMAP_SET(data->start_class, uc);
2716 data->start_class->flags &= ~ANYOF_EOS;
2718 data->start_class->flags &= ~ANYOF_UNICODE_ALL;
2720 else if (flags & SCF_DO_STCLASS_OR) {
2721 /* false positive possible if the class is case-folded */
2723 ANYOF_BITMAP_SET(data->start_class, uc);
2725 data->start_class->flags |= ANYOF_UNICODE_ALL;
2726 data->start_class->flags &= ~ANYOF_EOS;
2727 cl_and(data->start_class, and_withp);
2729 flags &= ~SCF_DO_STCLASS;
2731 else if (PL_regkind[OP(scan)] == EXACT) { /* But OP != EXACT! */
2732 I32 l = STR_LEN(scan);
2733 UV uc = *((U8*)STRING(scan));
2735 /* Search for fixed substrings supports EXACT only. */
2736 if (flags & SCF_DO_SUBSTR) {
2738 scan_commit(pRExC_state, data, minlenp);
2741 const U8 * const s = (U8 *)STRING(scan);
2742 l = utf8_length(s, s + l);
2743 uc = utf8_to_uvchr(s, NULL);
2746 if (flags & SCF_DO_SUBSTR)
2748 if (flags & SCF_DO_STCLASS_AND) {
2749 /* Check whether it is compatible with what we know already! */
2753 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
2754 && !ANYOF_BITMAP_TEST(data->start_class, uc)
2755 && !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
2757 ANYOF_CLASS_ZERO(data->start_class);
2758 ANYOF_BITMAP_ZERO(data->start_class);
2760 ANYOF_BITMAP_SET(data->start_class, uc);
2761 data->start_class->flags &= ~ANYOF_EOS;
2762 data->start_class->flags |= ANYOF_FOLD;
2763 if (OP(scan) == EXACTFL)
2764 data->start_class->flags |= ANYOF_LOCALE;
2767 else if (flags & SCF_DO_STCLASS_OR) {
2768 if (data->start_class->flags & ANYOF_FOLD) {
2769 /* false positive possible if the class is case-folded.
2770 Assume that the locale settings are the same... */
2772 ANYOF_BITMAP_SET(data->start_class, uc);
2773 data->start_class->flags &= ~ANYOF_EOS;
2775 cl_and(data->start_class, and_withp);
2777 flags &= ~SCF_DO_STCLASS;
2779 else if (strchr((const char*)PL_varies,OP(scan))) {
2780 I32 mincount, maxcount, minnext, deltanext, fl = 0;
2781 I32 f = flags, pos_before = 0;
2782 regnode * const oscan = scan;
2783 struct regnode_charclass_class this_class;
2784 struct regnode_charclass_class *oclass = NULL;
2785 I32 next_is_eval = 0;
2787 switch (PL_regkind[OP(scan)]) {
2788 case WHILEM: /* End of (?:...)* . */
2789 scan = NEXTOPER(scan);
2792 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
2793 next = NEXTOPER(scan);
2794 if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) {
2796 maxcount = REG_INFTY;
2797 next = regnext(scan);
2798 scan = NEXTOPER(scan);
2802 if (flags & SCF_DO_SUBSTR)
2807 if (flags & SCF_DO_STCLASS) {
2809 maxcount = REG_INFTY;
2810 next = regnext(scan);
2811 scan = NEXTOPER(scan);
2814 is_inf = is_inf_internal = 1;
2815 scan = regnext(scan);
2816 if (flags & SCF_DO_SUBSTR) {
2817 scan_commit(pRExC_state, data, minlenp); /* Cannot extend fixed substrings */
2818 data->longest = &(data->longest_float);
2820 goto optimize_curly_tail;
2822 if (stopparen>0 && (OP(scan)==CURLYN || OP(scan)==CURLYM)
2823 && (scan->flags == stopparen))
2828 mincount = ARG1(scan);
2829 maxcount = ARG2(scan);
2831 next = regnext(scan);
2832 if (OP(scan) == CURLYX) {
2833 I32 lp = (data ? *(data->last_closep) : 0);
2834 scan->flags = ((lp <= (I32)U8_MAX) ? (U8)lp : U8_MAX);
2836 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
2837 next_is_eval = (OP(scan) == EVAL);
2839 if (flags & SCF_DO_SUBSTR) {
2840 if (mincount == 0) scan_commit(pRExC_state,data,minlenp); /* Cannot extend fixed substrings */
2841 pos_before = data->pos_min;
2845 data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
2847 data->flags |= SF_IS_INF;
2849 if (flags & SCF_DO_STCLASS) {
2850 cl_init(pRExC_state, &this_class);
2851 oclass = data->start_class;
2852 data->start_class = &this_class;
2853 f |= SCF_DO_STCLASS_AND;
2854 f &= ~SCF_DO_STCLASS_OR;
2856 /* These are the cases when once a subexpression
2857 fails at a particular position, it cannot succeed
2858 even after backtracking at the enclosing scope.
2860 XXXX what if minimal match and we are at the
2861 initial run of {n,m}? */
2862 if ((mincount != maxcount - 1) && (maxcount != REG_INFTY))
2863 f &= ~SCF_WHILEM_VISITED_POS;
2865 /* This will finish on WHILEM, setting scan, or on NULL: */
2866 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
2867 last, data, stopparen, recursed, NULL,
2869 ? (f & ~SCF_DO_SUBSTR) : f),depth+1);
2871 if (flags & SCF_DO_STCLASS)
2872 data->start_class = oclass;
2873 if (mincount == 0 || minnext == 0) {
2874 if (flags & SCF_DO_STCLASS_OR) {
2875 cl_or(pRExC_state, data->start_class, &this_class);
2877 else if (flags & SCF_DO_STCLASS_AND) {
2878 /* Switch to OR mode: cache the old value of
2879 * data->start_class */
2881 StructCopy(data->start_class, and_withp,
2882 struct regnode_charclass_class);
2883 flags &= ~SCF_DO_STCLASS_AND;
2884 StructCopy(&this_class, data->start_class,
2885 struct regnode_charclass_class);
2886 flags |= SCF_DO_STCLASS_OR;
2887 data->start_class->flags |= ANYOF_EOS;
2889 } else { /* Non-zero len */
2890 if (flags & SCF_DO_STCLASS_OR) {
2891 cl_or(pRExC_state, data->start_class, &this_class);
2892 cl_and(data->start_class, and_withp);
2894 else if (flags & SCF_DO_STCLASS_AND)
2895 cl_and(data->start_class, &this_class);
2896 flags &= ~SCF_DO_STCLASS;
2898 if (!scan) /* It was not CURLYX, but CURLY. */
2900 if ( /* ? quantifier ok, except for (?{ ... }) */
2901 (next_is_eval || !(mincount == 0 && maxcount == 1))
2902 && (minnext == 0) && (deltanext == 0)
2903 && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
2904 && maxcount <= REG_INFTY/3 /* Complement check for big count */
2905 && ckWARN(WARN_REGEXP))
2908 "Quantifier unexpected on zero-length expression");
2911 min += minnext * mincount;
2912 is_inf_internal |= ((maxcount == REG_INFTY
2913 && (minnext + deltanext) > 0)
2914 || deltanext == I32_MAX);
2915 is_inf |= is_inf_internal;
2916 delta += (minnext + deltanext) * maxcount - minnext * mincount;
2918 /* Try powerful optimization CURLYX => CURLYN. */
2919 if ( OP(oscan) == CURLYX && data
2920 && data->flags & SF_IN_PAR
2921 && !(data->flags & SF_HAS_EVAL)
2922 && !deltanext && minnext == 1 ) {
2923 /* Try to optimize to CURLYN. */
2924 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
2925 regnode * const nxt1 = nxt;
2932 if (!strchr((const char*)PL_simple,OP(nxt))
2933 && !(PL_regkind[OP(nxt)] == EXACT
2934 && STR_LEN(nxt) == 1))
2940 if (OP(nxt) != CLOSE)
2942 if (RExC_open_parens) {
2943 RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
2944 RExC_close_parens[ARG(nxt1)-1]=nxt+2; /*close->while*/
2946 /* Now we know that nxt2 is the only contents: */
2947 oscan->flags = (U8)ARG(nxt);
2949 OP(nxt1) = NOTHING; /* was OPEN. */
2952 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
2953 NEXT_OFF(nxt1+ 1) = 0; /* just for consistancy. */
2954 NEXT_OFF(nxt2) = 0; /* just for consistancy with CURLY. */
2955 OP(nxt) = OPTIMIZED; /* was CLOSE. */
2956 OP(nxt + 1) = OPTIMIZED; /* was count. */
2957 NEXT_OFF(nxt+ 1) = 0; /* just for consistancy. */
2962 /* Try optimization CURLYX => CURLYM. */
2963 if ( OP(oscan) == CURLYX && data
2964 && !(data->flags & SF_HAS_PAR)
2965 && !(data->flags & SF_HAS_EVAL)
2966 && !deltanext /* atom is fixed width */
2967 && minnext != 0 /* CURLYM can't handle zero width */
2969 /* XXXX How to optimize if data == 0? */
2970 /* Optimize to a simpler form. */
2971 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
2975 while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
2976 && (OP(nxt2) != WHILEM))
2978 OP(nxt2) = SUCCEED; /* Whas WHILEM */
2979 /* Need to optimize away parenths. */
2980 if (data->flags & SF_IN_PAR) {
2981 /* Set the parenth number. */
2982 regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
2984 if (OP(nxt) != CLOSE)
2985 FAIL("Panic opt close");
2986 oscan->flags = (U8)ARG(nxt);
2987 if (RExC_open_parens) {
2988 RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
2989 RExC_close_parens[ARG(nxt1)-1]=nxt2+1; /*close->NOTHING*/
2991 OP(nxt1) = OPTIMIZED; /* was OPEN. */
2992 OP(nxt) = OPTIMIZED; /* was CLOSE. */
2995 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
2996 OP(nxt + 1) = OPTIMIZED; /* was count. */
2997 NEXT_OFF(nxt1 + 1) = 0; /* just for consistancy. */
2998 NEXT_OFF(nxt + 1) = 0; /* just for consistancy. */
3001 while ( nxt1 && (OP(nxt1) != WHILEM)) {
3002 regnode *nnxt = regnext(nxt1);
3005 if (reg_off_by_arg[OP(nxt1)])
3006 ARG_SET(nxt1, nxt2 - nxt1);
3007 else if (nxt2 - nxt1 < U16_MAX)
3008 NEXT_OFF(nxt1) = nxt2 - nxt1;
3010 OP(nxt) = NOTHING; /* Cannot beautify */
3015 /* Optimize again: */
3016 study_chunk(pRExC_state, &nxt1, minlenp, &deltanext, nxt,
3017 NULL, stopparen, recursed, NULL, 0,depth+1);
3022 else if ((OP(oscan) == CURLYX)
3023 && (flags & SCF_WHILEM_VISITED_POS)
3024 /* See the comment on a similar expression above.
3025 However, this time it not a subexpression
3026 we care about, but the expression itself. */
3027 && (maxcount == REG_INFTY)
3028 && data && ++data->whilem_c < 16) {
3029 /* This stays as CURLYX, we can put the count/of pair. */
3030 /* Find WHILEM (as in regexec.c) */
3031 regnode *nxt = oscan + NEXT_OFF(oscan);
3033 if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
3035 PREVOPER(nxt)->flags = (U8)(data->whilem_c
3036 | (RExC_whilem_seen << 4)); /* On WHILEM */
3038 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
3040 if (flags & SCF_DO_SUBSTR) {
3041 SV *last_str = NULL;
3042 int counted = mincount != 0;
3044 if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */
3045 #if defined(SPARC64_GCC_WORKAROUND)
3048 const char *s = NULL;
3051 if (pos_before >= data->last_start_min)
3054 b = data->last_start_min;
3057 s = SvPV_const(data->last_found, l);
3058 old = b - data->last_start_min;
3061 I32 b = pos_before >= data->last_start_min
3062 ? pos_before : data->last_start_min;
3064 const char * const s = SvPV_const(data->last_found, l);
3065 I32 old = b - data->last_start_min;
3069 old = utf8_hop((U8*)s, old) - (U8*)s;
3072 /* Get the added string: */
3073 last_str = newSVpvn(s + old, l);
3075 SvUTF8_on(last_str);
3076 if (deltanext == 0 && pos_before == b) {
3077 /* What was added is a constant string */
3079 SvGROW(last_str, (mincount * l) + 1);
3080 repeatcpy(SvPVX(last_str) + l,
3081 SvPVX_const(last_str), l, mincount - 1);
3082 SvCUR_set(last_str, SvCUR(last_str) * mincount);
3083 /* Add additional parts. */
3084 SvCUR_set(data->last_found,
3085 SvCUR(data->last_found) - l);
3086 sv_catsv(data->last_found, last_str);
3088 SV * sv = data->last_found;
3090 SvUTF8(sv) && SvMAGICAL(sv) ?
3091 mg_find(sv, PERL_MAGIC_utf8) : NULL;
3092 if (mg && mg->mg_len >= 0)
3093 mg->mg_len += CHR_SVLEN(last_str);
3095 data->last_end += l * (mincount - 1);
3098 /* start offset must point into the last copy */
3099 data->last_start_min += minnext * (mincount - 1);
3100 data->last_start_max += is_inf ? I32_MAX
3101 : (maxcount - 1) * (minnext + data->pos_delta);
3104 /* It is counted once already... */
3105 data->pos_min += minnext * (mincount - counted);
3106 data->pos_delta += - counted * deltanext +
3107 (minnext + deltanext) * maxcount - minnext * mincount;
3108 if (mincount != maxcount) {
3109 /* Cannot extend fixed substrings found inside
3111 scan_commit(pRExC_state,data,minlenp);
3112 if (mincount && last_str) {
3113 SV * const sv = data->last_found;
3114 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
3115 mg_find(sv, PERL_MAGIC_utf8) : NULL;
3119 sv_setsv(sv, last_str);
3120 data->last_end = data->pos_min;
3121 data->last_start_min =
3122 data->pos_min - CHR_SVLEN(last_str);
3123 data->last_start_max = is_inf
3125 : data->pos_min + data->pos_delta
3126 - CHR_SVLEN(last_str);
3128 data->longest = &(data->longest_float);
3130 SvREFCNT_dec(last_str);
3132 if (data && (fl & SF_HAS_EVAL))
3133 data->flags |= SF_HAS_EVAL;
3134 optimize_curly_tail:
3135 if (OP(oscan) != CURLYX) {
3136 while (PL_regkind[OP(next = regnext(oscan))] == NOTHING
3138 NEXT_OFF(oscan) += NEXT_OFF(next);
3141 default: /* REF and CLUMP only? */
3142 if (flags & SCF_DO_SUBSTR) {
3143 scan_commit(pRExC_state,data,minlenp); /* Cannot expect anything... */
3144 data->longest = &(data->longest_float);
3146 is_inf = is_inf_internal = 1;
3147 if (flags & SCF_DO_STCLASS_OR)
3148 cl_anything(pRExC_state, data->start_class);
3149 flags &= ~SCF_DO_STCLASS;
3153 else if (strchr((const char*)PL_simple,OP(scan))) {
3156 if (flags & SCF_DO_SUBSTR) {
3157 scan_commit(pRExC_state,data,minlenp);
3161 if (flags & SCF_DO_STCLASS) {
3162 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
3164 /* Some of the logic below assumes that switching
3165 locale on will only add false positives. */
3166 switch (PL_regkind[OP(scan)]) {
3170 /* Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan)); */
3171 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
3172 cl_anything(pRExC_state, data->start_class);
3175 if (OP(scan) == SANY)
3177 if (flags & SCF_DO_STCLASS_OR) { /* Everything but \n */
3178 value = (ANYOF_BITMAP_TEST(data->start_class,'\n')
3179 || (data->start_class->flags & ANYOF_CLASS));
3180 cl_anything(pRExC_state, data->start_class);
3182 if (flags & SCF_DO_STCLASS_AND || !value)
3183 ANYOF_BITMAP_CLEAR(data->start_class,'\n');
3186 if (flags & SCF_DO_STCLASS_AND)
3187 cl_and(data->start_class,
3188 (struct regnode_charclass_class*)scan);
3190 cl_or(pRExC_state, data->start_class,
3191 (struct regnode_charclass_class*)scan);
3194 if (flags & SCF_DO_STCLASS_AND) {
3195 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3196 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
3197 for (value = 0; value < 256; value++)
3198 if (!isALNUM(value))
3199 ANYOF_BITMAP_CLEAR(data->start_class, value);
3203 if (data->start_class->flags & ANYOF_LOCALE)
3204 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
3206 for (value = 0; value < 256; value++)
3208 ANYOF_BITMAP_SET(data->start_class, value);
3213 if (flags & SCF_DO_STCLASS_AND) {
3214 if (data->start_class->flags & ANYOF_LOCALE)
3215 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
3218 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
3219 data->start_class->flags |= ANYOF_LOCALE;
3223 if (flags & SCF_DO_STCLASS_AND) {
3224 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3225 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
3226 for (value = 0; value < 256; value++)
3228 ANYOF_BITMAP_CLEAR(data->start_class, value);
3232 if (data->start_class->flags & ANYOF_LOCALE)
3233 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
3235 for (value = 0; value < 256; value++)
3236 if (!isALNUM(value))
3237 ANYOF_BITMAP_SET(data->start_class, value);
3242 if (flags & SCF_DO_STCLASS_AND) {
3243 if (data->start_class->flags & ANYOF_LOCALE)
3244 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
3247 data->start_class->flags |= ANYOF_LOCALE;
3248 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
3252 if (flags & SCF_DO_STCLASS_AND) {
3253 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3254 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
3255 for (value = 0; value < 256; value++)
3256 if (!isSPACE(value))
3257 ANYOF_BITMAP_CLEAR(data->start_class, value);
3261 if (data->start_class->flags & ANYOF_LOCALE)
3262 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
3264 for (value = 0; value < 256; value++)
3266 ANYOF_BITMAP_SET(data->start_class, value);
3271 if (flags & SCF_DO_STCLASS_AND) {
3272 if (data->start_class->flags & ANYOF_LOCALE)
3273 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
3276 data->start_class->flags |= ANYOF_LOCALE;
3277 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
3281 if (flags & SCF_DO_STCLASS_AND) {
3282 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3283 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
3284 for (value = 0; value < 256; value++)
3286 ANYOF_BITMAP_CLEAR(data->start_class, value);
3290 if (data->start_class->flags & ANYOF_LOCALE)
3291 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
3293 for (value = 0; value < 256; value++)
3294 if (!isSPACE(value))
3295 ANYOF_BITMAP_SET(data->start_class, value);
3300 if (flags & SCF_DO_STCLASS_AND) {
3301 if (data->start_class->flags & ANYOF_LOCALE) {
3302 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
3303 for (value = 0; value < 256; value++)
3304 if (!isSPACE(value))
3305 ANYOF_BITMAP_CLEAR(data->start_class, value);
3309 data->start_class->flags |= ANYOF_LOCALE;
3310 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
3314 if (flags & SCF_DO_STCLASS_AND) {
3315 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NDIGIT);
3316 for (value = 0; value < 256; value++)
3317 if (!isDIGIT(value))
3318 ANYOF_BITMAP_CLEAR(data->start_class, value);
3321 if (data->start_class->flags & ANYOF_LOCALE)
3322 ANYOF_CLASS_SET(data->start_class,ANYOF_DIGIT);
3324 for (value = 0; value < 256; value++)
3326 ANYOF_BITMAP_SET(data->start_class, value);
3331 if (flags & SCF_DO_STCLASS_AND) {
3332 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_DIGIT);
3333 for (value = 0; value < 256; value++)
3335 ANYOF_BITMAP_CLEAR(data->start_class, value);
3338 if (data->start_class->flags & ANYOF_LOCALE)
3339 ANYOF_CLASS_SET(data->start_class,ANYOF_NDIGIT);
3341 for (value = 0; value < 256; value++)
3342 if (!isDIGIT(value))
3343 ANYOF_BITMAP_SET(data->start_class, value);
3348 if (flags & SCF_DO_STCLASS_OR)
3349 cl_and(data->start_class, and_withp);
3350 flags &= ~SCF_DO_STCLASS;
3353 else if (PL_regkind[OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
3354 data->flags |= (OP(scan) == MEOL
3358 else if ( PL_regkind[OP(scan)] == BRANCHJ
3359 /* Lookbehind, or need to calculate parens/evals/stclass: */
3360 && (scan->flags || data || (flags & SCF_DO_STCLASS))
3361 && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) {
3362 if ( !PERL_ENABLE_POSITIVE_ASSERTION_STUDY
3363 || OP(scan) == UNLESSM )
3365 /* Negative Lookahead/lookbehind
3366 In this case we can't do fixed string optimisation.
3369 I32 deltanext, minnext, fake = 0;
3371 struct regnode_charclass_class intrnl;
3374 data_fake.flags = 0;
3376 data_fake.whilem_c = data->whilem_c;
3377 data_fake.last_closep = data->last_closep;
3380 data_fake.last_closep = &fake;
3381 if ( flags & SCF_DO_STCLASS && !scan->flags
3382 && OP(scan) == IFMATCH ) { /* Lookahead */
3383 cl_init(pRExC_state, &intrnl);
3384 data_fake.start_class = &intrnl;
3385 f |= SCF_DO_STCLASS_AND;
3387 if (flags & SCF_WHILEM_VISITED_POS)
3388 f |= SCF_WHILEM_VISITED_POS;
3389 next = regnext(scan);
3390 nscan = NEXTOPER(NEXTOPER(scan));
3391 minnext = study_chunk(pRExC_state, &nscan, minlenp, &deltanext,
3392 last, &data_fake, stopparen, recursed, NULL, f, depth+1);
3395 vFAIL("Variable length lookbehind not implemented");
3397 else if (minnext > (I32)U8_MAX) {
3398 vFAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
3400 scan->flags = (U8)minnext;
3403 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3405 if (data_fake.flags & SF_HAS_EVAL)
3406 data->flags |= SF_HAS_EVAL;
3407 data->whilem_c = data_fake.whilem_c;
3409 if (f & SCF_DO_STCLASS_AND) {
3410 const int was = (data->start_class->flags & ANYOF_EOS);
3412 cl_and(data->start_class, &intrnl);
3414 data->start_class->flags |= ANYOF_EOS;
3417 #if PERL_ENABLE_POSITIVE_ASSERTION_STUDY
3419 /* Positive Lookahead/lookbehind
3420 In this case we can do fixed string optimisation,
3421 but we must be careful about it. Note in the case of
3422 lookbehind the positions will be offset by the minimum
3423 length of the pattern, something we won't know about
3424 until after the recurse.
3426 I32 deltanext, fake = 0;
3428 struct regnode_charclass_class intrnl;
3430 /* We use SAVEFREEPV so that when the full compile
3431 is finished perl will clean up the allocated
3432 minlens when its all done. This was we don't
3433 have to worry about freeing them when we know
3434 they wont be used, which would be a pain.
3437 Newx( minnextp, 1, I32 );
3438 SAVEFREEPV(minnextp);
3441 StructCopy(data, &data_fake, scan_data_t);
3442 if ((flags & SCF_DO_SUBSTR) && data->last_found) {
3445 scan_commit(pRExC_state, &data_fake,minlenp);
3446 data_fake.last_found=newSVsv(data->last_found);
3450 data_fake.last_closep = &fake;
3451 data_fake.flags = 0;
3453 data_fake.flags |= SF_IS_INF;
3454 if ( flags & SCF_DO_STCLASS && !scan->flags
3455 && OP(scan) == IFMATCH ) { /* Lookahead */
3456 cl_init(pRExC_state, &intrnl);
3457 data_fake.start_class = &intrnl;
3458 f |= SCF_DO_STCLASS_AND;
3460 if (flags & SCF_WHILEM_VISITED_POS)
3461 f |= SCF_WHILEM_VISITED_POS;
3462 next = regnext(scan);
3463 nscan = NEXTOPER(NEXTOPER(scan));
3465 *minnextp = study_chunk(pRExC_state, &nscan, minnextp, &deltanext,
3466 last, &data_fake, stopparen, recursed, NULL, f,depth+1);
3469 vFAIL("Variable length lookbehind not implemented");
3471 else if (*minnextp > (I32)U8_MAX) {
3472 vFAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
3474 scan->flags = (U8)*minnextp;
3479 if (f & SCF_DO_STCLASS_AND) {
3480 const int was = (data->start_class->flags & ANYOF_EOS);
3482 cl_and(data->start_class, &intrnl);
3484 data->start_class->flags |= ANYOF_EOS;
3487 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3489 if (data_fake.flags & SF_HAS_EVAL)
3490 data->flags |= SF_HAS_EVAL;
3491 data->whilem_c = data_fake.whilem_c;
3492 if ((flags & SCF_DO_SUBSTR) && data_fake.last_found) {
3493 if (RExC_rx->minlen<*minnextp)
3494 RExC_rx->minlen=*minnextp;
3495 scan_commit(pRExC_state, &data_fake, minnextp);
3496 SvREFCNT_dec(data_fake.last_found);
3498 if ( data_fake.minlen_fixed != minlenp )
3500 data->offset_fixed= data_fake.offset_fixed;
3501 data->minlen_fixed= data_fake.minlen_fixed;
3502 data->lookbehind_fixed+= scan->flags;
3504 if ( data_fake.minlen_float != minlenp )
3506 data->minlen_float= data_fake.minlen_float;
3507 data->offset_float_min=data_fake.offset_float_min;
3508 data->offset_float_max=data_fake.offset_float_max;
3509 data->lookbehind_float+= scan->flags;
3518 else if (OP(scan) == OPEN) {
3519 if (stopparen != (I32)ARG(scan))
3522 else if (OP(scan) == CLOSE) {
3523 if (stopparen == (I32)ARG(scan)) {
3526 if ((I32)ARG(scan) == is_par) {
3527 next = regnext(scan);
3529 if ( next && (OP(next) != WHILEM) && next < last)
3530 is_par = 0; /* Disable optimization */
3533 *(data->last_closep) = ARG(scan);
3535 else if (OP(scan) == GOSUB || OP(scan) == GOSTART) {
3536 /* set the pointer */
3540 if (OP(scan) == GOSUB) {
3542 RExC_recurse[ARG2L(scan)] = scan;
3543 start = RExC_open_parens[paren-1];
3544 end = RExC_close_parens[paren-1];
3547 start = RExC_rx->program + 1;
3553 Newxz(recursed, (((RExC_npar)>>3) +1), U8);
3554 SAVEFREEPV(recursed);
3556 if (!PAREN_TEST(recursed,paren+1)) {
3558 PAREN_SET(recursed,paren+1);
3560 DEBUG_PEEP("goto",start,depth);
3573 if (deltanext == I32_MAX) {
3574 is_inf = is_inf_internal = 1;
3577 DEBUG_PEEP("rtrn",end,depth);
3578 PAREN_UNSET(recursed,paren+1);
3580 if (flags & SCF_DO_SUBSTR) {
3581 scan_commit(pRExC_state,data,minlenp);
3582 data->longest = &(data->longest_float);
3584 is_inf = is_inf_internal = 1;
3585 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
3586 cl_anything(pRExC_state, data->start_class);
3587 flags &= ~SCF_DO_STCLASS;
3590 else if (OP(scan) == EVAL) {
3592 data->flags |= SF_HAS_EVAL;
3594 else if ( PL_regkind[OP(scan)] == ENDLIKE ) {
3595 if (flags & SCF_DO_SUBSTR) {
3596 scan_commit(pRExC_state,data,minlenp);
3597 flags &= ~SCF_DO_SUBSTR;
3599 if (data && OP(scan)==ACCEPT) {
3600 data->flags |= SCF_SEEN_ACCEPT;
3605 else if (OP(scan) == LOGICAL && scan->flags == 2) /* Embedded follows */
3607 if (flags & SCF_DO_SUBSTR) {
3608 scan_commit(pRExC_state,data,minlenp);
3609 data->longest = &(data->longest_float);
3611 is_inf = is_inf_internal = 1;
3612 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
3613 cl_anything(pRExC_state, data->start_class);
3614 flags &= ~SCF_DO_STCLASS;
3616 #ifdef TRIE_STUDY_OPT
3617 #ifdef FULL_TRIE_STUDY
3618 else if (PL_regkind[OP(scan)] == TRIE) {
3619 /* NOTE - There is similar code to this block above for handling
3620 BRANCH nodes on the initial study. If you change stuff here
3622 regnode *trie_node= scan;
3623 regnode *tail= regnext(scan);
3624 reg_trie_data *trie = (reg_trie_data*)RExC_rx->data->data[ ARG(scan) ];
3625 I32 max1 = 0, min1 = I32_MAX;
3626 struct regnode_charclass_class accum;
3628 if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
3629 scan_commit(pRExC_state, data,minlenp); /* Cannot merge strings after this. */
3630 if (flags & SCF_DO_STCLASS)
3631 cl_init_zero(pRExC_state, &accum);
3637 const regnode *nextbranch= NULL;
3640 for ( word=1 ; word <= trie->wordcount ; word++)
3642 I32 deltanext=0, minnext=0, f = 0, fake;
3643 struct regnode_charclass_class this_class;
3645 data_fake.flags = 0;
3647 data_fake.whilem_c = data->whilem_c;
3648 data_fake.last_closep = data->last_closep;
3651 data_fake.last_closep = &fake;
3653 if (flags & SCF_DO_STCLASS) {
3654 cl_init(pRExC_state, &this_class);
3655 data_fake.start_class = &this_class;
3656 f = SCF_DO_STCLASS_AND;
3658 if (flags & SCF_WHILEM_VISITED_POS)
3659 f |= SCF_WHILEM_VISITED_POS;
3661 if (trie->jump[word]) {
3663 nextbranch = trie_node + trie->jump[0];
3664 scan= trie_node + trie->jump[word];
3665 /* We go from the jump point to the branch that follows
3666 it. Note this means we need the vestigal unused branches
3667 even though they arent otherwise used.
3669 minnext = study_chunk(pRExC_state, &scan, minlenp,
3670 &deltanext, (regnode *)nextbranch, &data_fake,
3671 stopparen, recursed, NULL, f,depth+1);
3673 if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
3674 nextbranch= regnext((regnode*)nextbranch);
3676 if (min1 > (I32)(minnext + trie->minlen))
3677 min1 = minnext + trie->minlen;
3678 if (max1 < (I32)(minnext + deltanext + trie->maxlen))
3679 max1 = minnext + deltanext + trie->maxlen;
3680 if (deltanext == I32_MAX)
3681 is_inf = is_inf_internal = 1;
3683 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3685 if (data_fake.flags & SCF_SEEN_ACCEPT) {
3686 if ( stopmin > min + min1)
3687 stopmin = min + min1;
3688 flags &= ~SCF_DO_SUBSTR;
3690 data->flags |= SCF_SEEN_ACCEPT;
3693 if (data_fake.flags & SF_HAS_EVAL)
3694 data->flags |= SF_HAS_EVAL;
3695 data->whilem_c = data_fake.whilem_c;
3697 if (flags & SCF_DO_STCLASS)
3698 cl_or(pRExC_state, &accum, &this_class);
3701 if (flags & SCF_DO_SUBSTR) {
3702 data->pos_min += min1;
3703 data->pos_delta += max1 - min1;
3704 if (max1 != min1 || is_inf)
3705 data->longest = &(data->longest_float);
3708 delta += max1 - min1;
3709 if (flags & SCF_DO_STCLASS_OR) {
3710 cl_or(pRExC_state, data->start_class, &accum);
3712 cl_and(data->start_class, and_withp);
3713 flags &= ~SCF_DO_STCLASS;
3716 else if (flags & SCF_DO_STCLASS_AND) {
3718 cl_and(data->start_class, &accum);
3719 flags &= ~SCF_DO_STCLASS;
3722 /* Switch to OR mode: cache the old value of
3723 * data->start_class */
3725 StructCopy(data->start_class, and_withp,
3726 struct regnode_charclass_class);
3727 flags &= ~SCF_DO_STCLASS_AND;
3728 StructCopy(&accum, data->start_class,
3729 struct regnode_charclass_class);
3730 flags |= SCF_DO_STCLASS_OR;
3731 data->start_class->flags |= ANYOF_EOS;
3738 else if (PL_regkind[OP(scan)] == TRIE) {
3739 reg_trie_data *trie = (reg_trie_data*)RExC_rx->data->data[ ARG(scan) ];
3742 min += trie->minlen;
3743 delta += (trie->maxlen - trie->minlen);
3744 flags &= ~SCF_DO_STCLASS; /* xxx */
3745 if (flags & SCF_DO_SUBSTR) {
3746 scan_commit(pRExC_state,data,minlenp); /* Cannot expect anything... */
3747 data->pos_min += trie->minlen;
3748 data->pos_delta += (trie->maxlen - trie->minlen);
3749 if (trie->maxlen != trie->minlen)
3750 data->longest = &(data->longest_float);
3752 if (trie->jump) /* no more substrings -- for now /grr*/
3753 flags &= ~SCF_DO_SUBSTR;
3755 #endif /* old or new */
3756 #endif /* TRIE_STUDY_OPT */
3757 /* Else: zero-length, ignore. */
3758 scan = regnext(scan);
3763 *deltap = is_inf_internal ? I32_MAX : delta;
3764 if (flags & SCF_DO_SUBSTR && is_inf)
3765 data->pos_delta = I32_MAX - data->pos_min;
3766 if (is_par > (I32)U8_MAX)
3768 if (is_par && pars==1 && data) {
3769 data->flags |= SF_IN_PAR;
3770 data->flags &= ~SF_HAS_PAR;
3772 else if (pars && data) {
3773 data->flags |= SF_HAS_PAR;
3774 data->flags &= ~SF_IN_PAR;
3776 if (flags & SCF_DO_STCLASS_OR)
3777 cl_and(data->start_class, and_withp);
3778 if (flags & SCF_TRIE_RESTUDY)
3779 data->flags |= SCF_TRIE_RESTUDY;
3781 DEBUG_STUDYDATA(data,depth);
3783 return min < stopmin ? min : stopmin;
3787 S_add_data(RExC_state_t *pRExC_state, I32 n, const char *s)
3789 if (RExC_rx->data) {
3790 const U32 count = RExC_rx->data->count;
3791 Renewc(RExC_rx->data,
3792 sizeof(*RExC_rx->data) + sizeof(void*) * (count + n - 1),
3793 char, struct reg_data);
3794 Renew(RExC_rx->data->what, count + n, U8);
3795 RExC_rx->data->count += n;
3798 Newxc(RExC_rx->data, sizeof(*RExC_rx->data) + sizeof(void*) * (n - 1),
3799 char, struct reg_data);
3800 Newx(RExC_rx->data->what, n, U8);
3801 RExC_rx->data->count = n;
3803 Copy(s, RExC_rx->data->what + RExC_rx->data->count - n, n, U8);
3804 return RExC_rx->data->count - n;
3807 #ifndef PERL_IN_XSUB_RE
3809 Perl_reginitcolors(pTHX)
3812 const char * const s = PerlEnv_getenv("PERL_RE_COLORS");
3814 char *t = savepv(s);
3818 t = strchr(t, '\t');
3824 PL_colors[i] = t = (char *)"";
3829 PL_colors[i++] = (char *)"";
3836 #ifdef TRIE_STUDY_OPT
3837 #define CHECK_RESTUDY_GOTO \
3839 (data.flags & SCF_TRIE_RESTUDY) \
3843 #define CHECK_RESTUDY_GOTO
3847 - pregcomp - compile a regular expression into internal code
3849 * We can't allocate space until we know how big the compiled form will be,
3850 * but we can't compile it (and thus know how big it is) until we've got a
3851 * place to put the code. So we cheat: we compile it twice, once with code
3852 * generation turned off and size counting turned on, and once "for real".
3853 * This also means that we don't allocate space until we are sure that the
3854 * thing really will compile successfully, and we never have to move the
3855 * code and thus invalidate pointers into it. (Note that it has to be in
3856 * one piece because free() must be able to free it all.) [NB: not true in perl]
3858 * Beware that the optimization-preparation code in here knows about some
3859 * of the structure of the compiled regexp. [I'll say.]
3864 #ifndef PERL_IN_XSUB_RE
3865 #define RE_ENGINE_PTR &PL_core_reg_engine
3867 extern const struct regexp_engine my_reg_engine;
3868 #define RE_ENGINE_PTR &my_reg_engine
3870 /* these make a few things look better, to avoid indentation */
3871 #define BEGIN_BLOCK {
3875 Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm)
3878 GET_RE_DEBUG_FLAGS_DECL;
3879 DEBUG_r(if (!PL_colorset) reginitcolors());
3880 #ifndef PERL_IN_XSUB_RE
3882 /* Dispatch a request to compile a regexp to correct
3884 HV * const table = GvHV(PL_hintgv);
3886 SV **ptr= hv_fetchs(table, "regcomp", FALSE);
3887 if (ptr && SvIOK(*ptr) && SvIV(*ptr)) {
3888 const regexp_engine *eng=INT2PTR(regexp_engine*,SvIV(*ptr));
3890 PerlIO_printf(Perl_debug_log, "Using engine %"UVxf"\n",
3893 return CALLREGCOMP_ENG(eng, exp, xend, pm);
3907 RExC_state_t RExC_state;
3908 RExC_state_t * const pRExC_state = &RExC_state;
3909 #ifdef TRIE_STUDY_OPT
3911 RExC_state_t copyRExC_state;
3914 FAIL("NULL regexp argument");
3916 RExC_utf8 = pm->op_pmdynflags & PMdf_CMP_UTF8;
3920 SV *dsv= sv_newmortal();
3921 RE_PV_QUOTED_DECL(s, RExC_utf8,
3922 dsv, RExC_precomp, (xend - exp), 60);
3923 PerlIO_printf(Perl_debug_log, "%sCompiling REx%s %s\n",
3924 PL_colors[4],PL_colors[5],s);
3926 RExC_flags = pm->op_pmflags;
3930 RExC_seen_zerolen = *exp == '^' ? -1 : 0;
3931 RExC_seen_evals = 0;
3934 /* First pass: determine size, legality. */
3942 RExC_emit = &PL_regdummy;
3943 RExC_whilem_seen = 0;
3944 RExC_charnames = NULL;
3945 RExC_open_parens = NULL;
3946 RExC_close_parens = NULL;
3948 RExC_paren_names = NULL;
3949 RExC_recurse = NULL;
3950 RExC_recurse_count = 0;
3952 #if 0 /* REGC() is (currently) a NOP at the first pass.
3953 * Clever compilers notice this and complain. --jhi */
3954 REGC((U8)REG_MAGIC, (char*)RExC_emit);
3956 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "Starting first pass (sizing)\n"));
3957 if (reg(pRExC_state, 0, &flags,1) == NULL) {
3958 RExC_precomp = NULL;
3962 PerlIO_printf(Perl_debug_log,
3963 "Required size %"IVdf" nodes\n"
3964 "Starting second pass (creation)\n",
3967 RExC_lastparse=NULL;
3969 /* Small enough for pointer-storage convention?
3970 If extralen==0, this means that we will not need long jumps. */
3971 if (RExC_size >= 0x10000L && RExC_extralen)
3972 RExC_size += RExC_extralen;
3975 if (RExC_whilem_seen > 15)
3976 RExC_whilem_seen = 15;
3979 /* Make room for a sentinel value at the end of the program */
3983 /* Allocate space and zero-initialize. Note, the two step process
3984 of zeroing when in debug mode, thus anything assigned has to
3985 happen after that */
3986 Newxc(r, sizeof(regexp) + (unsigned)RExC_size * sizeof(regnode),
3989 FAIL("Regexp out of space");
3991 /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */
3992 Zero(r, sizeof(regexp) + (unsigned)RExC_size * sizeof(regnode), char);
3994 /* initialization begins here */
3995 r->engine= RE_ENGINE_PTR;
3997 r->prelen = xend - exp;
3998 r->precomp = savepvn(RExC_precomp, r->prelen);
4000 #ifdef PERL_OLD_COPY_ON_WRITE
4001 r->saved_copy = NULL;
4003 r->reganch = pm->op_pmflags & PMf_COMPILETIME;
4004 r->nparens = RExC_npar - 1; /* set early to validate backrefs */
4005 r->lastparen = 0; /* mg.c reads this. */
4007 r->substrs = 0; /* Useful during FAIL. */
4008 r->startp = 0; /* Useful during FAIL. */
4012 if (RExC_seen & REG_SEEN_RECURSE) {
4013 Newxz(RExC_open_parens, RExC_npar,regnode *);
4014 SAVEFREEPV(RExC_open_parens);
4015 Newxz(RExC_close_parens,RExC_npar,regnode *);
4016 SAVEFREEPV(RExC_close_parens);
4019 /* Useful during FAIL. */
4020 Newxz(r->offsets, 2*RExC_size+1, U32); /* MJD 20001228 */
4022 r->offsets[0] = RExC_size;
4024 DEBUG_OFFSETS_r(PerlIO_printf(Perl_debug_log,
4025 "%s %"UVuf" bytes for offset annotations.\n",
4026 r->offsets ? "Got" : "Couldn't get",
4027 (UV)((2*RExC_size+1) * sizeof(U32))));
4031 /* Second pass: emit code. */
4032 RExC_flags = pm->op_pmflags; /* don't let top level (?i) bleed */
4037 RExC_emit_start = r->program;
4038 RExC_emit = r->program;
4040 /* put a sentinal on the end of the program so we can check for
4042 r->program[RExC_size].type = 255;
4044 /* Store the count of eval-groups for security checks: */
4045 RExC_emit->next_off = (RExC_seen_evals > (I32)U16_MAX) ? U16_MAX : (U16)RExC_seen_evals;
4046 REGC((U8)REG_MAGIC, (char*) RExC_emit++);
4048 if (reg(pRExC_state, 0, &flags,1) == NULL)
4051 /* XXXX To minimize changes to RE engine we always allocate
4052 3-units-long substrs field. */
4053 Newx(r->substrs, 1, struct reg_substr_data);
4054 if (RExC_recurse_count) {
4055 Newxz(RExC_recurse,RExC_recurse_count,regnode *);
4056 SAVEFREEPV(RExC_recurse);
4060 r->minlen = minlen = sawplus = sawopen = 0;
4061 Zero(r->substrs, 1, struct reg_substr_data);
4063 #ifdef TRIE_STUDY_OPT
4065 DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log,"Restudying\n"));
4066 RExC_state=copyRExC_state;
4067 if (data.last_found) {
4068 SvREFCNT_dec(data.longest_fixed);
4069 SvREFCNT_dec(data.longest_float);
4070 SvREFCNT_dec(data.last_found);
4072 StructCopy(&zero_scan_data, &data, scan_data_t);
4074 StructCopy(&zero_scan_data, &data, scan_data_t);
4075 copyRExC_state=RExC_state;
4078 StructCopy(&zero_scan_data, &data, scan_data_t);
4081 /* Dig out information for optimizations. */
4082 r->reganch = pm->op_pmflags & PMf_COMPILETIME; /* Again? */
4083 pm->op_pmflags = RExC_flags;
4085 r->reganch |= ROPT_UTF8; /* Unicode in it? */
4086 r->regstclass = NULL;
4087 if (RExC_naughty >= 10) /* Probably an expensive pattern. */
4088 r->reganch |= ROPT_NAUGHTY;
4089 scan = r->program + 1; /* First BRANCH. */
4091 /* testing for BRANCH here tells us whether there is "must appear"
4092 data in the pattern. If there is then we can use it for optimisations */
4093 if (!(RExC_seen & REG_TOP_LEVEL_BRANCHES)) { /* Only one top-level choice. */
4095 STRLEN longest_float_length, longest_fixed_length;
4096 struct regnode_charclass_class ch_class; /* pointed to by data */
4098 I32 last_close = 0; /* pointed to by data */
4101 /* Skip introductions and multiplicators >= 1. */
4102 while ((OP(first) == OPEN && (sawopen = 1)) ||
4103 /* An OR of *one* alternative - should not happen now. */
4104 (OP(first) == BRANCH && OP(regnext(first)) != BRANCH) ||
4105 /* for now we can't handle lookbehind IFMATCH*/
4106 (OP(first) == IFMATCH && !first->flags) ||
4107 (OP(first) == PLUS) ||
4108 (OP(first) == MINMOD) ||
4109 /* An {n,m} with n>0 */
4110 (PL_regkind[OP(first)] == CURLY && ARG1(first) > 0) )
4113 if (OP(first) == PLUS)
4116 first += regarglen[OP(first)];
4117 if (OP(first) == IFMATCH) {
4118 first = NEXTOPER(first);
4119 first += EXTRA_STEP_2ARGS;
4120 } else /* XXX possible optimisation for /(?=)/ */
4121 first = NEXTOPER(first);
4124 /* Starting-point info. */
4126 DEBUG_PEEP("first:",first,0);
4127 /* Ignore EXACT as we deal with it later. */
4128 if (PL_regkind[OP(first)] == EXACT) {
4129 if (OP(first) == EXACT)
4130 NOOP; /* Empty, get anchored substr later. */
4131 else if ((OP(first) == EXACTF || OP(first) == EXACTFL))
4132 r->regstclass = first;
4135 else if (PL_regkind[OP(first)] == TRIE &&
4136 ((reg_trie_data *)r->data->data[ ARG(first) ])->minlen>0)
4139 /* this can happen only on restudy */
4140 if ( OP(first) == TRIE ) {
4141 struct regnode_1 *trieop;
4142 Newxz(trieop,1,struct regnode_1);
4143 StructCopy(first,trieop,struct regnode_1);
4144 trie_op=(regnode *)trieop;
4146 struct regnode_charclass *trieop;
4147 Newxz(trieop,1,struct regnode_charclass);
4148 StructCopy(first,trieop,struct regnode_charclass);
4149 trie_op=(regnode *)trieop;
4152 make_trie_failtable(pRExC_state, (regnode *)first, trie_op, 0);
4153 r->regstclass = trie_op;
4156 else if (strchr((const char*)PL_simple,OP(first)))
4157 r->regstclass = first;
4158 else if (PL_regkind[OP(first)] == BOUND ||
4159 PL_regkind[OP(first)] == NBOUND)
4160 r->regstclass = first;
4161 else if (PL_regkind[OP(first)] == BOL) {
4162 r->reganch |= (OP(first) == MBOL
4164 : (OP(first) == SBOL
4167 first = NEXTOPER(first);
4170 else if (OP(first) == GPOS) {
4171 r->reganch |= ROPT_ANCH_GPOS;
4172 first = NEXTOPER(first);
4175 else if (!sawopen && (OP(first) == STAR &&
4176 PL_regkind[OP(NEXTOPER(first))] == REG_ANY) &&
4177 !(r->reganch & ROPT_ANCH) )
4179 /* turn .* into ^.* with an implied $*=1 */
4181 (OP(NEXTOPER(first)) == REG_ANY)
4184 r->reganch |= type | ROPT_IMPLICIT;
4185 first = NEXTOPER(first);
4188 if (sawplus && (!sawopen || !RExC_sawback)
4189 && !(RExC_seen & REG_SEEN_EVAL)) /* May examine pos and $& */
4190 /* x+ must match at the 1st pos of run of x's */
4191 r->reganch |= ROPT_SKIP;
4193 /* Scan is after the zeroth branch, first is atomic matcher. */
4194 #ifdef TRIE_STUDY_OPT
4197 PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
4198 (IV)(first - scan + 1))
4202 PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
4203 (IV)(first - scan + 1))
4209 * If there's something expensive in the r.e., find the
4210 * longest literal string that must appear and make it the
4211 * regmust. Resolve ties in favor of later strings, since
4212 * the regstart check works with the beginning of the r.e.
4213 * and avoiding duplication strengthens checking. Not a
4214 * strong reason, but sufficient in the absence of others.
4215 * [Now we resolve ties in favor of the earlier string if
4216 * it happens that c_offset_min has been invalidated, since the
4217 * earlier string may buy us something the later one won't.]
4220 data.longest_fixed = newSVpvs("");
4221 data.longest_float = newSVpvs("");
4222 data.last_found = newSVpvs("");
4223 data.longest = &(data.longest_fixed);
4225 if (!r->regstclass) {
4226 cl_init(pRExC_state, &ch_class);
4227 data.start_class = &ch_class;
4228 stclass_flag = SCF_DO_STCLASS_AND;
4229 } else /* XXXX Check for BOUND? */
4231 data.last_closep = &last_close;
4233 minlen = study_chunk(pRExC_state, &first, &minlen, &fake, scan + RExC_size, /* Up to end */
4234 &data, -1, NULL, NULL,
4235 SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag,0);
4241 if ( RExC_npar == 1 && data.longest == &(data.longest_fixed)
4242 && data.last_start_min == 0 && data.last_end > 0
4243 && !RExC_seen_zerolen
4244 && (!(RExC_seen & REG_SEEN_GPOS) || (r->reganch & ROPT_ANCH_GPOS)))
4245 r->reganch |= ROPT_CHECK_ALL;
4246 scan_commit(pRExC_state, &data,&minlen);
4247 SvREFCNT_dec(data.last_found);
4249 /* Note that code very similar to this but for anchored string
4250 follows immediately below, changes may need to be made to both.
4253 longest_float_length = CHR_SVLEN(data.longest_float);
4254 if (longest_float_length
4255 || (data.flags & SF_FL_BEFORE_EOL
4256 && (!(data.flags & SF_FL_BEFORE_MEOL)
4257 || (RExC_flags & PMf_MULTILINE))))
4261 if (SvCUR(data.longest_fixed) /* ok to leave SvCUR */
4262 && data.offset_fixed == data.offset_float_min
4263 && SvCUR(data.longest_fixed) == SvCUR(data.longest_float))
4264 goto remove_float; /* As in (a)+. */
4266 /* copy the information about the longest float from the reg_scan_data
4267 over to the program. */
4268 if (SvUTF8(data.longest_float)) {
4269 r->float_utf8 = data.longest_float;
4270 r->float_substr = NULL;
4272 r->float_substr = data.longest_float;
4273 r->float_utf8 = NULL;
4275 /* float_end_shift is how many chars that must be matched that
4276 follow this item. We calculate it ahead of time as once the
4277 lookbehind offset is added in we lose the ability to correctly
4279 ml = data.minlen_float ? *(data.minlen_float)
4280 : (I32)longest_float_length;
4281 r->float_end_shift = ml - data.offset_float_min
4282 - longest_float_length + (SvTAIL(data.longest_float) != 0)
4283 + data.lookbehind_float;
4284 r->float_min_offset = data.offset_float_min - data.lookbehind_float;
4285 r->float_max_offset = data.offset_float_max;
4286 if (data.offset_float_max < I32_MAX) /* Don't offset infinity */
4287 r->float_max_offset -= data.lookbehind_float;
4289 t = (data.flags & SF_FL_BEFORE_EOL /* Can't have SEOL and MULTI */
4290 && (!(data.flags & SF_FL_BEFORE_MEOL)
4291 || (RExC_flags & PMf_MULTILINE)));
4292 fbm_compile(data.longest_float, t ? FBMcf_TAIL : 0);
4296 r->float_substr = r->float_utf8 = NULL;
4297 SvREFCNT_dec(data.longest_float);
4298 longest_float_length = 0;
4301 /* Note that code very similar to this but for floating string
4302 is immediately above, changes may need to be made to both.
4305 longest_fixed_length = CHR_SVLEN(data.longest_fixed);
4306 if (longest_fixed_length
4307 || (data.flags & SF_FIX_BEFORE_EOL /* Cannot have SEOL and MULTI */
4308 && (!(data.flags & SF_FIX_BEFORE_MEOL)
4309 || (RExC_flags & PMf_MULTILINE))))
4313 /* copy the information about the longest fixed
4314 from the reg_scan_data over to the program. */
4315 if (SvUTF8(data.longest_fixed)) {
4316 r->anchored_utf8 = data.longest_fixed;
4317 r->anchored_substr = NULL;
4319 r->anchored_substr = data.longest_fixed;
4320 r->anchored_utf8 = NULL;
4322 /* fixed_end_shift is how many chars that must be matched that
4323 follow this item. We calculate it ahead of time as once the
4324 lookbehind offset is added in we lose the ability to correctly
4326 ml = data.minlen_fixed ? *(data.minlen_fixed)
4327 : (I32)longest_fixed_length;
4328 r->anchored_end_shift = ml - data.offset_fixed
4329 - longest_fixed_length + (SvTAIL(data.longest_fixed) != 0)
4330 + data.lookbehind_fixed;
4331 r->anchored_offset = data.offset_fixed - data.lookbehind_fixed;
4333 t = (data.flags & SF_FIX_BEFORE_EOL /* Can't have SEOL and MULTI */
4334 && (!(data.flags & SF_FIX_BEFORE_MEOL)
4335 || (RExC_flags & PMf_MULTILINE)));
4336 fbm_compile(data.longest_fixed, t ? FBMcf_TAIL : 0);
4339 r->anchored_substr = r->anchored_utf8 = NULL;
4340 SvREFCNT_dec(data.longest_fixed);
4341 longest_fixed_length = 0;
4344 && (OP(r->regstclass) == REG_ANY || OP(r->regstclass) == SANY))
4345 r->regstclass = NULL;
4346 if ((!(r->anchored_substr || r->anchored_utf8) || r->anchored_offset)
4348 && !(data.start_class->flags & ANYOF_EOS)
4349 && !cl_is_anything(data.start_class))
4351 const I32 n = add_data(pRExC_state, 1, "f");
4353 Newx(RExC_rx->data->data[n], 1,
4354 struct regnode_charclass_class);
4355 StructCopy(data.start_class,
4356 (struct regnode_charclass_class*)RExC_rx->data->data[n],
4357 struct regnode_charclass_class);
4358 r->regstclass = (regnode*)RExC_rx->data->data[n];
4359 r->reganch &= ~ROPT_SKIP; /* Used in find_byclass(). */
4360 DEBUG_COMPILE_r({ SV *sv = sv_newmortal();
4361 regprop(r, sv, (regnode*)data.start_class);
4362 PerlIO_printf(Perl_debug_log,
4363 "synthetic stclass \"%s\".\n",
4364 SvPVX_const(sv));});
4367 /* A temporary algorithm prefers floated substr to fixed one to dig more info. */
4368 if (longest_fixed_length > longest_float_length) {
4369 r->check_end_shift = r->anchored_end_shift;
4370 r->check_substr = r->anchored_substr;
4371 r->check_utf8 = r->anchored_utf8;
4372 r->check_offset_min = r->check_offset_max = r->anchored_offset;
4373 if (r->reganch & ROPT_ANCH_SINGLE)
4374 r->reganch |= ROPT_NOSCAN;
4377 r->check_end_shift = r->float_end_shift;
4378 r->check_substr = r->float_substr;
4379 r->check_utf8 = r->float_utf8;
4380 r->check_offset_min = r->float_min_offset;
4381 r->check_offset_max = r->float_max_offset;
4383 /* XXXX Currently intuiting is not compatible with ANCH_GPOS.
4384 This should be changed ASAP! */
4385 if ((r->check_substr || r->check_utf8) && !(r->reganch & ROPT_ANCH_GPOS)) {
4386 r->reganch |= RE_USE_INTUIT;
4387 if (SvTAIL(r->check_substr ? r->check_substr : r->check_utf8))
4388 r->reganch |= RE_INTUIT_TAIL;
4390 /* XXX Unneeded? dmq (shouldn't as this is handled elsewhere)
4391 if ( (STRLEN)minlen < longest_float_length )
4392 minlen= longest_float_length;
4393 if ( (STRLEN)minlen < longest_fixed_length )
4394 minlen= longest_fixed_length;
4398 /* Several toplevels. Best we can is to set minlen. */
4400 struct regnode_charclass_class ch_class;
4403 DEBUG_COMPILE_r(PerlIO_printf(Perl_debug_log, "\n"));
4405 scan = r->program + 1;
4406 cl_init(pRExC_state, &ch_class);
4407 data.start_class = &ch_class;
4408 data.last_closep = &last_close;
4411 minlen = study_chunk(pRExC_state, &scan, &minlen, &fake, scan + RExC_size,
4412 &data, -1, NULL, NULL, SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS,0);
4416 r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8
4417 = r->float_substr = r->float_utf8 = NULL;
4418 if (!(data.start_class->flags & ANYOF_EOS)
4419 && !cl_is_anything(data.start_class))
4421 const I32 n = add_data(pRExC_state, 1, "f");
4423 Newx(RExC_rx->data->data[n], 1,
4424 struct regnode_charclass_class);
4425 StructCopy(data.start_class,
4426 (struct regnode_charclass_class*)RExC_rx->data->data[n],
4427 struct regnode_charclass_class);
4428 r->regstclass = (regnode*)RExC_rx->data->data[n];
4429 r->reganch &= ~ROPT_SKIP; /* Used in find_byclass(). */
4430 DEBUG_COMPILE_r({ SV* sv = sv_newmortal();
4431 regprop(r, sv, (regnode*)data.start_class);
4432 PerlIO_printf(Perl_debug_log,
4433 "synthetic stclass \"%s\".\n",
4434 SvPVX_const(sv));});
4438 /* Guard against an embedded (?=) or (?<=) with a longer minlen than
4439 the "real" pattern. */
4441 PerlIO_printf(Perl_debug_log,"minlen: %"IVdf" r->minlen:%"IVdf"\n",
4444 r->minlenret = minlen;
4445 if (r->minlen < minlen)
4448 if (RExC_seen & REG_SEEN_GPOS)
4449 r->reganch |= ROPT_GPOS_SEEN;
4450 if (RExC_seen & REG_SEEN_LOOKBEHIND)
4451 r->reganch |= ROPT_LOOKBEHIND_SEEN;
4452 if (RExC_seen & REG_SEEN_EVAL)
4453 r->reganch |= ROPT_EVAL_SEEN;
4454 if (RExC_seen & REG_SEEN_CANY)
4455 r->reganch |= ROPT_CANY_SEEN;
4456 if (RExC_seen & REG_SEEN_VERBARG)
4457 r->reganch |= ROPT_VERBARG_SEEN;
4458 if (RExC_paren_names)
4459 r->paren_names = (HV*)SvREFCNT_inc(RExC_paren_names);
4461 r->paren_names = NULL;
4463 if (RExC_recurse_count) {
4464 for ( ; RExC_recurse_count ; RExC_recurse_count-- ) {
4465 const regnode *scan = RExC_recurse[RExC_recurse_count-1];
4466 ARG2L_SET( scan, RExC_open_parens[ARG(scan)-1] - scan );
4469 Newxz(r->startp, RExC_npar, I32);
4470 Newxz(r->endp, RExC_npar, I32);
4472 DEBUG_r( RX_DEBUG_on(r) );
4474 PerlIO_printf(Perl_debug_log,"Final program:\n");
4477 DEBUG_OFFSETS_r(if (r->offsets) {
4478 const U32 len = r->offsets[0];
4480 GET_RE_DEBUG_FLAGS_DECL;
4481 PerlIO_printf(Perl_debug_log, "Offsets: [%"UVuf"]\n\t", (UV)r->offsets[0]);
4482 for (i = 1; i <= len; i++) {
4483 if (r->offsets[i*2-1] || r->offsets[i*2])
4484 PerlIO_printf(Perl_debug_log, "%"UVuf":%"UVuf"[%"UVuf"] ",
4485 (UV)i, (UV)r->offsets[i*2-1], (UV)r->offsets[i*2]);
4487 PerlIO_printf(Perl_debug_log, "\n");
4493 #undef CORE_ONLY_BLOCK
4495 #undef RE_ENGINE_PTR
4497 #ifndef PERL_IN_XSUB_RE
4499 Perl_reg_named_buff_sv(pTHX_ SV* namesv)
4501 I32 parno = 0; /* no match */
4503 const REGEXP * const rx = PM_GETRE(PL_curpm);
4504 if (rx && rx->paren_names) {
4505 HE *he_str = hv_fetch_ent( rx->paren_names, namesv, 0, 0 );
4508 SV* sv_dat=HeVAL(he_str);
4509 I32 *nums=(I32*)SvPVX(sv_dat);
4510 for ( i=0; i<SvIVX(sv_dat); i++ ) {
4511 if ((I32)(rx->lastparen) >= nums[i] &&
4512 rx->endp[nums[i]] != -1)
4525 SV *sv= sv_newmortal();
4526 Perl_sv_setpvf(aTHX_ sv, "%"IVdf,(IV)parno);
4527 gv_paren= Perl_gv_fetchsv(aTHX_ sv, GV_ADD, SVt_PVGV);
4528 return GvSVn(gv_paren);
4533 /* Scans the name of a named buffer from the pattern.
4534 * If flags is REG_RSN_RETURN_NULL returns null.
4535 * If flags is REG_RSN_RETURN_NAME returns an SV* containing the name
4536 * If flags is REG_RSN_RETURN_DATA returns the data SV* corresponding
4537 * to the parsed name as looked up in the RExC_paren_names hash.
4538 * If there is an error throws a vFAIL().. type exception.
4541 #define REG_RSN_RETURN_NULL 0
4542 #define REG_RSN_RETURN_NAME 1
4543 #define REG_RSN_RETURN_DATA 2
4546 S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags) {
4547 char *name_start = RExC_parse;
4550 while( isIDFIRST_uni(utf8n_to_uvchr((U8*)RExC_parse,
4551 RExC_end - RExC_parse, &numlen, UTF8_ALLOW_DEFAULT)))
4553 RExC_parse += numlen;
4556 while( isIDFIRST(*RExC_parse) )
4560 SV* sv_name = sv_2mortal(Perl_newSVpvn(aTHX_ name_start,
4561 (int)(RExC_parse - name_start)));
4564 if ( flags == REG_RSN_RETURN_NAME)
4566 else if (flags==REG_RSN_RETURN_DATA) {
4569 if ( ! sv_name ) /* should not happen*/
4570 Perl_croak(aTHX_ "panic: no svname in reg_scan_name");
4571 if (RExC_paren_names)
4572 he_str = hv_fetch_ent( RExC_paren_names, sv_name, 0, 0 );
4574 sv_dat = HeVAL(he_str);
4576 vFAIL("Reference to nonexistent named group");
4580 Perl_croak(aTHX_ "panic: bad flag in reg_scan_name");
4587 #define DEBUG_PARSE_MSG(funcname) DEBUG_PARSE_r({ \
4588 int rem=(int)(RExC_end - RExC_parse); \
4597 if (RExC_lastparse!=RExC_parse) \
4598 PerlIO_printf(Perl_debug_log," >%.*s%-*s", \
4601 iscut ? "..." : "<" \
4604 PerlIO_printf(Perl_debug_log,"%16s",""); \
4609 num=REG_NODE_NUM(RExC_emit); \
4610 if (RExC_lastnum!=num) \
4611 PerlIO_printf(Perl_debug_log,"|%4d",num); \
4613 PerlIO_printf(Perl_debug_log,"|%4s",""); \
4614 PerlIO_printf(Perl_debug_log,"|%*s%-4s", \
4615 (int)((depth*2)), "", \
4619 RExC_lastparse=RExC_parse; \
4624 #define DEBUG_PARSE(funcname) DEBUG_PARSE_r({ \
4625 DEBUG_PARSE_MSG((funcname)); \
4626 PerlIO_printf(Perl_debug_log,"%4s","\n"); \
4628 #define DEBUG_PARSE_FMT(funcname,fmt,args) DEBUG_PARSE_r({ \
4629 DEBUG_PARSE_MSG((funcname)); \
4630 PerlIO_printf(Perl_debug_log,fmt "\n",args); \
4633 - reg - regular expression, i.e. main body or parenthesized thing
4635 * Caller must absorb opening parenthesis.
4637 * Combining parenthesis handling with the base level of regular expression
4638 * is a trifle forced, but the need to tie the tails of the branches to what
4639 * follows makes it hard to avoid.
4641 #define REGTAIL(x,y,z) regtail((x),(y),(z),depth+1)
4643 #define REGTAIL_STUDY(x,y,z) regtail_study((x),(y),(z),depth+1)
4645 #define REGTAIL_STUDY(x,y,z) regtail((x),(y),(z),depth+1)
4648 /* this idea is borrowed from STR_WITH_LEN in handy.h */
4649 #define CHECK_WORD(s,v,l) \
4650 (((sizeof(s)-1)==(l)) && (strnEQ(start_verb, (s ""), (sizeof(s)-1))))
4653 S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
4654 /* paren: Parenthesized? 0=top, 1=(, inside: changed to letter. */
4657 register regnode *ret; /* Will be the head of the group. */
4658 register regnode *br;
4659 register regnode *lastbr;
4660 register regnode *ender = NULL;
4661 register I32 parno = 0;
4663 const I32 oregflags = RExC_flags;
4664 bool have_branch = 0;
4667 /* for (?g), (?gc), and (?o) warnings; warning
4668 about (?c) will warn about (?g) -- japhy */
4670 #define WASTED_O 0x01
4671 #define WASTED_G 0x02
4672 #define WASTED_C 0x04
4673 #define WASTED_GC (0x02|0x04)
4674 I32 wastedflags = 0x00;
4676 char * parse_start = RExC_parse; /* MJD */
4677 char * const oregcomp_parse = RExC_parse;
4679 GET_RE_DEBUG_FLAGS_DECL;
4680 DEBUG_PARSE("reg ");
4683 *flagp = 0; /* Tentatively. */
4686 /* Make an OPEN node, if parenthesized. */
4688 if ( *RExC_parse == '*') { /* (*VERB:ARG) */
4689 char *start_verb = RExC_parse;
4690 STRLEN verb_len = 0;
4691 char *start_arg = NULL;
4692 unsigned char op = 0;
4694 int internal_argval = 0; /* internal_argval is only useful if !argok */
4695 while ( *RExC_parse && *RExC_parse != ')' ) {
4696 if ( *RExC_parse == ':' ) {
4697 start_arg = RExC_parse + 1;
4703 verb_len = RExC_parse - start_verb;
4706 while ( *RExC_parse && *RExC_parse != ')' )
4708 if ( *RExC_parse != ')' )
4709 vFAIL("Unterminated verb pattern argument");
4710 if ( RExC_parse == start_arg )
4713 if ( *RExC_parse != ')' )
4714 vFAIL("Unterminated verb pattern");
4716 switch ( *start_verb ) {
4717 case 'A': /* (*ACCEPT) */
4718 if ( CHECK_WORD("ACCEPT",start_verb,verb_len) ) {
4720 internal_argval = RExC_nestroot;
4723 case 'C': /* (*COMMIT) */
4724 if ( CHECK_WORD("COMMIT",start_verb,verb_len) )
4726 else if ( CHECK_WORD("CUT",start_verb,verb_len) )
4729 case 'F': /* (*FAIL) */
4730 if ( verb_len==1 || CHECK_WORD("FAIL",start_verb,verb_len) ) {
4736 if ( CHECK_WORD("MARK",start_verb,verb_len) )
4739 case 'N': /* (*NOMATCH) */
4740 if ( CHECK_WORD("NOMATCH",start_verb,verb_len) )
4746 vFAIL3("Unknown verb pattern '%.*s'",
4747 verb_len, start_verb);
4750 if ( start_arg && internal_argval ) {
4751 vFAIL3("Verb pattern '%.*s' may not have an argument",
4752 verb_len, start_verb);
4753 } else if ( argok < 0 && !start_arg ) {
4754 vFAIL3("Verb pattern '%.*s' has a mandatory argument",
4755 verb_len, start_verb);
4757 ret = reganode(pRExC_state, op, internal_argval);
4758 if ( ! internal_argval && ! SIZE_ONLY ) {
4760 SV *sv = newSVpvn( start_arg, RExC_parse - start_arg);
4761 ARG(ret) = add_data( pRExC_state, 1, "S" );
4762 RExC_rx->data->data[ARG(ret)]=(void*)sv;
4769 if (!internal_argval)
4770 RExC_seen |= REG_SEEN_VERBARG;
4771 } else if ( start_arg ) {
4772 vFAIL3("Verb pattern '%.*s' may not have an argument",
4773 verb_len, start_verb);
4775 ret = reg_node(pRExC_state, op);
4777 nextchar(pRExC_state);
4780 if (*RExC_parse == '?') { /* (?...) */
4781 U32 posflags = 0, negflags = 0;
4782 U32 *flagsp = &posflags;
4783 bool is_logical = 0;
4784 const char * const seqstart = RExC_parse;
4787 paren = *RExC_parse++;
4788 ret = NULL; /* For look-ahead/behind. */
4791 case '<': /* (?<...) */
4792 if (*RExC_parse == '!')
4794 else if (*RExC_parse != '=')
4799 case '\'': /* (?'...') */
4800 name_start= RExC_parse;
4801 svname = reg_scan_name(pRExC_state,
4802 SIZE_ONLY ? /* reverse test from the others */
4803 REG_RSN_RETURN_NAME :
4804 REG_RSN_RETURN_NULL);
4805 if (RExC_parse == name_start)
4807 if (*RExC_parse != paren)
4808 vFAIL2("Sequence (?%c... not terminated",
4809 paren=='>' ? '<' : paren);
4813 if (!svname) /* shouldnt happen */
4815 "panic: reg_scan_name returned NULL");
4816 if (!RExC_paren_names) {
4817 RExC_paren_names= newHV();
4818 sv_2mortal((SV*)RExC_paren_names);
4820 he_str = hv_fetch_ent( RExC_paren_names, svname, 1, 0 );
4822 sv_dat = HeVAL(he_str);
4824 /* croak baby croak */
4826 "panic: paren_name hash element allocation failed");
4827 } else if ( SvPOK(sv_dat) ) {
4828 IV count=SvIV(sv_dat);
4829 I32 *pv=(I32*)SvGROW(sv_dat,SvCUR(sv_dat)+sizeof(I32)+1);
4830 SvCUR_set(sv_dat,SvCUR(sv_dat)+sizeof(I32));
4831 pv[count]=RExC_npar;
4834 (void)SvUPGRADE(sv_dat,SVt_PVNV);
4835 sv_setpvn(sv_dat, (char *)&(RExC_npar), sizeof(I32));
4840 /*sv_dump(sv_dat);*/
4842 nextchar(pRExC_state);
4844 goto capturing_parens;
4846 RExC_seen |= REG_SEEN_LOOKBEHIND;
4848 case '=': /* (?=...) */
4849 case '!': /* (?!...) */
4850 RExC_seen_zerolen++;
4851 if (*RExC_parse == ')') {
4852 ret=reg_node(pRExC_state, OPFAIL);
4853 nextchar(pRExC_state);
4856 case ':': /* (?:...) */
4857 case '>': /* (?>...) */
4859 case '$': /* (?$...) */
4860 case '@': /* (?@...) */
4861 vFAIL2("Sequence (?%c...) not implemented", (int)paren);
4863 case '#': /* (?#...) */
4864 while (*RExC_parse && *RExC_parse != ')')
4866 if (*RExC_parse != ')')
4867 FAIL("Sequence (?#... not terminated");
4868 nextchar(pRExC_state);
4871 case '0' : /* (?0) */
4872 case 'R' : /* (?R) */
4873 if (*RExC_parse != ')')
4874 FAIL("Sequence (?R) not terminated");
4875 ret = reg_node(pRExC_state, GOSTART);
4876 nextchar(pRExC_state);
4879 { /* named and numeric backreferences */
4882 case '&': /* (?&NAME) */
4883 parse_start = RExC_parse - 1;
4885 SV *sv_dat = reg_scan_name(pRExC_state,
4886 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
4887 num = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
4889 goto gen_recurse_regop;
4891 case '1': case '2': case '3': case '4': /* (?1) */
4892 case '5': case '6': case '7': case '8': case '9':
4894 num = atoi(RExC_parse);
4895 parse_start = RExC_parse - 1; /* MJD */
4896 while (isDIGIT(*RExC_parse))
4898 if (*RExC_parse!=')')
4899 vFAIL("Expecting close bracket");
4902 ret = reganode(pRExC_state, GOSUB, num);
4904 if (num > (I32)RExC_rx->nparens) {
4906 vFAIL("Reference to nonexistent group");
4908 ARG2L_SET( ret, RExC_recurse_count++);
4910 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
4911 "Recurse #%"UVuf" to %"IVdf"\n", (UV)ARG(ret), (IV)ARG2L(ret)));
4915 RExC_seen |= REG_SEEN_RECURSE;
4916 Set_Node_Length(ret, 1 + regarglen[OP(ret)]); /* MJD */
4917 Set_Node_Offset(ret, parse_start); /* MJD */
4919 nextchar(pRExC_state);
4921 } /* named and numeric backreferences */
4924 case 'p': /* (?p...) */
4925 if (SIZE_ONLY && ckWARN2(WARN_DEPRECATED, WARN_REGEXP))
4926 vWARNdep(RExC_parse, "(?p{}) is deprecated - use (??{})");
4928 case '?': /* (??...) */
4930 if (*RExC_parse != '{')
4932 paren = *RExC_parse++;
4934 case '{': /* (?{...}) */
4936 I32 count = 1, n = 0;
4938 char *s = RExC_parse;
4940 RExC_seen_zerolen++;
4941 RExC_seen |= REG_SEEN_EVAL;
4942 while (count && (c = *RExC_parse)) {
4953 if (*RExC_parse != ')') {
4955 vFAIL("Sequence (?{...}) not terminated or not {}-balanced");
4959 OP_4tree *sop, *rop;
4960 SV * const sv = newSVpvn(s, RExC_parse - 1 - s);
4963 Perl_save_re_context(aTHX);
4964 rop = sv_compile_2op(sv, &sop, "re", &pad);
4965 sop->op_private |= OPpREFCOUNTED;
4966 /* re_dup will OpREFCNT_inc */
4967 OpREFCNT_set(sop, 1);
4970 n = add_data(pRExC_state, 3, "nop");
4971 RExC_rx->data->data[n] = (void*)rop;
4972 RExC_rx->data->data[n+1] = (void*)sop;
4973 RExC_rx->data->data[n+2] = (void*)pad;
4976 else { /* First pass */
4977 if (PL_reginterp_cnt < ++RExC_seen_evals
4979 /* No compiled RE interpolated, has runtime
4980 components ===> unsafe. */
4981 FAIL("Eval-group not allowed at runtime, use re 'eval'");
4982 if (PL_tainting && PL_tainted)
4983 FAIL("Eval-group in insecure regular expression");
4984 #if PERL_VERSION > 8
4985 if (IN_PERL_COMPILETIME)
4990 nextchar(pRExC_state);
4992 ret = reg_node(pRExC_state, LOGICAL);
4995 REGTAIL(pRExC_state, ret, reganode(pRExC_state, EVAL, n));
4996 /* deal with the length of this later - MJD */
4999 ret = reganode(pRExC_state, EVAL, n);
5000 Set_Node_Length(ret, RExC_parse - parse_start + 1);
5001 Set_Node_Offset(ret, parse_start);
5004 case '(': /* (?(?{...})...) and (?(?=...)...) */
5007 if (RExC_parse[0] == '?') { /* (?(?...)) */
5008 if (RExC_parse[1] == '=' || RExC_parse[1] == '!'
5009 || RExC_parse[1] == '<'
5010 || RExC_parse[1] == '{') { /* Lookahead or eval. */
5013 ret = reg_node(pRExC_state, LOGICAL);
5016 REGTAIL(pRExC_state, ret, reg(pRExC_state, 1, &flag,depth+1));
5020 else if ( RExC_parse[0] == '<' /* (?(<NAME>)...) */
5021 || RExC_parse[0] == '\'' ) /* (?('NAME')...) */
5023 char ch = RExC_parse[0] == '<' ? '>' : '\'';
5024 char *name_start= RExC_parse++;
5026 SV *sv_dat=reg_scan_name(pRExC_state,
5027 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
5028 if (RExC_parse == name_start || *RExC_parse != ch)
5029 vFAIL2("Sequence (?(%c... not terminated",
5030 (ch == '>' ? '<' : ch));
5033 num = add_data( pRExC_state, 1, "S" );
5034 RExC_rx->data->data[num]=(void*)sv_dat;
5035 SvREFCNT_inc(sv_dat);
5037 ret = reganode(pRExC_state,NGROUPP,num);
5038 goto insert_if_check_paren;
5040 else if (RExC_parse[0] == 'D' &&
5041 RExC_parse[1] == 'E' &&
5042 RExC_parse[2] == 'F' &&
5043 RExC_parse[3] == 'I' &&
5044 RExC_parse[4] == 'N' &&
5045 RExC_parse[5] == 'E')
5047 ret = reganode(pRExC_state,DEFINEP,0);
5050 goto insert_if_check_paren;
5052 else if (RExC_parse[0] == 'R') {
5055 if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
5056 parno = atoi(RExC_parse++);
5057 while (isDIGIT(*RExC_parse))
5059 } else if (RExC_parse[0] == '&') {
5062 sv_dat = reg_scan_name(pRExC_state,
5063 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
5064 parno = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
5066 ret = reganode(pRExC_state,INSUBP,parno);
5067 goto insert_if_check_paren;
5069 else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
5072 parno = atoi(RExC_parse++);
5074 while (isDIGIT(*RExC_parse))
5076 ret = reganode(pRExC_state, GROUPP, parno);
5078 insert_if_check_paren:
5079 if ((c = *nextchar(pRExC_state)) != ')')
5080 vFAIL("Switch condition not recognized");
5082 REGTAIL(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
5083 br = regbranch(pRExC_state, &flags, 1,depth+1);
5085 br = reganode(pRExC_state, LONGJMP, 0);
5087 REGTAIL(pRExC_state, br, reganode(pRExC_state, LONGJMP, 0));
5088 c = *nextchar(pRExC_state);
5093 vFAIL("(?(DEFINE)....) does not allow branches");
5094 lastbr = reganode(pRExC_state, IFTHEN, 0); /* Fake one for optimizer. */
5095 regbranch(pRExC_state, &flags, 1,depth+1);
5096 REGTAIL(pRExC_state, ret, lastbr);
5099 c = *nextchar(pRExC_state);
5104 vFAIL("Switch (?(condition)... contains too many branches");
5105 ender = reg_node(pRExC_state, TAIL);
5106 REGTAIL(pRExC_state, br, ender);
5108 REGTAIL(pRExC_state, lastbr, ender);
5109 REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
5112 REGTAIL(pRExC_state, ret, ender);
5116 vFAIL2("Unknown switch condition (?(%.2s", RExC_parse);
5120 RExC_parse--; /* for vFAIL to print correctly */
5121 vFAIL("Sequence (? incomplete");
5125 parse_flags: /* (?i) */
5126 while (*RExC_parse && strchr("iogcmsx", *RExC_parse)) {
5127 /* (?g), (?gc) and (?o) are useless here
5128 and must be globally applied -- japhy */
5130 if (*RExC_parse == 'o' || *RExC_parse == 'g') {
5131 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
5132 const I32 wflagbit = *RExC_parse == 'o' ? WASTED_O : WASTED_G;
5133 if (! (wastedflags & wflagbit) ) {
5134 wastedflags |= wflagbit;
5137 "Useless (%s%c) - %suse /%c modifier",
5138 flagsp == &negflags ? "?-" : "?",
5140 flagsp == &negflags ? "don't " : "",
5146 else if (*RExC_parse == 'c') {
5147 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
5148 if (! (wastedflags & WASTED_C) ) {
5149 wastedflags |= WASTED_GC;
5152 "Useless (%sc) - %suse /gc modifier",
5153 flagsp == &negflags ? "?-" : "?",
5154 flagsp == &negflags ? "don't " : ""
5159 else { pmflag(flagsp, *RExC_parse); }
5163 if (*RExC_parse == '-') {
5165 wastedflags = 0; /* reset so (?g-c) warns twice */
5169 RExC_flags |= posflags;
5170 RExC_flags &= ~negflags;
5171 if (*RExC_parse == ':') {
5177 if (*RExC_parse != ')') {
5179 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
5181 nextchar(pRExC_state);
5191 ret = reganode(pRExC_state, OPEN, parno);
5194 RExC_nestroot = parno;
5195 if (RExC_seen & REG_SEEN_RECURSE) {
5196 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
5197 "Setting open paren #%"IVdf" to %d\n",
5198 (IV)parno, REG_NODE_NUM(ret)));
5199 RExC_open_parens[parno-1]= ret;
5202 Set_Node_Length(ret, 1); /* MJD */
5203 Set_Node_Offset(ret, RExC_parse); /* MJD */
5210 /* Pick up the branches, linking them together. */
5211 parse_start = RExC_parse; /* MJD */
5212 br = regbranch(pRExC_state, &flags, 1,depth+1);
5213 /* branch_len = (paren != 0); */
5217 if (*RExC_parse == '|') {
5218 if (!SIZE_ONLY && RExC_extralen) {
5219 reginsert(pRExC_state, BRANCHJ, br, depth+1);
5222 reginsert(pRExC_state, BRANCH, br, depth+1);
5223 Set_Node_Length(br, paren != 0);
5224 Set_Node_Offset_To_R(br-RExC_emit_start, parse_start-RExC_start);
5228 RExC_extralen += 1; /* For BRANCHJ-BRANCH. */
5230 else if (paren == ':') {
5231 *flagp |= flags&SIMPLE;
5233 if (is_open) { /* Starts with OPEN. */
5234 REGTAIL(pRExC_state, ret, br); /* OPEN -> first. */
5236 else if (paren != '?') /* Not Conditional */
5238 *flagp |= flags & (SPSTART | HASWIDTH);
5240 while (*RExC_parse == '|') {
5241 if (!SIZE_ONLY && RExC_extralen) {
5242 ender = reganode(pRExC_state, LONGJMP,0);
5243 REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender); /* Append to the previous. */
5246 RExC_extralen += 2; /* Account for LONGJMP. */
5247 nextchar(pRExC_state);
5248 br = regbranch(pRExC_state, &flags, 0, depth+1);
5252 REGTAIL(pRExC_state, lastbr, br); /* BRANCH -> BRANCH. */
5256 *flagp |= flags&SPSTART;
5259 if (have_branch || paren != ':') {
5260 /* Make a closing node, and hook it on the end. */
5263 ender = reg_node(pRExC_state, TAIL);
5266 ender = reganode(pRExC_state, CLOSE, parno);
5267 if (!SIZE_ONLY && RExC_seen & REG_SEEN_RECURSE) {
5268 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
5269 "Setting close paren #%"IVdf" to %d\n",
5270 (IV)parno, REG_NODE_NUM(ender)));
5271 RExC_close_parens[parno-1]= ender;
5272 if (RExC_nestroot == parno)
5275 Set_Node_Offset(ender,RExC_parse+1); /* MJD */
5276 Set_Node_Length(ender,1); /* MJD */
5282 *flagp &= ~HASWIDTH;
5285 ender = reg_node(pRExC_state, SUCCEED);
5288 ender = reg_node(pRExC_state, END);
5290 assert(!RExC_opend); /* there can only be one! */
5295 REGTAIL(pRExC_state, lastbr, ender);
5297 if (have_branch && !SIZE_ONLY) {
5299 RExC_seen |= REG_TOP_LEVEL_BRANCHES;
5301 /* Hook the tails of the branches to the closing node. */
5302 for (br = ret; br; br = regnext(br)) {
5303 const U8 op = PL_regkind[OP(br)];
5305 REGTAIL_STUDY(pRExC_state, NEXTOPER(br), ender);
5307 else if (op == BRANCHJ) {
5308 REGTAIL_STUDY(pRExC_state, NEXTOPER(NEXTOPER(br)), ender);
5316 static const char parens[] = "=!<,>";
5318 if (paren && (p = strchr(parens, paren))) {
5319 U8 node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
5320 int flag = (p - parens) > 1;
5323 node = SUSPEND, flag = 0;
5324 reginsert(pRExC_state, node,ret, depth+1);
5325 Set_Node_Cur_Length(ret);
5326 Set_Node_Offset(ret, parse_start + 1);
5328 REGTAIL_STUDY(pRExC_state, ret, reg_node(pRExC_state, TAIL));
5332 /* Check for proper termination. */
5334 RExC_flags = oregflags;
5335 if (RExC_parse >= RExC_end || *nextchar(pRExC_state) != ')') {
5336 RExC_parse = oregcomp_parse;
5337 vFAIL("Unmatched (");
5340 else if (!paren && RExC_parse < RExC_end) {
5341 if (*RExC_parse == ')') {
5343 vFAIL("Unmatched )");
5346 FAIL("Junk on end of regexp"); /* "Can't happen". */
5354 - regbranch - one alternative of an | operator
5356 * Implements the concatenation operator.
5359 S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first, U32 depth)
5362 register regnode *ret;
5363 register regnode *chain = NULL;
5364 register regnode *latest;
5365 I32 flags = 0, c = 0;
5366 GET_RE_DEBUG_FLAGS_DECL;
5367 DEBUG_PARSE("brnc");
5371 if (!SIZE_ONLY && RExC_extralen)
5372 ret = reganode(pRExC_state, BRANCHJ,0);
5374 ret = reg_node(pRExC_state, BRANCH);
5375 Set_Node_Length(ret, 1);
5379 if (!first && SIZE_ONLY)
5380 RExC_extralen += 1; /* BRANCHJ */
5382 *flagp = WORST; /* Tentatively. */
5385 nextchar(pRExC_state);
5386 while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
5388 latest = regpiece(pRExC_state, &flags,depth+1);
5389 if (latest == NULL) {
5390 if (flags & TRYAGAIN)
5394 else if (ret == NULL)
5396 *flagp |= flags&HASWIDTH;
5397 if (chain == NULL) /* First piece. */
5398 *flagp |= flags&SPSTART;
5401 REGTAIL(pRExC_state, chain, latest);
5406 if (chain == NULL) { /* Loop ran zero times. */
5407 chain = reg_node(pRExC_state, NOTHING);
5412 *flagp |= flags&SIMPLE;
5419 - regpiece - something followed by possible [*+?]
5421 * Note that the branching code sequences used for ? and the general cases
5422 * of * and + are somewhat optimized: they use the same NOTHING node as
5423 * both the endmarker for their branch list and the body of the last branch.
5424 * It might seem that this node could be dispensed with entirely, but the
5425 * endmarker role is not redundant.
5428 S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
5431 register regnode *ret;
5433 register char *next;
5435 const char * const origparse = RExC_parse;
5437 I32 max = REG_INFTY;
5439 const char *maxpos = NULL;
5440 GET_RE_DEBUG_FLAGS_DECL;
5441 DEBUG_PARSE("piec");
5443 ret = regatom(pRExC_state, &flags,depth+1);
5445 if (flags & TRYAGAIN)
5452 if (op == '{' && regcurly(RExC_parse)) {
5454 parse_start = RExC_parse; /* MJD */
5455 next = RExC_parse + 1;
5456 while (isDIGIT(*next) || *next == ',') {
5465 if (*next == '}') { /* got one */
5469 min = atoi(RExC_parse);
5473 maxpos = RExC_parse;
5475 if (!max && *maxpos != '0')
5476 max = REG_INFTY; /* meaning "infinity" */
5477 else if (max >= REG_INFTY)
5478 vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
5480 nextchar(pRExC_state);
5483 if ((flags&SIMPLE)) {
5484 RExC_naughty += 2 + RExC_naughty / 2;
5485 reginsert(pRExC_state, CURLY, ret, depth+1);
5486 Set_Node_Offset(ret, parse_start+1); /* MJD */
5487 Set_Node_Cur_Length(ret);
5490 regnode * const w = reg_node(pRExC_state, WHILEM);
5493 REGTAIL(pRExC_state, ret, w);
5494 if (!SIZE_ONLY && RExC_extralen) {
5495 reginsert(pRExC_state, LONGJMP,ret, depth+1);
5496 reginsert(pRExC_state, NOTHING,ret, depth+1);
5497 NEXT_OFF(ret) = 3; /* Go over LONGJMP. */
5499 reginsert(pRExC_state, CURLYX,ret, depth+1);
5501 Set_Node_Offset(ret, parse_start+1);
5502 Set_Node_Length(ret,
5503 op == '{' ? (RExC_parse - parse_start) : 1);
5505 if (!SIZE_ONLY && RExC_extralen)
5506 NEXT_OFF(ret) = 3; /* Go over NOTHING to LONGJMP. */
5507 REGTAIL(pRExC_state, ret, reg_node(pRExC_state, NOTHING));
5509 RExC_whilem_seen++, RExC_extralen += 3;
5510 RExC_naughty += 4 + RExC_naughty; /* compound interest */
5518 if (max && max < min)
5519 vFAIL("Can't do {n,m} with n > m");
5521 ARG1_SET(ret, (U16)min);
5522 ARG2_SET(ret, (U16)max);
5534 #if 0 /* Now runtime fix should be reliable. */
5536 /* if this is reinstated, don't forget to put this back into perldiag:
5538 =item Regexp *+ operand could be empty at {#} in regex m/%s/
5540 (F) The part of the regexp subject to either the * or + quantifier
5541 could match an empty string. The {#} shows in the regular
5542 expression about where the problem was discovered.
5546 if (!(flags&HASWIDTH) && op != '?')
5547 vFAIL("Regexp *+ operand could be empty");
5550 parse_start = RExC_parse;
5551 nextchar(pRExC_state);
5553 *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH);
5555 if (op == '*' && (flags&SIMPLE)) {
5556 reginsert(pRExC_state, STAR, ret, depth+1);
5560 else if (op == '*') {
5564 else if (op == '+' && (flags&SIMPLE)) {
5565 reginsert(pRExC_state, PLUS, ret, depth+1);
5569 else if (op == '+') {
5573 else if (op == '?') {
5578 if (!SIZE_ONLY && !(flags&HASWIDTH) && max > REG_INFTY/3 && ckWARN(WARN_REGEXP)) {
5580 "%.*s matches null string many times",
5581 (int)(RExC_parse >= origparse ? RExC_parse - origparse : 0),
5585 if (RExC_parse < RExC_end && *RExC_parse == '?') {
5586 nextchar(pRExC_state);
5587 reginsert(pRExC_state, MINMOD, ret, depth+1);
5588 REGTAIL(pRExC_state, ret, ret + NODE_STEP_REGNODE);
5590 #ifndef REG_ALLOW_MINMOD_SUSPEND
5593 if (RExC_parse < RExC_end && *RExC_parse == '+') {
5595 nextchar(pRExC_state);
5596 ender = reg_node(pRExC_state, SUCCEED);
5597 REGTAIL(pRExC_state, ret, ender);
5598 reginsert(pRExC_state, SUSPEND, ret, depth+1);
5600 ender = reg_node(pRExC_state, TAIL);
5601 REGTAIL(pRExC_state, ret, ender);
5605 if (RExC_parse < RExC_end && ISMULT2(RExC_parse)) {
5607 vFAIL("Nested quantifiers");
5614 /* reg_namedseq(pRExC_state,UVp)
5616 This is expected to be called by a parser routine that has
5617 recognized'\N' and needs to handle the rest. RExC_parse is
5618 expected to point at the first char following the N at the time
5621 If valuep is non-null then it is assumed that we are parsing inside
5622 of a charclass definition and the first codepoint in the resolved
5623 string is returned via *valuep and the routine will return NULL.
5624 In this mode if a multichar string is returned from the charnames
5625 handler a warning will be issued, and only the first char in the
5626 sequence will be examined. If the string returned is zero length
5627 then the value of *valuep is undefined and NON-NULL will
5628 be returned to indicate failure. (This will NOT be a valid pointer
5631 If value is null then it is assumed that we are parsing normal text
5632 and inserts a new EXACT node into the program containing the resolved
5633 string and returns a pointer to the new node. If the string is
5634 zerolength a NOTHING node is emitted.
5636 On success RExC_parse is set to the char following the endbrace.
5637 Parsing failures will generate a fatal errorvia vFAIL(...)
5639 NOTE: We cache all results from the charnames handler locally in
5640 the RExC_charnames hash (created on first use) to prevent a charnames
5641 handler from playing silly-buggers and returning a short string and
5642 then a long string for a given pattern. Since the regexp program
5643 size is calculated during an initial parse this would result
5644 in a buffer overrun so we cache to prevent the charname result from
5645 changing during the course of the parse.
5649 S_reg_namedseq(pTHX_ RExC_state_t *pRExC_state, UV *valuep)
5651 char * name; /* start of the content of the name */
5652 char * endbrace; /* endbrace following the name */
5655 STRLEN len; /* this has various purposes throughout the code */
5656 bool cached = 0; /* if this is true then we shouldn't refcount dev sv_str */
5657 regnode *ret = NULL;
5659 if (*RExC_parse != '{') {
5660 vFAIL("Missing braces on \\N{}");
5662 name = RExC_parse+1;
5663 endbrace = strchr(RExC_parse, '}');
5666 vFAIL("Missing right brace on \\N{}");
5668 RExC_parse = endbrace + 1;
5671 /* RExC_parse points at the beginning brace,
5672 endbrace points at the last */
5673 if ( name[0]=='U' && name[1]=='+' ) {
5674 /* its a "unicode hex" notation {U+89AB} */
5675 I32 fl = PERL_SCAN_ALLOW_UNDERSCORES
5676 | PERL_SCAN_DISALLOW_PREFIX
5677 | (SIZE_ONLY ? PERL_SCAN_SILENT_ILLDIGIT : 0);
5679 len = (STRLEN)(endbrace - name - 2);
5680 cp = grok_hex(name + 2, &len, &fl, NULL);
5681 if ( len != (STRLEN)(endbrace - name - 2) ) {
5690 sv_str= Perl_newSVpvf_nocontext("%c",(int)cp);
5692 /* fetch the charnames handler for this scope */
5693 HV * const table = GvHV(PL_hintgv);
5695 hv_fetchs(table, "charnames", FALSE) :
5697 SV *cv= cvp ? *cvp : NULL;
5700 /* create an SV with the name as argument */
5701 sv_name = newSVpvn(name, endbrace - name);
5703 if (!table || !(PL_hints & HINT_LOCALIZE_HH)) {
5704 vFAIL2("Constant(\\N{%s}) unknown: "
5705 "(possibly a missing \"use charnames ...\")",
5708 if (!cvp || !SvOK(*cvp)) { /* when $^H{charnames} = undef; */
5709 vFAIL2("Constant(\\N{%s}): "
5710 "$^H{charnames} is not defined",SvPVX(sv_name));
5715 if (!RExC_charnames) {
5716 /* make sure our cache is allocated */
5717 RExC_charnames = newHV();
5718 sv_2mortal((SV*)RExC_charnames);
5720 /* see if we have looked this one up before */
5721 he_str = hv_fetch_ent( RExC_charnames, sv_name, 0, 0 );
5723 sv_str = HeVAL(he_str);
5736 count= call_sv(cv, G_SCALAR);
5738 if (count == 1) { /* XXXX is this right? dmq */
5740 SvREFCNT_inc_simple_void(sv_str);
5748 if ( !sv_str || !SvOK(sv_str) ) {
5749 vFAIL2("Constant(\\N{%s}): Call to &{$^H{charnames}} "
5750 "did not return a defined value",SvPVX(sv_name));
5752 if (hv_store_ent( RExC_charnames, sv_name, sv_str, 0))
5757 char *p = SvPV(sv_str, len);
5760 if ( SvUTF8(sv_str) ) {
5761 *valuep = utf8_to_uvchr((U8*)p, &numlen);
5765 We have to turn on utf8 for high bit chars otherwise
5766 we get failures with
5768 "ss" =~ /[\N{LATIN SMALL LETTER SHARP S}]/i
5769 "SS" =~ /[\N{LATIN SMALL LETTER SHARP S}]/i
5771 This is different from what \x{} would do with the same
5772 codepoint, where the condition is > 0xFF.
5779 /* warn if we havent used the whole string? */
5781 if (numlen<len && SIZE_ONLY && ckWARN(WARN_REGEXP)) {
5783 "Ignoring excess chars from \\N{%s} in character class",
5787 } else if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
5789 "Ignoring zero length \\N{%s} in character class",
5794 SvREFCNT_dec(sv_name);
5796 SvREFCNT_dec(sv_str);
5797 return len ? NULL : (regnode *)&len;
5798 } else if(SvCUR(sv_str)) {
5803 char * parse_start = name-3; /* needed for the offsets */
5804 GET_RE_DEBUG_FLAGS_DECL; /* needed for the offsets */
5806 ret = reg_node(pRExC_state,
5807 (U8)(FOLD ? (LOC ? EXACTFL : EXACTF) : EXACT));
5810 if ( RExC_utf8 && !SvUTF8(sv_str) ) {
5811 sv_utf8_upgrade(sv_str);
5812 } else if ( !RExC_utf8 && SvUTF8(sv_str) ) {
5816 p = SvPV(sv_str, len);
5818 /* len is the length written, charlen is the size the char read */
5819 for ( len = 0; p < pend; p += charlen ) {
5821 UV uvc = utf8_to_uvchr((U8*)p, &charlen);
5823 STRLEN foldlen,numlen;
5824 U8 tmpbuf[UTF8_MAXBYTES_CASE+1], *foldbuf;
5825 uvc = toFOLD_uni(uvc, tmpbuf, &foldlen);
5826 /* Emit all the Unicode characters. */
5828 for (foldbuf = tmpbuf;
5832 uvc = utf8_to_uvchr(foldbuf, &numlen);
5834 const STRLEN unilen = reguni(pRExC_state, uvc, s);
5837 /* In EBCDIC the numlen
5838 * and unilen can differ. */
5840 if (numlen >= foldlen)
5844 break; /* "Can't happen." */
5847 const STRLEN unilen = reguni(pRExC_state, uvc, s);
5859 RExC_size += STR_SZ(len);
5862 RExC_emit += STR_SZ(len);
5864 Set_Node_Cur_Length(ret); /* MJD */
5866 nextchar(pRExC_state);
5868 ret = reg_node(pRExC_state,NOTHING);
5871 SvREFCNT_dec(sv_str);
5874 SvREFCNT_dec(sv_name);
5884 * It returns the code point in utf8 for the value in *encp.
5885 * value: a code value in the source encoding
5886 * encp: a pointer to an Encode object
5888 * If the result from Encode is not a single character,
5889 * it returns U+FFFD (Replacement character) and sets *encp to NULL.
5892 S_reg_recode(pTHX_ const char value, SV **encp)
5895 SV * const sv = sv_2mortal(newSVpvn(&value, numlen));
5896 const char * const s = encp && *encp ? sv_recode_to_utf8(sv, *encp)
5898 const STRLEN newlen = SvCUR(sv);
5899 UV uv = UNICODE_REPLACEMENT;
5903 ? utf8n_to_uvchr((U8*)s, newlen, &numlen, UTF8_ALLOW_DEFAULT)
5906 if (!newlen || numlen != newlen) {
5907 uv = UNICODE_REPLACEMENT;
5916 - regatom - the lowest level
5918 * Optimization: gobbles an entire sequence of ordinary characters so that
5919 * it can turn them into a single node, which is smaller to store and
5920 * faster to run. Backslashed characters are exceptions, each becoming a
5921 * separate node; the code is simpler that way and it's not worth fixing.
5923 * [Yes, it is worth fixing, some scripts can run twice the speed.]
5924 * [It looks like its ok, as in S_study_chunk we merge adjacent EXACT nodes]
5927 S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
5930 register regnode *ret = NULL;
5932 char *parse_start = RExC_parse;
5933 GET_RE_DEBUG_FLAGS_DECL;
5934 DEBUG_PARSE("atom");
5935 *flagp = WORST; /* Tentatively. */
5938 switch (*RExC_parse) {
5940 RExC_seen_zerolen++;
5941 nextchar(pRExC_state);
5942 if (RExC_flags & PMf_MULTILINE)
5943 ret = reg_node(pRExC_state, MBOL);
5944 else if (RExC_flags & PMf_SINGLELINE)
5945 ret = reg_node(pRExC_state, SBOL);
5947 ret = reg_node(pRExC_state, BOL);
5948 Set_Node_Length(ret, 1); /* MJD */
5951 nextchar(pRExC_state);
5953 RExC_seen_zerolen++;
5954 if (RExC_flags & PMf_MULTILINE)
5955 ret = reg_node(pRExC_state, MEOL);
5956 else if (RExC_flags & PMf_SINGLELINE)
5957 ret = reg_node(pRExC_state, SEOL);
5959 ret = reg_node(pRExC_state, EOL);
5960 Set_Node_Length(ret, 1); /* MJD */
5963 nextchar(pRExC_state);
5964 if (RExC_flags & PMf_SINGLELINE)
5965 ret = reg_node(pRExC_state, SANY);
5967 ret = reg_node(pRExC_state, REG_ANY);
5968 *flagp |= HASWIDTH|SIMPLE;
5970 Set_Node_Length(ret, 1); /* MJD */
5974 char * const oregcomp_parse = ++RExC_parse;
5975 ret = regclass(pRExC_state,depth+1);
5976 if (*RExC_parse != ']') {
5977 RExC_parse = oregcomp_parse;
5978 vFAIL("Unmatched [");
5980 nextchar(pRExC_state);
5981 *flagp |= HASWIDTH|SIMPLE;
5982 Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */
5986 nextchar(pRExC_state);
5987 ret = reg(pRExC_state, 1, &flags,depth+1);
5989 if (flags & TRYAGAIN) {
5990 if (RExC_parse == RExC_end) {
5991 /* Make parent create an empty node if needed. */
5999 *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE);
6003 if (flags & TRYAGAIN) {
6007 vFAIL("Internal urp");
6008 /* Supposed to be caught earlier. */
6011 if (!regcurly(RExC_parse)) {
6020 vFAIL("Quantifier follows nothing");
6023 switch (*++RExC_parse) {
6025 RExC_seen_zerolen++;
6026 ret = reg_node(pRExC_state, SBOL);
6028 nextchar(pRExC_state);
6029 Set_Node_Length(ret, 2); /* MJD */
6032 ret = reg_node(pRExC_state, GPOS);
6033 RExC_seen |= REG_SEEN_GPOS;
6035 nextchar(pRExC_state);
6036 Set_Node_Length(ret, 2); /* MJD */
6039 ret = reg_node(pRExC_state, SEOL);
6041 RExC_seen_zerolen++; /* Do not optimize RE away */
6042 nextchar(pRExC_state);
6045 ret = reg_node(pRExC_state, EOS);
6047 RExC_seen_zerolen++; /* Do not optimize RE away */
6048 nextchar(pRExC_state);
6049 Set_Node_Length(ret, 2); /* MJD */
6052 ret = reg_node(pRExC_state, CANY);
6053 RExC_seen |= REG_SEEN_CANY;
6054 *flagp |= HASWIDTH|SIMPLE;
6055 nextchar(pRExC_state);
6056 Set_Node_Length(ret, 2); /* MJD */
6059 ret = reg_node(pRExC_state, CLUMP);
6061 nextchar(pRExC_state);
6062 Set_Node_Length(ret, 2); /* MJD */
6065 ret = reg_node(pRExC_state, (U8)(LOC ? ALNUML : ALNUM));
6066 *flagp |= HASWIDTH|SIMPLE;
6067 nextchar(pRExC_state);
6068 Set_Node_Length(ret, 2); /* MJD */
6071 ret = reg_node(pRExC_state, (U8)(LOC ? NALNUML : NALNUM));
6072 *flagp |= HASWIDTH|SIMPLE;
6073 nextchar(pRExC_state);
6074 Set_Node_Length(ret, 2); /* MJD */
6077 RExC_seen_zerolen++;
6078 RExC_seen |= REG_SEEN_LOOKBEHIND;
6079 ret = reg_node(pRExC_state, (U8)(LOC ? BOUNDL : BOUND));
6081 nextchar(pRExC_state);
6082 Set_Node_Length(ret, 2); /* MJD */
6085 RExC_seen_zerolen++;
6086 RExC_seen |= REG_SEEN_LOOKBEHIND;
6087 ret = reg_node(pRExC_state, (U8)(LOC ? NBOUNDL : NBOUND));
6089 nextchar(pRExC_state);
6090 Set_Node_Length(ret, 2); /* MJD */
6093 ret = reg_node(pRExC_state, (U8)(LOC ? SPACEL : SPACE));
6094 *flagp |= HASWIDTH|SIMPLE;
6095 nextchar(pRExC_state);
6096 Set_Node_Length(ret, 2); /* MJD */
6099 ret = reg_node(pRExC_state, (U8)(LOC ? NSPACEL : NSPACE));
6100 *flagp |= HASWIDTH|SIMPLE;
6101 nextchar(pRExC_state);
6102 Set_Node_Length(ret, 2); /* MJD */
6105 ret = reg_node(pRExC_state, DIGIT);
6106 *flagp |= HASWIDTH|SIMPLE;
6107 nextchar(pRExC_state);
6108 Set_Node_Length(ret, 2); /* MJD */
6111 ret = reg_node(pRExC_state, NDIGIT);
6112 *flagp |= HASWIDTH|SIMPLE;
6113 nextchar(pRExC_state);
6114 Set_Node_Length(ret, 2); /* MJD */
6119 char* const oldregxend = RExC_end;
6120 char* parse_start = RExC_parse - 2;
6122 if (RExC_parse[1] == '{') {
6123 /* a lovely hack--pretend we saw [\pX] instead */
6124 RExC_end = strchr(RExC_parse, '}');
6126 const U8 c = (U8)*RExC_parse;
6128 RExC_end = oldregxend;
6129 vFAIL2("Missing right brace on \\%c{}", c);
6134 RExC_end = RExC_parse + 2;
6135 if (RExC_end > oldregxend)
6136 RExC_end = oldregxend;
6140 ret = regclass(pRExC_state,depth+1);
6142 RExC_end = oldregxend;
6145 Set_Node_Offset(ret, parse_start + 2);
6146 Set_Node_Cur_Length(ret);
6147 nextchar(pRExC_state);
6148 *flagp |= HASWIDTH|SIMPLE;
6152 /* Handle \N{NAME} here and not below because it can be
6153 multicharacter. join_exact() will join them up later on.
6154 Also this makes sure that things like /\N{BLAH}+/ and
6155 \N{BLAH} being multi char Just Happen. dmq*/
6157 ret= reg_namedseq(pRExC_state, NULL);
6159 case 'k': /* Handle \k<NAME> and \k'NAME' */
6161 char ch= RExC_parse[1];
6162 if (ch != '<' && ch != '\'') {
6164 vWARN( RExC_parse + 1,
6165 "Possible broken named back reference treated as literal k");
6169 char* name_start = (RExC_parse += 2);
6171 SV *sv_dat = reg_scan_name(pRExC_state,
6172 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
6173 ch= (ch == '<') ? '>' : '\'';
6175 if (RExC_parse == name_start || *RExC_parse != ch)
6176 vFAIL2("Sequence \\k%c... not terminated",
6177 (ch == '>' ? '<' : ch));
6180 ret = reganode(pRExC_state,
6181 (U8)(FOLD ? (LOC ? NREFFL : NREFF) : NREF),
6187 num = add_data( pRExC_state, 1, "S" );
6189 RExC_rx->data->data[num]=(void*)sv_dat;
6190 SvREFCNT_inc(sv_dat);
6192 /* override incorrect value set in reganode MJD */
6193 Set_Node_Offset(ret, parse_start+1);
6194 Set_Node_Cur_Length(ret); /* MJD */
6195 nextchar(pRExC_state);
6210 case '1': case '2': case '3': case '4':
6211 case '5': case '6': case '7': case '8': case '9':
6213 const I32 num = atoi(RExC_parse);
6215 if (num > 9 && num >= RExC_npar)
6218 char * const parse_start = RExC_parse - 1; /* MJD */
6219 while (isDIGIT(*RExC_parse))
6222 if (!SIZE_ONLY && num > (I32)RExC_rx->nparens)
6223 vFAIL("Reference to nonexistent group");
6225 ret = reganode(pRExC_state,
6226 (U8)(FOLD ? (LOC ? REFFL : REFF) : REF),
6230 /* override incorrect value set in reganode MJD */
6231 Set_Node_Offset(ret, parse_start+1);
6232 Set_Node_Cur_Length(ret); /* MJD */
6234 nextchar(pRExC_state);
6239 if (RExC_parse >= RExC_end)
6240 FAIL("Trailing \\");
6243 /* Do not generate "unrecognized" warnings here, we fall
6244 back into the quick-grab loop below */
6251 if (RExC_flags & PMf_EXTENDED) {
6252 while (RExC_parse < RExC_end && *RExC_parse != '\n')
6254 if (RExC_parse < RExC_end)
6260 register STRLEN len;
6265 U8 tmpbuf[UTF8_MAXBYTES_CASE+1], *foldbuf;
6267 parse_start = RExC_parse - 1;
6273 ret = reg_node(pRExC_state,
6274 (U8)(FOLD ? (LOC ? EXACTFL : EXACTF) : EXACT));
6276 for (len = 0, p = RExC_parse - 1;
6277 len < 127 && p < RExC_end;
6280 char * const oldp = p;
6282 if (RExC_flags & PMf_EXTENDED)
6283 p = regwhite(p, RExC_end);
6331 ender = ASCII_TO_NATIVE('\033');
6335 ender = ASCII_TO_NATIVE('\007');
6340 char* const e = strchr(p, '}');
6344 vFAIL("Missing right brace on \\x{}");
6347 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
6348 | PERL_SCAN_DISALLOW_PREFIX;
6349 STRLEN numlen = e - p - 1;
6350 ender = grok_hex(p + 1, &numlen, &flags, NULL);
6357 I32 flags = PERL_SCAN_DISALLOW_PREFIX;
6359 ender = grok_hex(p, &numlen, &flags, NULL);
6362 if (PL_encoding && ender < 0x100)
6363 goto recode_encoding;
6367 ender = UCHARAT(p++);
6368 ender = toCTRL(ender);
6370 case '0': case '1': case '2': case '3':case '4':
6371 case '5': case '6': case '7': case '8':case '9':
6373 (isDIGIT(p[1]) && atoi(p) >= RExC_npar) ) {
6376 ender = grok_oct(p, &numlen, &flags, NULL);
6383 if (PL_encoding && ender < 0x100)
6384 goto recode_encoding;
6388 SV* enc = PL_encoding;
6389 ender = reg_recode((const char)(U8)ender, &enc);
6390 if (!enc && SIZE_ONLY && ckWARN(WARN_REGEXP))
6391 vWARN(p, "Invalid escape in the specified encoding");
6397 FAIL("Trailing \\");
6400 if (!SIZE_ONLY&& isALPHA(*p) && ckWARN(WARN_REGEXP))
6401 vWARN2(p + 1, "Unrecognized escape \\%c passed through", UCHARAT(p));
6402 goto normal_default;
6407 if (UTF8_IS_START(*p) && UTF) {
6409 ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
6410 &numlen, UTF8_ALLOW_DEFAULT);
6417 if (RExC_flags & PMf_EXTENDED)
6418 p = regwhite(p, RExC_end);
6420 /* Prime the casefolded buffer. */
6421 ender = toFOLD_uni(ender, tmpbuf, &foldlen);
6423 if (ISMULT2(p)) { /* Back off on ?+*. */
6428 /* Emit all the Unicode characters. */
6430 for (foldbuf = tmpbuf;
6432 foldlen -= numlen) {
6433 ender = utf8_to_uvchr(foldbuf, &numlen);
6435 const STRLEN unilen = reguni(pRExC_state, ender, s);
6438 /* In EBCDIC the numlen
6439 * and unilen can differ. */
6441 if (numlen >= foldlen)
6445 break; /* "Can't happen." */
6449 const STRLEN unilen = reguni(pRExC_state, ender, s);
6458 REGC((char)ender, s++);
6464 /* Emit all the Unicode characters. */
6466 for (foldbuf = tmpbuf;
6468 foldlen -= numlen) {
6469 ender = utf8_to_uvchr(foldbuf, &numlen);
6471 const STRLEN unilen = reguni(pRExC_state, ender, s);
6474 /* In EBCDIC the numlen
6475 * and unilen can differ. */
6477 if (numlen >= foldlen)
6485 const STRLEN unilen = reguni(pRExC_state, ender, s);
6494 REGC((char)ender, s++);
6498 Set_Node_Cur_Length(ret); /* MJD */
6499 nextchar(pRExC_state);
6501 /* len is STRLEN which is unsigned, need to copy to signed */
6504 vFAIL("Internal disaster");
6508 if (len == 1 && UNI_IS_INVARIANT(ender))
6512 RExC_size += STR_SZ(len);
6515 RExC_emit += STR_SZ(len);
6525 S_regwhite(char *p, const char *e)
6530 else if (*p == '#') {
6533 } while (p < e && *p != '\n');
6541 /* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
6542 Character classes ([:foo:]) can also be negated ([:^foo:]).
6543 Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
6544 Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
6545 but trigger failures because they are currently unimplemented. */
6547 #define POSIXCC_DONE(c) ((c) == ':')
6548 #define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.')
6549 #define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c))
6552 S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value)
6555 I32 namedclass = OOB_NAMEDCLASS;
6557 if (value == '[' && RExC_parse + 1 < RExC_end &&
6558 /* I smell either [: or [= or [. -- POSIX has been here, right? */
6559 POSIXCC(UCHARAT(RExC_parse))) {
6560 const char c = UCHARAT(RExC_parse);
6561 char* const s = RExC_parse++;
6563 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != c)
6565 if (RExC_parse == RExC_end)
6566 /* Grandfather lone [:, [=, [. */
6569 const char* const t = RExC_parse++; /* skip over the c */
6572 if (UCHARAT(RExC_parse) == ']') {
6573 const char *posixcc = s + 1;
6574 RExC_parse++; /* skip over the ending ] */
6577 const I32 complement = *posixcc == '^' ? *posixcc++ : 0;
6578 const I32 skip = t - posixcc;
6580 /* Initially switch on the length of the name. */
6583 if (memEQ(posixcc, "word", 4)) /* this is not POSIX, this is the Perl \w */
6584 namedclass = complement ? ANYOF_NALNUM : ANYOF_ALNUM;
6587 /* Names all of length 5. */
6588 /* alnum alpha ascii blank cntrl digit graph lower
6589 print punct space upper */
6590 /* Offset 4 gives the best switch position. */
6591 switch (posixcc[4]) {
6593 if (memEQ(posixcc, "alph", 4)) /* alpha */
6594 namedclass = complement ? ANYOF_NALPHA : ANYOF_ALPHA;
6597 if (memEQ(posixcc, "spac", 4)) /* space */
6598 namedclass = complement ? ANYOF_NPSXSPC : ANYOF_PSXSPC;
6601 if (memEQ(posixcc, "grap", 4)) /* graph */
6602 namedclass = complement ? ANYOF_NGRAPH : ANYOF_GRAPH;
6605 if (memEQ(posixcc, "asci", 4)) /* ascii */
6606 namedclass = complement ? ANYOF_NASCII : ANYOF_ASCII;
6609 if (memEQ(posixcc, "blan", 4)) /* blank */
6610 namedclass = complement ? ANYOF_NBLANK : ANYOF_BLANK;
6613 if (memEQ(posixcc, "cntr", 4)) /* cntrl */
6614 namedclass = complement ? ANYOF_NCNTRL : ANYOF_CNTRL;
6617 if (memEQ(posixcc, "alnu", 4)) /* alnum */
6618 namedclass = complement ? ANYOF_NALNUMC : ANYOF_ALNUMC;
6621 if (memEQ(posixcc, "lowe", 4)) /* lower */
6622 namedclass = complement ? ANYOF_NLOWER : ANYOF_LOWER;
6623 else if (memEQ(posixcc, "uppe", 4)) /* upper */
6624 namedclass = complement ? ANYOF_NUPPER : ANYOF_UPPER;
6627 if (memEQ(posixcc, "digi", 4)) /* digit */
6628 namedclass = complement ? ANYOF_NDIGIT : ANYOF_DIGIT;
6629 else if (memEQ(posixcc, "prin", 4)) /* print */
6630 namedclass = complement ? ANYOF_NPRINT : ANYOF_PRINT;
6631 else if (memEQ(posixcc, "punc", 4)) /* punct */
6632 namedclass = complement ? ANYOF_NPUNCT : ANYOF_PUNCT;
6637 if (memEQ(posixcc, "xdigit", 6))
6638 namedclass = complement ? ANYOF_NXDIGIT : ANYOF_XDIGIT;
6642 if (namedclass == OOB_NAMEDCLASS)
6643 Simple_vFAIL3("POSIX class [:%.*s:] unknown",
6645 assert (posixcc[skip] == ':');
6646 assert (posixcc[skip+1] == ']');
6647 } else if (!SIZE_ONLY) {
6648 /* [[=foo=]] and [[.foo.]] are still future. */
6650 /* adjust RExC_parse so the warning shows after
6652 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse) != ']')
6654 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
6657 /* Maternal grandfather:
6658 * "[:" ending in ":" but not in ":]" */
6668 S_checkposixcc(pTHX_ RExC_state_t *pRExC_state)
6671 if (POSIXCC(UCHARAT(RExC_parse))) {
6672 const char *s = RExC_parse;
6673 const char c = *s++;
6677 if (*s && c == *s && s[1] == ']') {
6678 if (ckWARN(WARN_REGEXP))
6680 "POSIX syntax [%c %c] belongs inside character classes",
6683 /* [[=foo=]] and [[.foo.]] are still future. */
6684 if (POSIXCC_NOTYET(c)) {
6685 /* adjust RExC_parse so the error shows after
6687 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse++) != ']')
6689 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
6697 parse a class specification and produce either an ANYOF node that
6698 matches the pattern. If the pattern matches a single char only and
6699 that char is < 256 then we produce an EXACT node instead.
6702 S_regclass(pTHX_ RExC_state_t *pRExC_state, U32 depth)
6705 register UV value = 0;
6706 register UV nextvalue;
6707 register IV prevvalue = OOB_UNICODE;
6708 register IV range = 0;
6709 register regnode *ret;
6712 char *rangebegin = NULL;
6713 bool need_class = 0;
6716 bool optimize_invert = TRUE;
6717 AV* unicode_alternate = NULL;
6719 UV literal_endpoint = 0;
6721 UV stored = 0; /* number of chars stored in the class */
6723 regnode * const orig_emit = RExC_emit; /* Save the original RExC_emit in
6724 case we need to change the emitted regop to an EXACT. */
6725 const char * orig_parse = RExC_parse;
6726 GET_RE_DEBUG_FLAGS_DECL;
6728 PERL_UNUSED_ARG(depth);
6731 DEBUG_PARSE("clas");
6733 /* Assume we are going to generate an ANYOF node. */
6734 ret = reganode(pRExC_state, ANYOF, 0);
6737 ANYOF_FLAGS(ret) = 0;
6739 if (UCHARAT(RExC_parse) == '^') { /* Complement of range. */
6743 ANYOF_FLAGS(ret) |= ANYOF_INVERT;
6747 RExC_size += ANYOF_SKIP;
6748 listsv = &PL_sv_undef; /* For code scanners: listsv always non-NULL. */
6751 RExC_emit += ANYOF_SKIP;
6753 ANYOF_FLAGS(ret) |= ANYOF_FOLD;
6755 ANYOF_FLAGS(ret) |= ANYOF_LOCALE;
6756 ANYOF_BITMAP_ZERO(ret);
6757 listsv = newSVpvs("# comment\n");
6760 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
6762 if (!SIZE_ONLY && POSIXCC(nextvalue))
6763 checkposixcc(pRExC_state);
6765 /* allow 1st char to be ] (allowing it to be - is dealt with later) */
6766 if (UCHARAT(RExC_parse) == ']')
6770 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != ']') {
6774 namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
6777 rangebegin = RExC_parse;
6779 value = utf8n_to_uvchr((U8*)RExC_parse,
6780 RExC_end - RExC_parse,
6781 &numlen, UTF8_ALLOW_DEFAULT);
6782 RExC_parse += numlen;
6785 value = UCHARAT(RExC_parse++);
6787 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
6788 if (value == '[' && POSIXCC(nextvalue))
6789 namedclass = regpposixcc(pRExC_state, value);
6790 else if (value == '\\') {
6792 value = utf8n_to_uvchr((U8*)RExC_parse,
6793 RExC_end - RExC_parse,
6794 &numlen, UTF8_ALLOW_DEFAULT);
6795 RExC_parse += numlen;
6798 value = UCHARAT(RExC_parse++);
6799 /* Some compilers cannot handle switching on 64-bit integer
6800 * values, therefore value cannot be an UV. Yes, this will
6801 * be a problem later if we want switch on Unicode.
6802 * A similar issue a little bit later when switching on
6803 * namedclass. --jhi */
6804 switch ((I32)value) {
6805 case 'w': namedclass = ANYOF_ALNUM; break;
6806 case 'W': namedclass = ANYOF_NALNUM; break;
6807 case 's': namedclass = ANYOF_SPACE; break;
6808 case 'S': namedclass = ANYOF_NSPACE; break;
6809 case 'd': namedclass = ANYOF_DIGIT; break;
6810 case 'D': namedclass = ANYOF_NDIGIT; break;
6811 case 'N': /* Handle \N{NAME} in class */
6813 /* We only pay attention to the first char of
6814 multichar strings being returned. I kinda wonder
6815 if this makes sense as it does change the behaviour
6816 from earlier versions, OTOH that behaviour was broken
6818 UV v; /* value is register so we cant & it /grrr */
6819 if (reg_namedseq(pRExC_state, &v)) {
6829 if (RExC_parse >= RExC_end)
6830 vFAIL2("Empty \\%c{}", (U8)value);
6831 if (*RExC_parse == '{') {
6832 const U8 c = (U8)value;
6833 e = strchr(RExC_parse++, '}');
6835 vFAIL2("Missing right brace on \\%c{}", c);
6836 while (isSPACE(UCHARAT(RExC_parse)))
6838 if (e == RExC_parse)
6839 vFAIL2("Empty \\%c{}", c);
6841 while (isSPACE(UCHARAT(RExC_parse + n - 1)))
6849 if (UCHARAT(RExC_parse) == '^') {
6852 value = value == 'p' ? 'P' : 'p'; /* toggle */
6853 while (isSPACE(UCHARAT(RExC_parse))) {
6858 Perl_sv_catpvf(aTHX_ listsv, "%cutf8::%.*s\n",
6859 (value=='p' ? '+' : '!'), (int)n, RExC_parse);
6862 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
6863 namedclass = ANYOF_MAX; /* no official name, but it's named */
6866 case 'n': value = '\n'; break;
6867 case 'r': value = '\r'; break;
6868 case 't': value = '\t'; break;
6869 case 'f': value = '\f'; break;
6870 case 'b': value = '\b'; break;
6871 case 'e': value = ASCII_TO_NATIVE('\033');break;
6872 case 'a': value = ASCII_TO_NATIVE('\007');break;
6874 if (*RExC_parse == '{') {
6875 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
6876 | PERL_SCAN_DISALLOW_PREFIX;
6877 char * const e = strchr(RExC_parse++, '}');
6879 vFAIL("Missing right brace on \\x{}");
6881 numlen = e - RExC_parse;
6882 value = grok_hex(RExC_parse, &numlen, &flags, NULL);
6886 I32 flags = PERL_SCAN_DISALLOW_PREFIX;
6888 value = grok_hex(RExC_parse, &numlen, &flags, NULL);
6889 RExC_parse += numlen;
6891 if (PL_encoding && value < 0x100)
6892 goto recode_encoding;
6895 value = UCHARAT(RExC_parse++);
6896 value = toCTRL(value);
6898 case '0': case '1': case '2': case '3': case '4':
6899 case '5': case '6': case '7': case '8': case '9':
6903 value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
6904 RExC_parse += numlen;
6905 if (PL_encoding && value < 0x100)
6906 goto recode_encoding;
6911 SV* enc = PL_encoding;
6912 value = reg_recode((const char)(U8)value, &enc);
6913 if (!enc && SIZE_ONLY && ckWARN(WARN_REGEXP))
6915 "Invalid escape in the specified encoding");
6919 if (!SIZE_ONLY && isALPHA(value) && ckWARN(WARN_REGEXP))
6921 "Unrecognized escape \\%c in character class passed through",
6925 } /* end of \blah */
6931 if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
6933 if (!SIZE_ONLY && !need_class)
6934 ANYOF_CLASS_ZERO(ret);
6938 /* a bad range like a-\d, a-[:digit:] ? */
6941 if (ckWARN(WARN_REGEXP)) {
6943 RExC_parse >= rangebegin ?
6944 RExC_parse - rangebegin : 0;
6946 "False [] range \"%*.*s\"",
6949 if (prevvalue < 256) {
6950 ANYOF_BITMAP_SET(ret, prevvalue);
6951 ANYOF_BITMAP_SET(ret, '-');
6954 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
6955 Perl_sv_catpvf(aTHX_ listsv,
6956 "%04"UVxf"\n%04"UVxf"\n", (UV)prevvalue, (UV) '-');
6960 range = 0; /* this was not a true range */
6964 const char *what = NULL;
6967 if (namedclass > OOB_NAMEDCLASS)
6968 optimize_invert = FALSE;
6969 /* Possible truncation here but in some 64-bit environments
6970 * the compiler gets heartburn about switch on 64-bit values.
6971 * A similar issue a little earlier when switching on value.
6973 switch ((I32)namedclass) {
6976 ANYOF_CLASS_SET(ret, ANYOF_ALNUM);
6978 for (value = 0; value < 256; value++)
6980 ANYOF_BITMAP_SET(ret, value);
6987 ANYOF_CLASS_SET(ret, ANYOF_NALNUM);
6989 for (value = 0; value < 256; value++)
6990 if (!isALNUM(value))
6991 ANYOF_BITMAP_SET(ret, value);
6998 ANYOF_CLASS_SET(ret, ANYOF_ALNUMC);
7000 for (value = 0; value < 256; value++)
7001 if (isALNUMC(value))
7002 ANYOF_BITMAP_SET(ret, value);
7009 ANYOF_CLASS_SET(ret, ANYOF_NALNUMC);
7011 for (value = 0; value < 256; value++)
7012 if (!isALNUMC(value))
7013 ANYOF_BITMAP_SET(ret, value);
7020 ANYOF_CLASS_SET(ret, ANYOF_ALPHA);
7022 for (value = 0; value < 256; value++)
7024 ANYOF_BITMAP_SET(ret, value);
7031 ANYOF_CLASS_SET(ret, ANYOF_NALPHA);
7033 for (value = 0; value < 256; value++)
7034 if (!isALPHA(value))
7035 ANYOF_BITMAP_SET(ret, value);
7042 ANYOF_CLASS_SET(ret, ANYOF_ASCII);
7045 for (value = 0; value < 128; value++)
7046 ANYOF_BITMAP_SET(ret, value);
7048 for (value = 0; value < 256; value++) {
7050 ANYOF_BITMAP_SET(ret, value);
7059 ANYOF_CLASS_SET(ret, ANYOF_NASCII);
7062 for (value = 128; value < 256; value++)
7063 ANYOF_BITMAP_SET(ret, value);
7065 for (value = 0; value < 256; value++) {
7066 if (!isASCII(value))
7067 ANYOF_BITMAP_SET(ret, value);
7076 ANYOF_CLASS_SET(ret, ANYOF_BLANK);
7078 for (value = 0; value < 256; value++)
7080 ANYOF_BITMAP_SET(ret, value);
7087 ANYOF_CLASS_SET(ret, ANYOF_NBLANK);
7089 for (value = 0; value < 256; value++)
7090 if (!isBLANK(value))
7091 ANYOF_BITMAP_SET(ret, value);
7098 ANYOF_CLASS_SET(ret, ANYOF_CNTRL);
7100 for (value = 0; value < 256; value++)
7102 ANYOF_BITMAP_SET(ret, value);
7109 ANYOF_CLASS_SET(ret, ANYOF_NCNTRL);
7111 for (value = 0; value < 256; value++)
7112 if (!isCNTRL(value))
7113 ANYOF_BITMAP_SET(ret, value);
7120 ANYOF_CLASS_SET(ret, ANYOF_DIGIT);
7122 /* consecutive digits assumed */
7123 for (value = '0'; value <= '9'; value++)
7124 ANYOF_BITMAP_SET(ret, value);
7131 ANYOF_CLASS_SET(ret, ANYOF_NDIGIT);
7133 /* consecutive digits assumed */
7134 for (value = 0; value < '0'; value++)
7135 ANYOF_BITMAP_SET(ret, value);
7136 for (value = '9' + 1; value < 256; value++)
7137 ANYOF_BITMAP_SET(ret, value);
7144 ANYOF_CLASS_SET(ret, ANYOF_GRAPH);
7146 for (value = 0; value < 256; value++)
7148 ANYOF_BITMAP_SET(ret, value);
7155 ANYOF_CLASS_SET(ret, ANYOF_NGRAPH);
7157 for (value = 0; value < 256; value++)
7158 if (!isGRAPH(value))
7159 ANYOF_BITMAP_SET(ret, value);
7166 ANYOF_CLASS_SET(ret, ANYOF_LOWER);
7168 for (value = 0; value < 256; value++)
7170 ANYOF_BITMAP_SET(ret, value);
7177 ANYOF_CLASS_SET(ret, ANYOF_NLOWER);
7179 for (value = 0; value < 256; value++)
7180 if (!isLOWER(value))
7181 ANYOF_BITMAP_SET(ret, value);
7188 ANYOF_CLASS_SET(ret, ANYOF_PRINT);
7190 for (value = 0; value < 256; value++)
7192 ANYOF_BITMAP_SET(ret, value);
7199 ANYOF_CLASS_SET(ret, ANYOF_NPRINT);
7201 for (value = 0; value < 256; value++)
7202 if (!isPRINT(value))
7203 ANYOF_BITMAP_SET(ret, value);
7210 ANYOF_CLASS_SET(ret, ANYOF_PSXSPC);
7212 for (value = 0; value < 256; value++)
7213 if (isPSXSPC(value))
7214 ANYOF_BITMAP_SET(ret, value);
7221 ANYOF_CLASS_SET(ret, ANYOF_NPSXSPC);
7223 for (value = 0; value < 256; value++)
7224 if (!isPSXSPC(value))
7225 ANYOF_BITMAP_SET(ret, value);
7232 ANYOF_CLASS_SET(ret, ANYOF_PUNCT);
7234 for (value = 0; value < 256; value++)
7236 ANYOF_BITMAP_SET(ret, value);
7243 ANYOF_CLASS_SET(ret, ANYOF_NPUNCT);
7245 for (value = 0; value < 256; value++)
7246 if (!isPUNCT(value))
7247 ANYOF_BITMAP_SET(ret, value);
7254 ANYOF_CLASS_SET(ret, ANYOF_SPACE);
7256 for (value = 0; value < 256; value++)
7258 ANYOF_BITMAP_SET(ret, value);
7265 ANYOF_CLASS_SET(ret, ANYOF_NSPACE);
7267 for (value = 0; value < 256; value++)
7268 if (!isSPACE(value))
7269 ANYOF_BITMAP_SET(ret, value);
7276 ANYOF_CLASS_SET(ret, ANYOF_UPPER);
7278 for (value = 0; value < 256; value++)
7280 ANYOF_BITMAP_SET(ret, value);
7287 ANYOF_CLASS_SET(ret, ANYOF_NUPPER);
7289 for (value = 0; value < 256; value++)
7290 if (!isUPPER(value))
7291 ANYOF_BITMAP_SET(ret, value);
7298 ANYOF_CLASS_SET(ret, ANYOF_XDIGIT);
7300 for (value = 0; value < 256; value++)
7301 if (isXDIGIT(value))
7302 ANYOF_BITMAP_SET(ret, value);
7309 ANYOF_CLASS_SET(ret, ANYOF_NXDIGIT);
7311 for (value = 0; value < 256; value++)
7312 if (!isXDIGIT(value))
7313 ANYOF_BITMAP_SET(ret, value);
7319 /* this is to handle \p and \P */
7322 vFAIL("Invalid [::] class");
7326 /* Strings such as "+utf8::isWord\n" */
7327 Perl_sv_catpvf(aTHX_ listsv, "%cutf8::Is%s\n", yesno, what);
7330 ANYOF_FLAGS(ret) |= ANYOF_CLASS;
7333 } /* end of namedclass \blah */
7336 if (prevvalue > (IV)value) /* b-a */ {
7337 const int w = RExC_parse - rangebegin;
7338 Simple_vFAIL4("Invalid [] range \"%*.*s\"", w, w, rangebegin);
7339 range = 0; /* not a valid range */
7343 prevvalue = value; /* save the beginning of the range */
7344 if (*RExC_parse == '-' && RExC_parse+1 < RExC_end &&
7345 RExC_parse[1] != ']') {
7348 /* a bad range like \w-, [:word:]- ? */
7349 if (namedclass > OOB_NAMEDCLASS) {
7350 if (ckWARN(WARN_REGEXP)) {
7352 RExC_parse >= rangebegin ?
7353 RExC_parse - rangebegin : 0;
7355 "False [] range \"%*.*s\"",
7359 ANYOF_BITMAP_SET(ret, '-');
7361 range = 1; /* yeah, it's a range! */
7362 continue; /* but do it the next time */
7366 /* now is the next time */
7367 /*stored += (value - prevvalue + 1);*/
7369 if (prevvalue < 256) {
7370 const IV ceilvalue = value < 256 ? value : 255;
7373 /* In EBCDIC [\x89-\x91] should include
7374 * the \x8e but [i-j] should not. */
7375 if (literal_endpoint == 2 &&
7376 ((isLOWER(prevvalue) && isLOWER(ceilvalue)) ||
7377 (isUPPER(prevvalue) && isUPPER(ceilvalue))))
7379 if (isLOWER(prevvalue)) {
7380 for (i = prevvalue; i <= ceilvalue; i++)
7382 ANYOF_BITMAP_SET(ret, i);
7384 for (i = prevvalue; i <= ceilvalue; i++)
7386 ANYOF_BITMAP_SET(ret, i);
7391 for (i = prevvalue; i <= ceilvalue; i++) {
7392 if (!ANYOF_BITMAP_TEST(ret,i)) {
7394 ANYOF_BITMAP_SET(ret, i);
7398 if (value > 255 || UTF) {
7399 const UV prevnatvalue = NATIVE_TO_UNI(prevvalue);
7400 const UV natvalue = NATIVE_TO_UNI(value);
7401 stored+=2; /* can't optimize this class */
7402 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
7403 if (prevnatvalue < natvalue) { /* what about > ? */
7404 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\t%04"UVxf"\n",
7405 prevnatvalue, natvalue);
7407 else if (prevnatvalue == natvalue) {
7408 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n", natvalue);
7410 U8 foldbuf[UTF8_MAXBYTES_CASE+1];
7412 const UV f = to_uni_fold(natvalue, foldbuf, &foldlen);
7414 #ifdef EBCDIC /* RD t/uni/fold ff and 6b */
7415 if (RExC_precomp[0] == ':' &&
7416 RExC_precomp[1] == '[' &&
7417 (f == 0xDF || f == 0x92)) {
7418 f = NATIVE_TO_UNI(f);
7421 /* If folding and foldable and a single
7422 * character, insert also the folded version
7423 * to the charclass. */
7425 #ifdef EBCDIC /* RD tunifold ligatures s,t fb05, fb06 */
7426 if ((RExC_precomp[0] == ':' &&
7427 RExC_precomp[1] == '[' &&
7429 (value == 0xFB05 || value == 0xFB06))) ?
7430 foldlen == ((STRLEN)UNISKIP(f) - 1) :
7431 foldlen == (STRLEN)UNISKIP(f) )
7433 if (foldlen == (STRLEN)UNISKIP(f))
7435 Perl_sv_catpvf(aTHX_ listsv,
7438 /* Any multicharacter foldings
7439 * require the following transform:
7440 * [ABCDEF] -> (?:[ABCabcDEFd]|pq|rst)
7441 * where E folds into "pq" and F folds
7442 * into "rst", all other characters
7443 * fold to single characters. We save
7444 * away these multicharacter foldings,
7445 * to be later saved as part of the
7446 * additional "s" data. */
7449 if (!unicode_alternate)
7450 unicode_alternate = newAV();
7451 sv = newSVpvn((char*)foldbuf, foldlen);
7453 av_push(unicode_alternate, sv);
7457 /* If folding and the value is one of the Greek
7458 * sigmas insert a few more sigmas to make the
7459 * folding rules of the sigmas to work right.
7460 * Note that not all the possible combinations
7461 * are handled here: some of them are handled
7462 * by the standard folding rules, and some of
7463 * them (literal or EXACTF cases) are handled
7464 * during runtime in regexec.c:S_find_byclass(). */
7465 if (value == UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA) {
7466 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
7467 (UV)UNICODE_GREEK_CAPITAL_LETTER_SIGMA);
7468 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
7469 (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
7471 else if (value == UNICODE_GREEK_CAPITAL_LETTER_SIGMA)
7472 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
7473 (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
7478 literal_endpoint = 0;
7482 range = 0; /* this range (if it was one) is done now */
7486 ANYOF_FLAGS(ret) |= ANYOF_LARGE;
7488 RExC_size += ANYOF_CLASS_ADD_SKIP;
7490 RExC_emit += ANYOF_CLASS_ADD_SKIP;
7496 /****** !SIZE_ONLY AFTER HERE *********/
7498 if( stored == 1 && value < 256
7499 && !( ANYOF_FLAGS(ret) & ( ANYOF_FLAGS_ALL ^ ANYOF_FOLD ) )
7501 /* optimize single char class to an EXACT node
7502 but *only* when its not a UTF/high char */
7503 const char * cur_parse= RExC_parse;
7504 RExC_emit = (regnode *)orig_emit;
7505 RExC_parse = (char *)orig_parse;
7506 ret = reg_node(pRExC_state,
7507 (U8)((ANYOF_FLAGS(ret) & ANYOF_FOLD) ? EXACTF : EXACT));
7508 RExC_parse = (char *)cur_parse;
7509 *STRING(ret)= (char)value;
7511 RExC_emit += STR_SZ(1);
7514 /* optimize case-insensitive simple patterns (e.g. /[a-z]/i) */
7515 if ( /* If the only flag is folding (plus possibly inversion). */
7516 ((ANYOF_FLAGS(ret) & (ANYOF_FLAGS_ALL ^ ANYOF_INVERT)) == ANYOF_FOLD)
7518 for (value = 0; value < 256; ++value) {
7519 if (ANYOF_BITMAP_TEST(ret, value)) {
7520 UV fold = PL_fold[value];
7523 ANYOF_BITMAP_SET(ret, fold);
7526 ANYOF_FLAGS(ret) &= ~ANYOF_FOLD;
7529 /* optimize inverted simple patterns (e.g. [^a-z]) */
7530 if (optimize_invert &&
7531 /* If the only flag is inversion. */
7532 (ANYOF_FLAGS(ret) & ANYOF_FLAGS_ALL) == ANYOF_INVERT) {
7533 for (value = 0; value < ANYOF_BITMAP_SIZE; ++value)
7534 ANYOF_BITMAP(ret)[value] ^= ANYOF_FLAGS_ALL;
7535 ANYOF_FLAGS(ret) = ANYOF_UNICODE_ALL;
7538 AV * const av = newAV();
7540 /* The 0th element stores the character class description
7541 * in its textual form: used later (regexec.c:Perl_regclass_swash())
7542 * to initialize the appropriate swash (which gets stored in
7543 * the 1st element), and also useful for dumping the regnode.
7544 * The 2nd element stores the multicharacter foldings,
7545 * used later (regexec.c:S_reginclass()). */
7546 av_store(av, 0, listsv);
7547 av_store(av, 1, NULL);
7548 av_store(av, 2, (SV*)unicode_alternate);
7549 rv = newRV_noinc((SV*)av);
7550 n = add_data(pRExC_state, 1, "s");
7551 RExC_rx->data->data[n] = (void*)rv;
7558 S_nextchar(pTHX_ RExC_state_t *pRExC_state)
7560 char* const retval = RExC_parse++;
7563 if (*RExC_parse == '(' && RExC_parse[1] == '?' &&
7564 RExC_parse[2] == '#') {
7565 while (*RExC_parse != ')') {
7566 if (RExC_parse == RExC_end)
7567 FAIL("Sequence (?#... not terminated");
7573 if (RExC_flags & PMf_EXTENDED) {
7574 if (isSPACE(*RExC_parse)) {
7578 else if (*RExC_parse == '#') {
7579 while (RExC_parse < RExC_end)
7580 if (*RExC_parse++ == '\n') break;
7589 - reg_node - emit a node
7591 STATIC regnode * /* Location. */
7592 S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
7595 register regnode *ptr;
7596 regnode * const ret = RExC_emit;
7597 GET_RE_DEBUG_FLAGS_DECL;
7600 SIZE_ALIGN(RExC_size);
7605 if (OP(RExC_emit) == 255)
7606 Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %s: %d ",
7607 reg_name[op], OP(RExC_emit));
7609 NODE_ALIGN_FILL(ret);
7611 FILL_ADVANCE_NODE(ptr, op);
7612 if (RExC_offsets) { /* MJD */
7613 MJD_OFFSET_DEBUG(("%s:%d: (op %s) %s %"UVuf" (len %"UVuf") (max %"UVuf").\n",
7614 "reg_node", __LINE__,
7616 (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0]
7617 ? "Overwriting end of array!\n" : "OK",
7618 (UV)(RExC_emit - RExC_emit_start),
7619 (UV)(RExC_parse - RExC_start),
7620 (UV)RExC_offsets[0]));
7621 Set_Node_Offset(RExC_emit, RExC_parse + (op == END));
7629 - reganode - emit a node with an argument
7631 STATIC regnode * /* Location. */
7632 S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
7635 register regnode *ptr;
7636 regnode * const ret = RExC_emit;
7637 GET_RE_DEBUG_FLAGS_DECL;
7640 SIZE_ALIGN(RExC_size);
7645 assert(2==regarglen[op]+1);
7647 Anything larger than this has to allocate the extra amount.
7648 If we changed this to be:
7650 RExC_size += (1 + regarglen[op]);
7652 then it wouldn't matter. Its not clear what side effect
7653 might come from that so its not done so far.
7659 if (OP(RExC_emit) == 255)
7660 Perl_croak(aTHX_ "panic: reganode overwriting end of allocated program space");
7662 NODE_ALIGN_FILL(ret);
7664 FILL_ADVANCE_NODE_ARG(ptr, op, arg);
7665 if (RExC_offsets) { /* MJD */
7666 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
7670 (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0] ?
7671 "Overwriting end of array!\n" : "OK",
7672 (UV)(RExC_emit - RExC_emit_start),
7673 (UV)(RExC_parse - RExC_start),
7674 (UV)RExC_offsets[0]));
7675 Set_Cur_Node_Offset;
7683 - reguni - emit (if appropriate) a Unicode character
7686 S_reguni(pTHX_ const RExC_state_t *pRExC_state, UV uv, char* s)
7689 return SIZE_ONLY ? UNISKIP(uv) : (uvchr_to_utf8((U8*)s, uv) - (U8*)s);
7693 - reginsert - insert an operator in front of already-emitted operand
7695 * Means relocating the operand.
7698 S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd, U32 depth)
7701 register regnode *src;
7702 register regnode *dst;
7703 register regnode *place;
7704 const int offset = regarglen[(U8)op];
7705 const int size = NODE_STEP_REGNODE + offset;
7706 GET_RE_DEBUG_FLAGS_DECL;
7707 /* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */
7708 DEBUG_PARSE_FMT("inst"," - %s",reg_name[op]);
7717 if (RExC_open_parens) {
7719 DEBUG_PARSE_FMT("inst"," - %"IVdf, (IV)RExC_npar);
7720 for ( paren=0 ; paren < RExC_npar ; paren++ ) {
7721 if ( RExC_open_parens[paren] >= opnd ) {
7722 DEBUG_PARSE_FMT("open"," - %d",size);
7723 RExC_open_parens[paren] += size;
7725 DEBUG_PARSE_FMT("open"," - %s","ok");
7727 if ( RExC_close_parens[paren] >= opnd ) {
7728 DEBUG_PARSE_FMT("close"," - %d",size);
7729 RExC_close_parens[paren] += size;
7731 DEBUG_PARSE_FMT("close"," - %s","ok");
7736 while (src > opnd) {
7737 StructCopy(--src, --dst, regnode);
7738 if (RExC_offsets) { /* MJD 20010112 */
7739 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s copy %"UVuf" -> %"UVuf" (max %"UVuf").\n",
7743 (UV)(dst - RExC_emit_start) > RExC_offsets[0]
7744 ? "Overwriting end of array!\n" : "OK",
7745 (UV)(src - RExC_emit_start),
7746 (UV)(dst - RExC_emit_start),
7747 (UV)RExC_offsets[0]));
7748 Set_Node_Offset_To_R(dst-RExC_emit_start, Node_Offset(src));
7749 Set_Node_Length_To_R(dst-RExC_emit_start, Node_Length(src));
7754 place = opnd; /* Op node, where operand used to be. */
7755 if (RExC_offsets) { /* MJD */
7756 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
7760 (UV)(place - RExC_emit_start) > RExC_offsets[0]
7761 ? "Overwriting end of array!\n" : "OK",
7762 (UV)(place - RExC_emit_start),
7763 (UV)(RExC_parse - RExC_start),
7764 (UV)RExC_offsets[0]));
7765 Set_Node_Offset(place, RExC_parse);
7766 Set_Node_Length(place, 1);
7768 src = NEXTOPER(place);
7769 FILL_ADVANCE_NODE(place, op);
7770 Zero(src, offset, regnode);
7774 - regtail - set the next-pointer at the end of a node chain of p to val.
7775 - SEE ALSO: regtail_study
7777 /* TODO: All three parms should be const */
7779 S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
7782 register regnode *scan;
7783 GET_RE_DEBUG_FLAGS_DECL;
7785 PERL_UNUSED_ARG(depth);
7791 /* Find last node. */
7794 regnode * const temp = regnext(scan);
7796 SV * const mysv=sv_newmortal();
7797 DEBUG_PARSE_MSG((scan==p ? "tail" : ""));
7798 regprop(RExC_rx, mysv, scan);
7799 PerlIO_printf(Perl_debug_log, "~ %s (%d) %s %s\n",
7800 SvPV_nolen_const(mysv), REG_NODE_NUM(scan),
7801 (temp == NULL ? "->" : ""),
7802 (temp == NULL ? reg_name[OP(val)] : "")
7810 if (reg_off_by_arg[OP(scan)]) {
7811 ARG_SET(scan, val - scan);
7814 NEXT_OFF(scan) = val - scan;
7820 - regtail_study - set the next-pointer at the end of a node chain of p to val.
7821 - Look for optimizable sequences at the same time.
7822 - currently only looks for EXACT chains.
7824 This is expermental code. The idea is to use this routine to perform
7825 in place optimizations on branches and groups as they are constructed,
7826 with the long term intention of removing optimization from study_chunk so
7827 that it is purely analytical.
7829 Currently only used when in DEBUG mode. The macro REGTAIL_STUDY() is used
7830 to control which is which.
7833 /* TODO: All four parms should be const */
7836 S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
7839 register regnode *scan;
7841 #ifdef EXPERIMENTAL_INPLACESCAN
7845 GET_RE_DEBUG_FLAGS_DECL;
7851 /* Find last node. */
7855 regnode * const temp = regnext(scan);
7856 #ifdef EXPERIMENTAL_INPLACESCAN
7857 if (PL_regkind[OP(scan)] == EXACT)
7858 if (join_exact(pRExC_state,scan,&min,1,val,depth+1))
7866 if( exact == PSEUDO )
7868 else if ( exact != OP(scan) )
7877 SV * const mysv=sv_newmortal();
7878 DEBUG_PARSE_MSG((scan==p ? "tsdy" : ""));
7879 regprop(RExC_rx, mysv, scan);
7880 PerlIO_printf(Perl_debug_log, "~ %s (%d) -> %s\n",
7881 SvPV_nolen_const(mysv),
7890 SV * const mysv_val=sv_newmortal();
7891 DEBUG_PARSE_MSG("");
7892 regprop(RExC_rx, mysv_val, val);
7893 PerlIO_printf(Perl_debug_log, "~ attach to %s (%d) offset to %d\n",
7894 SvPV_nolen_const(mysv_val),
7899 if (reg_off_by_arg[OP(scan)]) {
7900 ARG_SET(scan, val - scan);
7903 NEXT_OFF(scan) = val - scan;
7911 - regcurly - a little FSA that accepts {\d+,?\d*}
7914 S_regcurly(register const char *s)
7933 - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
7936 Perl_regdump(pTHX_ const regexp *r)
7940 SV * const sv = sv_newmortal();
7941 SV *dsv= sv_newmortal();
7943 (void)dumpuntil(r, r->program, r->program + 1, NULL, NULL, sv, 0, 0);
7945 /* Header fields of interest. */
7946 if (r->anchored_substr) {
7947 RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->anchored_substr),
7948 RE_SV_DUMPLEN(r->anchored_substr), 30);
7949 PerlIO_printf(Perl_debug_log,
7950 "anchored %s%s at %"IVdf" ",
7951 s, RE_SV_TAIL(r->anchored_substr),
7952 (IV)r->anchored_offset);
7953 } else if (r->anchored_utf8) {
7954 RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->anchored_utf8),
7955 RE_SV_DUMPLEN(r->anchored_utf8), 30);
7956 PerlIO_printf(Perl_debug_log,
7957 "anchored utf8 %s%s at %"IVdf" ",
7958 s, RE_SV_TAIL(r->anchored_utf8),
7959 (IV)r->anchored_offset);
7961 if (r->float_substr) {
7962 RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->float_substr),
7963 RE_SV_DUMPLEN(r->float_substr), 30);
7964 PerlIO_printf(Perl_debug_log,
7965 "floating %s%s at %"IVdf"..%"UVuf" ",
7966 s, RE_SV_TAIL(r->float_substr),
7967 (IV)r->float_min_offset, (UV)r->float_max_offset);
7968 } else if (r->float_utf8) {
7969 RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->float_utf8),
7970 RE_SV_DUMPLEN(r->float_utf8), 30);
7971 PerlIO_printf(Perl_debug_log,
7972 "floating utf8 %s%s at %"IVdf"..%"UVuf" ",
7973 s, RE_SV_TAIL(r->float_utf8),
7974 (IV)r->float_min_offset, (UV)r->float_max_offset);
7976 if (r->check_substr || r->check_utf8)
7977 PerlIO_printf(Perl_debug_log,
7979 (r->check_substr == r->float_substr
7980 && r->check_utf8 == r->float_utf8
7981 ? "(checking floating" : "(checking anchored"));
7982 if (r->reganch & ROPT_NOSCAN)
7983 PerlIO_printf(Perl_debug_log, " noscan");
7984 if (r->reganch & ROPT_CHECK_ALL)
7985 PerlIO_printf(Perl_debug_log, " isall");
7986 if (r->check_substr || r->check_utf8)
7987 PerlIO_printf(Perl_debug_log, ") ");
7989 if (r->regstclass) {
7990 regprop(r, sv, r->regstclass);
7991 PerlIO_printf(Perl_debug_log, "stclass %s ", SvPVX_const(sv));
7993 if (r->reganch & ROPT_ANCH) {
7994 PerlIO_printf(Perl_debug_log, "anchored");
7995 if (r->reganch & ROPT_ANCH_BOL)
7996 PerlIO_printf(Perl_debug_log, "(BOL)");
7997 if (r->reganch & ROPT_ANCH_MBOL)
7998 PerlIO_printf(Perl_debug_log, "(MBOL)");
7999 if (r->reganch & ROPT_ANCH_SBOL)
8000 PerlIO_printf(Perl_debug_log, "(SBOL)");
8001 if (r->reganch & ROPT_ANCH_GPOS)
8002 PerlIO_printf(Perl_debug_log, "(GPOS)");
8003 PerlIO_putc(Perl_debug_log, ' ');
8005 if (r->reganch & ROPT_GPOS_SEEN)
8006 PerlIO_printf(Perl_debug_log, "GPOS ");
8007 if (r->reganch & ROPT_SKIP)
8008 PerlIO_printf(Perl_debug_log, "plus ");
8009 if (r->reganch & ROPT_IMPLICIT)
8010 PerlIO_printf(Perl_debug_log, "implicit ");
8011 PerlIO_printf(Perl_debug_log, "minlen %ld ", (long) r->minlen);
8012 if (r->reganch & ROPT_EVAL_SEEN)
8013 PerlIO_printf(Perl_debug_log, "with eval ");
8014 PerlIO_printf(Perl_debug_log, "\n");
8016 PERL_UNUSED_CONTEXT;
8018 #endif /* DEBUGGING */
8022 - regprop - printable representation of opcode
8025 Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o)
8030 GET_RE_DEBUG_FLAGS_DECL;
8032 sv_setpvn(sv, "", 0);
8033 if (OP(o) > REGNODE_MAX) /* regnode.type is unsigned */
8034 /* It would be nice to FAIL() here, but this may be called from
8035 regexec.c, and it would be hard to supply pRExC_state. */
8036 Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d", (int)OP(o), (int)REGNODE_MAX);
8037 sv_catpv(sv, reg_name[OP(o)]); /* Take off const! */
8039 k = PL_regkind[OP(o)];
8042 SV * const dsv = sv_2mortal(newSVpvs(""));
8043 /* Using is_utf8_string() (via PERL_PV_UNI_DETECT)
8044 * is a crude hack but it may be the best for now since
8045 * we have no flag "this EXACTish node was UTF-8"
8047 const char * const s =
8048 pv_pretty(dsv, STRING(o), STR_LEN(o), 60,
8049 PL_colors[0], PL_colors[1],
8050 PERL_PV_ESCAPE_UNI_DETECT |
8051 PERL_PV_PRETTY_ELIPSES |
8054 Perl_sv_catpvf(aTHX_ sv, " %s", s );
8055 } else if (k == TRIE) {
8056 /* print the details of the trie in dumpuntil instead, as
8057 * prog->data isn't available here */
8058 const char op = OP(o);
8059 const I32 n = ARG(o);
8060 const reg_ac_data * const ac = IS_TRIE_AC(op) ?
8061 (reg_ac_data *)prog->data->data[n] :
8063 const reg_trie_data * const trie = !IS_TRIE_AC(op) ?
8064 (reg_trie_data*)prog->data->data[n] :
8067 Perl_sv_catpvf(aTHX_ sv, "-%s",reg_name[o->flags]);
8068 DEBUG_TRIE_COMPILE_r(
8069 Perl_sv_catpvf(aTHX_ sv,
8070 "<S:%"UVuf"/%"IVdf" W:%"UVuf" L:%"UVuf"/%"UVuf" C:%"UVuf"/%"UVuf">",
8071 (UV)trie->startstate,
8072 (IV)trie->statecount-1, /* -1 because of the unused 0 element */
8073 (UV)trie->wordcount,
8076 (UV)TRIE_CHARCOUNT(trie),
8077 (UV)trie->uniquecharcount
8080 if ( IS_ANYOF_TRIE(op) || trie->bitmap ) {
8082 int rangestart = -1;
8083 U8* bitmap = IS_ANYOF_TRIE(op) ? (U8*)ANYOF_BITMAP(o) : (U8*)TRIE_BITMAP(trie);
8084 Perl_sv_catpvf(aTHX_ sv, "[");
8085 for (i = 0; i <= 256; i++) {
8086 if (i < 256 && BITMAP_TEST(bitmap,i)) {
8087 if (rangestart == -1)
8089 } else if (rangestart != -1) {
8090 if (i <= rangestart + 3)
8091 for (; rangestart < i; rangestart++)
8092 put_byte(sv, rangestart);
8094 put_byte(sv, rangestart);
8096 put_byte(sv, i - 1);
8101 Perl_sv_catpvf(aTHX_ sv, "]");
8104 } else if (k == CURLY) {
8105 if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX)
8106 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */
8107 Perl_sv_catpvf(aTHX_ sv, " {%d,%d}", ARG1(o), ARG2(o));
8109 else if (k == WHILEM && o->flags) /* Ordinal/of */
8110 Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4);
8111 else if (k == REF || k == OPEN || k == CLOSE || k == GROUPP || OP(o)==ACCEPT)
8112 Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o)); /* Parenth number */
8113 else if (k == GOSUB)
8114 Perl_sv_catpvf(aTHX_ sv, "%d[%+d]", (int)ARG(o),(int)ARG2L(o)); /* Paren and offset */
8115 else if (k == VERB) {
8117 Perl_sv_catpvf(aTHX_ sv, ":%"SVf,
8118 (SV*)prog->data->data[ ARG( o ) ]);
8119 } else if (k == LOGICAL)
8120 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* 2: embedded, otherwise 1 */
8121 else if (k == ANYOF) {
8122 int i, rangestart = -1;
8123 const U8 flags = ANYOF_FLAGS(o);
8125 /* Should be synchronized with * ANYOF_ #xdefines in regcomp.h */
8126 static const char * const anyofs[] = {
8159 if (flags & ANYOF_LOCALE)
8160 sv_catpvs(sv, "{loc}");
8161 if (flags & ANYOF_FOLD)
8162 sv_catpvs(sv, "{i}");
8163 Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
8164 if (flags & ANYOF_INVERT)
8166 for (i = 0; i <= 256; i++) {
8167 if (i < 256 && ANYOF_BITMAP_TEST(o,i)) {
8168 if (rangestart == -1)
8170 } else if (rangestart != -1) {
8171 if (i <= rangestart + 3)
8172 for (; rangestart < i; rangestart++)
8173 put_byte(sv, rangestart);
8175 put_byte(sv, rangestart);
8177 put_byte(sv, i - 1);
8183 if (o->flags & ANYOF_CLASS)
8184 for (i = 0; i < (int)(sizeof(anyofs)/sizeof(char*)); i++)
8185 if (ANYOF_CLASS_TEST(o,i))
8186 sv_catpv(sv, anyofs[i]);
8188 if (flags & ANYOF_UNICODE)
8189 sv_catpvs(sv, "{unicode}");
8190 else if (flags & ANYOF_UNICODE_ALL)
8191 sv_catpvs(sv, "{unicode_all}");
8195 SV * const sw = regclass_swash(prog, o, FALSE, &lv, 0);
8199 U8 s[UTF8_MAXBYTES_CASE+1];
8201 for (i = 0; i <= 256; i++) { /* just the first 256 */
8202 uvchr_to_utf8(s, i);
8204 if (i < 256 && swash_fetch(sw, s, TRUE)) {
8205 if (rangestart == -1)
8207 } else if (rangestart != -1) {
8208 if (i <= rangestart + 3)
8209 for (; rangestart < i; rangestart++) {
8210 const U8 * const e = uvchr_to_utf8(s,rangestart);
8212 for(p = s; p < e; p++)
8216 const U8 *e = uvchr_to_utf8(s,rangestart);
8218 for (p = s; p < e; p++)
8221 e = uvchr_to_utf8(s, i-1);
8222 for (p = s; p < e; p++)
8229 sv_catpvs(sv, "..."); /* et cetera */
8233 char *s = savesvpv(lv);
8234 char * const origs = s;
8236 while (*s && *s != '\n')
8240 const char * const t = ++s;
8258 Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
8260 else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH))
8261 Perl_sv_catpvf(aTHX_ sv, "[%d]", -(o->flags));
8263 PERL_UNUSED_CONTEXT;
8264 PERL_UNUSED_ARG(sv);
8266 PERL_UNUSED_ARG(prog);
8267 #endif /* DEBUGGING */
8271 Perl_re_intuit_string(pTHX_ regexp *prog)
8272 { /* Assume that RE_INTUIT is set */
8274 GET_RE_DEBUG_FLAGS_DECL;
8275 PERL_UNUSED_CONTEXT;
8279 const char * const s = SvPV_nolen_const(prog->check_substr
8280 ? prog->check_substr : prog->check_utf8);
8282 if (!PL_colorset) reginitcolors();
8283 PerlIO_printf(Perl_debug_log,
8284 "%sUsing REx %ssubstr:%s \"%s%.60s%s%s\"\n",
8286 prog->check_substr ? "" : "utf8 ",
8287 PL_colors[5],PL_colors[0],
8290 (strlen(s) > 60 ? "..." : ""));
8293 return prog->check_substr ? prog->check_substr : prog->check_utf8;
8297 pregfree - free a regexp
8299 See regdupe below if you change anything here.
8303 Perl_pregfree(pTHX_ struct regexp *r)
8307 GET_RE_DEBUG_FLAGS_DECL;
8309 if (!r || (--r->refcnt > 0))
8315 SV *dsv= sv_newmortal();
8316 RE_PV_QUOTED_DECL(s, (r->reganch & ROPT_UTF8),
8317 dsv, r->precomp, r->prelen, 60);
8318 PerlIO_printf(Perl_debug_log,"%sFreeing REx:%s %s\n",
8319 PL_colors[4],PL_colors[5],s);
8323 /* gcov results gave these as non-null 100% of the time, so there's no
8324 optimisation in checking them before calling Safefree */
8325 Safefree(r->precomp);
8326 Safefree(r->offsets); /* 20010421 MJD */
8327 RX_MATCH_COPY_FREE(r);
8328 #ifdef PERL_OLD_COPY_ON_WRITE
8330 SvREFCNT_dec(r->saved_copy);
8333 if (r->anchored_substr)
8334 SvREFCNT_dec(r->anchored_substr);
8335 if (r->anchored_utf8)
8336 SvREFCNT_dec(r->anchored_utf8);
8337 if (r->float_substr)
8338 SvREFCNT_dec(r->float_substr);
8340 SvREFCNT_dec(r->float_utf8);
8341 Safefree(r->substrs);
8344 SvREFCNT_dec(r->paren_names);
8346 int n = r->data->count;
8347 PAD* new_comppad = NULL;
8352 /* If you add a ->what type here, update the comment in regcomp.h */
8353 switch (r->data->what[n]) {
8356 SvREFCNT_dec((SV*)r->data->data[n]);
8359 Safefree(r->data->data[n]);
8362 new_comppad = (AV*)r->data->data[n];
8365 if (new_comppad == NULL)
8366 Perl_croak(aTHX_ "panic: pregfree comppad");
8367 PAD_SAVE_LOCAL(old_comppad,
8368 /* Watch out for global destruction's random ordering. */
8369 (SvTYPE(new_comppad) == SVt_PVAV) ? new_comppad : NULL
8372 refcnt = OpREFCNT_dec((OP_4tree*)r->data->data[n]);
8375 op_free((OP_4tree*)r->data->data[n]);
8377 PAD_RESTORE_LOCAL(old_comppad);
8378 SvREFCNT_dec((SV*)new_comppad);
8384 { /* Aho Corasick add-on structure for a trie node.
8385 Used in stclass optimization only */
8387 reg_ac_data *aho=(reg_ac_data*)r->data->data[n];
8389 refcount = --aho->refcount;
8392 Safefree(aho->states);
8393 Safefree(aho->fail);
8394 aho->trie=NULL; /* not necessary to free this as it is
8395 handled by the 't' case */
8396 Safefree(r->data->data[n]); /* do this last!!!! */
8397 Safefree(r->regstclass);
8403 /* trie structure. */
8405 reg_trie_data *trie=(reg_trie_data*)r->data->data[n];
8407 refcount = --trie->refcount;
8410 Safefree(trie->charmap);
8411 if (trie->widecharmap)
8412 SvREFCNT_dec((SV*)trie->widecharmap);
8413 Safefree(trie->states);
8414 Safefree(trie->trans);
8416 Safefree(trie->bitmap);
8418 Safefree(trie->wordlen);
8420 Safefree(trie->jump);
8422 Safefree(trie->nextword);
8426 SvREFCNT_dec((SV*)trie->words);
8427 if (trie->revcharmap)
8428 SvREFCNT_dec((SV*)trie->revcharmap);
8431 Safefree(r->data->data[n]); /* do this last!!!! */
8436 Perl_croak(aTHX_ "panic: regfree data code '%c'", r->data->what[n]);
8439 Safefree(r->data->what);
8442 Safefree(r->startp);
8447 #define sv_dup_inc(s,t) SvREFCNT_inc(sv_dup(s,t))
8448 #define av_dup_inc(s,t) (AV*)SvREFCNT_inc(sv_dup((SV*)s,t))
8449 #define hv_dup_inc(s,t) (HV*)SvREFCNT_inc(sv_dup((SV*)s,t))
8450 #define SAVEPVN(p,n) ((p) ? savepvn(p,n) : NULL)
8453 regdupe - duplicate a regexp.
8455 This routine is called by sv.c's re_dup and is expected to clone a
8456 given regexp structure. It is a no-op when not under USE_ITHREADS.
8457 (Originally this *was* re_dup() for change history see sv.c)
8459 See pregfree() above if you change anything here.
8461 #if defined(USE_ITHREADS)
8463 Perl_regdupe(pTHX_ const regexp *r, CLONE_PARAMS *param)
8468 struct reg_substr_datum *s;
8471 return (REGEXP *)NULL;
8473 if ((ret = (REGEXP *)ptr_table_fetch(PL_ptr_table, r)))
8476 len = r->offsets[0];
8477 npar = r->nparens+1;
8479 Newxc(ret, sizeof(regexp) + (len+1)*sizeof(regnode), char, regexp);
8480 Copy(r->program, ret->program, len+1, regnode);
8482 Newx(ret->startp, npar, I32);
8483 Copy(r->startp, ret->startp, npar, I32);
8484 Newx(ret->endp, npar, I32);
8485 Copy(r->startp, ret->startp, npar, I32);
8487 Newx(ret->substrs, 1, struct reg_substr_data);
8488 for (s = ret->substrs->data, i = 0; i < 3; i++, s++) {
8489 s->min_offset = r->substrs->data[i].min_offset;
8490 s->max_offset = r->substrs->data[i].max_offset;
8491 s->end_shift = r->substrs->data[i].end_shift;
8492 s->substr = sv_dup_inc(r->substrs->data[i].substr, param);
8493 s->utf8_substr = sv_dup_inc(r->substrs->data[i].utf8_substr, param);
8496 ret->regstclass = NULL;
8499 const int count = r->data->count;
8502 Newxc(d, sizeof(struct reg_data) + count*sizeof(void *),
8503 char, struct reg_data);
8504 Newx(d->what, count, U8);
8507 for (i = 0; i < count; i++) {
8508 d->what[i] = r->data->what[i];
8509 switch (d->what[i]) {
8510 /* legal options are one of: sSfpont
8511 see also regcomp.h and pregfree() */
8514 d->data[i] = sv_dup_inc((SV *)r->data->data[i], param);
8517 d->data[i] = av_dup_inc((AV *)r->data->data[i], param);
8520 /* This is cheating. */
8521 Newx(d->data[i], 1, struct regnode_charclass_class);
8522 StructCopy(r->data->data[i], d->data[i],
8523 struct regnode_charclass_class);
8524 ret->regstclass = (regnode*)d->data[i];
8527 /* Compiled op trees are readonly, and can thus be
8528 shared without duplication. */
8530 d->data[i] = (void*)OpREFCNT_inc((OP*)r->data->data[i]);
8534 d->data[i] = r->data->data[i];
8537 d->data[i] = r->data->data[i];
8539 ((reg_trie_data*)d->data[i])->refcount++;
8543 d->data[i] = r->data->data[i];
8545 ((reg_ac_data*)d->data[i])->refcount++;
8547 /* Trie stclasses are readonly and can thus be shared
8548 * without duplication. We free the stclass in pregfree
8549 * when the corresponding reg_ac_data struct is freed.
8551 ret->regstclass= r->regstclass;
8554 Perl_croak(aTHX_ "panic: re_dup unknown data code '%c'", r->data->what[i]);
8563 Newx(ret->offsets, 2*len+1, U32);
8564 Copy(r->offsets, ret->offsets, 2*len+1, U32);
8566 ret->precomp = SAVEPVN(r->precomp, r->prelen);
8567 ret->refcnt = r->refcnt;
8568 ret->minlen = r->minlen;
8569 ret->minlenret = r->minlenret;
8570 ret->prelen = r->prelen;
8571 ret->nparens = r->nparens;
8572 ret->lastparen = r->lastparen;
8573 ret->lastcloseparen = r->lastcloseparen;
8574 ret->reganch = r->reganch;
8576 ret->sublen = r->sublen;
8578 ret->engine = r->engine;
8580 ret->paren_names = hv_dup_inc(r->paren_names, param);
8582 if (RX_MATCH_COPIED(ret))
8583 ret->subbeg = SAVEPVN(r->subbeg, r->sublen);
8586 #ifdef PERL_OLD_COPY_ON_WRITE
8587 ret->saved_copy = NULL;
8590 ptr_table_store(PL_ptr_table, r, ret);
8598 converts a regexp embedded in a MAGIC struct to its stringified form,
8599 caching the converted form in the struct and returns the cached
8602 If lp is nonnull then it is used to return the length of the
8605 If flags is nonnull and the returned string contains UTF8 then
8606 (flags & 1) will be true.
8608 If haseval is nonnull then it is used to return whether the pattern
8611 Normally called via macro:
8613 CALLREG_STRINGIFY(mg,0,0);
8617 CALLREG_AS_STR(mg,lp,flags,haseval)
8619 See sv_2pv_flags() in sv.c for an example of internal usage.
8624 Perl_reg_stringify(pTHX_ MAGIC *mg, STRLEN *lp, U32 *flags, I32 *haseval ) {
8626 const regexp * const re = (regexp *)mg->mg_obj;
8629 const char *fptr = "msix";
8634 bool need_newline = 0;
8635 U16 reganch = (U16)((re->reganch & PMf_COMPILETIME) >> 12);
8637 while((ch = *fptr++)) {
8639 reflags[left++] = ch;
8642 reflags[right--] = ch;
8647 reflags[left] = '-';
8651 mg->mg_len = re->prelen + 4 + left;
8653 * If /x was used, we have to worry about a regex ending with a
8654 * comment later being embedded within another regex. If so, we don't
8655 * want this regex's "commentization" to leak out to the right part of
8656 * the enclosing regex, we must cap it with a newline.
8658 * So, if /x was used, we scan backwards from the end of the regex. If
8659 * we find a '#' before we find a newline, we need to add a newline
8660 * ourself. If we find a '\n' first (or if we don't find '#' or '\n'),
8661 * we don't need to add anything. -jfriedl
8663 if (PMf_EXTENDED & re->reganch) {
8664 const char *endptr = re->precomp + re->prelen;
8665 while (endptr >= re->precomp) {
8666 const char c = *(endptr--);
8668 break; /* don't need another */
8670 /* we end while in a comment, so we need a newline */
8671 mg->mg_len++; /* save space for it */
8672 need_newline = 1; /* note to add it */
8678 Newx(mg->mg_ptr, mg->mg_len + 1 + left, char);
8679 mg->mg_ptr[0] = '(';
8680 mg->mg_ptr[1] = '?';
8681 Copy(reflags, mg->mg_ptr+2, left, char);
8682 *(mg->mg_ptr+left+2) = ':';
8683 Copy(re->precomp, mg->mg_ptr+3+left, re->prelen, char);
8685 mg->mg_ptr[mg->mg_len - 2] = '\n';
8686 mg->mg_ptr[mg->mg_len - 1] = ')';
8687 mg->mg_ptr[mg->mg_len] = 0;
8690 *haseval = re->program[0].next_off;
8692 *flags = ((re->reganch & ROPT_UTF8) ? 1 : 0);
8700 #ifndef PERL_IN_XSUB_RE
8702 - regnext - dig the "next" pointer out of a node
8705 Perl_regnext(pTHX_ register regnode *p)
8708 register I32 offset;
8710 if (p == &PL_regdummy)
8713 offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p));
8722 S_re_croak2(pTHX_ const char* pat1,const char* pat2,...)
8725 STRLEN l1 = strlen(pat1);
8726 STRLEN l2 = strlen(pat2);
8729 const char *message;
8735 Copy(pat1, buf, l1 , char);
8736 Copy(pat2, buf + l1, l2 , char);
8737 buf[l1 + l2] = '\n';
8738 buf[l1 + l2 + 1] = '\0';
8740 /* ANSI variant takes additional second argument */
8741 va_start(args, pat2);
8745 msv = vmess(buf, &args);
8747 message = SvPV_const(msv,l1);
8750 Copy(message, buf, l1 , char);
8751 buf[l1-1] = '\0'; /* Overwrite \n */
8752 Perl_croak(aTHX_ "%s", buf);
8755 /* XXX Here's a total kludge. But we need to re-enter for swash routines. */
8757 #ifndef PERL_IN_XSUB_RE
8759 Perl_save_re_context(pTHX)
8763 struct re_save_state *state;
8765 SAVEVPTR(PL_curcop);
8766 SSGROW(SAVESTACK_ALLOC_FOR_RE_SAVE_STATE + 1);
8768 state = (struct re_save_state *)(PL_savestack + PL_savestack_ix);
8769 PL_savestack_ix += SAVESTACK_ALLOC_FOR_RE_SAVE_STATE;
8770 SSPUSHINT(SAVEt_RE_STATE);
8772 Copy(&PL_reg_state, state, 1, struct re_save_state);
8774 PL_reg_start_tmp = 0;
8775 PL_reg_start_tmpl = 0;
8776 PL_reg_oldsaved = NULL;
8777 PL_reg_oldsavedlen = 0;
8779 PL_reg_leftiter = 0;
8780 PL_reg_poscache = NULL;
8781 PL_reg_poscache_size = 0;
8782 #ifdef PERL_OLD_COPY_ON_WRITE
8786 /* Save $1..$n (#18107: UTF-8 s/(\w+)/uc($1)/e); AMS 20021106. */
8788 const REGEXP * const rx = PM_GETRE(PL_curpm);
8791 for (i = 1; i <= rx->nparens; i++) {
8792 char digits[TYPE_CHARS(long)];
8793 const STRLEN len = my_snprintf(digits, sizeof(digits), "%lu", (long)i);
8794 GV *const *const gvp
8795 = (GV**)hv_fetch(PL_defstash, digits, len, 0);
8798 GV * const gv = *gvp;
8799 if (SvTYPE(gv) == SVt_PVGV && GvSV(gv))
8809 clear_re(pTHX_ void *r)
8812 ReREFCNT_dec((regexp *)r);
8818 S_put_byte(pTHX_ SV *sv, int c)
8820 if (isCNTRL(c) || c == 255 || !isPRINT(c))
8821 Perl_sv_catpvf(aTHX_ sv, "\\%o", c);
8822 else if (c == '-' || c == ']' || c == '\\' || c == '^')
8823 Perl_sv_catpvf(aTHX_ sv, "\\%c", c);
8825 Perl_sv_catpvf(aTHX_ sv, "%c", c);
8829 #define CLEAR_OPTSTART \
8830 if (optstart) STMT_START { \
8831 DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log, " (%d nodes)\n", node - optstart)); \
8835 #define DUMPUNTIL(b,e) CLEAR_OPTSTART; node=dumpuntil(r,start,(b),(e),last,sv,indent+1,depth+1);
8837 STATIC const regnode *
8838 S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node,
8839 const regnode *last, const regnode *plast,
8840 SV* sv, I32 indent, U32 depth)
8843 register U8 op = PSEUDO; /* Arbitrary non-END op. */
8844 register const regnode *next;
8845 const regnode *optstart= NULL;
8846 GET_RE_DEBUG_FLAGS_DECL;
8848 #ifdef DEBUG_DUMPUNTIL
8849 PerlIO_printf(Perl_debug_log, "--- %d : %d - %d - %d\n",indent,node-start,
8850 last ? last-start : 0,plast ? plast-start : 0);
8853 if (plast && plast < last)
8856 while (PL_regkind[op] != END && (!last || node < last)) {
8857 /* While that wasn't END last time... */
8863 next = regnext((regnode *)node);
8866 if (OP(node) == OPTIMIZED) {
8867 if (!optstart && RE_DEBUG_FLAG(RE_DEBUG_COMPILE_OPTIMISE))
8874 regprop(r, sv, node);
8875 PerlIO_printf(Perl_debug_log, "%4"IVdf":%*s%s", (IV)(node - start),
8876 (int)(2*indent + 1), "", SvPVX_const(sv));
8878 if (OP(node) != OPTIMIZED) {
8879 if (next == NULL) /* Next ptr. */
8880 PerlIO_printf(Perl_debug_log, "(0)");
8881 else if (PL_regkind[(U8)op] == BRANCH && PL_regkind[OP(next)] != BRANCH )
8882 PerlIO_printf(Perl_debug_log, "(FAIL)");
8884 PerlIO_printf(Perl_debug_log, "(%"IVdf")", (IV)(next - start));
8886 /*if (PL_regkind[(U8)op] != TRIE)*/
8887 (void)PerlIO_putc(Perl_debug_log, '\n');
8891 if (PL_regkind[(U8)op] == BRANCHJ) {
8894 register const regnode *nnode = (OP(next) == LONGJMP
8895 ? regnext((regnode *)next)
8897 if (last && nnode > last)
8899 DUMPUNTIL(NEXTOPER(NEXTOPER(node)), nnode);
8902 else if (PL_regkind[(U8)op] == BRANCH) {
8904 DUMPUNTIL(NEXTOPER(node), next);
8906 else if ( PL_regkind[(U8)op] == TRIE ) {
8907 const regnode *this_trie = node;
8908 const char op = OP(node);
8909 const I32 n = ARG(node);
8910 const reg_ac_data * const ac = op>=AHOCORASICK ?
8911 (reg_ac_data *)r->data->data[n] :
8913 const reg_trie_data * const trie = op<AHOCORASICK ?
8914 (reg_trie_data*)r->data->data[n] :
8916 const regnode *nextbranch= NULL;
8918 sv_setpvn(sv, "", 0);
8919 for (word_idx= 0; word_idx < (I32)trie->wordcount; word_idx++) {
8920 SV ** const elem_ptr = av_fetch(trie->words,word_idx,0);
8922 PerlIO_printf(Perl_debug_log, "%*s%s ",
8923 (int)(2*(indent+3)), "",
8924 elem_ptr ? pv_pretty(sv, SvPV_nolen_const(*elem_ptr), SvCUR(*elem_ptr), 60,
8925 PL_colors[0], PL_colors[1],
8926 (SvUTF8(*elem_ptr) ? PERL_PV_ESCAPE_UNI : 0) |
8927 PERL_PV_PRETTY_ELIPSES |
8933 U16 dist= trie->jump[word_idx+1];
8934 PerlIO_printf(Perl_debug_log, "(%u)\n",
8935 (dist ? this_trie + dist : next) - start);
8938 nextbranch= this_trie + trie->jump[0];
8939 DUMPUNTIL(this_trie + dist, nextbranch);
8941 if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
8942 nextbranch= regnext((regnode *)nextbranch);
8944 PerlIO_printf(Perl_debug_log, "\n");
8947 if (last && next > last)
8952 else if ( op == CURLY ) { /* "next" might be very big: optimizer */
8953 DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS,
8954 NEXTOPER(node) + EXTRA_STEP_2ARGS + 1);
8956 else if (PL_regkind[(U8)op] == CURLY && op != CURLYX) {
8958 DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS, next);
8960 else if ( op == PLUS || op == STAR) {
8961 DUMPUNTIL(NEXTOPER(node), NEXTOPER(node) + 1);
8963 else if (op == ANYOF) {
8964 /* arglen 1 + class block */
8965 node += 1 + ((ANYOF_FLAGS(node) & ANYOF_LARGE)
8966 ? ANYOF_CLASS_SKIP : ANYOF_SKIP);
8967 node = NEXTOPER(node);
8969 else if (PL_regkind[(U8)op] == EXACT) {
8970 /* Literal string, where present. */
8971 node += NODE_SZ_STR(node) - 1;
8972 node = NEXTOPER(node);
8975 node = NEXTOPER(node);
8976 node += regarglen[(U8)op];
8978 if (op == CURLYX || op == OPEN)
8980 else if (op == WHILEM)
8984 #ifdef DEBUG_DUMPUNTIL
8985 PerlIO_printf(Perl_debug_log, "--- %d\n",indent);
8990 #endif /* DEBUGGING */
8994 * c-indentation-style: bsd
8996 * indent-tabs-mode: t
8999 * ex: set ts=8 sts=4 sw=4 noet: