5 * "A fair jaw-cracker dwarf-language must be." --Samwise Gamgee
8 /* This file contains functions for compiling a regular expression. See
9 * also regexec.c which funnily enough, contains functions for executing
10 * a regular expression.
12 * This file is also copied at build time to ext/re/re_comp.c, where
13 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
14 * This causes the main functions to be compiled under new names and with
15 * debugging support added, which makes "use re 'debug'" work.
18 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
19 * confused with the original package (see point 3 below). Thanks, Henry!
22 /* Additional note: this code is very heavily munged from Henry's version
23 * in places. In some spots I've traded clarity for efficiency, so don't
24 * blame Henry for some of the lack of readability.
27 /* The names of the functions have been changed from regcomp and
28 * regexec to pregcomp and pregexec in order to avoid conflicts
29 * with the POSIX routines of the same names.
32 #ifdef PERL_EXT_RE_BUILD
37 * pregcomp and pregexec -- regsub and regerror are not used in perl
39 * Copyright (c) 1986 by University of Toronto.
40 * Written by Henry Spencer. Not derived from licensed software.
42 * Permission is granted to anyone to use this software for any
43 * purpose on any computer system, and to redistribute it freely,
44 * subject to the following restrictions:
46 * 1. The author is not responsible for the consequences of use of
47 * this software, no matter how awful, even if they arise
50 * 2. The origin of this software must not be misrepresented, either
51 * by explicit claim or by omission.
53 * 3. Altered versions must be plainly marked as such, and must not
54 * be misrepresented as being the original software.
57 **** Alterations to Henry's code are...
59 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
60 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, by Larry Wall and others
62 **** You may distribute under the terms of either the GNU General Public
63 **** License or the Artistic License, as specified in the README file.
66 * Beware that some of this code is subtly aware of the way operator
67 * precedence is structured in regular expressions. Serious changes in
68 * regular-expression syntax might require a total rethink.
71 #define PERL_IN_REGCOMP_C
74 #ifndef PERL_IN_XSUB_RE
79 #ifdef PERL_IN_XSUB_RE
90 # if defined(BUGGY_MSC6)
91 /* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */
92 # pragma optimize("a",off)
93 /* But MSC 6.00A is happy with 'w', for aliases only across function calls*/
94 # pragma optimize("w",on )
95 # endif /* BUGGY_MSC6 */
102 typedef struct RExC_state_t {
103 U32 flags; /* are we folding, multilining? */
104 char *precomp; /* uncompiled string. */
106 char *start; /* Start of input for compile */
107 char *end; /* End of input for compile */
108 char *parse; /* Input-scan pointer. */
109 I32 whilem_seen; /* number of WHILEM in this expr */
110 regnode *emit_start; /* Start of emitted-code area */
111 regnode *emit; /* Code-emit pointer; ®dummy = don't = compiling */
112 I32 naughty; /* How bad is this pattern? */
113 I32 sawback; /* Did we see \1, ...? */
115 I32 size; /* Code size. */
116 I32 npar; /* Capture buffer count, (OPEN). */
117 I32 cpar; /* Capture buffer count, (CLOSE). */
118 I32 nestroot; /* root parens we are in - used by accept */
122 regnode **open_parens; /* pointers to open parens */
123 regnode **close_parens; /* pointers to close parens */
124 regnode *opend; /* END node in program */
126 HV *charnames; /* cache of named sequences */
127 HV *paren_names; /* Paren names */
128 regnode **recurse; /* Recurse regops */
129 I32 recurse_count; /* Number of recurse regops */
131 char *starttry; /* -Dr: where regtry was called. */
132 #define RExC_starttry (pRExC_state->starttry)
135 const char *lastparse;
137 #define RExC_lastparse (pRExC_state->lastparse)
138 #define RExC_lastnum (pRExC_state->lastnum)
142 #define RExC_flags (pRExC_state->flags)
143 #define RExC_precomp (pRExC_state->precomp)
144 #define RExC_rx (pRExC_state->rx)
145 #define RExC_start (pRExC_state->start)
146 #define RExC_end (pRExC_state->end)
147 #define RExC_parse (pRExC_state->parse)
148 #define RExC_whilem_seen (pRExC_state->whilem_seen)
149 #define RExC_offsets (pRExC_state->rx->offsets) /* I am not like the others */
150 #define RExC_emit (pRExC_state->emit)
151 #define RExC_emit_start (pRExC_state->emit_start)
152 #define RExC_naughty (pRExC_state->naughty)
153 #define RExC_sawback (pRExC_state->sawback)
154 #define RExC_seen (pRExC_state->seen)
155 #define RExC_size (pRExC_state->size)
156 #define RExC_npar (pRExC_state->npar)
157 #define RExC_cpar (pRExC_state->cpar)
158 #define RExC_nestroot (pRExC_state->nestroot)
159 #define RExC_extralen (pRExC_state->extralen)
160 #define RExC_seen_zerolen (pRExC_state->seen_zerolen)
161 #define RExC_seen_evals (pRExC_state->seen_evals)
162 #define RExC_utf8 (pRExC_state->utf8)
163 #define RExC_charnames (pRExC_state->charnames)
164 #define RExC_open_parens (pRExC_state->open_parens)
165 #define RExC_close_parens (pRExC_state->close_parens)
166 #define RExC_opend (pRExC_state->opend)
167 #define RExC_paren_names (pRExC_state->paren_names)
168 #define RExC_recurse (pRExC_state->recurse)
169 #define RExC_recurse_count (pRExC_state->recurse_count)
171 #define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?')
172 #define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
173 ((*s) == '{' && regcurly(s)))
176 #undef SPSTART /* dratted cpp namespace... */
179 * Flags to be passed up and down.
181 #define WORST 0 /* Worst case. */
182 #define HASWIDTH 0x1 /* Known to match non-null strings. */
183 #define SIMPLE 0x2 /* Simple enough to be STAR/PLUS operand. */
184 #define SPSTART 0x4 /* Starts with * or +. */
185 #define TRYAGAIN 0x8 /* Weeded out a declaration. */
187 #define REG_NODE_NUM(x) ((x) ? (int)((x)-RExC_emit_start) : -1)
189 /* whether trie related optimizations are enabled */
190 #if PERL_ENABLE_EXTENDED_TRIE_OPTIMISATION
191 #define TRIE_STUDY_OPT
192 #define FULL_TRIE_STUDY
198 #define PBYTE(u8str,paren) ((U8*)(u8str))[(paren) >> 3]
199 #define PBITVAL(paren) (1 << ((paren) & 7))
200 #define PAREN_TEST(u8str,paren) ( PBYTE(u8str,paren) & PBITVAL(paren))
201 #define PAREN_SET(u8str,paren) PBYTE(u8str,paren) |= PBITVAL(paren)
202 #define PAREN_UNSET(u8str,paren) PBYTE(u8str,paren) &= (~PBITVAL(paren))
205 /* About scan_data_t.
207 During optimisation we recurse through the regexp program performing
208 various inplace (keyhole style) optimisations. In addition study_chunk
209 and scan_commit populate this data structure with information about
210 what strings MUST appear in the pattern. We look for the longest
211 string that must appear for at a fixed location, and we look for the
212 longest string that may appear at a floating location. So for instance
217 Both 'FOO' and 'A' are fixed strings. Both 'B' and 'BAR' are floating
218 strings (because they follow a .* construct). study_chunk will identify
219 both FOO and BAR as being the longest fixed and floating strings respectively.
221 The strings can be composites, for instance
225 will result in a composite fixed substring 'foo'.
227 For each string some basic information is maintained:
229 - offset or min_offset
230 This is the position the string must appear at, or not before.
231 It also implicitly (when combined with minlenp) tells us how many
232 character must match before the string we are searching.
233 Likewise when combined with minlenp and the length of the string
234 tells us how many characters must appear after the string we have
238 Only used for floating strings. This is the rightmost point that
239 the string can appear at. Ifset to I32 max it indicates that the
240 string can occur infinitely far to the right.
243 A pointer to the minimum length of the pattern that the string
244 was found inside. This is important as in the case of positive
245 lookahead or positive lookbehind we can have multiple patterns
250 The minimum length of the pattern overall is 3, the minimum length
251 of the lookahead part is 3, but the minimum length of the part that
252 will actually match is 1. So 'FOO's minimum length is 3, but the
253 minimum length for the F is 1. This is important as the minimum length
254 is used to determine offsets in front of and behind the string being
255 looked for. Since strings can be composites this is the length of the
256 pattern at the time it was commited with a scan_commit. Note that
257 the length is calculated by study_chunk, so that the minimum lengths
258 are not known until the full pattern has been compiled, thus the
259 pointer to the value.
263 In the case of lookbehind the string being searched for can be
264 offset past the start point of the final matching string.
265 If this value was just blithely removed from the min_offset it would
266 invalidate some of the calculations for how many chars must match
267 before or after (as they are derived from min_offset and minlen and
268 the length of the string being searched for).
269 When the final pattern is compiled and the data is moved from the
270 scan_data_t structure into the regexp structure the information
271 about lookbehind is factored in, with the information that would
272 have been lost precalculated in the end_shift field for the
275 The fields pos_min and pos_delta are used to store the minimum offset
276 and the delta to the maximum offset at the current point in the pattern.
280 typedef struct scan_data_t {
281 /*I32 len_min; unused */
282 /*I32 len_delta; unused */
286 I32 last_end; /* min value, <0 unless valid. */
289 SV **longest; /* Either &l_fixed, or &l_float. */
290 SV *longest_fixed; /* longest fixed string found in pattern */
291 I32 offset_fixed; /* offset where it starts */
292 I32 *minlen_fixed; /* pointer to the minlen relevent to the string */
293 I32 lookbehind_fixed; /* is the position of the string modfied by LB */
294 SV *longest_float; /* longest floating string found in pattern */
295 I32 offset_float_min; /* earliest point in string it can appear */
296 I32 offset_float_max; /* latest point in string it can appear */
297 I32 *minlen_float; /* pointer to the minlen relevent to the string */
298 I32 lookbehind_float; /* is the position of the string modified by LB */
302 struct regnode_charclass_class *start_class;
306 * Forward declarations for pregcomp()'s friends.
309 static const scan_data_t zero_scan_data =
310 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ,0};
312 #define SF_BEFORE_EOL (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
313 #define SF_BEFORE_SEOL 0x0001
314 #define SF_BEFORE_MEOL 0x0002
315 #define SF_FIX_BEFORE_EOL (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
316 #define SF_FL_BEFORE_EOL (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
319 # define SF_FIX_SHIFT_EOL (0+2)
320 # define SF_FL_SHIFT_EOL (0+4)
322 # define SF_FIX_SHIFT_EOL (+2)
323 # define SF_FL_SHIFT_EOL (+4)
326 #define SF_FIX_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
327 #define SF_FIX_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
329 #define SF_FL_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
330 #define SF_FL_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
331 #define SF_IS_INF 0x0040
332 #define SF_HAS_PAR 0x0080
333 #define SF_IN_PAR 0x0100
334 #define SF_HAS_EVAL 0x0200
335 #define SCF_DO_SUBSTR 0x0400
336 #define SCF_DO_STCLASS_AND 0x0800
337 #define SCF_DO_STCLASS_OR 0x1000
338 #define SCF_DO_STCLASS (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
339 #define SCF_WHILEM_VISITED_POS 0x2000
341 #define SCF_TRIE_RESTUDY 0x4000 /* Do restudy? */
342 #define SCF_SEEN_ACCEPT 0x8000
344 #define UTF (RExC_utf8 != 0)
345 #define LOC ((RExC_flags & PMf_LOCALE) != 0)
346 #define FOLD ((RExC_flags & PMf_FOLD) != 0)
348 #define OOB_UNICODE 12345678
349 #define OOB_NAMEDCLASS -1
351 #define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
352 #define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
355 /* length of regex to show in messages that don't mark a position within */
356 #define RegexLengthToShowInErrorMessages 127
359 * If MARKER[12] are adjusted, be sure to adjust the constants at the top
360 * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
361 * op/pragma/warn/regcomp.
363 #define MARKER1 "<-- HERE" /* marker as it appears in the description */
364 #define MARKER2 " <-- HERE " /* marker as it appears within the regex */
366 #define REPORT_LOCATION " in regex; marked by " MARKER1 " in m/%.*s" MARKER2 "%s/"
369 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
370 * arg. Show regex, up to a maximum length. If it's too long, chop and add
373 #define _FAIL(code) STMT_START { \
374 const char *ellipses = ""; \
375 IV len = RExC_end - RExC_precomp; \
378 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
379 if (len > RegexLengthToShowInErrorMessages) { \
380 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
381 len = RegexLengthToShowInErrorMessages - 10; \
387 #define FAIL(msg) _FAIL( \
388 Perl_croak(aTHX_ "%s in regex m/%.*s%s/", \
389 msg, (int)len, RExC_precomp, ellipses))
391 #define FAIL2(msg,arg) _FAIL( \
392 Perl_croak(aTHX_ msg " in regex m/%.*s%s/", \
393 arg, (int)len, RExC_precomp, ellipses))
396 * Simple_vFAIL -- like FAIL, but marks the current location in the scan
398 #define Simple_vFAIL(m) STMT_START { \
399 const IV offset = RExC_parse - RExC_precomp; \
400 Perl_croak(aTHX_ "%s" REPORT_LOCATION, \
401 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
405 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
407 #define vFAIL(m) STMT_START { \
409 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
414 * Like Simple_vFAIL(), but accepts two arguments.
416 #define Simple_vFAIL2(m,a1) STMT_START { \
417 const IV offset = RExC_parse - RExC_precomp; \
418 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, \
419 (int)offset, RExC_precomp, RExC_precomp + offset); \
423 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
425 #define vFAIL2(m,a1) STMT_START { \
427 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
428 Simple_vFAIL2(m, a1); \
433 * Like Simple_vFAIL(), but accepts three arguments.
435 #define Simple_vFAIL3(m, a1, a2) STMT_START { \
436 const IV offset = RExC_parse - RExC_precomp; \
437 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, \
438 (int)offset, RExC_precomp, RExC_precomp + offset); \
442 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
444 #define vFAIL3(m,a1,a2) STMT_START { \
446 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
447 Simple_vFAIL3(m, a1, a2); \
451 * Like Simple_vFAIL(), but accepts four arguments.
453 #define Simple_vFAIL4(m, a1, a2, a3) STMT_START { \
454 const IV offset = RExC_parse - RExC_precomp; \
455 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3, \
456 (int)offset, RExC_precomp, RExC_precomp + offset); \
459 #define vWARN(loc,m) STMT_START { \
460 const IV offset = loc - RExC_precomp; \
461 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s" REPORT_LOCATION, \
462 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
465 #define vWARNdep(loc,m) STMT_START { \
466 const IV offset = loc - RExC_precomp; \
467 Perl_warner(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), \
468 "%s" REPORT_LOCATION, \
469 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
473 #define vWARN2(loc, m, a1) STMT_START { \
474 const IV offset = loc - RExC_precomp; \
475 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
476 a1, (int)offset, RExC_precomp, RExC_precomp + offset); \
479 #define vWARN3(loc, m, a1, a2) STMT_START { \
480 const IV offset = loc - RExC_precomp; \
481 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
482 a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset); \
485 #define vWARN4(loc, m, a1, a2, a3) STMT_START { \
486 const IV offset = loc - RExC_precomp; \
487 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
488 a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
491 #define vWARN5(loc, m, a1, a2, a3, a4) STMT_START { \
492 const IV offset = loc - RExC_precomp; \
493 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
494 a1, a2, a3, a4, (int)offset, RExC_precomp, RExC_precomp + offset); \
498 /* Allow for side effects in s */
499 #define REGC(c,s) STMT_START { \
500 if (!SIZE_ONLY) *(s) = (c); else (void)(s); \
503 /* Macros for recording node offsets. 20001227 mjd@plover.com
504 * Nodes are numbered 1, 2, 3, 4. Node #n's position is recorded in
505 * element 2*n-1 of the array. Element #2n holds the byte length node #n.
506 * Element 0 holds the number n.
507 * Position is 1 indexed.
510 #define Set_Node_Offset_To_R(node,byte) STMT_START { \
512 MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n", \
513 __LINE__, (int)(node), (int)(byte))); \
515 Perl_croak(aTHX_ "value of node is %d in Offset macro", (int)(node)); \
517 RExC_offsets[2*(node)-1] = (byte); \
522 #define Set_Node_Offset(node,byte) \
523 Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
524 #define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
526 #define Set_Node_Length_To_R(node,len) STMT_START { \
528 MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n", \
529 __LINE__, (int)(node), (int)(len))); \
531 Perl_croak(aTHX_ "value of node is %d in Length macro", (int)(node)); \
533 RExC_offsets[2*(node)] = (len); \
538 #define Set_Node_Length(node,len) \
539 Set_Node_Length_To_R((node)-RExC_emit_start, len)
540 #define Set_Cur_Node_Length(len) Set_Node_Length(RExC_emit, len)
541 #define Set_Node_Cur_Length(node) \
542 Set_Node_Length(node, RExC_parse - parse_start)
544 /* Get offsets and lengths */
545 #define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
546 #define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
548 #define Set_Node_Offset_Length(node,offset,len) STMT_START { \
549 Set_Node_Offset_To_R((node)-RExC_emit_start, (offset)); \
550 Set_Node_Length_To_R((node)-RExC_emit_start, (len)); \
554 #if PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS
555 #define EXPERIMENTAL_INPLACESCAN
558 #define DEBUG_STUDYDATA(data,depth) \
559 DEBUG_OPTIMISE_MORE_r(if(data){ \
560 PerlIO_printf(Perl_debug_log, \
561 "%*s"/* Len:%"IVdf"/%"IVdf" */"Pos:%"IVdf"/%"IVdf \
562 " Flags: %"IVdf" Whilem_c: %"IVdf" Lcp: %"IVdf" ", \
563 (int)(depth)*2, "", \
564 (IV)((data)->pos_min), \
565 (IV)((data)->pos_delta), \
566 (IV)((data)->flags), \
567 (IV)((data)->whilem_c), \
568 (IV)((data)->last_closep ? *((data)->last_closep) : -1) \
570 if ((data)->last_found) \
571 PerlIO_printf(Perl_debug_log, \
572 "Last:'%s' %"IVdf":%"IVdf"/%"IVdf" %sFixed:'%s' @ %"IVdf \
573 " %sFloat: '%s' @ %"IVdf"/%"IVdf"", \
574 SvPVX_const((data)->last_found), \
575 (IV)((data)->last_end), \
576 (IV)((data)->last_start_min), \
577 (IV)((data)->last_start_max), \
578 ((data)->longest && \
579 (data)->longest==&((data)->longest_fixed)) ? "*" : "", \
580 SvPVX_const((data)->longest_fixed), \
581 (IV)((data)->offset_fixed), \
582 ((data)->longest && \
583 (data)->longest==&((data)->longest_float)) ? "*" : "", \
584 SvPVX_const((data)->longest_float), \
585 (IV)((data)->offset_float_min), \
586 (IV)((data)->offset_float_max) \
588 PerlIO_printf(Perl_debug_log,"\n"); \
591 static void clear_re(pTHX_ void *r);
593 /* Mark that we cannot extend a found fixed substring at this point.
594 Update the longest found anchored substring and the longest found
595 floating substrings if needed. */
598 S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data, I32 *minlenp)
600 const STRLEN l = CHR_SVLEN(data->last_found);
601 const STRLEN old_l = CHR_SVLEN(*data->longest);
602 GET_RE_DEBUG_FLAGS_DECL;
604 if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
605 SvSetMagicSV(*data->longest, data->last_found);
606 if (*data->longest == data->longest_fixed) {
607 data->offset_fixed = l ? data->last_start_min : data->pos_min;
608 if (data->flags & SF_BEFORE_EOL)
610 |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
612 data->flags &= ~SF_FIX_BEFORE_EOL;
613 data->minlen_fixed=minlenp;
614 data->lookbehind_fixed=0;
617 data->offset_float_min = l ? data->last_start_min : data->pos_min;
618 data->offset_float_max = (l
619 ? data->last_start_max
620 : data->pos_min + data->pos_delta);
621 if ((U32)data->offset_float_max > (U32)I32_MAX)
622 data->offset_float_max = I32_MAX;
623 if (data->flags & SF_BEFORE_EOL)
625 |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
627 data->flags &= ~SF_FL_BEFORE_EOL;
628 data->minlen_float=minlenp;
629 data->lookbehind_float=0;
632 SvCUR_set(data->last_found, 0);
634 SV * const sv = data->last_found;
635 if (SvUTF8(sv) && SvMAGICAL(sv)) {
636 MAGIC * const mg = mg_find(sv, PERL_MAGIC_utf8);
642 data->flags &= ~SF_BEFORE_EOL;
643 DEBUG_STUDYDATA(data,0);
646 /* Can match anything (initialization) */
648 S_cl_anything(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
650 ANYOF_CLASS_ZERO(cl);
651 ANYOF_BITMAP_SETALL(cl);
652 cl->flags = ANYOF_EOS|ANYOF_UNICODE_ALL;
654 cl->flags |= ANYOF_LOCALE;
657 /* Can match anything (initialization) */
659 S_cl_is_anything(const struct regnode_charclass_class *cl)
663 for (value = 0; value <= ANYOF_MAX; value += 2)
664 if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1))
666 if (!(cl->flags & ANYOF_UNICODE_ALL))
668 if (!ANYOF_BITMAP_TESTALLSET((const void*)cl))
673 /* Can match anything (initialization) */
675 S_cl_init(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
677 Zero(cl, 1, struct regnode_charclass_class);
679 cl_anything(pRExC_state, cl);
683 S_cl_init_zero(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
685 Zero(cl, 1, struct regnode_charclass_class);
687 cl_anything(pRExC_state, cl);
689 cl->flags |= ANYOF_LOCALE;
692 /* 'And' a given class with another one. Can create false positives */
693 /* We assume that cl is not inverted */
695 S_cl_and(struct regnode_charclass_class *cl,
696 const struct regnode_charclass_class *and_with)
699 assert(and_with->type == ANYOF);
700 if (!(and_with->flags & ANYOF_CLASS)
701 && !(cl->flags & ANYOF_CLASS)
702 && (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
703 && !(and_with->flags & ANYOF_FOLD)
704 && !(cl->flags & ANYOF_FOLD)) {
707 if (and_with->flags & ANYOF_INVERT)
708 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
709 cl->bitmap[i] &= ~and_with->bitmap[i];
711 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
712 cl->bitmap[i] &= and_with->bitmap[i];
713 } /* XXXX: logic is complicated otherwise, leave it along for a moment. */
714 if (!(and_with->flags & ANYOF_EOS))
715 cl->flags &= ~ANYOF_EOS;
717 if (cl->flags & ANYOF_UNICODE_ALL && and_with->flags & ANYOF_UNICODE &&
718 !(and_with->flags & ANYOF_INVERT)) {
719 cl->flags &= ~ANYOF_UNICODE_ALL;
720 cl->flags |= ANYOF_UNICODE;
721 ARG_SET(cl, ARG(and_with));
723 if (!(and_with->flags & ANYOF_UNICODE_ALL) &&
724 !(and_with->flags & ANYOF_INVERT))
725 cl->flags &= ~ANYOF_UNICODE_ALL;
726 if (!(and_with->flags & (ANYOF_UNICODE|ANYOF_UNICODE_ALL)) &&
727 !(and_with->flags & ANYOF_INVERT))
728 cl->flags &= ~ANYOF_UNICODE;
731 /* 'OR' a given class with another one. Can create false positives */
732 /* We assume that cl is not inverted */
734 S_cl_or(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, const struct regnode_charclass_class *or_with)
736 if (or_with->flags & ANYOF_INVERT) {
738 * (B1 | CL1) | (!B2 & !CL2) = (B1 | !B2 & !CL2) | (CL1 | (!B2 & !CL2))
739 * <= (B1 | !B2) | (CL1 | !CL2)
740 * which is wasteful if CL2 is small, but we ignore CL2:
741 * (B1 | CL1) | (!B2 & !CL2) <= (B1 | CL1) | !B2 = (B1 | !B2) | CL1
742 * XXXX Can we handle case-fold? Unclear:
743 * (OK1(i) | OK1(i')) | !(OK1(i) | OK1(i')) =
744 * (OK1(i) | OK1(i')) | (!OK1(i) & !OK1(i'))
746 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
747 && !(or_with->flags & ANYOF_FOLD)
748 && !(cl->flags & ANYOF_FOLD) ) {
751 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
752 cl->bitmap[i] |= ~or_with->bitmap[i];
753 } /* XXXX: logic is complicated otherwise */
755 cl_anything(pRExC_state, cl);
758 /* (B1 | CL1) | (B2 | CL2) = (B1 | B2) | (CL1 | CL2)) */
759 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
760 && (!(or_with->flags & ANYOF_FOLD)
761 || (cl->flags & ANYOF_FOLD)) ) {
764 /* OR char bitmap and class bitmap separately */
765 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
766 cl->bitmap[i] |= or_with->bitmap[i];
767 if (or_with->flags & ANYOF_CLASS) {
768 for (i = 0; i < ANYOF_CLASSBITMAP_SIZE; i++)
769 cl->classflags[i] |= or_with->classflags[i];
770 cl->flags |= ANYOF_CLASS;
773 else { /* XXXX: logic is complicated, leave it along for a moment. */
774 cl_anything(pRExC_state, cl);
777 if (or_with->flags & ANYOF_EOS)
778 cl->flags |= ANYOF_EOS;
780 if (cl->flags & ANYOF_UNICODE && or_with->flags & ANYOF_UNICODE &&
781 ARG(cl) != ARG(or_with)) {
782 cl->flags |= ANYOF_UNICODE_ALL;
783 cl->flags &= ~ANYOF_UNICODE;
785 if (or_with->flags & ANYOF_UNICODE_ALL) {
786 cl->flags |= ANYOF_UNICODE_ALL;
787 cl->flags &= ~ANYOF_UNICODE;
791 #define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ]
792 #define TRIE_LIST_CUR(state) ( TRIE_LIST_ITEM( state, 0 ).forid )
793 #define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate )
794 #define TRIE_LIST_USED(idx) ( trie->states[state].trans.list ? (TRIE_LIST_CUR( idx ) - 1) : 0 )
800 dump_trie_interim_list(trie,next_alloc)
801 dump_trie_interim_table(trie,next_alloc)
803 These routines dump out a trie in a somewhat readable format.
804 The _interim_ variants are used for debugging the interim
805 tables that are used to generate the final compressed
806 representation which is what dump_trie expects.
808 Part of the reason for their existance is to provide a form
809 of documentation as to how the different representations function.
815 Dumps the final compressed table form of the trie to Perl_debug_log.
816 Used for debugging make_trie().
820 S_dump_trie(pTHX_ const struct _reg_trie_data *trie,U32 depth)
823 SV *sv=sv_newmortal();
824 int colwidth= trie->widecharmap ? 6 : 4;
825 GET_RE_DEBUG_FLAGS_DECL;
828 PerlIO_printf( Perl_debug_log, "%*sChar : %-6s%-6s%-4s ",
829 (int)depth * 2 + 2,"",
830 "Match","Base","Ofs" );
832 for( state = 0 ; state < trie->uniquecharcount ; state++ ) {
833 SV ** const tmp = av_fetch( trie->revcharmap, state, 0);
835 PerlIO_printf( Perl_debug_log, "%*s",
837 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
838 PL_colors[0], PL_colors[1],
839 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
840 PERL_PV_ESCAPE_FIRSTCHAR
845 PerlIO_printf( Perl_debug_log, "\n%*sState|-----------------------",
846 (int)depth * 2 + 2,"");
848 for( state = 0 ; state < trie->uniquecharcount ; state++ )
849 PerlIO_printf( Perl_debug_log, "%.*s", colwidth, "--------");
850 PerlIO_printf( Perl_debug_log, "\n");
852 for( state = 1 ; state < trie->statecount ; state++ ) {
853 const U32 base = trie->states[ state ].trans.base;
855 PerlIO_printf( Perl_debug_log, "%*s#%4"UVXf"|", (int)depth * 2 + 2,"", (UV)state);
857 if ( trie->states[ state ].wordnum ) {
858 PerlIO_printf( Perl_debug_log, " W%4X", trie->states[ state ].wordnum );
860 PerlIO_printf( Perl_debug_log, "%6s", "" );
863 PerlIO_printf( Perl_debug_log, " @%4"UVXf" ", (UV)base );
868 while( ( base + ofs < trie->uniquecharcount ) ||
869 ( base + ofs - trie->uniquecharcount < trie->lasttrans
870 && trie->trans[ base + ofs - trie->uniquecharcount ].check != state))
873 PerlIO_printf( Perl_debug_log, "+%2"UVXf"[ ", (UV)ofs);
875 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
876 if ( ( base + ofs >= trie->uniquecharcount ) &&
877 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
878 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
880 PerlIO_printf( Perl_debug_log, "%*"UVXf,
882 (UV)trie->trans[ base + ofs - trie->uniquecharcount ].next );
884 PerlIO_printf( Perl_debug_log, "%*s",colwidth," ." );
888 PerlIO_printf( Perl_debug_log, "]");
891 PerlIO_printf( Perl_debug_log, "\n" );
895 dump_trie_interim_list(trie,next_alloc)
896 Dumps a fully constructed but uncompressed trie in list form.
897 List tries normally only are used for construction when the number of
898 possible chars (trie->uniquecharcount) is very high.
899 Used for debugging make_trie().
902 S_dump_trie_interim_list(pTHX_ const struct _reg_trie_data *trie, U32 next_alloc,U32 depth)
905 SV *sv=sv_newmortal();
906 int colwidth= trie->widecharmap ? 6 : 4;
907 GET_RE_DEBUG_FLAGS_DECL;
908 /* print out the table precompression. */
909 PerlIO_printf( Perl_debug_log, "%*sState :Word | Transition Data\n%*s%s",
910 (int)depth * 2 + 2,"", (int)depth * 2 + 2,"",
911 "------:-----+-----------------\n" );
913 for( state=1 ; state < next_alloc ; state ++ ) {
916 PerlIO_printf( Perl_debug_log, "%*s %4"UVXf" :",
917 (int)depth * 2 + 2,"", (UV)state );
918 if ( ! trie->states[ state ].wordnum ) {
919 PerlIO_printf( Perl_debug_log, "%5s| ","");
921 PerlIO_printf( Perl_debug_log, "W%4x| ",
922 trie->states[ state ].wordnum
925 for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) {
926 SV ** const tmp = av_fetch( trie->revcharmap, TRIE_LIST_ITEM(state,charid).forid, 0);
928 PerlIO_printf( Perl_debug_log, "%*s:%3X=%4"UVXf" | ",
930 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
931 PL_colors[0], PL_colors[1],
932 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
933 PERL_PV_ESCAPE_FIRSTCHAR
935 TRIE_LIST_ITEM(state,charid).forid,
936 (UV)TRIE_LIST_ITEM(state,charid).newstate
939 PerlIO_printf(Perl_debug_log, "\n%*s| ",
940 (int)((depth * 2) + 14), "");
943 PerlIO_printf( Perl_debug_log, "\n");
948 dump_trie_interim_table(trie,next_alloc)
949 Dumps a fully constructed but uncompressed trie in table form.
950 This is the normal DFA style state transition table, with a few
951 twists to facilitate compression later.
952 Used for debugging make_trie().
955 S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie, U32 next_alloc, U32 depth)
959 SV *sv=sv_newmortal();
960 int colwidth= trie->widecharmap ? 6 : 4;
961 GET_RE_DEBUG_FLAGS_DECL;
964 print out the table precompression so that we can do a visual check
965 that they are identical.
968 PerlIO_printf( Perl_debug_log, "%*sChar : ",(int)depth * 2 + 2,"" );
970 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
971 SV ** const tmp = av_fetch( trie->revcharmap, charid, 0);
973 PerlIO_printf( Perl_debug_log, "%*s",
975 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
976 PL_colors[0], PL_colors[1],
977 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
978 PERL_PV_ESCAPE_FIRSTCHAR
984 PerlIO_printf( Perl_debug_log, "\n%*sState+-",(int)depth * 2 + 2,"" );
986 for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) {
987 PerlIO_printf( Perl_debug_log, "%.*s", colwidth,"--------");
990 PerlIO_printf( Perl_debug_log, "\n" );
992 for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) {
994 PerlIO_printf( Perl_debug_log, "%*s%4"UVXf" : ",
995 (int)depth * 2 + 2,"",
996 (UV)TRIE_NODENUM( state ) );
998 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
999 UV v=(UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next );
1001 PerlIO_printf( Perl_debug_log, "%*"UVXf, colwidth, v );
1003 PerlIO_printf( Perl_debug_log, "%*s", colwidth, "." );
1005 if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) {
1006 PerlIO_printf( Perl_debug_log, " (%4"UVXf")\n", (UV)trie->trans[ state ].check );
1008 PerlIO_printf( Perl_debug_log, " (%4"UVXf") W%4X\n", (UV)trie->trans[ state ].check,
1009 trie->states[ TRIE_NODENUM( state ) ].wordnum );
1016 /* make_trie(startbranch,first,last,tail,word_count,flags,depth)
1017 startbranch: the first branch in the whole branch sequence
1018 first : start branch of sequence of branch-exact nodes.
1019 May be the same as startbranch
1020 last : Thing following the last branch.
1021 May be the same as tail.
1022 tail : item following the branch sequence
1023 count : words in the sequence
1024 flags : currently the OP() type we will be building one of /EXACT(|F|Fl)/
1025 depth : indent depth
1027 Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node.
1029 A trie is an N'ary tree where the branches are determined by digital
1030 decomposition of the key. IE, at the root node you look up the 1st character and
1031 follow that branch repeat until you find the end of the branches. Nodes can be
1032 marked as "accepting" meaning they represent a complete word. Eg:
1036 would convert into the following structure. Numbers represent states, letters
1037 following numbers represent valid transitions on the letter from that state, if
1038 the number is in square brackets it represents an accepting state, otherwise it
1039 will be in parenthesis.
1041 +-h->+-e->[3]-+-r->(8)-+-s->[9]
1045 (1) +-i->(6)-+-s->[7]
1047 +-s->(3)-+-h->(4)-+-e->[5]
1049 Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers)
1051 This shows that when matching against the string 'hers' we will begin at state 1
1052 read 'h' and move to state 2, read 'e' and move to state 3 which is accepting,
1053 then read 'r' and go to state 8 followed by 's' which takes us to state 9 which
1054 is also accepting. Thus we know that we can match both 'he' and 'hers' with a
1055 single traverse. We store a mapping from accepting to state to which word was
1056 matched, and then when we have multiple possibilities we try to complete the
1057 rest of the regex in the order in which they occured in the alternation.
1059 The only prior NFA like behaviour that would be changed by the TRIE support is
1060 the silent ignoring of duplicate alternations which are of the form:
1062 / (DUPE|DUPE) X? (?{ ... }) Y /x
1064 Thus EVAL blocks follwing a trie may be called a different number of times with
1065 and without the optimisation. With the optimisations dupes will be silently
1066 ignored. This inconsistant behaviour of EVAL type nodes is well established as
1067 the following demonstrates:
1069 'words'=~/(word|word|word)(?{ print $1 })[xyz]/
1071 which prints out 'word' three times, but
1073 'words'=~/(word|word|word)(?{ print $1 })S/
1075 which doesnt print it out at all. This is due to other optimisations kicking in.
1077 Example of what happens on a structural level:
1079 The regexp /(ac|ad|ab)+/ will produce the folowing debug output:
1081 1: CURLYM[1] {1,32767}(18)
1092 This would be optimizable with startbranch=5, first=5, last=16, tail=16
1093 and should turn into:
1095 1: CURLYM[1] {1,32767}(18)
1097 [Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1]
1105 Cases where tail != last would be like /(?foo|bar)baz/:
1115 which would be optimizable with startbranch=1, first=1, last=7, tail=8
1116 and would end up looking like:
1119 [Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1]
1126 d = uvuni_to_utf8_flags(d, uv, 0);
1128 is the recommended Unicode-aware way of saying
1133 #define TRIE_STORE_REVCHAR \
1135 SV *tmp = newSVpvs(""); \
1136 if (UTF) SvUTF8_on(tmp); \
1137 Perl_sv_catpvf( aTHX_ tmp, "%c", (int)uvc ); \
1138 av_push( TRIE_REVCHARMAP(trie), tmp ); \
1141 #define TRIE_READ_CHAR STMT_START { \
1145 if ( foldlen > 0 ) { \
1146 uvc = utf8n_to_uvuni( scan, UTF8_MAXLEN, &len, uniflags ); \
1151 uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
1152 uvc = to_uni_fold( uvc, foldbuf, &foldlen ); \
1153 foldlen -= UNISKIP( uvc ); \
1154 scan = foldbuf + UNISKIP( uvc ); \
1157 uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
1167 #define TRIE_LIST_PUSH(state,fid,ns) STMT_START { \
1168 if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) { \
1169 U32 ging = TRIE_LIST_LEN( state ) *= 2; \
1170 Renew( trie->states[ state ].trans.list, ging, reg_trie_trans_le ); \
1172 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid; \
1173 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns; \
1174 TRIE_LIST_CUR( state )++; \
1177 #define TRIE_LIST_NEW(state) STMT_START { \
1178 Newxz( trie->states[ state ].trans.list, \
1179 4, reg_trie_trans_le ); \
1180 TRIE_LIST_CUR( state ) = 1; \
1181 TRIE_LIST_LEN( state ) = 4; \
1184 #define TRIE_HANDLE_WORD(state) STMT_START { \
1185 U16 dupe= trie->states[ state ].wordnum; \
1186 regnode * const noper_next = regnext( noper ); \
1188 if (trie->wordlen) \
1189 trie->wordlen[ curword ] = wordlen; \
1191 /* store the word for dumping */ \
1193 if (OP(noper) != NOTHING) \
1194 tmp = newSVpvn(STRING(noper), STR_LEN(noper)); \
1196 tmp = newSVpvn( "", 0 ); \
1197 if ( UTF ) SvUTF8_on( tmp ); \
1198 av_push( trie->words, tmp ); \
1203 if ( noper_next < tail ) { \
1205 Newxz( trie->jump, word_count + 1, U16); \
1206 trie->jump[curword] = (U16)(noper_next - convert); \
1208 jumper = noper_next; \
1210 nextbranch= regnext(cur); \
1214 /* So it's a dupe. This means we need to maintain a */\
1215 /* linked-list from the first to the next. */\
1216 /* we only allocate the nextword buffer when there */\
1217 /* a dupe, so first time we have to do the allocation */\
1218 if (!trie->nextword) \
1219 Newxz( trie->nextword, word_count + 1, U16); \
1220 while ( trie->nextword[dupe] ) \
1221 dupe= trie->nextword[dupe]; \
1222 trie->nextword[dupe]= curword; \
1224 /* we haven't inserted this word yet. */ \
1225 trie->states[ state ].wordnum = curword; \
1230 #define TRIE_TRANS_STATE(state,base,ucharcount,charid,special) \
1231 ( ( base + charid >= ucharcount \
1232 && base + charid < ubound \
1233 && state == trie->trans[ base - ucharcount + charid ].check \
1234 && trie->trans[ base - ucharcount + charid ].next ) \
1235 ? trie->trans[ base - ucharcount + charid ].next \
1236 : ( state==1 ? special : 0 ) \
1240 #define MADE_JUMP_TRIE 2
1241 #define MADE_EXACT_TRIE 4
1244 S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *first, regnode *last, regnode *tail, U32 word_count, U32 flags, U32 depth)
1247 /* first pass, loop through and scan words */
1248 reg_trie_data *trie;
1250 const U32 uniflags = UTF8_ALLOW_DEFAULT;
1255 regnode *jumper = NULL;
1256 regnode *nextbranch = NULL;
1257 regnode *convert = NULL;
1258 /* we just use folder as a flag in utf8 */
1259 const U8 * const folder = ( flags == EXACTF
1261 : ( flags == EXACTFL
1267 const U32 data_slot = add_data( pRExC_state, 1, "t" );
1268 SV *re_trie_maxbuff;
1270 /* these are only used during construction but are useful during
1271 * debugging so we store them in the struct when debugging.
1273 STRLEN trie_charcount=0;
1274 AV *trie_revcharmap;
1276 GET_RE_DEBUG_FLAGS_DECL;
1278 PERL_UNUSED_ARG(depth);
1281 Newxz( trie, 1, reg_trie_data );
1283 trie->startstate = 1;
1284 trie->wordcount = word_count;
1285 RExC_rx->data->data[ data_slot ] = (void*)trie;
1286 Newxz( trie->charmap, 256, U16 );
1287 if (!(UTF && folder))
1288 Newxz( trie->bitmap, ANYOF_BITMAP_SIZE, char );
1290 trie->words = newAV();
1292 TRIE_REVCHARMAP(trie) = newAV();
1294 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
1295 if (!SvIOK(re_trie_maxbuff)) {
1296 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
1299 PerlIO_printf( Perl_debug_log,
1300 "%*smake_trie start==%d, first==%d, last==%d, tail==%d depth=%d\n",
1301 (int)depth * 2 + 2, "",
1302 REG_NODE_NUM(startbranch),REG_NODE_NUM(first),
1303 REG_NODE_NUM(last), REG_NODE_NUM(tail),
1307 /* Find the node we are going to overwrite */
1308 if ( first == startbranch && OP( last ) != BRANCH ) {
1309 /* whole branch chain */
1312 /* branch sub-chain */
1313 convert = NEXTOPER( first );
1316 /* -- First loop and Setup --
1318 We first traverse the branches and scan each word to determine if it
1319 contains widechars, and how many unique chars there are, this is
1320 important as we have to build a table with at least as many columns as we
1323 We use an array of integers to represent the character codes 0..255
1324 (trie->charmap) and we use a an HV* to store unicode characters. We use the
1325 native representation of the character value as the key and IV's for the
1328 *TODO* If we keep track of how many times each character is used we can
1329 remap the columns so that the table compression later on is more
1330 efficient in terms of memory by ensuring most common value is in the
1331 middle and the least common are on the outside. IMO this would be better
1332 than a most to least common mapping as theres a decent chance the most
1333 common letter will share a node with the least common, meaning the node
1334 will not be compressable. With a middle is most common approach the worst
1335 case is when we have the least common nodes twice.
1339 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1340 regnode * const noper = NEXTOPER( cur );
1341 const U8 *uc = (U8*)STRING( noper );
1342 const U8 * const e = uc + STR_LEN( noper );
1344 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1345 const U8 *scan = (U8*)NULL;
1346 U32 wordlen = 0; /* required init */
1349 if (OP(noper) == NOTHING) {
1354 TRIE_BITMAP_SET(trie,*uc);
1355 if ( folder ) TRIE_BITMAP_SET(trie,folder[ *uc ]);
1357 for ( ; uc < e ; uc += len ) {
1358 TRIE_CHARCOUNT(trie)++;
1362 if ( !trie->charmap[ uvc ] ) {
1363 trie->charmap[ uvc ]=( ++trie->uniquecharcount );
1365 trie->charmap[ folder[ uvc ] ] = trie->charmap[ uvc ];
1370 if ( !trie->widecharmap )
1371 trie->widecharmap = newHV();
1373 svpp = hv_fetch( trie->widecharmap, (char*)&uvc, sizeof( UV ), 1 );
1376 Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%"UVXf, uvc );
1378 if ( !SvTRUE( *svpp ) ) {
1379 sv_setiv( *svpp, ++trie->uniquecharcount );
1384 if( cur == first ) {
1387 } else if (chars < trie->minlen) {
1389 } else if (chars > trie->maxlen) {
1393 } /* end first pass */
1394 DEBUG_TRIE_COMPILE_r(
1395 PerlIO_printf( Perl_debug_log, "%*sTRIE(%s): W:%d C:%d Uq:%d Min:%d Max:%d\n",
1396 (int)depth * 2 + 2,"",
1397 ( trie->widecharmap ? "UTF8" : "NATIVE" ), (int)word_count,
1398 (int)TRIE_CHARCOUNT(trie), trie->uniquecharcount,
1399 (int)trie->minlen, (int)trie->maxlen )
1401 Newxz( trie->wordlen, word_count, U32 );
1404 We now know what we are dealing with in terms of unique chars and
1405 string sizes so we can calculate how much memory a naive
1406 representation using a flat table will take. If it's over a reasonable
1407 limit (as specified by ${^RE_TRIE_MAXBUF}) we use a more memory
1408 conservative but potentially much slower representation using an array
1411 At the end we convert both representations into the same compressed
1412 form that will be used in regexec.c for matching with. The latter
1413 is a form that cannot be used to construct with but has memory
1414 properties similar to the list form and access properties similar
1415 to the table form making it both suitable for fast searches and
1416 small enough that its feasable to store for the duration of a program.
1418 See the comment in the code where the compressed table is produced
1419 inplace from the flat tabe representation for an explanation of how
1420 the compression works.
1425 if ( (IV)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1) > SvIV(re_trie_maxbuff) ) {
1427 Second Pass -- Array Of Lists Representation
1429 Each state will be represented by a list of charid:state records
1430 (reg_trie_trans_le) the first such element holds the CUR and LEN
1431 points of the allocated array. (See defines above).
1433 We build the initial structure using the lists, and then convert
1434 it into the compressed table form which allows faster lookups
1435 (but cant be modified once converted).
1438 STRLEN transcount = 1;
1440 DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
1441 "%*sCompiling trie using list compiler\n",
1442 (int)depth * 2 + 2, ""));
1444 Newxz( trie->states, TRIE_CHARCOUNT(trie) + 2, reg_trie_state );
1448 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1450 regnode * const noper = NEXTOPER( cur );
1451 U8 *uc = (U8*)STRING( noper );
1452 const U8 * const e = uc + STR_LEN( noper );
1453 U32 state = 1; /* required init */
1454 U16 charid = 0; /* sanity init */
1455 U8 *scan = (U8*)NULL; /* sanity init */
1456 STRLEN foldlen = 0; /* required init */
1457 U32 wordlen = 0; /* required init */
1458 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1460 if (OP(noper) != NOTHING) {
1461 for ( ; uc < e ; uc += len ) {
1466 charid = trie->charmap[ uvc ];
1468 SV** const svpp = hv_fetch( trie->widecharmap, (char*)&uvc, sizeof( UV ), 0);
1472 charid=(U16)SvIV( *svpp );
1475 /* charid is now 0 if we dont know the char read, or nonzero if we do */
1482 if ( !trie->states[ state ].trans.list ) {
1483 TRIE_LIST_NEW( state );
1485 for ( check = 1; check <= TRIE_LIST_USED( state ); check++ ) {
1486 if ( TRIE_LIST_ITEM( state, check ).forid == charid ) {
1487 newstate = TRIE_LIST_ITEM( state, check ).newstate;
1492 newstate = next_alloc++;
1493 TRIE_LIST_PUSH( state, charid, newstate );
1498 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
1502 TRIE_HANDLE_WORD(state);
1504 } /* end second pass */
1506 /* next alloc is the NEXT state to be allocated */
1507 trie->statecount = next_alloc;
1508 Renew( trie->states, next_alloc, reg_trie_state );
1510 /* and now dump it out before we compress it */
1511 DEBUG_TRIE_COMPILE_MORE_r(
1512 dump_trie_interim_list(trie,next_alloc,depth+1)
1515 Newxz( trie->trans, transcount ,reg_trie_trans );
1522 for( state=1 ; state < next_alloc ; state ++ ) {
1526 DEBUG_TRIE_COMPILE_MORE_r(
1527 PerlIO_printf( Perl_debug_log, "tp: %d zp: %d ",tp,zp)
1531 if (trie->states[state].trans.list) {
1532 U16 minid=TRIE_LIST_ITEM( state, 1).forid;
1536 for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
1537 const U16 forid = TRIE_LIST_ITEM( state, idx).forid;
1538 if ( forid < minid ) {
1540 } else if ( forid > maxid ) {
1544 if ( transcount < tp + maxid - minid + 1) {
1546 Renew( trie->trans, transcount, reg_trie_trans );
1547 Zero( trie->trans + (transcount / 2), transcount / 2 , reg_trie_trans );
1549 base = trie->uniquecharcount + tp - minid;
1550 if ( maxid == minid ) {
1552 for ( ; zp < tp ; zp++ ) {
1553 if ( ! trie->trans[ zp ].next ) {
1554 base = trie->uniquecharcount + zp - minid;
1555 trie->trans[ zp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1556 trie->trans[ zp ].check = state;
1562 trie->trans[ tp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1563 trie->trans[ tp ].check = state;
1568 for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
1569 const U32 tid = base - trie->uniquecharcount + TRIE_LIST_ITEM( state, idx ).forid;
1570 trie->trans[ tid ].next = TRIE_LIST_ITEM( state, idx ).newstate;
1571 trie->trans[ tid ].check = state;
1573 tp += ( maxid - minid + 1 );
1575 Safefree(trie->states[ state ].trans.list);
1578 DEBUG_TRIE_COMPILE_MORE_r(
1579 PerlIO_printf( Perl_debug_log, " base: %d\n",base);
1582 trie->states[ state ].trans.base=base;
1584 trie->lasttrans = tp + 1;
1588 Second Pass -- Flat Table Representation.
1590 we dont use the 0 slot of either trans[] or states[] so we add 1 to each.
1591 We know that we will need Charcount+1 trans at most to store the data
1592 (one row per char at worst case) So we preallocate both structures
1593 assuming worst case.
1595 We then construct the trie using only the .next slots of the entry
1598 We use the .check field of the first entry of the node temporarily to
1599 make compression both faster and easier by keeping track of how many non
1600 zero fields are in the node.
1602 Since trans are numbered from 1 any 0 pointer in the table is a FAIL
1605 There are two terms at use here: state as a TRIE_NODEIDX() which is a
1606 number representing the first entry of the node, and state as a
1607 TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1) and
1608 TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3) if there
1609 are 2 entrys per node. eg:
1617 The table is internally in the right hand, idx form. However as we also
1618 have to deal with the states array which is indexed by nodenum we have to
1619 use TRIE_NODENUM() to convert.
1622 DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
1623 "%*sCompiling trie using table compiler\n",
1624 (int)depth * 2 + 2, ""));
1626 Newxz( trie->trans, ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1,
1628 Newxz( trie->states, TRIE_CHARCOUNT(trie) + 2, reg_trie_state );
1629 next_alloc = trie->uniquecharcount + 1;
1632 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1634 regnode * const noper = NEXTOPER( cur );
1635 const U8 *uc = (U8*)STRING( noper );
1636 const U8 * const e = uc + STR_LEN( noper );
1638 U32 state = 1; /* required init */
1640 U16 charid = 0; /* sanity init */
1641 U32 accept_state = 0; /* sanity init */
1642 U8 *scan = (U8*)NULL; /* sanity init */
1644 STRLEN foldlen = 0; /* required init */
1645 U32 wordlen = 0; /* required init */
1646 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1648 if ( OP(noper) != NOTHING ) {
1649 for ( ; uc < e ; uc += len ) {
1654 charid = trie->charmap[ uvc ];
1656 SV* const * const svpp = hv_fetch( trie->widecharmap, (char*)&uvc, sizeof( UV ), 0);
1657 charid = svpp ? (U16)SvIV(*svpp) : 0;
1661 if ( !trie->trans[ state + charid ].next ) {
1662 trie->trans[ state + charid ].next = next_alloc;
1663 trie->trans[ state ].check++;
1664 next_alloc += trie->uniquecharcount;
1666 state = trie->trans[ state + charid ].next;
1668 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
1670 /* charid is now 0 if we dont know the char read, or nonzero if we do */
1673 accept_state = TRIE_NODENUM( state );
1674 TRIE_HANDLE_WORD(accept_state);
1676 } /* end second pass */
1678 /* and now dump it out before we compress it */
1679 DEBUG_TRIE_COMPILE_MORE_r(
1680 dump_trie_interim_table(trie,next_alloc,depth+1)
1685 * Inplace compress the table.*
1687 For sparse data sets the table constructed by the trie algorithm will
1688 be mostly 0/FAIL transitions or to put it another way mostly empty.
1689 (Note that leaf nodes will not contain any transitions.)
1691 This algorithm compresses the tables by eliminating most such
1692 transitions, at the cost of a modest bit of extra work during lookup:
1694 - Each states[] entry contains a .base field which indicates the
1695 index in the state[] array wheres its transition data is stored.
1697 - If .base is 0 there are no valid transitions from that node.
1699 - If .base is nonzero then charid is added to it to find an entry in
1702 -If trans[states[state].base+charid].check!=state then the
1703 transition is taken to be a 0/Fail transition. Thus if there are fail
1704 transitions at the front of the node then the .base offset will point
1705 somewhere inside the previous nodes data (or maybe even into a node
1706 even earlier), but the .check field determines if the transition is
1710 The following process inplace converts the table to the compressed
1711 table: We first do not compress the root node 1,and mark its all its
1712 .check pointers as 1 and set its .base pointer as 1 as well. This
1713 allows to do a DFA construction from the compressed table later, and
1714 ensures that any .base pointers we calculate later are greater than
1717 - We set 'pos' to indicate the first entry of the second node.
1719 - We then iterate over the columns of the node, finding the first and
1720 last used entry at l and m. We then copy l..m into pos..(pos+m-l),
1721 and set the .check pointers accordingly, and advance pos
1722 appropriately and repreat for the next node. Note that when we copy
1723 the next pointers we have to convert them from the original
1724 NODEIDX form to NODENUM form as the former is not valid post
1727 - If a node has no transitions used we mark its base as 0 and do not
1728 advance the pos pointer.
1730 - If a node only has one transition we use a second pointer into the
1731 structure to fill in allocated fail transitions from other states.
1732 This pointer is independent of the main pointer and scans forward
1733 looking for null transitions that are allocated to a state. When it
1734 finds one it writes the single transition into the "hole". If the
1735 pointer doesnt find one the single transition is appended as normal.
1737 - Once compressed we can Renew/realloc the structures to release the
1740 See "Table-Compression Methods" in sec 3.9 of the Red Dragon,
1741 specifically Fig 3.47 and the associated pseudocode.
1745 const U32 laststate = TRIE_NODENUM( next_alloc );
1748 trie->statecount = laststate;
1750 for ( state = 1 ; state < laststate ; state++ ) {
1752 const U32 stateidx = TRIE_NODEIDX( state );
1753 const U32 o_used = trie->trans[ stateidx ].check;
1754 U32 used = trie->trans[ stateidx ].check;
1755 trie->trans[ stateidx ].check = 0;
1757 for ( charid = 0 ; used && charid < trie->uniquecharcount ; charid++ ) {
1758 if ( flag || trie->trans[ stateidx + charid ].next ) {
1759 if ( trie->trans[ stateidx + charid ].next ) {
1761 for ( ; zp < pos ; zp++ ) {
1762 if ( ! trie->trans[ zp ].next ) {
1766 trie->states[ state ].trans.base = zp + trie->uniquecharcount - charid ;
1767 trie->trans[ zp ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
1768 trie->trans[ zp ].check = state;
1769 if ( ++zp > pos ) pos = zp;
1776 trie->states[ state ].trans.base = pos + trie->uniquecharcount - charid ;
1778 trie->trans[ pos ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
1779 trie->trans[ pos ].check = state;
1784 trie->lasttrans = pos + 1;
1785 Renew( trie->states, laststate, reg_trie_state);
1786 DEBUG_TRIE_COMPILE_MORE_r(
1787 PerlIO_printf( Perl_debug_log,
1788 "%*sAlloc: %d Orig: %"IVdf" elements, Final:%"IVdf". Savings of %%%5.2f\n",
1789 (int)depth * 2 + 2,"",
1790 (int)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1 ),
1793 ( ( next_alloc - pos ) * 100 ) / (double)next_alloc );
1796 } /* end table compress */
1798 DEBUG_TRIE_COMPILE_MORE_r(
1799 PerlIO_printf(Perl_debug_log, "%*sStatecount:%"UVxf" Lasttrans:%"UVxf"\n",
1800 (int)depth * 2 + 2, "",
1801 (UV)trie->statecount,
1802 (UV)trie->lasttrans)
1804 /* resize the trans array to remove unused space */
1805 Renew( trie->trans, trie->lasttrans, reg_trie_trans);
1807 /* and now dump out the compressed format */
1808 DEBUG_TRIE_COMPILE_r(
1809 dump_trie(trie,depth+1)
1812 { /* Modify the program and insert the new TRIE node*/
1813 U8 nodetype =(U8)(flags & 0xFF);
1817 regnode *optimize = NULL;
1819 U32 mjd_nodelen = 0;
1822 This means we convert either the first branch or the first Exact,
1823 depending on whether the thing following (in 'last') is a branch
1824 or not and whther first is the startbranch (ie is it a sub part of
1825 the alternation or is it the whole thing.)
1826 Assuming its a sub part we conver the EXACT otherwise we convert
1827 the whole branch sequence, including the first.
1829 /* Find the node we are going to overwrite */
1830 if ( first != startbranch || OP( last ) == BRANCH ) {
1831 /* branch sub-chain */
1832 NEXT_OFF( first ) = (U16)(last - first);
1834 mjd_offset= Node_Offset((convert));
1835 mjd_nodelen= Node_Length((convert));
1837 /* whole branch chain */
1840 const regnode *nop = NEXTOPER( convert );
1841 mjd_offset= Node_Offset((nop));
1842 mjd_nodelen= Node_Length((nop));
1847 PerlIO_printf(Perl_debug_log, "%*sMJD offset:%"UVuf" MJD length:%"UVuf"\n",
1848 (int)depth * 2 + 2, "",
1849 (UV)mjd_offset, (UV)mjd_nodelen)
1852 /* But first we check to see if there is a common prefix we can
1853 split out as an EXACT and put in front of the TRIE node. */
1854 trie->startstate= 1;
1855 if ( trie->bitmap && !trie->widecharmap && !trie->jump ) {
1857 for ( state = 1 ; state < trie->statecount-1 ; state++ ) {
1861 const U32 base = trie->states[ state ].trans.base;
1863 if ( trie->states[state].wordnum )
1866 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
1867 if ( ( base + ofs >= trie->uniquecharcount ) &&
1868 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
1869 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
1871 if ( ++count > 1 ) {
1872 SV **tmp = av_fetch( TRIE_REVCHARMAP(trie), ofs, 0);
1873 const U8 *ch = (U8*)SvPV_nolen_const( *tmp );
1874 if ( state == 1 ) break;
1876 Zero(trie->bitmap, ANYOF_BITMAP_SIZE, char);
1878 PerlIO_printf(Perl_debug_log,
1879 "%*sNew Start State=%"UVuf" Class: [",
1880 (int)depth * 2 + 2, "",
1883 SV ** const tmp = av_fetch( TRIE_REVCHARMAP(trie), idx, 0);
1884 const U8 * const ch = (U8*)SvPV_nolen_const( *tmp );
1886 TRIE_BITMAP_SET(trie,*ch);
1888 TRIE_BITMAP_SET(trie, folder[ *ch ]);
1890 PerlIO_printf(Perl_debug_log, (char*)ch)
1894 TRIE_BITMAP_SET(trie,*ch);
1896 TRIE_BITMAP_SET(trie,folder[ *ch ]);
1897 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"%s", ch));
1903 SV **tmp = av_fetch( TRIE_REVCHARMAP(trie), idx, 0);
1904 char *ch = SvPV_nolen( *tmp );
1906 SV *sv=sv_newmortal();
1907 PerlIO_printf( Perl_debug_log,
1908 "%*sPrefix State: %"UVuf" Idx:%"UVuf" Char='%s'\n",
1909 (int)depth * 2 + 2, "",
1911 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 6,
1912 PL_colors[0], PL_colors[1],
1913 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1914 PERL_PV_ESCAPE_FIRSTCHAR
1919 OP( convert ) = nodetype;
1920 str=STRING(convert);
1931 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"]\n"));
1937 regnode *n = convert+NODE_SZ_STR(convert);
1938 NEXT_OFF(convert) = NODE_SZ_STR(convert);
1939 trie->startstate = state;
1940 trie->minlen -= (state - 1);
1941 trie->maxlen -= (state - 1);
1943 regnode *fix = convert;
1944 U32 word = trie->wordcount;
1946 Set_Node_Offset_Length(convert, mjd_offset, state - 1);
1947 while( ++fix < n ) {
1948 Set_Node_Offset_Length(fix, 0, 0);
1951 SV ** const tmp = av_fetch( trie->words, word, 0 );
1953 if ( STR_LEN(convert) <= SvCUR(*tmp) )
1954 sv_chop(*tmp, SvPV_nolen(*tmp) + STR_LEN(convert));
1956 sv_chop(*tmp, SvPV_nolen(*tmp) + SvCUR(*tmp));
1963 NEXT_OFF(convert) = (U16)(tail - convert);
1964 DEBUG_r(optimize= n);
1970 if ( trie->maxlen ) {
1971 NEXT_OFF( convert ) = (U16)(tail - convert);
1972 ARG_SET( convert, data_slot );
1973 /* Store the offset to the first unabsorbed branch in
1974 jump[0], which is otherwise unused by the jump logic.
1975 We use this when dumping a trie and during optimisation. */
1977 trie->jump[0] = (U16)(nextbranch - convert);
1980 if ( !trie->states[trie->startstate].wordnum && trie->bitmap &&
1981 ( (char *)jumper - (char *)convert) >= (int)sizeof(struct regnode_charclass) )
1983 OP( convert ) = TRIEC;
1984 Copy(trie->bitmap, ((struct regnode_charclass *)convert)->bitmap, ANYOF_BITMAP_SIZE, char);
1985 Safefree(trie->bitmap);
1988 OP( convert ) = TRIE;
1990 /* store the type in the flags */
1991 convert->flags = nodetype;
1995 + regarglen[ OP( convert ) ];
1997 /* XXX We really should free up the resource in trie now,
1998 as we won't use them - (which resources?) dmq */
2000 /* needed for dumping*/
2001 DEBUG_r(if (optimize) {
2002 regnode *opt = convert;
2003 while ( ++opt < optimize) {
2004 Set_Node_Offset_Length(opt,0,0);
2007 Try to clean up some of the debris left after the
2010 while( optimize < jumper ) {
2011 mjd_nodelen += Node_Length((optimize));
2012 OP( optimize ) = OPTIMIZED;
2013 Set_Node_Offset_Length(optimize,0,0);
2016 Set_Node_Offset_Length(convert,mjd_offset,mjd_nodelen);
2018 } /* end node insert */
2020 SvREFCNT_dec(TRIE_REVCHARMAP(trie));
2024 : trie->startstate>1
2030 S_make_trie_failtable(pTHX_ RExC_state_t *pRExC_state, regnode *source, regnode *stclass, U32 depth)
2032 /* The Trie is constructed and compressed now so we can build a fail array now if its needed
2034 This is basically the Aho-Corasick algorithm. Its from exercise 3.31 and 3.32 in the
2035 "Red Dragon" -- Compilers, principles, techniques, and tools. Aho, Sethi, Ullman 1985/88
2038 We find the fail state for each state in the trie, this state is the longest proper
2039 suffix of the current states 'word' that is also a proper prefix of another word in our
2040 trie. State 1 represents the word '' and is the thus the default fail state. This allows
2041 the DFA not to have to restart after its tried and failed a word at a given point, it
2042 simply continues as though it had been matching the other word in the first place.
2044 'abcdgu'=~/abcdefg|cdgu/
2045 When we get to 'd' we are still matching the first word, we would encounter 'g' which would
2046 fail, which would bring use to the state representing 'd' in the second word where we would
2047 try 'g' and succeed, prodceding to match 'cdgu'.
2049 /* add a fail transition */
2050 reg_trie_data *trie=(reg_trie_data *)RExC_rx->data->data[ARG(source)];
2052 const U32 ucharcount = trie->uniquecharcount;
2053 const U32 numstates = trie->statecount;
2054 const U32 ubound = trie->lasttrans + ucharcount;
2058 U32 base = trie->states[ 1 ].trans.base;
2061 const U32 data_slot = add_data( pRExC_state, 1, "T" );
2062 GET_RE_DEBUG_FLAGS_DECL;
2064 PERL_UNUSED_ARG(depth);
2068 ARG_SET( stclass, data_slot );
2069 Newxz( aho, 1, reg_ac_data );
2070 RExC_rx->data->data[ data_slot ] = (void*)aho;
2072 aho->states=(reg_trie_state *)savepvn((const char*)trie->states,
2073 numstates * sizeof(reg_trie_state));
2074 Newxz( q, numstates, U32);
2075 Newxz( aho->fail, numstates, U32 );
2078 /* initialize fail[0..1] to be 1 so that we always have
2079 a valid final fail state */
2080 fail[ 0 ] = fail[ 1 ] = 1;
2082 for ( charid = 0; charid < ucharcount ; charid++ ) {
2083 const U32 newstate = TRIE_TRANS_STATE( 1, base, ucharcount, charid, 0 );
2085 q[ q_write ] = newstate;
2086 /* set to point at the root */
2087 fail[ q[ q_write++ ] ]=1;
2090 while ( q_read < q_write) {
2091 const U32 cur = q[ q_read++ % numstates ];
2092 base = trie->states[ cur ].trans.base;
2094 for ( charid = 0 ; charid < ucharcount ; charid++ ) {
2095 const U32 ch_state = TRIE_TRANS_STATE( cur, base, ucharcount, charid, 1 );
2097 U32 fail_state = cur;
2100 fail_state = fail[ fail_state ];
2101 fail_base = aho->states[ fail_state ].trans.base;
2102 } while ( !TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 ) );
2104 fail_state = TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 );
2105 fail[ ch_state ] = fail_state;
2106 if ( !aho->states[ ch_state ].wordnum && aho->states[ fail_state ].wordnum )
2108 aho->states[ ch_state ].wordnum = aho->states[ fail_state ].wordnum;
2110 q[ q_write++ % numstates] = ch_state;
2114 /* restore fail[0..1] to 0 so that we "fall out" of the AC loop
2115 when we fail in state 1, this allows us to use the
2116 charclass scan to find a valid start char. This is based on the principle
2117 that theres a good chance the string being searched contains lots of stuff
2118 that cant be a start char.
2120 fail[ 0 ] = fail[ 1 ] = 0;
2121 DEBUG_TRIE_COMPILE_r({
2122 PerlIO_printf(Perl_debug_log,
2123 "%*sStclass Failtable (%"UVuf" states): 0",
2124 (int)(depth * 2), "", (UV)numstates
2126 for( q_read=1; q_read<numstates; q_read++ ) {
2127 PerlIO_printf(Perl_debug_log, ", %"UVuf, (UV)fail[q_read]);
2129 PerlIO_printf(Perl_debug_log, "\n");
2132 /*RExC_seen |= REG_SEEN_TRIEDFA;*/
2137 * There are strange code-generation bugs caused on sparc64 by gcc-2.95.2.
2138 * These need to be revisited when a newer toolchain becomes available.
2140 #if defined(__sparc64__) && defined(__GNUC__)
2141 # if __GNUC__ < 2 || (__GNUC__ == 2 && __GNUC_MINOR__ < 96)
2142 # undef SPARC64_GCC_WORKAROUND
2143 # define SPARC64_GCC_WORKAROUND 1
2147 #define DEBUG_PEEP(str,scan,depth) \
2148 DEBUG_OPTIMISE_r({if (scan){ \
2149 SV * const mysv=sv_newmortal(); \
2150 regnode *Next = regnext(scan); \
2151 regprop(RExC_rx, mysv, scan); \
2152 PerlIO_printf(Perl_debug_log, "%*s" str ">%3d: %s (%d)\n", \
2153 (int)depth*2, "", REG_NODE_NUM(scan), SvPV_nolen_const(mysv),\
2154 Next ? (REG_NODE_NUM(Next)) : 0 ); \
2161 #define JOIN_EXACT(scan,min,flags) \
2162 if (PL_regkind[OP(scan)] == EXACT) \
2163 join_exact(pRExC_state,(scan),(min),(flags),NULL,depth+1)
2166 S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, I32 *min, U32 flags,regnode *val, U32 depth) {
2167 /* Merge several consecutive EXACTish nodes into one. */
2168 regnode *n = regnext(scan);
2170 regnode *next = scan + NODE_SZ_STR(scan);
2174 regnode *stop = scan;
2175 GET_RE_DEBUG_FLAGS_DECL;
2177 PERL_UNUSED_ARG(depth);
2179 #ifndef EXPERIMENTAL_INPLACESCAN
2180 PERL_UNUSED_ARG(flags);
2181 PERL_UNUSED_ARG(val);
2183 DEBUG_PEEP("join",scan,depth);
2185 /* Skip NOTHING, merge EXACT*. */
2187 ( PL_regkind[OP(n)] == NOTHING ||
2188 (stringok && (OP(n) == OP(scan))))
2190 && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX) {
2192 if (OP(n) == TAIL || n > next)
2194 if (PL_regkind[OP(n)] == NOTHING) {
2195 DEBUG_PEEP("skip:",n,depth);
2196 NEXT_OFF(scan) += NEXT_OFF(n);
2197 next = n + NODE_STEP_REGNODE;
2204 else if (stringok) {
2205 const unsigned int oldl = STR_LEN(scan);
2206 regnode * const nnext = regnext(n);
2208 DEBUG_PEEP("merg",n,depth);
2211 if (oldl + STR_LEN(n) > U8_MAX)
2213 NEXT_OFF(scan) += NEXT_OFF(n);
2214 STR_LEN(scan) += STR_LEN(n);
2215 next = n + NODE_SZ_STR(n);
2216 /* Now we can overwrite *n : */
2217 Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
2225 #ifdef EXPERIMENTAL_INPLACESCAN
2226 if (flags && !NEXT_OFF(n)) {
2227 DEBUG_PEEP("atch", val, depth);
2228 if (reg_off_by_arg[OP(n)]) {
2229 ARG_SET(n, val - n);
2232 NEXT_OFF(n) = val - n;
2239 if (UTF && ( OP(scan) == EXACTF ) && ( STR_LEN(scan) >= 6 ) ) {
2241 Two problematic code points in Unicode casefolding of EXACT nodes:
2243 U+0390 - GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
2244 U+03B0 - GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
2250 U+03B9 U+0308 U+0301 0xCE 0xB9 0xCC 0x88 0xCC 0x81
2251 U+03C5 U+0308 U+0301 0xCF 0x85 0xCC 0x88 0xCC 0x81
2253 This means that in case-insensitive matching (or "loose matching",
2254 as Unicode calls it), an EXACTF of length six (the UTF-8 encoded byte
2255 length of the above casefolded versions) can match a target string
2256 of length two (the byte length of UTF-8 encoded U+0390 or U+03B0).
2257 This would rather mess up the minimum length computation.
2259 What we'll do is to look for the tail four bytes, and then peek
2260 at the preceding two bytes to see whether we need to decrease
2261 the minimum length by four (six minus two).
2263 Thanks to the design of UTF-8, there cannot be false matches:
2264 A sequence of valid UTF-8 bytes cannot be a subsequence of
2265 another valid sequence of UTF-8 bytes.
2268 char * const s0 = STRING(scan), *s, *t;
2269 char * const s1 = s0 + STR_LEN(scan) - 1;
2270 char * const s2 = s1 - 4;
2271 #ifdef EBCDIC /* RD tunifold greek 0390 and 03B0 */
2272 const char t0[] = "\xaf\x49\xaf\x42";
2274 const char t0[] = "\xcc\x88\xcc\x81";
2276 const char * const t1 = t0 + 3;
2279 s < s2 && (t = ninstr(s, s1, t0, t1));
2282 if (((U8)t[-1] == 0x68 && (U8)t[-2] == 0xB4) ||
2283 ((U8)t[-1] == 0x46 && (U8)t[-2] == 0xB5))
2285 if (((U8)t[-1] == 0xB9 && (U8)t[-2] == 0xCE) ||
2286 ((U8)t[-1] == 0x85 && (U8)t[-2] == 0xCF))
2294 n = scan + NODE_SZ_STR(scan);
2296 if (PL_regkind[OP(n)] != NOTHING || OP(n) == NOTHING) {
2303 DEBUG_OPTIMISE_r(if (merged){DEBUG_PEEP("finl",scan,depth)});
2307 /* REx optimizer. Converts nodes into quickier variants "in place".
2308 Finds fixed substrings. */
2310 /* Stops at toplevel WHILEM as well as at "last". At end *scanp is set
2311 to the position after last scanned or to NULL. */
2313 #define INIT_AND_WITHP \
2314 assert(!and_withp); \
2315 Newx(and_withp,1,struct regnode_charclass_class); \
2316 SAVEFREEPV(and_withp)
2318 /* this is a chain of data about sub patterns we are processing that
2319 need to be handled seperately/specially in study_chunk. Its so
2320 we can simulate recursion without losing state. */
2322 typedef struct scan_frame {
2323 regnode *last; /* last node to process in this frame */
2324 regnode *next; /* next node to process when last is reached */
2325 struct scan_frame *prev; /*previous frame*/
2326 I32 stop; /* what stopparen do we use */
2330 S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
2331 I32 *minlenp, I32 *deltap,
2336 struct regnode_charclass_class *and_withp,
2337 U32 flags, U32 depth)
2338 /* scanp: Start here (read-write). */
2339 /* deltap: Write maxlen-minlen here. */
2340 /* last: Stop before this one. */
2341 /* data: string data about the pattern */
2342 /* stopparen: treat close N as END */
2343 /* recursed: which subroutines have we recursed into */
2344 /* and_withp: Valid if flags & SCF_DO_STCLASS_OR */
2347 I32 min = 0, pars = 0, code;
2348 regnode *scan = *scanp, *next;
2350 int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
2351 int is_inf_internal = 0; /* The studied chunk is infinite */
2352 I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
2353 scan_data_t data_fake;
2354 SV *re_trie_maxbuff = NULL;
2355 regnode *first_non_open = scan;
2356 I32 stopmin = I32_MAX;
2357 scan_frame *frame = NULL;
2359 GET_RE_DEBUG_FLAGS_DECL;
2362 StructCopy(&zero_scan_data, &data_fake, scan_data_t);
2366 while (first_non_open && OP(first_non_open) == OPEN)
2367 first_non_open=regnext(first_non_open);
2372 while ( scan && OP(scan) != END && scan < last ){
2373 /* Peephole optimizer: */
2374 DEBUG_STUDYDATA(data,depth);
2375 DEBUG_PEEP("Peep",scan,depth);
2376 JOIN_EXACT(scan,&min,0);
2378 /* Follow the next-chain of the current node and optimize
2379 away all the NOTHINGs from it. */
2380 if (OP(scan) != CURLYX) {
2381 const int max = (reg_off_by_arg[OP(scan)]
2383 /* I32 may be smaller than U16 on CRAYs! */
2384 : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
2385 int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
2389 /* Skip NOTHING and LONGJMP. */
2390 while ((n = regnext(n))
2391 && ((PL_regkind[OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
2392 || ((OP(n) == LONGJMP) && (noff = ARG(n))))
2393 && off + noff < max)
2395 if (reg_off_by_arg[OP(scan)])
2398 NEXT_OFF(scan) = off;
2403 /* The principal pseudo-switch. Cannot be a switch, since we
2404 look into several different things. */
2405 if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
2406 || OP(scan) == IFTHEN) {
2407 next = regnext(scan);
2409 /* demq: the op(next)==code check is to see if we have "branch-branch" AFAICT */
2411 if (OP(next) == code || code == IFTHEN) {
2412 /* NOTE - There is similar code to this block below for handling
2413 TRIE nodes on a re-study. If you change stuff here check there
2415 I32 max1 = 0, min1 = I32_MAX, num = 0;
2416 struct regnode_charclass_class accum;
2417 regnode * const startbranch=scan;
2419 if (flags & SCF_DO_SUBSTR)
2420 scan_commit(pRExC_state, data, minlenp); /* Cannot merge strings after this. */
2421 if (flags & SCF_DO_STCLASS)
2422 cl_init_zero(pRExC_state, &accum);
2424 while (OP(scan) == code) {
2425 I32 deltanext, minnext, f = 0, fake;
2426 struct regnode_charclass_class this_class;
2429 data_fake.flags = 0;
2431 data_fake.whilem_c = data->whilem_c;
2432 data_fake.last_closep = data->last_closep;
2435 data_fake.last_closep = &fake;
2437 data_fake.pos_delta = delta;
2438 next = regnext(scan);
2439 scan = NEXTOPER(scan);
2441 scan = NEXTOPER(scan);
2442 if (flags & SCF_DO_STCLASS) {
2443 cl_init(pRExC_state, &this_class);
2444 data_fake.start_class = &this_class;
2445 f = SCF_DO_STCLASS_AND;
2447 if (flags & SCF_WHILEM_VISITED_POS)
2448 f |= SCF_WHILEM_VISITED_POS;
2450 /* we suppose the run is continuous, last=next...*/
2451 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
2453 stopparen, recursed, NULL, f,depth+1);
2456 if (max1 < minnext + deltanext)
2457 max1 = minnext + deltanext;
2458 if (deltanext == I32_MAX)
2459 is_inf = is_inf_internal = 1;
2461 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
2463 if (data_fake.flags & SCF_SEEN_ACCEPT) {
2464 if ( stopmin > minnext)
2465 stopmin = min + min1;
2466 flags &= ~SCF_DO_SUBSTR;
2468 data->flags |= SCF_SEEN_ACCEPT;
2471 if (data_fake.flags & SF_HAS_EVAL)
2472 data->flags |= SF_HAS_EVAL;
2473 data->whilem_c = data_fake.whilem_c;
2475 if (flags & SCF_DO_STCLASS)
2476 cl_or(pRExC_state, &accum, &this_class);
2478 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
2480 if (flags & SCF_DO_SUBSTR) {
2481 data->pos_min += min1;
2482 data->pos_delta += max1 - min1;
2483 if (max1 != min1 || is_inf)
2484 data->longest = &(data->longest_float);
2487 delta += max1 - min1;
2488 if (flags & SCF_DO_STCLASS_OR) {
2489 cl_or(pRExC_state, data->start_class, &accum);
2491 cl_and(data->start_class, and_withp);
2492 flags &= ~SCF_DO_STCLASS;
2495 else if (flags & SCF_DO_STCLASS_AND) {
2497 cl_and(data->start_class, &accum);
2498 flags &= ~SCF_DO_STCLASS;
2501 /* Switch to OR mode: cache the old value of
2502 * data->start_class */
2504 StructCopy(data->start_class, and_withp,
2505 struct regnode_charclass_class);
2506 flags &= ~SCF_DO_STCLASS_AND;
2507 StructCopy(&accum, data->start_class,
2508 struct regnode_charclass_class);
2509 flags |= SCF_DO_STCLASS_OR;
2510 data->start_class->flags |= ANYOF_EOS;
2514 if (PERL_ENABLE_TRIE_OPTIMISATION && OP( startbranch ) == BRANCH ) {
2517 Assuming this was/is a branch we are dealing with: 'scan' now
2518 points at the item that follows the branch sequence, whatever
2519 it is. We now start at the beginning of the sequence and look
2526 which would be constructed from a pattern like /A|LIST|OF|WORDS/
2528 If we can find such a subseqence we need to turn the first
2529 element into a trie and then add the subsequent branch exact
2530 strings to the trie.
2534 1. patterns where the whole set of branch can be converted.
2536 2. patterns where only a subset can be converted.
2538 In case 1 we can replace the whole set with a single regop
2539 for the trie. In case 2 we need to keep the start and end
2542 'BRANCH EXACT; BRANCH EXACT; BRANCH X'
2543 becomes BRANCH TRIE; BRANCH X;
2545 There is an additional case, that being where there is a
2546 common prefix, which gets split out into an EXACT like node
2547 preceding the TRIE node.
2549 If x(1..n)==tail then we can do a simple trie, if not we make
2550 a "jump" trie, such that when we match the appropriate word
2551 we "jump" to the appopriate tail node. Essentailly we turn
2552 a nested if into a case structure of sorts.
2557 if (!re_trie_maxbuff) {
2558 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
2559 if (!SvIOK(re_trie_maxbuff))
2560 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
2562 if ( SvIV(re_trie_maxbuff)>=0 ) {
2564 regnode *first = (regnode *)NULL;
2565 regnode *last = (regnode *)NULL;
2566 regnode *tail = scan;
2571 SV * const mysv = sv_newmortal(); /* for dumping */
2573 /* var tail is used because there may be a TAIL
2574 regop in the way. Ie, the exacts will point to the
2575 thing following the TAIL, but the last branch will
2576 point at the TAIL. So we advance tail. If we
2577 have nested (?:) we may have to move through several
2581 while ( OP( tail ) == TAIL ) {
2582 /* this is the TAIL generated by (?:) */
2583 tail = regnext( tail );
2588 regprop(RExC_rx, mysv, tail );
2589 PerlIO_printf( Perl_debug_log, "%*s%s%s\n",
2590 (int)depth * 2 + 2, "",
2591 "Looking for TRIE'able sequences. Tail node is: ",
2592 SvPV_nolen_const( mysv )
2598 step through the branches, cur represents each
2599 branch, noper is the first thing to be matched
2600 as part of that branch and noper_next is the
2601 regnext() of that node. if noper is an EXACT
2602 and noper_next is the same as scan (our current
2603 position in the regex) then the EXACT branch is
2604 a possible optimization target. Once we have
2605 two or more consequetive such branches we can
2606 create a trie of the EXACT's contents and stich
2607 it in place. If the sequence represents all of
2608 the branches we eliminate the whole thing and
2609 replace it with a single TRIE. If it is a
2610 subsequence then we need to stitch it in. This
2611 means the first branch has to remain, and needs
2612 to be repointed at the item on the branch chain
2613 following the last branch optimized. This could
2614 be either a BRANCH, in which case the
2615 subsequence is internal, or it could be the
2616 item following the branch sequence in which
2617 case the subsequence is at the end.
2621 /* dont use tail as the end marker for this traverse */
2622 for ( cur = startbranch ; cur != scan ; cur = regnext( cur ) ) {
2623 regnode * const noper = NEXTOPER( cur );
2624 #if defined(DEBUGGING) || defined(NOJUMPTRIE)
2625 regnode * const noper_next = regnext( noper );
2629 regprop(RExC_rx, mysv, cur);
2630 PerlIO_printf( Perl_debug_log, "%*s- %s (%d)",
2631 (int)depth * 2 + 2,"", SvPV_nolen_const( mysv ), REG_NODE_NUM(cur) );
2633 regprop(RExC_rx, mysv, noper);
2634 PerlIO_printf( Perl_debug_log, " -> %s",
2635 SvPV_nolen_const(mysv));
2638 regprop(RExC_rx, mysv, noper_next );
2639 PerlIO_printf( Perl_debug_log,"\t=> %s\t",
2640 SvPV_nolen_const(mysv));
2642 PerlIO_printf( Perl_debug_log, "(First==%d,Last==%d,Cur==%d)\n",
2643 REG_NODE_NUM(first), REG_NODE_NUM(last), REG_NODE_NUM(cur) );
2645 if ( (((first && optype!=NOTHING) ? OP( noper ) == optype
2646 : PL_regkind[ OP( noper ) ] == EXACT )
2647 || OP(noper) == NOTHING )
2649 && noper_next == tail
2654 if ( !first || optype == NOTHING ) {
2655 if (!first) first = cur;
2656 optype = OP( noper );
2662 make_trie( pRExC_state,
2663 startbranch, first, cur, tail, count,
2666 if ( PL_regkind[ OP( noper ) ] == EXACT
2668 && noper_next == tail
2673 optype = OP( noper );
2683 regprop(RExC_rx, mysv, cur);
2684 PerlIO_printf( Perl_debug_log,
2685 "%*s- %s (%d) <SCAN FINISHED>\n", (int)depth * 2 + 2,
2686 "", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur));
2690 made= make_trie( pRExC_state, startbranch, first, scan, tail, count, optype, depth+1 );
2691 #ifdef TRIE_STUDY_OPT
2692 if ( ((made == MADE_EXACT_TRIE &&
2693 startbranch == first)
2694 || ( first_non_open == first )) &&
2696 flags |= SCF_TRIE_RESTUDY;
2697 if ( startbranch == first
2700 RExC_seen &=~REG_TOP_LEVEL_BRANCHES;
2710 else if ( code == BRANCHJ ) { /* single branch is optimized. */
2711 scan = NEXTOPER(NEXTOPER(scan));
2712 } else /* single branch is optimized. */
2713 scan = NEXTOPER(scan);
2715 } else if (OP(scan) == SUSPEND || OP(scan) == GOSUB || OP(scan) == GOSTART) {
2716 scan_frame *newframe = NULL;
2721 if (OP(scan) != SUSPEND) {
2722 /* set the pointer */
2723 if (OP(scan) == GOSUB) {
2725 RExC_recurse[ARG2L(scan)] = scan;
2726 start = RExC_open_parens[paren-1];
2727 end = RExC_close_parens[paren-1];
2730 start = RExC_rx->program + 1;
2734 Newxz(recursed, (((RExC_npar)>>3) +1), U8);
2735 SAVEFREEPV(recursed);
2737 if (!PAREN_TEST(recursed,paren+1)) {
2738 PAREN_SET(recursed,paren+1);
2739 Newx(newframe,1,scan_frame);
2741 if (flags & SCF_DO_SUBSTR) {
2742 scan_commit(pRExC_state,data,minlenp);
2743 data->longest = &(data->longest_float);
2745 is_inf = is_inf_internal = 1;
2746 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
2747 cl_anything(pRExC_state, data->start_class);
2748 flags &= ~SCF_DO_STCLASS;
2751 Newx(newframe,1,scan_frame);
2754 end = regnext(scan);
2759 SAVEFREEPV(newframe);
2760 newframe->next = regnext(scan);
2761 newframe->last = last;
2762 newframe->stop = stopparen;
2763 newframe->prev = frame;
2773 else if (OP(scan) == EXACT) {
2774 I32 l = STR_LEN(scan);
2777 const U8 * const s = (U8*)STRING(scan);
2778 l = utf8_length(s, s + l);
2779 uc = utf8_to_uvchr(s, NULL);
2781 uc = *((U8*)STRING(scan));
2784 if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
2785 /* The code below prefers earlier match for fixed
2786 offset, later match for variable offset. */
2787 if (data->last_end == -1) { /* Update the start info. */
2788 data->last_start_min = data->pos_min;
2789 data->last_start_max = is_inf
2790 ? I32_MAX : data->pos_min + data->pos_delta;
2792 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
2794 SvUTF8_on(data->last_found);
2796 SV * const sv = data->last_found;
2797 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
2798 mg_find(sv, PERL_MAGIC_utf8) : NULL;
2799 if (mg && mg->mg_len >= 0)
2800 mg->mg_len += utf8_length((U8*)STRING(scan),
2801 (U8*)STRING(scan)+STR_LEN(scan));
2803 data->last_end = data->pos_min + l;
2804 data->pos_min += l; /* As in the first entry. */
2805 data->flags &= ~SF_BEFORE_EOL;
2807 if (flags & SCF_DO_STCLASS_AND) {
2808 /* Check whether it is compatible with what we know already! */
2812 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
2813 && !ANYOF_BITMAP_TEST(data->start_class, uc)
2814 && (!(data->start_class->flags & ANYOF_FOLD)
2815 || !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
2818 ANYOF_CLASS_ZERO(data->start_class);
2819 ANYOF_BITMAP_ZERO(data->start_class);
2821 ANYOF_BITMAP_SET(data->start_class, uc);
2822 data->start_class->flags &= ~ANYOF_EOS;
2824 data->start_class->flags &= ~ANYOF_UNICODE_ALL;
2826 else if (flags & SCF_DO_STCLASS_OR) {
2827 /* false positive possible if the class is case-folded */
2829 ANYOF_BITMAP_SET(data->start_class, uc);
2831 data->start_class->flags |= ANYOF_UNICODE_ALL;
2832 data->start_class->flags &= ~ANYOF_EOS;
2833 cl_and(data->start_class, and_withp);
2835 flags &= ~SCF_DO_STCLASS;
2837 else if (PL_regkind[OP(scan)] == EXACT) { /* But OP != EXACT! */
2838 I32 l = STR_LEN(scan);
2839 UV uc = *((U8*)STRING(scan));
2841 /* Search for fixed substrings supports EXACT only. */
2842 if (flags & SCF_DO_SUBSTR) {
2844 scan_commit(pRExC_state, data, minlenp);
2847 const U8 * const s = (U8 *)STRING(scan);
2848 l = utf8_length(s, s + l);
2849 uc = utf8_to_uvchr(s, NULL);
2852 if (flags & SCF_DO_SUBSTR)
2854 if (flags & SCF_DO_STCLASS_AND) {
2855 /* Check whether it is compatible with what we know already! */
2859 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
2860 && !ANYOF_BITMAP_TEST(data->start_class, uc)
2861 && !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
2863 ANYOF_CLASS_ZERO(data->start_class);
2864 ANYOF_BITMAP_ZERO(data->start_class);
2866 ANYOF_BITMAP_SET(data->start_class, uc);
2867 data->start_class->flags &= ~ANYOF_EOS;
2868 data->start_class->flags |= ANYOF_FOLD;
2869 if (OP(scan) == EXACTFL)
2870 data->start_class->flags |= ANYOF_LOCALE;
2873 else if (flags & SCF_DO_STCLASS_OR) {
2874 if (data->start_class->flags & ANYOF_FOLD) {
2875 /* false positive possible if the class is case-folded.
2876 Assume that the locale settings are the same... */
2878 ANYOF_BITMAP_SET(data->start_class, uc);
2879 data->start_class->flags &= ~ANYOF_EOS;
2881 cl_and(data->start_class, and_withp);
2883 flags &= ~SCF_DO_STCLASS;
2885 else if (strchr((const char*)PL_varies,OP(scan))) {
2886 I32 mincount, maxcount, minnext, deltanext, fl = 0;
2887 I32 f = flags, pos_before = 0;
2888 regnode * const oscan = scan;
2889 struct regnode_charclass_class this_class;
2890 struct regnode_charclass_class *oclass = NULL;
2891 I32 next_is_eval = 0;
2893 switch (PL_regkind[OP(scan)]) {
2894 case WHILEM: /* End of (?:...)* . */
2895 scan = NEXTOPER(scan);
2898 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
2899 next = NEXTOPER(scan);
2900 if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) {
2902 maxcount = REG_INFTY;
2903 next = regnext(scan);
2904 scan = NEXTOPER(scan);
2908 if (flags & SCF_DO_SUBSTR)
2913 if (flags & SCF_DO_STCLASS) {
2915 maxcount = REG_INFTY;
2916 next = regnext(scan);
2917 scan = NEXTOPER(scan);
2920 is_inf = is_inf_internal = 1;
2921 scan = regnext(scan);
2922 if (flags & SCF_DO_SUBSTR) {
2923 scan_commit(pRExC_state, data, minlenp); /* Cannot extend fixed substrings */
2924 data->longest = &(data->longest_float);
2926 goto optimize_curly_tail;
2928 if (stopparen>0 && (OP(scan)==CURLYN || OP(scan)==CURLYM)
2929 && (scan->flags == stopparen))
2934 mincount = ARG1(scan);
2935 maxcount = ARG2(scan);
2937 next = regnext(scan);
2938 if (OP(scan) == CURLYX) {
2939 I32 lp = (data ? *(data->last_closep) : 0);
2940 scan->flags = ((lp <= (I32)U8_MAX) ? (U8)lp : U8_MAX);
2942 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
2943 next_is_eval = (OP(scan) == EVAL);
2945 if (flags & SCF_DO_SUBSTR) {
2946 if (mincount == 0) scan_commit(pRExC_state,data,minlenp); /* Cannot extend fixed substrings */
2947 pos_before = data->pos_min;
2951 data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
2953 data->flags |= SF_IS_INF;
2955 if (flags & SCF_DO_STCLASS) {
2956 cl_init(pRExC_state, &this_class);
2957 oclass = data->start_class;
2958 data->start_class = &this_class;
2959 f |= SCF_DO_STCLASS_AND;
2960 f &= ~SCF_DO_STCLASS_OR;
2962 /* These are the cases when once a subexpression
2963 fails at a particular position, it cannot succeed
2964 even after backtracking at the enclosing scope.
2966 XXXX what if minimal match and we are at the
2967 initial run of {n,m}? */
2968 if ((mincount != maxcount - 1) && (maxcount != REG_INFTY))
2969 f &= ~SCF_WHILEM_VISITED_POS;
2971 /* This will finish on WHILEM, setting scan, or on NULL: */
2972 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
2973 last, data, stopparen, recursed, NULL,
2975 ? (f & ~SCF_DO_SUBSTR) : f),depth+1);
2977 if (flags & SCF_DO_STCLASS)
2978 data->start_class = oclass;
2979 if (mincount == 0 || minnext == 0) {
2980 if (flags & SCF_DO_STCLASS_OR) {
2981 cl_or(pRExC_state, data->start_class, &this_class);
2983 else if (flags & SCF_DO_STCLASS_AND) {
2984 /* Switch to OR mode: cache the old value of
2985 * data->start_class */
2987 StructCopy(data->start_class, and_withp,
2988 struct regnode_charclass_class);
2989 flags &= ~SCF_DO_STCLASS_AND;
2990 StructCopy(&this_class, data->start_class,
2991 struct regnode_charclass_class);
2992 flags |= SCF_DO_STCLASS_OR;
2993 data->start_class->flags |= ANYOF_EOS;
2995 } else { /* Non-zero len */
2996 if (flags & SCF_DO_STCLASS_OR) {
2997 cl_or(pRExC_state, data->start_class, &this_class);
2998 cl_and(data->start_class, and_withp);
3000 else if (flags & SCF_DO_STCLASS_AND)
3001 cl_and(data->start_class, &this_class);
3002 flags &= ~SCF_DO_STCLASS;
3004 if (!scan) /* It was not CURLYX, but CURLY. */
3006 if ( /* ? quantifier ok, except for (?{ ... }) */
3007 (next_is_eval || !(mincount == 0 && maxcount == 1))
3008 && (minnext == 0) && (deltanext == 0)
3009 && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
3010 && maxcount <= REG_INFTY/3 /* Complement check for big count */
3011 && ckWARN(WARN_REGEXP))
3014 "Quantifier unexpected on zero-length expression");
3017 min += minnext * mincount;
3018 is_inf_internal |= ((maxcount == REG_INFTY
3019 && (minnext + deltanext) > 0)
3020 || deltanext == I32_MAX);
3021 is_inf |= is_inf_internal;
3022 delta += (minnext + deltanext) * maxcount - minnext * mincount;
3024 /* Try powerful optimization CURLYX => CURLYN. */
3025 if ( OP(oscan) == CURLYX && data
3026 && data->flags & SF_IN_PAR
3027 && !(data->flags & SF_HAS_EVAL)
3028 && !deltanext && minnext == 1 ) {
3029 /* Try to optimize to CURLYN. */
3030 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
3031 regnode * const nxt1 = nxt;
3038 if (!strchr((const char*)PL_simple,OP(nxt))
3039 && !(PL_regkind[OP(nxt)] == EXACT
3040 && STR_LEN(nxt) == 1))
3046 if (OP(nxt) != CLOSE)
3048 if (RExC_open_parens) {
3049 RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
3050 RExC_close_parens[ARG(nxt1)-1]=nxt+2; /*close->while*/
3052 /* Now we know that nxt2 is the only contents: */
3053 oscan->flags = (U8)ARG(nxt);
3055 OP(nxt1) = NOTHING; /* was OPEN. */
3058 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
3059 NEXT_OFF(nxt1+ 1) = 0; /* just for consistancy. */
3060 NEXT_OFF(nxt2) = 0; /* just for consistancy with CURLY. */
3061 OP(nxt) = OPTIMIZED; /* was CLOSE. */
3062 OP(nxt + 1) = OPTIMIZED; /* was count. */
3063 NEXT_OFF(nxt+ 1) = 0; /* just for consistancy. */
3068 /* Try optimization CURLYX => CURLYM. */
3069 if ( OP(oscan) == CURLYX && data
3070 && !(data->flags & SF_HAS_PAR)
3071 && !(data->flags & SF_HAS_EVAL)
3072 && !deltanext /* atom is fixed width */
3073 && minnext != 0 /* CURLYM can't handle zero width */
3075 /* XXXX How to optimize if data == 0? */
3076 /* Optimize to a simpler form. */
3077 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
3081 while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
3082 && (OP(nxt2) != WHILEM))
3084 OP(nxt2) = SUCCEED; /* Whas WHILEM */
3085 /* Need to optimize away parenths. */
3086 if (data->flags & SF_IN_PAR) {
3087 /* Set the parenth number. */
3088 regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
3090 if (OP(nxt) != CLOSE)
3091 FAIL("Panic opt close");
3092 oscan->flags = (U8)ARG(nxt);
3093 if (RExC_open_parens) {
3094 RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
3095 RExC_close_parens[ARG(nxt1)-1]=nxt2+1; /*close->NOTHING*/
3097 OP(nxt1) = OPTIMIZED; /* was OPEN. */
3098 OP(nxt) = OPTIMIZED; /* was CLOSE. */
3101 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
3102 OP(nxt + 1) = OPTIMIZED; /* was count. */
3103 NEXT_OFF(nxt1 + 1) = 0; /* just for consistancy. */
3104 NEXT_OFF(nxt + 1) = 0; /* just for consistancy. */
3107 while ( nxt1 && (OP(nxt1) != WHILEM)) {
3108 regnode *nnxt = regnext(nxt1);
3111 if (reg_off_by_arg[OP(nxt1)])
3112 ARG_SET(nxt1, nxt2 - nxt1);
3113 else if (nxt2 - nxt1 < U16_MAX)
3114 NEXT_OFF(nxt1) = nxt2 - nxt1;
3116 OP(nxt) = NOTHING; /* Cannot beautify */
3121 /* Optimize again: */
3122 study_chunk(pRExC_state, &nxt1, minlenp, &deltanext, nxt,
3123 NULL, stopparen, recursed, NULL, 0,depth+1);
3128 else if ((OP(oscan) == CURLYX)
3129 && (flags & SCF_WHILEM_VISITED_POS)
3130 /* See the comment on a similar expression above.
3131 However, this time it not a subexpression
3132 we care about, but the expression itself. */
3133 && (maxcount == REG_INFTY)
3134 && data && ++data->whilem_c < 16) {
3135 /* This stays as CURLYX, we can put the count/of pair. */
3136 /* Find WHILEM (as in regexec.c) */
3137 regnode *nxt = oscan + NEXT_OFF(oscan);
3139 if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
3141 PREVOPER(nxt)->flags = (U8)(data->whilem_c
3142 | (RExC_whilem_seen << 4)); /* On WHILEM */
3144 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
3146 if (flags & SCF_DO_SUBSTR) {
3147 SV *last_str = NULL;
3148 int counted = mincount != 0;
3150 if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */
3151 #if defined(SPARC64_GCC_WORKAROUND)
3154 const char *s = NULL;
3157 if (pos_before >= data->last_start_min)
3160 b = data->last_start_min;
3163 s = SvPV_const(data->last_found, l);
3164 old = b - data->last_start_min;
3167 I32 b = pos_before >= data->last_start_min
3168 ? pos_before : data->last_start_min;
3170 const char * const s = SvPV_const(data->last_found, l);
3171 I32 old = b - data->last_start_min;
3175 old = utf8_hop((U8*)s, old) - (U8*)s;
3178 /* Get the added string: */
3179 last_str = newSVpvn(s + old, l);
3181 SvUTF8_on(last_str);
3182 if (deltanext == 0 && pos_before == b) {
3183 /* What was added is a constant string */
3185 SvGROW(last_str, (mincount * l) + 1);
3186 repeatcpy(SvPVX(last_str) + l,
3187 SvPVX_const(last_str), l, mincount - 1);
3188 SvCUR_set(last_str, SvCUR(last_str) * mincount);
3189 /* Add additional parts. */
3190 SvCUR_set(data->last_found,
3191 SvCUR(data->last_found) - l);
3192 sv_catsv(data->last_found, last_str);
3194 SV * sv = data->last_found;
3196 SvUTF8(sv) && SvMAGICAL(sv) ?
3197 mg_find(sv, PERL_MAGIC_utf8) : NULL;
3198 if (mg && mg->mg_len >= 0)
3199 mg->mg_len += CHR_SVLEN(last_str);
3201 data->last_end += l * (mincount - 1);
3204 /* start offset must point into the last copy */
3205 data->last_start_min += minnext * (mincount - 1);
3206 data->last_start_max += is_inf ? I32_MAX
3207 : (maxcount - 1) * (minnext + data->pos_delta);
3210 /* It is counted once already... */
3211 data->pos_min += minnext * (mincount - counted);
3212 data->pos_delta += - counted * deltanext +
3213 (minnext + deltanext) * maxcount - minnext * mincount;
3214 if (mincount != maxcount) {
3215 /* Cannot extend fixed substrings found inside
3217 scan_commit(pRExC_state,data,minlenp);
3218 if (mincount && last_str) {
3219 SV * const sv = data->last_found;
3220 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
3221 mg_find(sv, PERL_MAGIC_utf8) : NULL;
3225 sv_setsv(sv, last_str);
3226 data->last_end = data->pos_min;
3227 data->last_start_min =
3228 data->pos_min - CHR_SVLEN(last_str);
3229 data->last_start_max = is_inf
3231 : data->pos_min + data->pos_delta
3232 - CHR_SVLEN(last_str);
3234 data->longest = &(data->longest_float);
3236 SvREFCNT_dec(last_str);
3238 if (data && (fl & SF_HAS_EVAL))
3239 data->flags |= SF_HAS_EVAL;
3240 optimize_curly_tail:
3241 if (OP(oscan) != CURLYX) {
3242 while (PL_regkind[OP(next = regnext(oscan))] == NOTHING
3244 NEXT_OFF(oscan) += NEXT_OFF(next);
3247 default: /* REF and CLUMP only? */
3248 if (flags & SCF_DO_SUBSTR) {
3249 scan_commit(pRExC_state,data,minlenp); /* Cannot expect anything... */
3250 data->longest = &(data->longest_float);
3252 is_inf = is_inf_internal = 1;
3253 if (flags & SCF_DO_STCLASS_OR)
3254 cl_anything(pRExC_state, data->start_class);
3255 flags &= ~SCF_DO_STCLASS;
3259 else if (strchr((const char*)PL_simple,OP(scan))) {
3262 if (flags & SCF_DO_SUBSTR) {
3263 scan_commit(pRExC_state,data,minlenp);
3267 if (flags & SCF_DO_STCLASS) {
3268 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
3270 /* Some of the logic below assumes that switching
3271 locale on will only add false positives. */
3272 switch (PL_regkind[OP(scan)]) {
3276 /* Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan)); */
3277 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
3278 cl_anything(pRExC_state, data->start_class);
3281 if (OP(scan) == SANY)
3283 if (flags & SCF_DO_STCLASS_OR) { /* Everything but \n */
3284 value = (ANYOF_BITMAP_TEST(data->start_class,'\n')
3285 || (data->start_class->flags & ANYOF_CLASS));
3286 cl_anything(pRExC_state, data->start_class);
3288 if (flags & SCF_DO_STCLASS_AND || !value)
3289 ANYOF_BITMAP_CLEAR(data->start_class,'\n');
3292 if (flags & SCF_DO_STCLASS_AND)
3293 cl_and(data->start_class,
3294 (struct regnode_charclass_class*)scan);
3296 cl_or(pRExC_state, data->start_class,
3297 (struct regnode_charclass_class*)scan);
3300 if (flags & SCF_DO_STCLASS_AND) {
3301 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3302 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
3303 for (value = 0; value < 256; value++)
3304 if (!isALNUM(value))
3305 ANYOF_BITMAP_CLEAR(data->start_class, value);
3309 if (data->start_class->flags & ANYOF_LOCALE)
3310 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
3312 for (value = 0; value < 256; value++)
3314 ANYOF_BITMAP_SET(data->start_class, value);
3319 if (flags & SCF_DO_STCLASS_AND) {
3320 if (data->start_class->flags & ANYOF_LOCALE)
3321 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
3324 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
3325 data->start_class->flags |= ANYOF_LOCALE;
3329 if (flags & SCF_DO_STCLASS_AND) {
3330 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3331 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
3332 for (value = 0; value < 256; value++)
3334 ANYOF_BITMAP_CLEAR(data->start_class, value);
3338 if (data->start_class->flags & ANYOF_LOCALE)
3339 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
3341 for (value = 0; value < 256; value++)
3342 if (!isALNUM(value))
3343 ANYOF_BITMAP_SET(data->start_class, value);
3348 if (flags & SCF_DO_STCLASS_AND) {
3349 if (data->start_class->flags & ANYOF_LOCALE)
3350 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
3353 data->start_class->flags |= ANYOF_LOCALE;
3354 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
3358 if (flags & SCF_DO_STCLASS_AND) {
3359 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3360 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
3361 for (value = 0; value < 256; value++)
3362 if (!isSPACE(value))
3363 ANYOF_BITMAP_CLEAR(data->start_class, value);
3367 if (data->start_class->flags & ANYOF_LOCALE)
3368 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
3370 for (value = 0; value < 256; value++)
3372 ANYOF_BITMAP_SET(data->start_class, value);
3377 if (flags & SCF_DO_STCLASS_AND) {
3378 if (data->start_class->flags & ANYOF_LOCALE)
3379 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
3382 data->start_class->flags |= ANYOF_LOCALE;
3383 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
3387 if (flags & SCF_DO_STCLASS_AND) {
3388 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3389 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
3390 for (value = 0; value < 256; value++)
3392 ANYOF_BITMAP_CLEAR(data->start_class, value);
3396 if (data->start_class->flags & ANYOF_LOCALE)
3397 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
3399 for (value = 0; value < 256; value++)
3400 if (!isSPACE(value))
3401 ANYOF_BITMAP_SET(data->start_class, value);
3406 if (flags & SCF_DO_STCLASS_AND) {
3407 if (data->start_class->flags & ANYOF_LOCALE) {
3408 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
3409 for (value = 0; value < 256; value++)
3410 if (!isSPACE(value))
3411 ANYOF_BITMAP_CLEAR(data->start_class, value);
3415 data->start_class->flags |= ANYOF_LOCALE;
3416 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
3420 if (flags & SCF_DO_STCLASS_AND) {
3421 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NDIGIT);
3422 for (value = 0; value < 256; value++)
3423 if (!isDIGIT(value))
3424 ANYOF_BITMAP_CLEAR(data->start_class, value);
3427 if (data->start_class->flags & ANYOF_LOCALE)
3428 ANYOF_CLASS_SET(data->start_class,ANYOF_DIGIT);
3430 for (value = 0; value < 256; value++)
3432 ANYOF_BITMAP_SET(data->start_class, value);
3437 if (flags & SCF_DO_STCLASS_AND) {
3438 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_DIGIT);
3439 for (value = 0; value < 256; value++)
3441 ANYOF_BITMAP_CLEAR(data->start_class, value);
3444 if (data->start_class->flags & ANYOF_LOCALE)
3445 ANYOF_CLASS_SET(data->start_class,ANYOF_NDIGIT);
3447 for (value = 0; value < 256; value++)
3448 if (!isDIGIT(value))
3449 ANYOF_BITMAP_SET(data->start_class, value);
3454 if (flags & SCF_DO_STCLASS_OR)
3455 cl_and(data->start_class, and_withp);
3456 flags &= ~SCF_DO_STCLASS;
3459 else if (PL_regkind[OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
3460 data->flags |= (OP(scan) == MEOL
3464 else if ( PL_regkind[OP(scan)] == BRANCHJ
3465 /* Lookbehind, or need to calculate parens/evals/stclass: */
3466 && (scan->flags || data || (flags & SCF_DO_STCLASS))
3467 && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) {
3468 if ( !PERL_ENABLE_POSITIVE_ASSERTION_STUDY
3469 || OP(scan) == UNLESSM )
3471 /* Negative Lookahead/lookbehind
3472 In this case we can't do fixed string optimisation.
3475 I32 deltanext, minnext, fake = 0;
3477 struct regnode_charclass_class intrnl;
3480 data_fake.flags = 0;
3482 data_fake.whilem_c = data->whilem_c;
3483 data_fake.last_closep = data->last_closep;
3486 data_fake.last_closep = &fake;
3487 data_fake.pos_delta = delta;
3488 if ( flags & SCF_DO_STCLASS && !scan->flags
3489 && OP(scan) == IFMATCH ) { /* Lookahead */
3490 cl_init(pRExC_state, &intrnl);
3491 data_fake.start_class = &intrnl;
3492 f |= SCF_DO_STCLASS_AND;
3494 if (flags & SCF_WHILEM_VISITED_POS)
3495 f |= SCF_WHILEM_VISITED_POS;
3496 next = regnext(scan);
3497 nscan = NEXTOPER(NEXTOPER(scan));
3498 minnext = study_chunk(pRExC_state, &nscan, minlenp, &deltanext,
3499 last, &data_fake, stopparen, recursed, NULL, f, depth+1);
3502 FAIL("Variable length lookbehind not implemented");
3504 else if (minnext > (I32)U8_MAX) {
3505 FAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
3507 scan->flags = (U8)minnext;
3510 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3512 if (data_fake.flags & SF_HAS_EVAL)
3513 data->flags |= SF_HAS_EVAL;
3514 data->whilem_c = data_fake.whilem_c;
3516 if (f & SCF_DO_STCLASS_AND) {
3517 const int was = (data->start_class->flags & ANYOF_EOS);
3519 cl_and(data->start_class, &intrnl);
3521 data->start_class->flags |= ANYOF_EOS;
3524 #if PERL_ENABLE_POSITIVE_ASSERTION_STUDY
3526 /* Positive Lookahead/lookbehind
3527 In this case we can do fixed string optimisation,
3528 but we must be careful about it. Note in the case of
3529 lookbehind the positions will be offset by the minimum
3530 length of the pattern, something we won't know about
3531 until after the recurse.
3533 I32 deltanext, fake = 0;
3535 struct regnode_charclass_class intrnl;
3537 /* We use SAVEFREEPV so that when the full compile
3538 is finished perl will clean up the allocated
3539 minlens when its all done. This was we don't
3540 have to worry about freeing them when we know
3541 they wont be used, which would be a pain.
3544 Newx( minnextp, 1, I32 );
3545 SAVEFREEPV(minnextp);
3548 StructCopy(data, &data_fake, scan_data_t);
3549 if ((flags & SCF_DO_SUBSTR) && data->last_found) {
3552 scan_commit(pRExC_state, &data_fake,minlenp);
3553 data_fake.last_found=newSVsv(data->last_found);
3557 data_fake.last_closep = &fake;
3558 data_fake.flags = 0;
3559 data_fake.pos_delta = delta;
3561 data_fake.flags |= SF_IS_INF;
3562 if ( flags & SCF_DO_STCLASS && !scan->flags
3563 && OP(scan) == IFMATCH ) { /* Lookahead */
3564 cl_init(pRExC_state, &intrnl);
3565 data_fake.start_class = &intrnl;
3566 f |= SCF_DO_STCLASS_AND;
3568 if (flags & SCF_WHILEM_VISITED_POS)
3569 f |= SCF_WHILEM_VISITED_POS;
3570 next = regnext(scan);
3571 nscan = NEXTOPER(NEXTOPER(scan));
3573 *minnextp = study_chunk(pRExC_state, &nscan, minnextp, &deltanext,
3574 last, &data_fake, stopparen, recursed, NULL, f,depth+1);
3577 FAIL("Variable length lookbehind not implemented");
3579 else if (*minnextp > (I32)U8_MAX) {
3580 FAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
3582 scan->flags = (U8)*minnextp;
3587 if (f & SCF_DO_STCLASS_AND) {
3588 const int was = (data->start_class->flags & ANYOF_EOS);
3590 cl_and(data->start_class, &intrnl);
3592 data->start_class->flags |= ANYOF_EOS;
3595 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3597 if (data_fake.flags & SF_HAS_EVAL)
3598 data->flags |= SF_HAS_EVAL;
3599 data->whilem_c = data_fake.whilem_c;
3600 if ((flags & SCF_DO_SUBSTR) && data_fake.last_found) {
3601 if (RExC_rx->minlen<*minnextp)
3602 RExC_rx->minlen=*minnextp;
3603 scan_commit(pRExC_state, &data_fake, minnextp);
3604 SvREFCNT_dec(data_fake.last_found);
3606 if ( data_fake.minlen_fixed != minlenp )
3608 data->offset_fixed= data_fake.offset_fixed;
3609 data->minlen_fixed= data_fake.minlen_fixed;
3610 data->lookbehind_fixed+= scan->flags;
3612 if ( data_fake.minlen_float != minlenp )
3614 data->minlen_float= data_fake.minlen_float;
3615 data->offset_float_min=data_fake.offset_float_min;
3616 data->offset_float_max=data_fake.offset_float_max;
3617 data->lookbehind_float+= scan->flags;
3626 else if (OP(scan) == OPEN) {
3627 if (stopparen != (I32)ARG(scan))
3630 else if (OP(scan) == CLOSE) {
3631 if (stopparen == (I32)ARG(scan)) {
3634 if ((I32)ARG(scan) == is_par) {
3635 next = regnext(scan);
3637 if ( next && (OP(next) != WHILEM) && next < last)
3638 is_par = 0; /* Disable optimization */
3641 *(data->last_closep) = ARG(scan);
3643 else if (OP(scan) == EVAL) {
3645 data->flags |= SF_HAS_EVAL;
3647 else if ( PL_regkind[OP(scan)] == ENDLIKE ) {
3648 if (flags & SCF_DO_SUBSTR) {
3649 scan_commit(pRExC_state,data,minlenp);
3650 flags &= ~SCF_DO_SUBSTR;
3652 if (data && OP(scan)==ACCEPT) {
3653 data->flags |= SCF_SEEN_ACCEPT;
3658 else if (OP(scan) == LOGICAL && scan->flags == 2) /* Embedded follows */
3660 if (flags & SCF_DO_SUBSTR) {
3661 scan_commit(pRExC_state,data,minlenp);
3662 data->longest = &(data->longest_float);
3664 is_inf = is_inf_internal = 1;
3665 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
3666 cl_anything(pRExC_state, data->start_class);
3667 flags &= ~SCF_DO_STCLASS;
3669 else if (OP(scan) == GPOS) {
3670 if (!(RExC_rx->reganch & ROPT_GPOS_FLOAT) &&
3671 !(delta || is_inf || (data && data->pos_delta)))
3673 if (!(RExC_rx->reganch & ROPT_ANCH) && (flags & SCF_DO_SUBSTR))
3674 RExC_rx->reganch |= ROPT_ANCH_GPOS;
3675 if (RExC_rx->gofs < (U32)min)
3676 RExC_rx->gofs = min;
3678 RExC_rx->reganch |= ROPT_GPOS_FLOAT;
3682 #ifdef TRIE_STUDY_OPT
3683 #ifdef FULL_TRIE_STUDY
3684 else if (PL_regkind[OP(scan)] == TRIE) {
3685 /* NOTE - There is similar code to this block above for handling
3686 BRANCH nodes on the initial study. If you change stuff here
3688 regnode *trie_node= scan;
3689 regnode *tail= regnext(scan);
3690 reg_trie_data *trie = (reg_trie_data*)RExC_rx->data->data[ ARG(scan) ];
3691 I32 max1 = 0, min1 = I32_MAX;
3692 struct regnode_charclass_class accum;
3694 if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
3695 scan_commit(pRExC_state, data,minlenp); /* Cannot merge strings after this. */
3696 if (flags & SCF_DO_STCLASS)
3697 cl_init_zero(pRExC_state, &accum);
3703 const regnode *nextbranch= NULL;
3706 for ( word=1 ; word <= trie->wordcount ; word++)
3708 I32 deltanext=0, minnext=0, f = 0, fake;
3709 struct regnode_charclass_class this_class;
3711 data_fake.flags = 0;
3713 data_fake.whilem_c = data->whilem_c;
3714 data_fake.last_closep = data->last_closep;
3717 data_fake.last_closep = &fake;
3718 data_fake.pos_delta = delta;
3719 if (flags & SCF_DO_STCLASS) {
3720 cl_init(pRExC_state, &this_class);
3721 data_fake.start_class = &this_class;
3722 f = SCF_DO_STCLASS_AND;
3724 if (flags & SCF_WHILEM_VISITED_POS)
3725 f |= SCF_WHILEM_VISITED_POS;
3727 if (trie->jump[word]) {
3729 nextbranch = trie_node + trie->jump[0];
3730 scan= trie_node + trie->jump[word];
3731 /* We go from the jump point to the branch that follows
3732 it. Note this means we need the vestigal unused branches
3733 even though they arent otherwise used.
3735 minnext = study_chunk(pRExC_state, &scan, minlenp,
3736 &deltanext, (regnode *)nextbranch, &data_fake,
3737 stopparen, recursed, NULL, f,depth+1);
3739 if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
3740 nextbranch= regnext((regnode*)nextbranch);
3742 if (min1 > (I32)(minnext + trie->minlen))
3743 min1 = minnext + trie->minlen;
3744 if (max1 < (I32)(minnext + deltanext + trie->maxlen))
3745 max1 = minnext + deltanext + trie->maxlen;
3746 if (deltanext == I32_MAX)
3747 is_inf = is_inf_internal = 1;
3749 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3751 if (data_fake.flags & SCF_SEEN_ACCEPT) {
3752 if ( stopmin > min + min1)
3753 stopmin = min + min1;
3754 flags &= ~SCF_DO_SUBSTR;
3756 data->flags |= SCF_SEEN_ACCEPT;
3759 if (data_fake.flags & SF_HAS_EVAL)
3760 data->flags |= SF_HAS_EVAL;
3761 data->whilem_c = data_fake.whilem_c;
3763 if (flags & SCF_DO_STCLASS)
3764 cl_or(pRExC_state, &accum, &this_class);
3767 if (flags & SCF_DO_SUBSTR) {
3768 data->pos_min += min1;
3769 data->pos_delta += max1 - min1;
3770 if (max1 != min1 || is_inf)
3771 data->longest = &(data->longest_float);
3774 delta += max1 - min1;
3775 if (flags & SCF_DO_STCLASS_OR) {
3776 cl_or(pRExC_state, data->start_class, &accum);
3778 cl_and(data->start_class, and_withp);
3779 flags &= ~SCF_DO_STCLASS;
3782 else if (flags & SCF_DO_STCLASS_AND) {
3784 cl_and(data->start_class, &accum);
3785 flags &= ~SCF_DO_STCLASS;
3788 /* Switch to OR mode: cache the old value of
3789 * data->start_class */
3791 StructCopy(data->start_class, and_withp,
3792 struct regnode_charclass_class);
3793 flags &= ~SCF_DO_STCLASS_AND;
3794 StructCopy(&accum, data->start_class,
3795 struct regnode_charclass_class);
3796 flags |= SCF_DO_STCLASS_OR;
3797 data->start_class->flags |= ANYOF_EOS;
3804 else if (PL_regkind[OP(scan)] == TRIE) {
3805 reg_trie_data *trie = (reg_trie_data*)RExC_rx->data->data[ ARG(scan) ];
3808 min += trie->minlen;
3809 delta += (trie->maxlen - trie->minlen);
3810 flags &= ~SCF_DO_STCLASS; /* xxx */
3811 if (flags & SCF_DO_SUBSTR) {
3812 scan_commit(pRExC_state,data,minlenp); /* Cannot expect anything... */
3813 data->pos_min += trie->minlen;
3814 data->pos_delta += (trie->maxlen - trie->minlen);
3815 if (trie->maxlen != trie->minlen)
3816 data->longest = &(data->longest_float);
3818 if (trie->jump) /* no more substrings -- for now /grr*/
3819 flags &= ~SCF_DO_SUBSTR;
3821 #endif /* old or new */
3822 #endif /* TRIE_STUDY_OPT */
3823 /* Else: zero-length, ignore. */
3824 scan = regnext(scan);
3829 stopparen = frame->stop;
3830 frame = frame->prev;
3831 goto fake_study_recurse;
3838 *deltap = is_inf_internal ? I32_MAX : delta;
3839 if (flags & SCF_DO_SUBSTR && is_inf)
3840 data->pos_delta = I32_MAX - data->pos_min;
3841 if (is_par > (I32)U8_MAX)
3843 if (is_par && pars==1 && data) {
3844 data->flags |= SF_IN_PAR;
3845 data->flags &= ~SF_HAS_PAR;
3847 else if (pars && data) {
3848 data->flags |= SF_HAS_PAR;
3849 data->flags &= ~SF_IN_PAR;
3851 if (flags & SCF_DO_STCLASS_OR)
3852 cl_and(data->start_class, and_withp);
3853 if (flags & SCF_TRIE_RESTUDY)
3854 data->flags |= SCF_TRIE_RESTUDY;
3856 DEBUG_STUDYDATA(data,depth);
3858 return min < stopmin ? min : stopmin;
3862 S_add_data(RExC_state_t *pRExC_state, I32 n, const char *s)
3864 if (RExC_rx->data) {
3865 const U32 count = RExC_rx->data->count;
3866 Renewc(RExC_rx->data,
3867 sizeof(*RExC_rx->data) + sizeof(void*) * (count + n - 1),
3868 char, struct reg_data);
3869 Renew(RExC_rx->data->what, count + n, U8);
3870 RExC_rx->data->count += n;
3873 Newxc(RExC_rx->data, sizeof(*RExC_rx->data) + sizeof(void*) * (n - 1),
3874 char, struct reg_data);
3875 Newx(RExC_rx->data->what, n, U8);
3876 RExC_rx->data->count = n;
3878 Copy(s, RExC_rx->data->what + RExC_rx->data->count - n, n, U8);
3879 return RExC_rx->data->count - n;
3882 #ifndef PERL_IN_XSUB_RE
3884 Perl_reginitcolors(pTHX)
3887 const char * const s = PerlEnv_getenv("PERL_RE_COLORS");
3889 char *t = savepv(s);
3893 t = strchr(t, '\t');
3899 PL_colors[i] = t = (char *)"";
3904 PL_colors[i++] = (char *)"";
3911 #ifdef TRIE_STUDY_OPT
3912 #define CHECK_RESTUDY_GOTO \
3914 (data.flags & SCF_TRIE_RESTUDY) \
3918 #define CHECK_RESTUDY_GOTO
3922 - pregcomp - compile a regular expression into internal code
3924 * We can't allocate space until we know how big the compiled form will be,
3925 * but we can't compile it (and thus know how big it is) until we've got a
3926 * place to put the code. So we cheat: we compile it twice, once with code
3927 * generation turned off and size counting turned on, and once "for real".
3928 * This also means that we don't allocate space until we are sure that the
3929 * thing really will compile successfully, and we never have to move the
3930 * code and thus invalidate pointers into it. (Note that it has to be in
3931 * one piece because free() must be able to free it all.) [NB: not true in perl]
3933 * Beware that the optimization-preparation code in here knows about some
3934 * of the structure of the compiled regexp. [I'll say.]
3939 #ifndef PERL_IN_XSUB_RE
3940 #define RE_ENGINE_PTR &PL_core_reg_engine
3942 extern const struct regexp_engine my_reg_engine;
3943 #define RE_ENGINE_PTR &my_reg_engine
3945 /* these make a few things look better, to avoid indentation */
3946 #define BEGIN_BLOCK {
3950 Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm)
3953 GET_RE_DEBUG_FLAGS_DECL;
3954 DEBUG_r(if (!PL_colorset) reginitcolors());
3955 #ifndef PERL_IN_XSUB_RE
3957 /* Dispatch a request to compile a regexp to correct
3959 HV * const table = GvHV(PL_hintgv);
3961 SV **ptr= hv_fetchs(table, "regcomp", FALSE);
3962 if (ptr && SvIOK(*ptr) && SvIV(*ptr)) {
3963 const regexp_engine *eng=INT2PTR(regexp_engine*,SvIV(*ptr));
3965 PerlIO_printf(Perl_debug_log, "Using engine %"UVxf"\n",
3968 return CALLREGCOMP_ENG(eng, exp, xend, pm);
3982 RExC_state_t RExC_state;
3983 RExC_state_t * const pRExC_state = &RExC_state;
3984 #ifdef TRIE_STUDY_OPT
3986 RExC_state_t copyRExC_state;
3989 FAIL("NULL regexp argument");
3991 RExC_utf8 = pm->op_pmdynflags & PMdf_CMP_UTF8;
3995 SV *dsv= sv_newmortal();
3996 RE_PV_QUOTED_DECL(s, RExC_utf8,
3997 dsv, RExC_precomp, (xend - exp), 60);
3998 PerlIO_printf(Perl_debug_log, "%sCompiling REx%s %s\n",
3999 PL_colors[4],PL_colors[5],s);
4001 RExC_flags = pm->op_pmflags;
4005 RExC_seen_zerolen = *exp == '^' ? -1 : 0;
4006 RExC_seen_evals = 0;
4009 /* First pass: determine size, legality. */
4018 RExC_emit = &PL_regdummy;
4019 RExC_whilem_seen = 0;
4020 RExC_charnames = NULL;
4021 RExC_open_parens = NULL;
4022 RExC_close_parens = NULL;
4024 RExC_paren_names = NULL;
4025 RExC_recurse = NULL;
4026 RExC_recurse_count = 0;
4028 #if 0 /* REGC() is (currently) a NOP at the first pass.
4029 * Clever compilers notice this and complain. --jhi */
4030 REGC((U8)REG_MAGIC, (char*)RExC_emit);
4032 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "Starting first pass (sizing)\n"));
4033 if (reg(pRExC_state, 0, &flags,1) == NULL) {
4034 RExC_precomp = NULL;
4038 PerlIO_printf(Perl_debug_log,
4039 "Required size %"IVdf" nodes\n"
4040 "Starting second pass (creation)\n",
4043 RExC_lastparse=NULL;
4045 /* Small enough for pointer-storage convention?
4046 If extralen==0, this means that we will not need long jumps. */
4047 if (RExC_size >= 0x10000L && RExC_extralen)
4048 RExC_size += RExC_extralen;
4051 if (RExC_whilem_seen > 15)
4052 RExC_whilem_seen = 15;
4055 /* Make room for a sentinel value at the end of the program */
4059 /* Allocate space and zero-initialize. Note, the two step process
4060 of zeroing when in debug mode, thus anything assigned has to
4061 happen after that */
4062 Newxc(r, sizeof(regexp) + (unsigned)RExC_size * sizeof(regnode),
4065 FAIL("Regexp out of space");
4067 /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */
4068 Zero(r, sizeof(regexp) + (unsigned)RExC_size * sizeof(regnode), char);
4070 /* bulk initialize fields with 0. */
4071 Zero(r, sizeof(regexp), char);
4074 /* non-zero initialization begins here */
4075 r->engine= RE_ENGINE_PTR;
4077 r->prelen = xend - exp;
4078 r->precomp = savepvn(RExC_precomp, r->prelen);
4079 r->reganch = pm->op_pmflags & PMf_COMPILETIME;
4080 r->nparens = RExC_npar - 1; /* set early to validate backrefs */
4082 if (RExC_seen & REG_SEEN_RECURSE) {
4083 Newxz(RExC_open_parens, RExC_npar,regnode *);
4084 SAVEFREEPV(RExC_open_parens);
4085 Newxz(RExC_close_parens,RExC_npar,regnode *);
4086 SAVEFREEPV(RExC_close_parens);
4089 /* Useful during FAIL. */
4090 Newxz(r->offsets, 2*RExC_size+1, U32); /* MJD 20001228 */
4092 r->offsets[0] = RExC_size;
4094 DEBUG_OFFSETS_r(PerlIO_printf(Perl_debug_log,
4095 "%s %"UVuf" bytes for offset annotations.\n",
4096 r->offsets ? "Got" : "Couldn't get",
4097 (UV)((2*RExC_size+1) * sizeof(U32))));
4101 /* Second pass: emit code. */
4102 RExC_flags = pm->op_pmflags; /* don't let top level (?i) bleed */
4108 RExC_emit_start = r->program;
4109 RExC_emit = r->program;
4111 /* put a sentinal on the end of the program so we can check for
4113 r->program[RExC_size].type = 255;
4115 /* Store the count of eval-groups for security checks: */
4116 RExC_emit->next_off = (RExC_seen_evals > (I32)U16_MAX) ? U16_MAX : (U16)RExC_seen_evals;
4117 REGC((U8)REG_MAGIC, (char*) RExC_emit++);
4119 if (reg(pRExC_state, 0, &flags,1) == NULL)
4122 /* XXXX To minimize changes to RE engine we always allocate
4123 3-units-long substrs field. */
4124 Newx(r->substrs, 1, struct reg_substr_data);
4125 if (RExC_recurse_count) {
4126 Newxz(RExC_recurse,RExC_recurse_count,regnode *);
4127 SAVEFREEPV(RExC_recurse);
4131 r->minlen = minlen = sawplus = sawopen = 0;
4132 Zero(r->substrs, 1, struct reg_substr_data);
4134 #ifdef TRIE_STUDY_OPT
4137 DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log,"Restudying\n"));
4139 RExC_state = copyRExC_state;
4140 if (seen & REG_TOP_LEVEL_BRANCHES)
4141 RExC_seen |= REG_TOP_LEVEL_BRANCHES;
4143 RExC_seen &= ~REG_TOP_LEVEL_BRANCHES;
4144 if (data.last_found) {
4145 SvREFCNT_dec(data.longest_fixed);
4146 SvREFCNT_dec(data.longest_float);
4147 SvREFCNT_dec(data.last_found);
4149 StructCopy(&zero_scan_data, &data, scan_data_t);
4151 StructCopy(&zero_scan_data, &data, scan_data_t);
4152 copyRExC_state = RExC_state;
4155 StructCopy(&zero_scan_data, &data, scan_data_t);
4158 /* Dig out information for optimizations. */
4159 r->reganch = pm->op_pmflags & PMf_COMPILETIME; /* Again? */
4160 pm->op_pmflags = RExC_flags;
4162 r->reganch |= ROPT_UTF8; /* Unicode in it? */
4163 r->regstclass = NULL;
4164 if (RExC_naughty >= 10) /* Probably an expensive pattern. */
4165 r->reganch |= ROPT_NAUGHTY;
4166 scan = r->program + 1; /* First BRANCH. */
4168 /* testing for BRANCH here tells us whether there is "must appear"
4169 data in the pattern. If there is then we can use it for optimisations */
4170 if (!(RExC_seen & REG_TOP_LEVEL_BRANCHES)) { /* Only one top-level choice. */
4172 STRLEN longest_float_length, longest_fixed_length;
4173 struct regnode_charclass_class ch_class; /* pointed to by data */
4175 I32 last_close = 0; /* pointed to by data */
4178 /* Skip introductions and multiplicators >= 1. */
4179 while ((OP(first) == OPEN && (sawopen = 1)) ||
4180 /* An OR of *one* alternative - should not happen now. */
4181 (OP(first) == BRANCH && OP(regnext(first)) != BRANCH) ||
4182 /* for now we can't handle lookbehind IFMATCH*/
4183 (OP(first) == IFMATCH && !first->flags) ||
4184 (OP(first) == PLUS) ||
4185 (OP(first) == MINMOD) ||
4186 /* An {n,m} with n>0 */
4187 (PL_regkind[OP(first)] == CURLY && ARG1(first) > 0) )
4190 if (OP(first) == PLUS)
4193 first += regarglen[OP(first)];
4194 if (OP(first) == IFMATCH) {
4195 first = NEXTOPER(first);
4196 first += EXTRA_STEP_2ARGS;
4197 } else /* XXX possible optimisation for /(?=)/ */
4198 first = NEXTOPER(first);
4201 /* Starting-point info. */
4203 DEBUG_PEEP("first:",first,0);
4204 /* Ignore EXACT as we deal with it later. */
4205 if (PL_regkind[OP(first)] == EXACT) {
4206 if (OP(first) == EXACT)
4207 NOOP; /* Empty, get anchored substr later. */
4208 else if ((OP(first) == EXACTF || OP(first) == EXACTFL))
4209 r->regstclass = first;
4212 else if (PL_regkind[OP(first)] == TRIE &&
4213 ((reg_trie_data *)r->data->data[ ARG(first) ])->minlen>0)
4216 /* this can happen only on restudy */
4217 if ( OP(first) == TRIE ) {
4218 struct regnode_1 *trieop;
4219 Newxz(trieop,1,struct regnode_1);
4220 StructCopy(first,trieop,struct regnode_1);
4221 trie_op=(regnode *)trieop;
4223 struct regnode_charclass *trieop;
4224 Newxz(trieop,1,struct regnode_charclass);
4225 StructCopy(first,trieop,struct regnode_charclass);
4226 trie_op=(regnode *)trieop;
4229 make_trie_failtable(pRExC_state, (regnode *)first, trie_op, 0);
4230 r->regstclass = trie_op;
4233 else if (strchr((const char*)PL_simple,OP(first)))
4234 r->regstclass = first;
4235 else if (PL_regkind[OP(first)] == BOUND ||
4236 PL_regkind[OP(first)] == NBOUND)
4237 r->regstclass = first;
4238 else if (PL_regkind[OP(first)] == BOL) {
4239 r->reganch |= (OP(first) == MBOL
4241 : (OP(first) == SBOL
4244 first = NEXTOPER(first);
4247 else if (OP(first) == GPOS) {
4248 r->reganch |= ROPT_ANCH_GPOS;
4249 first = NEXTOPER(first);
4252 else if ((!sawopen || !RExC_sawback) &&
4253 (OP(first) == STAR &&
4254 PL_regkind[OP(NEXTOPER(first))] == REG_ANY) &&
4255 !(r->reganch & ROPT_ANCH) && !(RExC_seen & REG_SEEN_EVAL))
4257 /* turn .* into ^.* with an implied $*=1 */
4259 (OP(NEXTOPER(first)) == REG_ANY)
4262 r->reganch |= type | ROPT_IMPLICIT;
4263 first = NEXTOPER(first);
4266 if (sawplus && (!sawopen || !RExC_sawback)
4267 && !(RExC_seen & REG_SEEN_EVAL)) /* May examine pos and $& */
4268 /* x+ must match at the 1st pos of run of x's */
4269 r->reganch |= ROPT_SKIP;
4271 /* Scan is after the zeroth branch, first is atomic matcher. */
4272 #ifdef TRIE_STUDY_OPT
4275 PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
4276 (IV)(first - scan + 1))
4280 PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
4281 (IV)(first - scan + 1))
4287 * If there's something expensive in the r.e., find the
4288 * longest literal string that must appear and make it the
4289 * regmust. Resolve ties in favor of later strings, since
4290 * the regstart check works with the beginning of the r.e.
4291 * and avoiding duplication strengthens checking. Not a
4292 * strong reason, but sufficient in the absence of others.
4293 * [Now we resolve ties in favor of the earlier string if
4294 * it happens that c_offset_min has been invalidated, since the
4295 * earlier string may buy us something the later one won't.]
4298 data.longest_fixed = newSVpvs("");
4299 data.longest_float = newSVpvs("");
4300 data.last_found = newSVpvs("");
4301 data.longest = &(data.longest_fixed);
4303 if (!r->regstclass) {
4304 cl_init(pRExC_state, &ch_class);
4305 data.start_class = &ch_class;
4306 stclass_flag = SCF_DO_STCLASS_AND;
4307 } else /* XXXX Check for BOUND? */
4309 data.last_closep = &last_close;
4311 minlen = study_chunk(pRExC_state, &first, &minlen, &fake, scan + RExC_size, /* Up to end */
4312 &data, -1, NULL, NULL,
4313 SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag,0);
4319 if ( RExC_npar == 1 && data.longest == &(data.longest_fixed)
4320 && data.last_start_min == 0 && data.last_end > 0
4321 && !RExC_seen_zerolen
4322 && (!(RExC_seen & REG_SEEN_GPOS) || (r->reganch & ROPT_ANCH_GPOS)))
4323 r->reganch |= ROPT_CHECK_ALL;
4324 scan_commit(pRExC_state, &data,&minlen);
4325 SvREFCNT_dec(data.last_found);
4327 /* Note that code very similar to this but for anchored string
4328 follows immediately below, changes may need to be made to both.
4331 longest_float_length = CHR_SVLEN(data.longest_float);
4332 if (longest_float_length
4333 || (data.flags & SF_FL_BEFORE_EOL
4334 && (!(data.flags & SF_FL_BEFORE_MEOL)
4335 || (RExC_flags & PMf_MULTILINE))))
4339 if (SvCUR(data.longest_fixed) /* ok to leave SvCUR */
4340 && data.offset_fixed == data.offset_float_min
4341 && SvCUR(data.longest_fixed) == SvCUR(data.longest_float))
4342 goto remove_float; /* As in (a)+. */
4344 /* copy the information about the longest float from the reg_scan_data
4345 over to the program. */
4346 if (SvUTF8(data.longest_float)) {
4347 r->float_utf8 = data.longest_float;
4348 r->float_substr = NULL;
4350 r->float_substr = data.longest_float;
4351 r->float_utf8 = NULL;
4353 /* float_end_shift is how many chars that must be matched that
4354 follow this item. We calculate it ahead of time as once the
4355 lookbehind offset is added in we lose the ability to correctly
4357 ml = data.minlen_float ? *(data.minlen_float)
4358 : (I32)longest_float_length;
4359 r->float_end_shift = ml - data.offset_float_min
4360 - longest_float_length + (SvTAIL(data.longest_float) != 0)
4361 + data.lookbehind_float;
4362 r->float_min_offset = data.offset_float_min - data.lookbehind_float;
4363 r->float_max_offset = data.offset_float_max;
4364 if (data.offset_float_max < I32_MAX) /* Don't offset infinity */
4365 r->float_max_offset -= data.lookbehind_float;
4367 t = (data.flags & SF_FL_BEFORE_EOL /* Can't have SEOL and MULTI */
4368 && (!(data.flags & SF_FL_BEFORE_MEOL)
4369 || (RExC_flags & PMf_MULTILINE)));
4370 fbm_compile(data.longest_float, t ? FBMcf_TAIL : 0);
4374 r->float_substr = r->float_utf8 = NULL;
4375 SvREFCNT_dec(data.longest_float);
4376 longest_float_length = 0;
4379 /* Note that code very similar to this but for floating string
4380 is immediately above, changes may need to be made to both.
4383 longest_fixed_length = CHR_SVLEN(data.longest_fixed);
4384 if (longest_fixed_length
4385 || (data.flags & SF_FIX_BEFORE_EOL /* Cannot have SEOL and MULTI */
4386 && (!(data.flags & SF_FIX_BEFORE_MEOL)
4387 || (RExC_flags & PMf_MULTILINE))))
4391 /* copy the information about the longest fixed
4392 from the reg_scan_data over to the program. */
4393 if (SvUTF8(data.longest_fixed)) {
4394 r->anchored_utf8 = data.longest_fixed;
4395 r->anchored_substr = NULL;
4397 r->anchored_substr = data.longest_fixed;
4398 r->anchored_utf8 = NULL;
4400 /* fixed_end_shift is how many chars that must be matched that
4401 follow this item. We calculate it ahead of time as once the
4402 lookbehind offset is added in we lose the ability to correctly
4404 ml = data.minlen_fixed ? *(data.minlen_fixed)
4405 : (I32)longest_fixed_length;
4406 r->anchored_end_shift = ml - data.offset_fixed
4407 - longest_fixed_length + (SvTAIL(data.longest_fixed) != 0)
4408 + data.lookbehind_fixed;
4409 r->anchored_offset = data.offset_fixed - data.lookbehind_fixed;
4411 t = (data.flags & SF_FIX_BEFORE_EOL /* Can't have SEOL and MULTI */
4412 && (!(data.flags & SF_FIX_BEFORE_MEOL)
4413 || (RExC_flags & PMf_MULTILINE)));
4414 fbm_compile(data.longest_fixed, t ? FBMcf_TAIL : 0);
4417 r->anchored_substr = r->anchored_utf8 = NULL;
4418 SvREFCNT_dec(data.longest_fixed);
4419 longest_fixed_length = 0;
4422 && (OP(r->regstclass) == REG_ANY || OP(r->regstclass) == SANY))
4423 r->regstclass = NULL;
4424 if ((!(r->anchored_substr || r->anchored_utf8) || r->anchored_offset)
4426 && !(data.start_class->flags & ANYOF_EOS)
4427 && !cl_is_anything(data.start_class))
4429 const I32 n = add_data(pRExC_state, 1, "f");
4431 Newx(RExC_rx->data->data[n], 1,
4432 struct regnode_charclass_class);
4433 StructCopy(data.start_class,
4434 (struct regnode_charclass_class*)RExC_rx->data->data[n],
4435 struct regnode_charclass_class);
4436 r->regstclass = (regnode*)RExC_rx->data->data[n];
4437 r->reganch &= ~ROPT_SKIP; /* Used in find_byclass(). */
4438 DEBUG_COMPILE_r({ SV *sv = sv_newmortal();
4439 regprop(r, sv, (regnode*)data.start_class);
4440 PerlIO_printf(Perl_debug_log,
4441 "synthetic stclass \"%s\".\n",
4442 SvPVX_const(sv));});
4445 /* A temporary algorithm prefers floated substr to fixed one to dig more info. */
4446 if (longest_fixed_length > longest_float_length) {
4447 r->check_end_shift = r->anchored_end_shift;
4448 r->check_substr = r->anchored_substr;
4449 r->check_utf8 = r->anchored_utf8;
4450 r->check_offset_min = r->check_offset_max = r->anchored_offset;
4451 if (r->reganch & ROPT_ANCH_SINGLE)
4452 r->reganch |= ROPT_NOSCAN;
4455 r->check_end_shift = r->float_end_shift;
4456 r->check_substr = r->float_substr;
4457 r->check_utf8 = r->float_utf8;
4458 r->check_offset_min = r->float_min_offset;
4459 r->check_offset_max = r->float_max_offset;
4461 /* XXXX Currently intuiting is not compatible with ANCH_GPOS.
4462 This should be changed ASAP! */
4463 if ((r->check_substr || r->check_utf8) && !(r->reganch & ROPT_ANCH_GPOS)) {
4464 r->reganch |= RE_USE_INTUIT;
4465 if (SvTAIL(r->check_substr ? r->check_substr : r->check_utf8))
4466 r->reganch |= RE_INTUIT_TAIL;
4468 /* XXX Unneeded? dmq (shouldn't as this is handled elsewhere)
4469 if ( (STRLEN)minlen < longest_float_length )
4470 minlen= longest_float_length;
4471 if ( (STRLEN)minlen < longest_fixed_length )
4472 minlen= longest_fixed_length;
4476 /* Several toplevels. Best we can is to set minlen. */
4478 struct regnode_charclass_class ch_class;
4481 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "\nMulti Top Level\n"));
4483 scan = r->program + 1;
4484 cl_init(pRExC_state, &ch_class);
4485 data.start_class = &ch_class;
4486 data.last_closep = &last_close;
4489 minlen = study_chunk(pRExC_state, &scan, &minlen, &fake, scan + RExC_size,
4490 &data, -1, NULL, NULL, SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS,0);
4494 r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8
4495 = r->float_substr = r->float_utf8 = NULL;
4496 if (!(data.start_class->flags & ANYOF_EOS)
4497 && !cl_is_anything(data.start_class))
4499 const I32 n = add_data(pRExC_state, 1, "f");
4501 Newx(RExC_rx->data->data[n], 1,
4502 struct regnode_charclass_class);
4503 StructCopy(data.start_class,
4504 (struct regnode_charclass_class*)RExC_rx->data->data[n],
4505 struct regnode_charclass_class);
4506 r->regstclass = (regnode*)RExC_rx->data->data[n];
4507 r->reganch &= ~ROPT_SKIP; /* Used in find_byclass(). */
4508 DEBUG_COMPILE_r({ SV* sv = sv_newmortal();
4509 regprop(r, sv, (regnode*)data.start_class);
4510 PerlIO_printf(Perl_debug_log,
4511 "synthetic stclass \"%s\".\n",
4512 SvPVX_const(sv));});
4516 /* Guard against an embedded (?=) or (?<=) with a longer minlen than
4517 the "real" pattern. */
4519 PerlIO_printf(Perl_debug_log,"minlen: %"IVdf" r->minlen:%"IVdf"\n",
4522 r->minlenret = minlen;
4523 if (r->minlen < minlen)
4526 if (RExC_seen & REG_SEEN_GPOS)
4527 r->reganch |= ROPT_GPOS_SEEN;
4528 if (RExC_seen & REG_SEEN_LOOKBEHIND)
4529 r->reganch |= ROPT_LOOKBEHIND_SEEN;
4530 if (RExC_seen & REG_SEEN_EVAL)
4531 r->reganch |= ROPT_EVAL_SEEN;
4532 if (RExC_seen & REG_SEEN_CANY)
4533 r->reganch |= ROPT_CANY_SEEN;
4534 if (RExC_seen & REG_SEEN_VERBARG)
4535 r->reganch |= ROPT_VERBARG_SEEN;
4536 if (RExC_seen & REG_SEEN_CUTGROUP)
4537 r->reganch |= ROPT_CUTGROUP_SEEN;
4538 if (RExC_paren_names)
4539 r->paren_names = (HV*)SvREFCNT_inc(RExC_paren_names);
4541 r->paren_names = NULL;
4543 if (RExC_recurse_count) {
4544 for ( ; RExC_recurse_count ; RExC_recurse_count-- ) {
4545 const regnode *scan = RExC_recurse[RExC_recurse_count-1];
4546 ARG2L_SET( scan, RExC_open_parens[ARG(scan)-1] - scan );
4549 Newxz(r->startp, RExC_npar, I32);
4550 Newxz(r->endp, RExC_npar, I32);
4551 /* assume we don't need to swap parens around before we match */
4554 PerlIO_printf(Perl_debug_log,"Final program:\n");
4557 DEBUG_OFFSETS_r(if (r->offsets) {
4558 const U32 len = r->offsets[0];
4560 GET_RE_DEBUG_FLAGS_DECL;
4561 PerlIO_printf(Perl_debug_log, "Offsets: [%"UVuf"]\n\t", (UV)r->offsets[0]);
4562 for (i = 1; i <= len; i++) {
4563 if (r->offsets[i*2-1] || r->offsets[i*2])
4564 PerlIO_printf(Perl_debug_log, "%"UVuf":%"UVuf"[%"UVuf"] ",
4565 (UV)i, (UV)r->offsets[i*2-1], (UV)r->offsets[i*2]);
4567 PerlIO_printf(Perl_debug_log, "\n");
4573 #undef CORE_ONLY_BLOCK
4575 #undef RE_ENGINE_PTR
4577 #ifndef PERL_IN_XSUB_RE
4579 Perl_reg_named_buff_sv(pTHX_ SV* namesv)
4581 I32 parno = 0; /* no match */
4583 const REGEXP * const rx = PM_GETRE(PL_curpm);
4584 if (rx && rx->paren_names) {
4585 HE *he_str = hv_fetch_ent( rx->paren_names, namesv, 0, 0 );
4588 SV* sv_dat=HeVAL(he_str);
4589 I32 *nums=(I32*)SvPVX(sv_dat);
4590 for ( i=0; i<SvIVX(sv_dat); i++ ) {
4591 if ((I32)(rx->lastparen) >= nums[i] &&
4592 rx->endp[nums[i]] != -1)
4605 SV *sv= sv_newmortal();
4606 Perl_sv_setpvf(aTHX_ sv, "%"IVdf,(IV)parno);
4607 gv_paren= Perl_gv_fetchsv(aTHX_ sv, GV_ADD, SVt_PVGV);
4608 return GvSVn(gv_paren);
4613 /* Scans the name of a named buffer from the pattern.
4614 * If flags is REG_RSN_RETURN_NULL returns null.
4615 * If flags is REG_RSN_RETURN_NAME returns an SV* containing the name
4616 * If flags is REG_RSN_RETURN_DATA returns the data SV* corresponding
4617 * to the parsed name as looked up in the RExC_paren_names hash.
4618 * If there is an error throws a vFAIL().. type exception.
4621 #define REG_RSN_RETURN_NULL 0
4622 #define REG_RSN_RETURN_NAME 1
4623 #define REG_RSN_RETURN_DATA 2
4626 S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags) {
4627 char *name_start = RExC_parse;
4630 while( isIDFIRST_uni(utf8n_to_uvchr((U8*)RExC_parse,
4631 RExC_end - RExC_parse, &numlen, UTF8_ALLOW_DEFAULT)))
4633 RExC_parse += numlen;
4636 while( isIDFIRST(*RExC_parse) )
4640 SV* sv_name = sv_2mortal(Perl_newSVpvn(aTHX_ name_start,
4641 (int)(RExC_parse - name_start)));
4644 if ( flags == REG_RSN_RETURN_NAME)
4646 else if (flags==REG_RSN_RETURN_DATA) {
4649 if ( ! sv_name ) /* should not happen*/
4650 Perl_croak(aTHX_ "panic: no svname in reg_scan_name");
4651 if (RExC_paren_names)
4652 he_str = hv_fetch_ent( RExC_paren_names, sv_name, 0, 0 );
4654 sv_dat = HeVAL(he_str);
4656 vFAIL("Reference to nonexistent named group");
4660 Perl_croak(aTHX_ "panic: bad flag in reg_scan_name");
4667 #define DEBUG_PARSE_MSG(funcname) DEBUG_PARSE_r({ \
4668 int rem=(int)(RExC_end - RExC_parse); \
4677 if (RExC_lastparse!=RExC_parse) \
4678 PerlIO_printf(Perl_debug_log," >%.*s%-*s", \
4681 iscut ? "..." : "<" \
4684 PerlIO_printf(Perl_debug_log,"%16s",""); \
4689 num=REG_NODE_NUM(RExC_emit); \
4690 if (RExC_lastnum!=num) \
4691 PerlIO_printf(Perl_debug_log,"|%4d",num); \
4693 PerlIO_printf(Perl_debug_log,"|%4s",""); \
4694 PerlIO_printf(Perl_debug_log,"|%*s%-4s", \
4695 (int)((depth*2)), "", \
4699 RExC_lastparse=RExC_parse; \
4704 #define DEBUG_PARSE(funcname) DEBUG_PARSE_r({ \
4705 DEBUG_PARSE_MSG((funcname)); \
4706 PerlIO_printf(Perl_debug_log,"%4s","\n"); \
4708 #define DEBUG_PARSE_FMT(funcname,fmt,args) DEBUG_PARSE_r({ \
4709 DEBUG_PARSE_MSG((funcname)); \
4710 PerlIO_printf(Perl_debug_log,fmt "\n",args); \
4713 - reg - regular expression, i.e. main body or parenthesized thing
4715 * Caller must absorb opening parenthesis.
4717 * Combining parenthesis handling with the base level of regular expression
4718 * is a trifle forced, but the need to tie the tails of the branches to what
4719 * follows makes it hard to avoid.
4721 #define REGTAIL(x,y,z) regtail((x),(y),(z),depth+1)
4723 #define REGTAIL_STUDY(x,y,z) regtail_study((x),(y),(z),depth+1)
4725 #define REGTAIL_STUDY(x,y,z) regtail((x),(y),(z),depth+1)
4728 /* this idea is borrowed from STR_WITH_LEN in handy.h */
4729 #define CHECK_WORD(s,v,l) \
4730 (((sizeof(s)-1)==(l)) && (strnEQ(start_verb, (s ""), (sizeof(s)-1))))
4733 S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
4734 /* paren: Parenthesized? 0=top, 1=(, inside: changed to letter. */
4737 register regnode *ret; /* Will be the head of the group. */
4738 register regnode *br;
4739 register regnode *lastbr;
4740 register regnode *ender = NULL;
4741 register I32 parno = 0;
4743 const I32 oregflags = RExC_flags;
4744 bool have_branch = 0;
4747 /* for (?g), (?gc), and (?o) warnings; warning
4748 about (?c) will warn about (?g) -- japhy */
4750 #define WASTED_O 0x01
4751 #define WASTED_G 0x02
4752 #define WASTED_C 0x04
4753 #define WASTED_GC (0x02|0x04)
4754 I32 wastedflags = 0x00;
4756 char * parse_start = RExC_parse; /* MJD */
4757 char * const oregcomp_parse = RExC_parse;
4759 GET_RE_DEBUG_FLAGS_DECL;
4760 DEBUG_PARSE("reg ");
4763 *flagp = 0; /* Tentatively. */
4766 /* Make an OPEN node, if parenthesized. */
4768 if ( *RExC_parse == '*') { /* (*VERB:ARG) */
4769 char *start_verb = RExC_parse;
4770 STRLEN verb_len = 0;
4771 char *start_arg = NULL;
4772 unsigned char op = 0;
4774 int internal_argval = 0; /* internal_argval is only useful if !argok */
4775 while ( *RExC_parse && *RExC_parse != ')' ) {
4776 if ( *RExC_parse == ':' ) {
4777 start_arg = RExC_parse + 1;
4783 verb_len = RExC_parse - start_verb;
4786 while ( *RExC_parse && *RExC_parse != ')' )
4788 if ( *RExC_parse != ')' )
4789 vFAIL("Unterminated verb pattern argument");
4790 if ( RExC_parse == start_arg )
4793 if ( *RExC_parse != ')' )
4794 vFAIL("Unterminated verb pattern");
4797 switch ( *start_verb ) {
4798 case 'A': /* (*ACCEPT) */
4799 if ( CHECK_WORD("ACCEPT",start_verb,verb_len) ) {
4801 internal_argval = RExC_nestroot;
4804 case 'C': /* (*COMMIT) */
4805 if ( CHECK_WORD("COMMIT",start_verb,verb_len) )
4808 case 'F': /* (*FAIL) */
4809 if ( verb_len==1 || CHECK_WORD("FAIL",start_verb,verb_len) ) {
4814 case ':': /* (*:NAME) */
4815 case 'M': /* (*MARK:NAME) */
4816 if ( verb_len==0 || CHECK_WORD("MARK",start_verb,verb_len) ) {
4821 case 'P': /* (*PRUNE) */
4822 if ( CHECK_WORD("PRUNE",start_verb,verb_len) )
4825 case 'S': /* (*SKIP) */
4826 if ( CHECK_WORD("SKIP",start_verb,verb_len) )
4829 case 'T': /* (*THEN) */
4830 /* [19:06] <TimToady> :: is then */
4831 if ( CHECK_WORD("THEN",start_verb,verb_len) ) {
4833 RExC_seen |= REG_SEEN_CUTGROUP;
4839 vFAIL3("Unknown verb pattern '%.*s'",
4840 verb_len, start_verb);
4843 if ( start_arg && internal_argval ) {
4844 vFAIL3("Verb pattern '%.*s' may not have an argument",
4845 verb_len, start_verb);
4846 } else if ( argok < 0 && !start_arg ) {
4847 vFAIL3("Verb pattern '%.*s' has a mandatory argument",
4848 verb_len, start_verb);
4850 ret = reganode(pRExC_state, op, internal_argval);
4851 if ( ! internal_argval && ! SIZE_ONLY ) {
4853 SV *sv = newSVpvn( start_arg, RExC_parse - start_arg);
4854 ARG(ret) = add_data( pRExC_state, 1, "S" );
4855 RExC_rx->data->data[ARG(ret)]=(void*)sv;
4862 if (!internal_argval)
4863 RExC_seen |= REG_SEEN_VERBARG;
4864 } else if ( start_arg ) {
4865 vFAIL3("Verb pattern '%.*s' may not have an argument",
4866 verb_len, start_verb);
4868 ret = reg_node(pRExC_state, op);
4870 nextchar(pRExC_state);
4873 if (*RExC_parse == '?') { /* (?...) */
4874 U32 posflags = 0, negflags = 0;
4875 U32 *flagsp = &posflags;
4876 bool is_logical = 0;
4877 const char * const seqstart = RExC_parse;
4880 paren = *RExC_parse++;
4881 ret = NULL; /* For look-ahead/behind. */
4884 case '<': /* (?<...) */
4885 if (*RExC_parse == '!')
4887 else if (*RExC_parse != '=')
4892 case '\'': /* (?'...') */
4893 name_start= RExC_parse;
4894 svname = reg_scan_name(pRExC_state,
4895 SIZE_ONLY ? /* reverse test from the others */
4896 REG_RSN_RETURN_NAME :
4897 REG_RSN_RETURN_NULL);
4898 if (RExC_parse == name_start)
4900 if (*RExC_parse != paren)
4901 vFAIL2("Sequence (?%c... not terminated",
4902 paren=='>' ? '<' : paren);
4906 if (!svname) /* shouldnt happen */
4908 "panic: reg_scan_name returned NULL");
4909 if (!RExC_paren_names) {
4910 RExC_paren_names= newHV();
4911 sv_2mortal((SV*)RExC_paren_names);
4913 he_str = hv_fetch_ent( RExC_paren_names, svname, 1, 0 );
4915 sv_dat = HeVAL(he_str);
4917 /* croak baby croak */
4919 "panic: paren_name hash element allocation failed");
4920 } else if ( SvPOK(sv_dat) ) {
4921 IV count=SvIV(sv_dat);
4922 I32 *pv=(I32*)SvGROW(sv_dat,SvCUR(sv_dat)+sizeof(I32)+1);
4923 SvCUR_set(sv_dat,SvCUR(sv_dat)+sizeof(I32));
4924 pv[count]=RExC_npar;
4927 (void)SvUPGRADE(sv_dat,SVt_PVNV);
4928 sv_setpvn(sv_dat, (char *)&(RExC_npar), sizeof(I32));
4933 /*sv_dump(sv_dat);*/
4935 nextchar(pRExC_state);
4937 goto capturing_parens;
4939 RExC_seen |= REG_SEEN_LOOKBEHIND;
4941 case '=': /* (?=...) */
4942 case '!': /* (?!...) */
4943 RExC_seen_zerolen++;
4944 if (*RExC_parse == ')') {
4945 ret=reg_node(pRExC_state, OPFAIL);
4946 nextchar(pRExC_state);
4949 case ':': /* (?:...) */
4950 case '>': /* (?>...) */
4952 case '$': /* (?$...) */
4953 case '@': /* (?@...) */
4954 vFAIL2("Sequence (?%c...) not implemented", (int)paren);
4956 case '#': /* (?#...) */
4957 while (*RExC_parse && *RExC_parse != ')')
4959 if (*RExC_parse != ')')
4960 FAIL("Sequence (?#... not terminated");
4961 nextchar(pRExC_state);
4964 case '0' : /* (?0) */
4965 case 'R' : /* (?R) */
4966 if (*RExC_parse != ')')
4967 FAIL("Sequence (?R) not terminated");
4968 ret = reg_node(pRExC_state, GOSTART);
4969 nextchar(pRExC_state);
4972 { /* named and numeric backreferences */
4975 case '&': /* (?&NAME) */
4976 parse_start = RExC_parse - 1;
4978 SV *sv_dat = reg_scan_name(pRExC_state,
4979 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
4980 num = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
4982 goto gen_recurse_regop;
4985 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
4987 vFAIL("Illegal pattern");
4989 goto parse_recursion;
4991 case '-': /* (?-1) */
4992 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
4993 RExC_parse--; /* rewind to let it be handled later */
4997 case '1': case '2': case '3': case '4': /* (?1) */
4998 case '5': case '6': case '7': case '8': case '9':
5001 num = atoi(RExC_parse);
5002 parse_start = RExC_parse - 1; /* MJD */
5003 if (*RExC_parse == '-')
5005 while (isDIGIT(*RExC_parse))
5007 if (*RExC_parse!=')')
5008 vFAIL("Expecting close bracket");
5011 if ( paren == '-' ) {
5013 Diagram of capture buffer numbering.
5014 Top line is the normal capture buffer numbers
5015 Botton line is the negative indexing as from
5019 /(a(x)y)(a(b(c(?-2)d)e)f)(g(h))/
5023 num = RExC_npar + num;
5026 vFAIL("Reference to nonexistent group");
5028 } else if ( paren == '+' ) {
5029 num = RExC_npar + num - 1;
5032 ret = reganode(pRExC_state, GOSUB, num);
5034 if (num > (I32)RExC_rx->nparens) {
5036 vFAIL("Reference to nonexistent group");
5038 ARG2L_SET( ret, RExC_recurse_count++);
5040 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
5041 "Recurse #%"UVuf" to %"IVdf"\n", (UV)ARG(ret), (IV)ARG2L(ret)));
5045 RExC_seen |= REG_SEEN_RECURSE;
5046 Set_Node_Length(ret, 1 + regarglen[OP(ret)]); /* MJD */
5047 Set_Node_Offset(ret, parse_start); /* MJD */
5049 nextchar(pRExC_state);
5051 } /* named and numeric backreferences */
5054 case 'p': /* (?p...) */
5055 if (SIZE_ONLY && ckWARN2(WARN_DEPRECATED, WARN_REGEXP))
5056 vWARNdep(RExC_parse, "(?p{}) is deprecated - use (??{})");
5058 case '?': /* (??...) */
5060 if (*RExC_parse != '{')
5062 paren = *RExC_parse++;
5064 case '{': /* (?{...}) */
5066 I32 count = 1, n = 0;
5068 char *s = RExC_parse;
5070 RExC_seen_zerolen++;
5071 RExC_seen |= REG_SEEN_EVAL;
5072 while (count && (c = *RExC_parse)) {
5083 if (*RExC_parse != ')') {
5085 vFAIL("Sequence (?{...}) not terminated or not {}-balanced");
5089 OP_4tree *sop, *rop;
5090 SV * const sv = newSVpvn(s, RExC_parse - 1 - s);
5093 Perl_save_re_context(aTHX);
5094 rop = sv_compile_2op(sv, &sop, "re", &pad);
5095 sop->op_private |= OPpREFCOUNTED;
5096 /* re_dup will OpREFCNT_inc */
5097 OpREFCNT_set(sop, 1);
5100 n = add_data(pRExC_state, 3, "nop");
5101 RExC_rx->data->data[n] = (void*)rop;
5102 RExC_rx->data->data[n+1] = (void*)sop;
5103 RExC_rx->data->data[n+2] = (void*)pad;
5106 else { /* First pass */
5107 if (PL_reginterp_cnt < ++RExC_seen_evals
5109 /* No compiled RE interpolated, has runtime
5110 components ===> unsafe. */
5111 FAIL("Eval-group not allowed at runtime, use re 'eval'");
5112 if (PL_tainting && PL_tainted)
5113 FAIL("Eval-group in insecure regular expression");
5114 #if PERL_VERSION > 8
5115 if (IN_PERL_COMPILETIME)
5120 nextchar(pRExC_state);
5122 ret = reg_node(pRExC_state, LOGICAL);
5125 REGTAIL(pRExC_state, ret, reganode(pRExC_state, EVAL, n));
5126 /* deal with the length of this later - MJD */
5129 ret = reganode(pRExC_state, EVAL, n);
5130 Set_Node_Length(ret, RExC_parse - parse_start + 1);
5131 Set_Node_Offset(ret, parse_start);
5134 case '(': /* (?(?{...})...) and (?(?=...)...) */
5137 if (RExC_parse[0] == '?') { /* (?(?...)) */
5138 if (RExC_parse[1] == '=' || RExC_parse[1] == '!'
5139 || RExC_parse[1] == '<'
5140 || RExC_parse[1] == '{') { /* Lookahead or eval. */
5143 ret = reg_node(pRExC_state, LOGICAL);
5146 REGTAIL(pRExC_state, ret, reg(pRExC_state, 1, &flag,depth+1));
5150 else if ( RExC_parse[0] == '<' /* (?(<NAME>)...) */
5151 || RExC_parse[0] == '\'' ) /* (?('NAME')...) */
5153 char ch = RExC_parse[0] == '<' ? '>' : '\'';
5154 char *name_start= RExC_parse++;
5156 SV *sv_dat=reg_scan_name(pRExC_state,
5157 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
5158 if (RExC_parse == name_start || *RExC_parse != ch)
5159 vFAIL2("Sequence (?(%c... not terminated",
5160 (ch == '>' ? '<' : ch));
5163 num = add_data( pRExC_state, 1, "S" );
5164 RExC_rx->data->data[num]=(void*)sv_dat;
5165 SvREFCNT_inc(sv_dat);
5167 ret = reganode(pRExC_state,NGROUPP,num);
5168 goto insert_if_check_paren;
5170 else if (RExC_parse[0] == 'D' &&
5171 RExC_parse[1] == 'E' &&
5172 RExC_parse[2] == 'F' &&
5173 RExC_parse[3] == 'I' &&
5174 RExC_parse[4] == 'N' &&
5175 RExC_parse[5] == 'E')
5177 ret = reganode(pRExC_state,DEFINEP,0);
5180 goto insert_if_check_paren;
5182 else if (RExC_parse[0] == 'R') {
5185 if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
5186 parno = atoi(RExC_parse++);
5187 while (isDIGIT(*RExC_parse))
5189 } else if (RExC_parse[0] == '&') {
5192 sv_dat = reg_scan_name(pRExC_state,
5193 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
5194 parno = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
5196 ret = reganode(pRExC_state,INSUBP,parno);
5197 goto insert_if_check_paren;
5199 else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
5202 parno = atoi(RExC_parse++);
5204 while (isDIGIT(*RExC_parse))
5206 ret = reganode(pRExC_state, GROUPP, parno);
5208 insert_if_check_paren:
5209 if ((c = *nextchar(pRExC_state)) != ')')
5210 vFAIL("Switch condition not recognized");
5212 REGTAIL(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
5213 br = regbranch(pRExC_state, &flags, 1,depth+1);
5215 br = reganode(pRExC_state, LONGJMP, 0);
5217 REGTAIL(pRExC_state, br, reganode(pRExC_state, LONGJMP, 0));
5218 c = *nextchar(pRExC_state);
5223 vFAIL("(?(DEFINE)....) does not allow branches");
5224 lastbr = reganode(pRExC_state, IFTHEN, 0); /* Fake one for optimizer. */
5225 regbranch(pRExC_state, &flags, 1,depth+1);
5226 REGTAIL(pRExC_state, ret, lastbr);
5229 c = *nextchar(pRExC_state);
5234 vFAIL("Switch (?(condition)... contains too many branches");
5235 ender = reg_node(pRExC_state, TAIL);
5236 REGTAIL(pRExC_state, br, ender);
5238 REGTAIL(pRExC_state, lastbr, ender);
5239 REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
5242 REGTAIL(pRExC_state, ret, ender);
5246 vFAIL2("Unknown switch condition (?(%.2s", RExC_parse);
5250 RExC_parse--; /* for vFAIL to print correctly */
5251 vFAIL("Sequence (? incomplete");
5255 parse_flags: /* (?i) */
5256 while (*RExC_parse && strchr("iogcmsx", *RExC_parse)) {
5257 /* (?g), (?gc) and (?o) are useless here
5258 and must be globally applied -- japhy */
5260 if (*RExC_parse == 'o' || *RExC_parse == 'g') {
5261 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
5262 const I32 wflagbit = *RExC_parse == 'o' ? WASTED_O : WASTED_G;
5263 if (! (wastedflags & wflagbit) ) {
5264 wastedflags |= wflagbit;
5267 "Useless (%s%c) - %suse /%c modifier",
5268 flagsp == &negflags ? "?-" : "?",
5270 flagsp == &negflags ? "don't " : "",
5276 else if (*RExC_parse == 'c') {
5277 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
5278 if (! (wastedflags & WASTED_C) ) {
5279 wastedflags |= WASTED_GC;
5282 "Useless (%sc) - %suse /gc modifier",
5283 flagsp == &negflags ? "?-" : "?",
5284 flagsp == &negflags ? "don't " : ""
5289 else { pmflag(flagsp, *RExC_parse); }
5293 if (*RExC_parse == '-') {
5295 wastedflags = 0; /* reset so (?g-c) warns twice */
5299 RExC_flags |= posflags;
5300 RExC_flags &= ~negflags;
5301 if (*RExC_parse == ':') {
5307 if (*RExC_parse != ')') {
5309 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
5311 nextchar(pRExC_state);
5321 ret = reganode(pRExC_state, OPEN, parno);
5324 RExC_nestroot = parno;
5325 if (RExC_seen & REG_SEEN_RECURSE) {
5326 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
5327 "Setting open paren #%"IVdf" to %d\n",
5328 (IV)parno, REG_NODE_NUM(ret)));
5329 RExC_open_parens[parno-1]= ret;
5332 Set_Node_Length(ret, 1); /* MJD */
5333 Set_Node_Offset(ret, RExC_parse); /* MJD */
5340 /* Pick up the branches, linking them together. */
5341 parse_start = RExC_parse; /* MJD */
5342 br = regbranch(pRExC_state, &flags, 1,depth+1);
5343 /* branch_len = (paren != 0); */
5347 if (*RExC_parse == '|') {
5348 if (!SIZE_ONLY && RExC_extralen) {
5349 reginsert(pRExC_state, BRANCHJ, br, depth+1);
5352 reginsert(pRExC_state, BRANCH, br, depth+1);
5353 Set_Node_Length(br, paren != 0);
5354 Set_Node_Offset_To_R(br-RExC_emit_start, parse_start-RExC_start);
5358 RExC_extralen += 1; /* For BRANCHJ-BRANCH. */
5360 else if (paren == ':') {
5361 *flagp |= flags&SIMPLE;
5363 if (is_open) { /* Starts with OPEN. */
5364 REGTAIL(pRExC_state, ret, br); /* OPEN -> first. */
5366 else if (paren != '?') /* Not Conditional */
5368 *flagp |= flags & (SPSTART | HASWIDTH);
5370 while (*RExC_parse == '|') {
5371 if (!SIZE_ONLY && RExC_extralen) {
5372 ender = reganode(pRExC_state, LONGJMP,0);
5373 REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender); /* Append to the previous. */
5376 RExC_extralen += 2; /* Account for LONGJMP. */
5377 nextchar(pRExC_state);
5378 br = regbranch(pRExC_state, &flags, 0, depth+1);
5382 REGTAIL(pRExC_state, lastbr, br); /* BRANCH -> BRANCH. */
5386 *flagp |= flags&SPSTART;
5389 if (have_branch || paren != ':') {
5390 /* Make a closing node, and hook it on the end. */
5393 ender = reg_node(pRExC_state, TAIL);
5397 ender = reganode(pRExC_state, CLOSE, parno);
5398 if (!SIZE_ONLY && RExC_seen & REG_SEEN_RECURSE) {
5399 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
5400 "Setting close paren #%"IVdf" to %d\n",
5401 (IV)parno, REG_NODE_NUM(ender)));
5402 RExC_close_parens[parno-1]= ender;
5403 if (RExC_nestroot == parno)
5406 Set_Node_Offset(ender,RExC_parse+1); /* MJD */
5407 Set_Node_Length(ender,1); /* MJD */
5413 *flagp &= ~HASWIDTH;
5416 ender = reg_node(pRExC_state, SUCCEED);
5419 ender = reg_node(pRExC_state, END);
5421 assert(!RExC_opend); /* there can only be one! */
5426 REGTAIL(pRExC_state, lastbr, ender);
5428 if (have_branch && !SIZE_ONLY) {
5430 RExC_seen |= REG_TOP_LEVEL_BRANCHES;
5432 /* Hook the tails of the branches to the closing node. */
5433 for (br = ret; br; br = regnext(br)) {
5434 const U8 op = PL_regkind[OP(br)];
5436 REGTAIL_STUDY(pRExC_state, NEXTOPER(br), ender);
5438 else if (op == BRANCHJ) {
5439 REGTAIL_STUDY(pRExC_state, NEXTOPER(NEXTOPER(br)), ender);
5447 static const char parens[] = "=!<,>";
5449 if (paren && (p = strchr(parens, paren))) {
5450 U8 node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
5451 int flag = (p - parens) > 1;
5454 node = SUSPEND, flag = 0;
5455 reginsert(pRExC_state, node,ret, depth+1);
5456 Set_Node_Cur_Length(ret);
5457 Set_Node_Offset(ret, parse_start + 1);
5459 REGTAIL_STUDY(pRExC_state, ret, reg_node(pRExC_state, TAIL));
5463 /* Check for proper termination. */
5465 RExC_flags = oregflags;
5466 if (RExC_parse >= RExC_end || *nextchar(pRExC_state) != ')') {
5467 RExC_parse = oregcomp_parse;
5468 vFAIL("Unmatched (");
5471 else if (!paren && RExC_parse < RExC_end) {
5472 if (*RExC_parse == ')') {
5474 vFAIL("Unmatched )");
5477 FAIL("Junk on end of regexp"); /* "Can't happen". */
5485 - regbranch - one alternative of an | operator
5487 * Implements the concatenation operator.
5490 S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first, U32 depth)
5493 register regnode *ret;
5494 register regnode *chain = NULL;
5495 register regnode *latest;
5496 I32 flags = 0, c = 0;
5497 GET_RE_DEBUG_FLAGS_DECL;
5498 DEBUG_PARSE("brnc");
5502 if (!SIZE_ONLY && RExC_extralen)
5503 ret = reganode(pRExC_state, BRANCHJ,0);
5505 ret = reg_node(pRExC_state, BRANCH);
5506 Set_Node_Length(ret, 1);
5510 if (!first && SIZE_ONLY)
5511 RExC_extralen += 1; /* BRANCHJ */
5513 *flagp = WORST; /* Tentatively. */
5516 nextchar(pRExC_state);
5517 while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
5519 latest = regpiece(pRExC_state, &flags,depth+1);
5520 if (latest == NULL) {
5521 if (flags & TRYAGAIN)
5525 else if (ret == NULL)
5527 *flagp |= flags&HASWIDTH;
5528 if (chain == NULL) /* First piece. */
5529 *flagp |= flags&SPSTART;
5532 REGTAIL(pRExC_state, chain, latest);
5537 if (chain == NULL) { /* Loop ran zero times. */
5538 chain = reg_node(pRExC_state, NOTHING);
5543 *flagp |= flags&SIMPLE;
5550 - regpiece - something followed by possible [*+?]
5552 * Note that the branching code sequences used for ? and the general cases
5553 * of * and + are somewhat optimized: they use the same NOTHING node as
5554 * both the endmarker for their branch list and the body of the last branch.
5555 * It might seem that this node could be dispensed with entirely, but the
5556 * endmarker role is not redundant.
5559 S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
5562 register regnode *ret;
5564 register char *next;
5566 const char * const origparse = RExC_parse;
5568 I32 max = REG_INFTY;
5570 const char *maxpos = NULL;
5571 GET_RE_DEBUG_FLAGS_DECL;
5572 DEBUG_PARSE("piec");
5574 ret = regatom(pRExC_state, &flags,depth+1);
5576 if (flags & TRYAGAIN)
5583 if (op == '{' && regcurly(RExC_parse)) {
5585 parse_start = RExC_parse; /* MJD */
5586 next = RExC_parse + 1;
5587 while (isDIGIT(*next) || *next == ',') {
5596 if (*next == '}') { /* got one */
5600 min = atoi(RExC_parse);
5604 maxpos = RExC_parse;
5606 if (!max && *maxpos != '0')
5607 max = REG_INFTY; /* meaning "infinity" */
5608 else if (max >= REG_INFTY)
5609 vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
5611 nextchar(pRExC_state);
5614 if ((flags&SIMPLE)) {
5615 RExC_naughty += 2 + RExC_naughty / 2;
5616 reginsert(pRExC_state, CURLY, ret, depth+1);
5617 Set_Node_Offset(ret, parse_start+1); /* MJD */
5618 Set_Node_Cur_Length(ret);
5621 regnode * const w = reg_node(pRExC_state, WHILEM);
5624 REGTAIL(pRExC_state, ret, w);
5625 if (!SIZE_ONLY && RExC_extralen) {
5626 reginsert(pRExC_state, LONGJMP,ret, depth+1);
5627 reginsert(pRExC_state, NOTHING,ret, depth+1);
5628 NEXT_OFF(ret) = 3; /* Go over LONGJMP. */
5630 reginsert(pRExC_state, CURLYX,ret, depth+1);
5632 Set_Node_Offset(ret, parse_start+1);
5633 Set_Node_Length(ret,
5634 op == '{' ? (RExC_parse - parse_start) : 1);
5636 if (!SIZE_ONLY && RExC_extralen)
5637 NEXT_OFF(ret) = 3; /* Go over NOTHING to LONGJMP. */
5638 REGTAIL(pRExC_state, ret, reg_node(pRExC_state, NOTHING));
5640 RExC_whilem_seen++, RExC_extralen += 3;
5641 RExC_naughty += 4 + RExC_naughty; /* compound interest */
5649 if (max && max < min)
5650 vFAIL("Can't do {n,m} with n > m");
5652 ARG1_SET(ret, (U16)min);
5653 ARG2_SET(ret, (U16)max);
5665 #if 0 /* Now runtime fix should be reliable. */
5667 /* if this is reinstated, don't forget to put this back into perldiag:
5669 =item Regexp *+ operand could be empty at {#} in regex m/%s/
5671 (F) The part of the regexp subject to either the * or + quantifier
5672 could match an empty string. The {#} shows in the regular
5673 expression about where the problem was discovered.
5677 if (!(flags&HASWIDTH) && op != '?')
5678 vFAIL("Regexp *+ operand could be empty");
5681 parse_start = RExC_parse;
5682 nextchar(pRExC_state);
5684 *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH);
5686 if (op == '*' && (flags&SIMPLE)) {
5687 reginsert(pRExC_state, STAR, ret, depth+1);
5691 else if (op == '*') {
5695 else if (op == '+' && (flags&SIMPLE)) {
5696 reginsert(pRExC_state, PLUS, ret, depth+1);
5700 else if (op == '+') {
5704 else if (op == '?') {
5709 if (!SIZE_ONLY && !(flags&HASWIDTH) && max > REG_INFTY/3 && ckWARN(WARN_REGEXP)) {
5711 "%.*s matches null string many times",
5712 (int)(RExC_parse >= origparse ? RExC_parse - origparse : 0),
5716 if (RExC_parse < RExC_end && *RExC_parse == '?') {
5717 nextchar(pRExC_state);
5718 reginsert(pRExC_state, MINMOD, ret, depth+1);
5719 REGTAIL(pRExC_state, ret, ret + NODE_STEP_REGNODE);
5721 #ifndef REG_ALLOW_MINMOD_SUSPEND
5724 if (RExC_parse < RExC_end && *RExC_parse == '+') {
5726 nextchar(pRExC_state);
5727 ender = reg_node(pRExC_state, SUCCEED);
5728 REGTAIL(pRExC_state, ret, ender);
5729 reginsert(pRExC_state, SUSPEND, ret, depth+1);
5731 ender = reg_node(pRExC_state, TAIL);
5732 REGTAIL(pRExC_state, ret, ender);
5736 if (RExC_parse < RExC_end && ISMULT2(RExC_parse)) {
5738 vFAIL("Nested quantifiers");
5745 /* reg_namedseq(pRExC_state,UVp)
5747 This is expected to be called by a parser routine that has
5748 recognized'\N' and needs to handle the rest. RExC_parse is
5749 expected to point at the first char following the N at the time
5752 If valuep is non-null then it is assumed that we are parsing inside
5753 of a charclass definition and the first codepoint in the resolved
5754 string is returned via *valuep and the routine will return NULL.
5755 In this mode if a multichar string is returned from the charnames
5756 handler a warning will be issued, and only the first char in the
5757 sequence will be examined. If the string returned is zero length
5758 then the value of *valuep is undefined and NON-NULL will
5759 be returned to indicate failure. (This will NOT be a valid pointer
5762 If value is null then it is assumed that we are parsing normal text
5763 and inserts a new EXACT node into the program containing the resolved
5764 string and returns a pointer to the new node. If the string is
5765 zerolength a NOTHING node is emitted.
5767 On success RExC_parse is set to the char following the endbrace.
5768 Parsing failures will generate a fatal errorvia vFAIL(...)
5770 NOTE: We cache all results from the charnames handler locally in
5771 the RExC_charnames hash (created on first use) to prevent a charnames
5772 handler from playing silly-buggers and returning a short string and
5773 then a long string for a given pattern. Since the regexp program
5774 size is calculated during an initial parse this would result
5775 in a buffer overrun so we cache to prevent the charname result from
5776 changing during the course of the parse.
5780 S_reg_namedseq(pTHX_ RExC_state_t *pRExC_state, UV *valuep)
5782 char * name; /* start of the content of the name */
5783 char * endbrace; /* endbrace following the name */
5786 STRLEN len; /* this has various purposes throughout the code */
5787 bool cached = 0; /* if this is true then we shouldn't refcount dev sv_str */
5788 regnode *ret = NULL;
5790 if (*RExC_parse != '{') {
5791 vFAIL("Missing braces on \\N{}");
5793 name = RExC_parse+1;
5794 endbrace = strchr(RExC_parse, '}');
5797 vFAIL("Missing right brace on \\N{}");
5799 RExC_parse = endbrace + 1;
5802 /* RExC_parse points at the beginning brace,
5803 endbrace points at the last */
5804 if ( name[0]=='U' && name[1]=='+' ) {
5805 /* its a "unicode hex" notation {U+89AB} */
5806 I32 fl = PERL_SCAN_ALLOW_UNDERSCORES
5807 | PERL_SCAN_DISALLOW_PREFIX
5808 | (SIZE_ONLY ? PERL_SCAN_SILENT_ILLDIGIT : 0);
5810 len = (STRLEN)(endbrace - name - 2);
5811 cp = grok_hex(name + 2, &len, &fl, NULL);
5812 if ( len != (STRLEN)(endbrace - name - 2) ) {
5821 sv_str= Perl_newSVpvf_nocontext("%c",(int)cp);
5823 /* fetch the charnames handler for this scope */
5824 HV * const table = GvHV(PL_hintgv);
5826 hv_fetchs(table, "charnames", FALSE) :
5828 SV *cv= cvp ? *cvp : NULL;
5831 /* create an SV with the name as argument */
5832 sv_name = newSVpvn(name, endbrace - name);
5834 if (!table || !(PL_hints & HINT_LOCALIZE_HH)) {
5835 vFAIL2("Constant(\\N{%s}) unknown: "
5836 "(possibly a missing \"use charnames ...\")",
5839 if (!cvp || !SvOK(*cvp)) { /* when $^H{charnames} = undef; */
5840 vFAIL2("Constant(\\N{%s}): "
5841 "$^H{charnames} is not defined",SvPVX(sv_name));
5846 if (!RExC_charnames) {
5847 /* make sure our cache is allocated */
5848 RExC_charnames = newHV();
5849 sv_2mortal((SV*)RExC_charnames);
5851 /* see if we have looked this one up before */
5852 he_str = hv_fetch_ent( RExC_charnames, sv_name, 0, 0 );
5854 sv_str = HeVAL(he_str);
5867 count= call_sv(cv, G_SCALAR);
5869 if (count == 1) { /* XXXX is this right? dmq */
5871 SvREFCNT_inc_simple_void(sv_str);
5879 if ( !sv_str || !SvOK(sv_str) ) {
5880 vFAIL2("Constant(\\N{%s}): Call to &{$^H{charnames}} "
5881 "did not return a defined value",SvPVX(sv_name));
5883 if (hv_store_ent( RExC_charnames, sv_name, sv_str, 0))
5888 char *p = SvPV(sv_str, len);
5891 if ( SvUTF8(sv_str) ) {
5892 *valuep = utf8_to_uvchr((U8*)p, &numlen);
5896 We have to turn on utf8 for high bit chars otherwise
5897 we get failures with
5899 "ss" =~ /[\N{LATIN SMALL LETTER SHARP S}]/i
5900 "SS" =~ /[\N{LATIN SMALL LETTER SHARP S}]/i
5902 This is different from what \x{} would do with the same
5903 codepoint, where the condition is > 0xFF.
5910 /* warn if we havent used the whole string? */
5912 if (numlen<len && SIZE_ONLY && ckWARN(WARN_REGEXP)) {
5914 "Ignoring excess chars from \\N{%s} in character class",
5918 } else if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
5920 "Ignoring zero length \\N{%s} in character class",
5925 SvREFCNT_dec(sv_name);
5927 SvREFCNT_dec(sv_str);
5928 return len ? NULL : (regnode *)&len;
5929 } else if(SvCUR(sv_str)) {
5934 char * parse_start = name-3; /* needed for the offsets */
5935 GET_RE_DEBUG_FLAGS_DECL; /* needed for the offsets */
5937 ret = reg_node(pRExC_state,
5938 (U8)(FOLD ? (LOC ? EXACTFL : EXACTF) : EXACT));
5941 if ( RExC_utf8 && !SvUTF8(sv_str) ) {
5942 sv_utf8_upgrade(sv_str);
5943 } else if ( !RExC_utf8 && SvUTF8(sv_str) ) {
5947 p = SvPV(sv_str, len);
5949 /* len is the length written, charlen is the size the char read */
5950 for ( len = 0; p < pend; p += charlen ) {
5952 UV uvc = utf8_to_uvchr((U8*)p, &charlen);
5954 STRLEN foldlen,numlen;
5955 U8 tmpbuf[UTF8_MAXBYTES_CASE+1], *foldbuf;
5956 uvc = toFOLD_uni(uvc, tmpbuf, &foldlen);
5957 /* Emit all the Unicode characters. */
5959 for (foldbuf = tmpbuf;
5963 uvc = utf8_to_uvchr(foldbuf, &numlen);
5965 const STRLEN unilen = reguni(pRExC_state, uvc, s);
5968 /* In EBCDIC the numlen
5969 * and unilen can differ. */
5971 if (numlen >= foldlen)
5975 break; /* "Can't happen." */
5978 const STRLEN unilen = reguni(pRExC_state, uvc, s);
5990 RExC_size += STR_SZ(len);
5993 RExC_emit += STR_SZ(len);
5995 Set_Node_Cur_Length(ret); /* MJD */
5997 nextchar(pRExC_state);
5999 ret = reg_node(pRExC_state,NOTHING);
6002 SvREFCNT_dec(sv_str);
6005 SvREFCNT_dec(sv_name);
6015 * It returns the code point in utf8 for the value in *encp.
6016 * value: a code value in the source encoding
6017 * encp: a pointer to an Encode object
6019 * If the result from Encode is not a single character,
6020 * it returns U+FFFD (Replacement character) and sets *encp to NULL.
6023 S_reg_recode(pTHX_ const char value, SV **encp)
6026 SV * const sv = sv_2mortal(newSVpvn(&value, numlen));
6027 const char * const s = encp && *encp ? sv_recode_to_utf8(sv, *encp)
6029 const STRLEN newlen = SvCUR(sv);
6030 UV uv = UNICODE_REPLACEMENT;
6034 ? utf8n_to_uvchr((U8*)s, newlen, &numlen, UTF8_ALLOW_DEFAULT)
6037 if (!newlen || numlen != newlen) {
6038 uv = UNICODE_REPLACEMENT;
6047 - regatom - the lowest level
6049 * Optimization: gobbles an entire sequence of ordinary characters so that
6050 * it can turn them into a single node, which is smaller to store and
6051 * faster to run. Backslashed characters are exceptions, each becoming a
6052 * separate node; the code is simpler that way and it's not worth fixing.
6054 * [Yes, it is worth fixing, some scripts can run twice the speed.]
6055 * [It looks like its ok, as in S_study_chunk we merge adjacent EXACT nodes]
6058 S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
6061 register regnode *ret = NULL;
6063 char *parse_start = RExC_parse;
6064 GET_RE_DEBUG_FLAGS_DECL;
6065 DEBUG_PARSE("atom");
6066 *flagp = WORST; /* Tentatively. */
6069 switch (*RExC_parse) {
6071 RExC_seen_zerolen++;
6072 nextchar(pRExC_state);
6073 if (RExC_flags & PMf_MULTILINE)
6074 ret = reg_node(pRExC_state, MBOL);
6075 else if (RExC_flags & PMf_SINGLELINE)
6076 ret = reg_node(pRExC_state, SBOL);
6078 ret = reg_node(pRExC_state, BOL);
6079 Set_Node_Length(ret, 1); /* MJD */
6082 nextchar(pRExC_state);
6084 RExC_seen_zerolen++;
6085 if (RExC_flags & PMf_MULTILINE)
6086 ret = reg_node(pRExC_state, MEOL);
6087 else if (RExC_flags & PMf_SINGLELINE)
6088 ret = reg_node(pRExC_state, SEOL);
6090 ret = reg_node(pRExC_state, EOL);
6091 Set_Node_Length(ret, 1); /* MJD */
6094 nextchar(pRExC_state);
6095 if (RExC_flags & PMf_SINGLELINE)
6096 ret = reg_node(pRExC_state, SANY);
6098 ret = reg_node(pRExC_state, REG_ANY);
6099 *flagp |= HASWIDTH|SIMPLE;
6101 Set_Node_Length(ret, 1); /* MJD */
6105 char * const oregcomp_parse = ++RExC_parse;
6106 ret = regclass(pRExC_state,depth+1);
6107 if (*RExC_parse != ']') {
6108 RExC_parse = oregcomp_parse;
6109 vFAIL("Unmatched [");
6111 nextchar(pRExC_state);
6112 *flagp |= HASWIDTH|SIMPLE;
6113 Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */
6117 nextchar(pRExC_state);
6118 ret = reg(pRExC_state, 1, &flags,depth+1);
6120 if (flags & TRYAGAIN) {
6121 if (RExC_parse == RExC_end) {
6122 /* Make parent create an empty node if needed. */
6130 *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE);
6134 if (flags & TRYAGAIN) {
6138 vFAIL("Internal urp");
6139 /* Supposed to be caught earlier. */
6142 if (!regcurly(RExC_parse)) {
6151 vFAIL("Quantifier follows nothing");
6154 switch (*++RExC_parse) {
6156 RExC_seen_zerolen++;
6157 ret = reg_node(pRExC_state, SBOL);
6159 nextchar(pRExC_state);
6160 Set_Node_Length(ret, 2); /* MJD */
6163 ret = reg_node(pRExC_state, GPOS);
6164 RExC_seen |= REG_SEEN_GPOS;
6166 nextchar(pRExC_state);
6167 Set_Node_Length(ret, 2); /* MJD */
6170 ret = reg_node(pRExC_state, SEOL);
6172 RExC_seen_zerolen++; /* Do not optimize RE away */
6173 nextchar(pRExC_state);
6176 ret = reg_node(pRExC_state, EOS);
6178 RExC_seen_zerolen++; /* Do not optimize RE away */
6179 nextchar(pRExC_state);
6180 Set_Node_Length(ret, 2); /* MJD */
6183 ret = reg_node(pRExC_state, CANY);
6184 RExC_seen |= REG_SEEN_CANY;
6185 *flagp |= HASWIDTH|SIMPLE;
6186 nextchar(pRExC_state);
6187 Set_Node_Length(ret, 2); /* MJD */
6190 ret = reg_node(pRExC_state, CLUMP);
6192 nextchar(pRExC_state);
6193 Set_Node_Length(ret, 2); /* MJD */
6196 ret = reg_node(pRExC_state, (U8)(LOC ? ALNUML : ALNUM));
6197 *flagp |= HASWIDTH|SIMPLE;
6198 nextchar(pRExC_state);
6199 Set_Node_Length(ret, 2); /* MJD */
6202 ret = reg_node(pRExC_state, (U8)(LOC ? NALNUML : NALNUM));
6203 *flagp |= HASWIDTH|SIMPLE;
6204 nextchar(pRExC_state);
6205 Set_Node_Length(ret, 2); /* MJD */
6208 RExC_seen_zerolen++;
6209 RExC_seen |= REG_SEEN_LOOKBEHIND;
6210 ret = reg_node(pRExC_state, (U8)(LOC ? BOUNDL : BOUND));
6212 nextchar(pRExC_state);
6213 Set_Node_Length(ret, 2); /* MJD */
6216 RExC_seen_zerolen++;
6217 RExC_seen |= REG_SEEN_LOOKBEHIND;
6218 ret = reg_node(pRExC_state, (U8)(LOC ? NBOUNDL : NBOUND));
6220 nextchar(pRExC_state);
6221 Set_Node_Length(ret, 2); /* MJD */
6224 ret = reg_node(pRExC_state, (U8)(LOC ? SPACEL : SPACE));
6225 *flagp |= HASWIDTH|SIMPLE;
6226 nextchar(pRExC_state);
6227 Set_Node_Length(ret, 2); /* MJD */
6230 ret = reg_node(pRExC_state, (U8)(LOC ? NSPACEL : NSPACE));
6231 *flagp |= HASWIDTH|SIMPLE;
6232 nextchar(pRExC_state);
6233 Set_Node_Length(ret, 2); /* MJD */
6236 ret = reg_node(pRExC_state, DIGIT);
6237 *flagp |= HASWIDTH|SIMPLE;
6238 nextchar(pRExC_state);
6239 Set_Node_Length(ret, 2); /* MJD */
6242 ret = reg_node(pRExC_state, NDIGIT);
6243 *flagp |= HASWIDTH|SIMPLE;
6244 nextchar(pRExC_state);
6245 Set_Node_Length(ret, 2); /* MJD */
6250 char* const oldregxend = RExC_end;
6251 char* parse_start = RExC_parse - 2;
6253 if (RExC_parse[1] == '{') {
6254 /* a lovely hack--pretend we saw [\pX] instead */
6255 RExC_end = strchr(RExC_parse, '}');
6257 const U8 c = (U8)*RExC_parse;
6259 RExC_end = oldregxend;
6260 vFAIL2("Missing right brace on \\%c{}", c);
6265 RExC_end = RExC_parse + 2;
6266 if (RExC_end > oldregxend)
6267 RExC_end = oldregxend;
6271 ret = regclass(pRExC_state,depth+1);
6273 RExC_end = oldregxend;
6276 Set_Node_Offset(ret, parse_start + 2);
6277 Set_Node_Cur_Length(ret);
6278 nextchar(pRExC_state);
6279 *flagp |= HASWIDTH|SIMPLE;
6283 /* Handle \N{NAME} here and not below because it can be
6284 multicharacter. join_exact() will join them up later on.
6285 Also this makes sure that things like /\N{BLAH}+/ and
6286 \N{BLAH} being multi char Just Happen. dmq*/
6288 ret= reg_namedseq(pRExC_state, NULL);
6290 case 'k': /* Handle \k<NAME> and \k'NAME' */
6292 char ch= RExC_parse[1];
6293 if (ch != '<' && ch != '\'') {
6295 vWARN( RExC_parse + 1,
6296 "Possible broken named back reference treated as literal k");
6300 char* name_start = (RExC_parse += 2);
6302 SV *sv_dat = reg_scan_name(pRExC_state,
6303 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
6304 ch= (ch == '<') ? '>' : '\'';
6306 if (RExC_parse == name_start || *RExC_parse != ch)
6307 vFAIL2("Sequence \\k%c... not terminated",
6308 (ch == '>' ? '<' : ch));
6311 ret = reganode(pRExC_state,
6312 (U8)(FOLD ? (LOC ? NREFFL : NREFF) : NREF),
6318 num = add_data( pRExC_state, 1, "S" );
6320 RExC_rx->data->data[num]=(void*)sv_dat;
6321 SvREFCNT_inc(sv_dat);
6323 /* override incorrect value set in reganode MJD */
6324 Set_Node_Offset(ret, parse_start+1);
6325 Set_Node_Cur_Length(ret); /* MJD */
6326 nextchar(pRExC_state);
6342 case '1': case '2': case '3': case '4':
6343 case '5': case '6': case '7': case '8': case '9':
6346 bool isrel=(*RExC_parse=='R');
6349 num = atoi(RExC_parse);
6351 num = RExC_cpar - num;
6353 vFAIL("Reference to nonexistent or unclosed group");
6355 if (num > 9 && num >= RExC_npar)
6358 char * const parse_start = RExC_parse - 1; /* MJD */
6359 while (isDIGIT(*RExC_parse))
6363 if (num > (I32)RExC_rx->nparens)
6364 vFAIL("Reference to nonexistent group");
6365 /* People make this error all the time apparently.
6366 So we cant fail on it, even though we should
6368 else if (num >= RExC_cpar)
6369 vFAIL("Reference to unclosed group will always match");
6373 ret = reganode(pRExC_state,
6374 (U8)(FOLD ? (LOC ? REFFL : REFF) : REF),
6378 /* override incorrect value set in reganode MJD */
6379 Set_Node_Offset(ret, parse_start+1);
6380 Set_Node_Cur_Length(ret); /* MJD */
6382 nextchar(pRExC_state);
6387 if (RExC_parse >= RExC_end)
6388 FAIL("Trailing \\");
6391 /* Do not generate "unrecognized" warnings here, we fall
6392 back into the quick-grab loop below */
6399 if (RExC_flags & PMf_EXTENDED) {
6400 while (RExC_parse < RExC_end && *RExC_parse != '\n')
6402 if (RExC_parse < RExC_end)
6408 register STRLEN len;
6413 U8 tmpbuf[UTF8_MAXBYTES_CASE+1], *foldbuf;
6415 parse_start = RExC_parse - 1;
6421 ret = reg_node(pRExC_state,
6422 (U8)(FOLD ? (LOC ? EXACTFL : EXACTF) : EXACT));
6424 for (len = 0, p = RExC_parse - 1;
6425 len < 127 && p < RExC_end;
6428 char * const oldp = p;
6430 if (RExC_flags & PMf_EXTENDED)
6431 p = regwhite(p, RExC_end);
6480 ender = ASCII_TO_NATIVE('\033');
6484 ender = ASCII_TO_NATIVE('\007');
6489 char* const e = strchr(p, '}');
6493 vFAIL("Missing right brace on \\x{}");
6496 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
6497 | PERL_SCAN_DISALLOW_PREFIX;
6498 STRLEN numlen = e - p - 1;
6499 ender = grok_hex(p + 1, &numlen, &flags, NULL);
6506 I32 flags = PERL_SCAN_DISALLOW_PREFIX;
6508 ender = grok_hex(p, &numlen, &flags, NULL);
6511 if (PL_encoding && ender < 0x100)
6512 goto recode_encoding;
6516 ender = UCHARAT(p++);
6517 ender = toCTRL(ender);
6519 case '0': case '1': case '2': case '3':case '4':
6520 case '5': case '6': case '7': case '8':case '9':
6522 (isDIGIT(p[1]) && atoi(p) >= RExC_npar) ) {
6525 ender = grok_oct(p, &numlen, &flags, NULL);
6532 if (PL_encoding && ender < 0x100)
6533 goto recode_encoding;
6537 SV* enc = PL_encoding;
6538 ender = reg_recode((const char)(U8)ender, &enc);
6539 if (!enc && SIZE_ONLY && ckWARN(WARN_REGEXP))
6540 vWARN(p, "Invalid escape in the specified encoding");
6546 FAIL("Trailing \\");
6549 if (!SIZE_ONLY&& isALPHA(*p) && ckWARN(WARN_REGEXP))
6550 vWARN2(p + 1, "Unrecognized escape \\%c passed through", UCHARAT(p));
6551 goto normal_default;
6556 if (UTF8_IS_START(*p) && UTF) {
6558 ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
6559 &numlen, UTF8_ALLOW_DEFAULT);
6566 if (RExC_flags & PMf_EXTENDED)
6567 p = regwhite(p, RExC_end);
6569 /* Prime the casefolded buffer. */
6570 ender = toFOLD_uni(ender, tmpbuf, &foldlen);
6572 if (ISMULT2(p)) { /* Back off on ?+*. */
6577 /* Emit all the Unicode characters. */
6579 for (foldbuf = tmpbuf;
6581 foldlen -= numlen) {
6582 ender = utf8_to_uvchr(foldbuf, &numlen);
6584 const STRLEN unilen = reguni(pRExC_state, ender, s);
6587 /* In EBCDIC the numlen
6588 * and unilen can differ. */
6590 if (numlen >= foldlen)
6594 break; /* "Can't happen." */
6598 const STRLEN unilen = reguni(pRExC_state, ender, s);
6607 REGC((char)ender, s++);
6613 /* Emit all the Unicode characters. */
6615 for (foldbuf = tmpbuf;
6617 foldlen -= numlen) {
6618 ender = utf8_to_uvchr(foldbuf, &numlen);
6620 const STRLEN unilen = reguni(pRExC_state, ender, s);
6623 /* In EBCDIC the numlen
6624 * and unilen can differ. */
6626 if (numlen >= foldlen)
6634 const STRLEN unilen = reguni(pRExC_state, ender, s);
6643 REGC((char)ender, s++);
6647 Set_Node_Cur_Length(ret); /* MJD */
6648 nextchar(pRExC_state);
6650 /* len is STRLEN which is unsigned, need to copy to signed */
6653 vFAIL("Internal disaster");
6657 if (len == 1 && UNI_IS_INVARIANT(ender))
6661 RExC_size += STR_SZ(len);
6664 RExC_emit += STR_SZ(len);
6674 S_regwhite(char *p, const char *e)
6679 else if (*p == '#') {
6682 } while (p < e && *p != '\n');
6690 /* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
6691 Character classes ([:foo:]) can also be negated ([:^foo:]).
6692 Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
6693 Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
6694 but trigger failures because they are currently unimplemented. */
6696 #define POSIXCC_DONE(c) ((c) == ':')
6697 #define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.')
6698 #define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c))
6701 S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value)
6704 I32 namedclass = OOB_NAMEDCLASS;
6706 if (value == '[' && RExC_parse + 1 < RExC_end &&
6707 /* I smell either [: or [= or [. -- POSIX has been here, right? */
6708 POSIXCC(UCHARAT(RExC_parse))) {
6709 const char c = UCHARAT(RExC_parse);
6710 char* const s = RExC_parse++;
6712 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != c)
6714 if (RExC_parse == RExC_end)
6715 /* Grandfather lone [:, [=, [. */
6718 const char* const t = RExC_parse++; /* skip over the c */
6721 if (UCHARAT(RExC_parse) == ']') {
6722 const char *posixcc = s + 1;
6723 RExC_parse++; /* skip over the ending ] */
6726 const I32 complement = *posixcc == '^' ? *posixcc++ : 0;
6727 const I32 skip = t - posixcc;
6729 /* Initially switch on the length of the name. */
6732 if (memEQ(posixcc, "word", 4)) /* this is not POSIX, this is the Perl \w */
6733 namedclass = complement ? ANYOF_NALNUM : ANYOF_ALNUM;
6736 /* Names all of length 5. */
6737 /* alnum alpha ascii blank cntrl digit graph lower
6738 print punct space upper */
6739 /* Offset 4 gives the best switch position. */
6740 switch (posixcc[4]) {
6742 if (memEQ(posixcc, "alph", 4)) /* alpha */
6743 namedclass = complement ? ANYOF_NALPHA : ANYOF_ALPHA;
6746 if (memEQ(posixcc, "spac", 4)) /* space */
6747 namedclass = complement ? ANYOF_NPSXSPC : ANYOF_PSXSPC;
6750 if (memEQ(posixcc, "grap", 4)) /* graph */
6751 namedclass = complement ? ANYOF_NGRAPH : ANYOF_GRAPH;
6754 if (memEQ(posixcc, "asci", 4)) /* ascii */
6755 namedclass = complement ? ANYOF_NASCII : ANYOF_ASCII;
6758 if (memEQ(posixcc, "blan", 4)) /* blank */
6759 namedclass = complement ? ANYOF_NBLANK : ANYOF_BLANK;
6762 if (memEQ(posixcc, "cntr", 4)) /* cntrl */
6763 namedclass = complement ? ANYOF_NCNTRL : ANYOF_CNTRL;
6766 if (memEQ(posixcc, "alnu", 4)) /* alnum */
6767 namedclass = complement ? ANYOF_NALNUMC : ANYOF_ALNUMC;
6770 if (memEQ(posixcc, "lowe", 4)) /* lower */
6771 namedclass = complement ? ANYOF_NLOWER : ANYOF_LOWER;
6772 else if (memEQ(posixcc, "uppe", 4)) /* upper */
6773 namedclass = complement ? ANYOF_NUPPER : ANYOF_UPPER;
6776 if (memEQ(posixcc, "digi", 4)) /* digit */
6777 namedclass = complement ? ANYOF_NDIGIT : ANYOF_DIGIT;
6778 else if (memEQ(posixcc, "prin", 4)) /* print */
6779 namedclass = complement ? ANYOF_NPRINT : ANYOF_PRINT;
6780 else if (memEQ(posixcc, "punc", 4)) /* punct */
6781 namedclass = complement ? ANYOF_NPUNCT : ANYOF_PUNCT;
6786 if (memEQ(posixcc, "xdigit", 6))
6787 namedclass = complement ? ANYOF_NXDIGIT : ANYOF_XDIGIT;
6791 if (namedclass == OOB_NAMEDCLASS)
6792 Simple_vFAIL3("POSIX class [:%.*s:] unknown",
6794 assert (posixcc[skip] == ':');
6795 assert (posixcc[skip+1] == ']');
6796 } else if (!SIZE_ONLY) {
6797 /* [[=foo=]] and [[.foo.]] are still future. */
6799 /* adjust RExC_parse so the warning shows after
6801 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse) != ']')
6803 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
6806 /* Maternal grandfather:
6807 * "[:" ending in ":" but not in ":]" */
6817 S_checkposixcc(pTHX_ RExC_state_t *pRExC_state)
6820 if (POSIXCC(UCHARAT(RExC_parse))) {
6821 const char *s = RExC_parse;
6822 const char c = *s++;
6826 if (*s && c == *s && s[1] == ']') {
6827 if (ckWARN(WARN_REGEXP))
6829 "POSIX syntax [%c %c] belongs inside character classes",
6832 /* [[=foo=]] and [[.foo.]] are still future. */
6833 if (POSIXCC_NOTYET(c)) {
6834 /* adjust RExC_parse so the error shows after
6836 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse++) != ']')
6838 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
6846 parse a class specification and produce either an ANYOF node that
6847 matches the pattern. If the pattern matches a single char only and
6848 that char is < 256 then we produce an EXACT node instead.
6851 S_regclass(pTHX_ RExC_state_t *pRExC_state, U32 depth)
6854 register UV value = 0;
6855 register UV nextvalue;
6856 register IV prevvalue = OOB_UNICODE;
6857 register IV range = 0;
6858 register regnode *ret;
6861 char *rangebegin = NULL;
6862 bool need_class = 0;
6865 bool optimize_invert = TRUE;
6866 AV* unicode_alternate = NULL;
6868 UV literal_endpoint = 0;
6870 UV stored = 0; /* number of chars stored in the class */
6872 regnode * const orig_emit = RExC_emit; /* Save the original RExC_emit in
6873 case we need to change the emitted regop to an EXACT. */
6874 const char * orig_parse = RExC_parse;
6875 GET_RE_DEBUG_FLAGS_DECL;
6877 PERL_UNUSED_ARG(depth);
6880 DEBUG_PARSE("clas");
6882 /* Assume we are going to generate an ANYOF node. */
6883 ret = reganode(pRExC_state, ANYOF, 0);
6886 ANYOF_FLAGS(ret) = 0;
6888 if (UCHARAT(RExC_parse) == '^') { /* Complement of range. */
6892 ANYOF_FLAGS(ret) |= ANYOF_INVERT;
6896 RExC_size += ANYOF_SKIP;
6897 listsv = &PL_sv_undef; /* For code scanners: listsv always non-NULL. */
6900 RExC_emit += ANYOF_SKIP;
6902 ANYOF_FLAGS(ret) |= ANYOF_FOLD;
6904 ANYOF_FLAGS(ret) |= ANYOF_LOCALE;
6905 ANYOF_BITMAP_ZERO(ret);
6906 listsv = newSVpvs("# comment\n");
6909 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
6911 if (!SIZE_ONLY && POSIXCC(nextvalue))
6912 checkposixcc(pRExC_state);
6914 /* allow 1st char to be ] (allowing it to be - is dealt with later) */
6915 if (UCHARAT(RExC_parse) == ']')
6919 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != ']') {
6923 namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
6926 rangebegin = RExC_parse;
6928 value = utf8n_to_uvchr((U8*)RExC_parse,
6929 RExC_end - RExC_parse,
6930 &numlen, UTF8_ALLOW_DEFAULT);
6931 RExC_parse += numlen;
6934 value = UCHARAT(RExC_parse++);
6936 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
6937 if (value == '[' && POSIXCC(nextvalue))
6938 namedclass = regpposixcc(pRExC_state, value);
6939 else if (value == '\\') {
6941 value = utf8n_to_uvchr((U8*)RExC_parse,
6942 RExC_end - RExC_parse,
6943 &numlen, UTF8_ALLOW_DEFAULT);
6944 RExC_parse += numlen;
6947 value = UCHARAT(RExC_parse++);
6948 /* Some compilers cannot handle switching on 64-bit integer
6949 * values, therefore value cannot be an UV. Yes, this will
6950 * be a problem later if we want switch on Unicode.
6951 * A similar issue a little bit later when switching on
6952 * namedclass. --jhi */
6953 switch ((I32)value) {
6954 case 'w': namedclass = ANYOF_ALNUM; break;
6955 case 'W': namedclass = ANYOF_NALNUM; break;
6956 case 's': namedclass = ANYOF_SPACE; break;
6957 case 'S': namedclass = ANYOF_NSPACE; break;
6958 case 'd': namedclass = ANYOF_DIGIT; break;
6959 case 'D': namedclass = ANYOF_NDIGIT; break;
6960 case 'N': /* Handle \N{NAME} in class */
6962 /* We only pay attention to the first char of
6963 multichar strings being returned. I kinda wonder
6964 if this makes sense as it does change the behaviour
6965 from earlier versions, OTOH that behaviour was broken
6967 UV v; /* value is register so we cant & it /grrr */
6968 if (reg_namedseq(pRExC_state, &v)) {
6978 if (RExC_parse >= RExC_end)
6979 vFAIL2("Empty \\%c{}", (U8)value);
6980 if (*RExC_parse == '{') {
6981 const U8 c = (U8)value;
6982 e = strchr(RExC_parse++, '}');
6984 vFAIL2("Missing right brace on \\%c{}", c);
6985 while (isSPACE(UCHARAT(RExC_parse)))
6987 if (e == RExC_parse)
6988 vFAIL2("Empty \\%c{}", c);
6990 while (isSPACE(UCHARAT(RExC_parse + n - 1)))
6998 if (UCHARAT(RExC_parse) == '^') {
7001 value = value == 'p' ? 'P' : 'p'; /* toggle */
7002 while (isSPACE(UCHARAT(RExC_parse))) {
7007 Perl_sv_catpvf(aTHX_ listsv, "%cutf8::%.*s\n",
7008 (value=='p' ? '+' : '!'), (int)n, RExC_parse);
7011 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
7012 namedclass = ANYOF_MAX; /* no official name, but it's named */
7015 case 'n': value = '\n'; break;
7016 case 'r': value = '\r'; break;
7017 case 't': value = '\t'; break;
7018 case 'f': value = '\f'; break;
7019 case 'b': value = '\b'; break;
7020 case 'e': value = ASCII_TO_NATIVE('\033');break;
7021 case 'a': value = ASCII_TO_NATIVE('\007');break;
7023 if (*RExC_parse == '{') {
7024 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
7025 | PERL_SCAN_DISALLOW_PREFIX;
7026 char * const e = strchr(RExC_parse++, '}');
7028 vFAIL("Missing right brace on \\x{}");
7030 numlen = e - RExC_parse;
7031 value = grok_hex(RExC_parse, &numlen, &flags, NULL);
7035 I32 flags = PERL_SCAN_DISALLOW_PREFIX;
7037 value = grok_hex(RExC_parse, &numlen, &flags, NULL);
7038 RExC_parse += numlen;
7040 if (PL_encoding && value < 0x100)
7041 goto recode_encoding;
7044 value = UCHARAT(RExC_parse++);
7045 value = toCTRL(value);
7047 case '0': case '1': case '2': case '3': case '4':
7048 case '5': case '6': case '7': case '8': case '9':
7052 value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
7053 RExC_parse += numlen;
7054 if (PL_encoding && value < 0x100)
7055 goto recode_encoding;
7060 SV* enc = PL_encoding;
7061 value = reg_recode((const char)(U8)value, &enc);
7062 if (!enc && SIZE_ONLY && ckWARN(WARN_REGEXP))
7064 "Invalid escape in the specified encoding");
7068 if (!SIZE_ONLY && isALPHA(value) && ckWARN(WARN_REGEXP))
7070 "Unrecognized escape \\%c in character class passed through",
7074 } /* end of \blah */
7080 if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
7082 if (!SIZE_ONLY && !need_class)
7083 ANYOF_CLASS_ZERO(ret);
7087 /* a bad range like a-\d, a-[:digit:] ? */
7090 if (ckWARN(WARN_REGEXP)) {
7092 RExC_parse >= rangebegin ?
7093 RExC_parse - rangebegin : 0;
7095 "False [] range \"%*.*s\"",
7098 if (prevvalue < 256) {
7099 ANYOF_BITMAP_SET(ret, prevvalue);
7100 ANYOF_BITMAP_SET(ret, '-');
7103 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
7104 Perl_sv_catpvf(aTHX_ listsv,
7105 "%04"UVxf"\n%04"UVxf"\n", (UV)prevvalue, (UV) '-');
7109 range = 0; /* this was not a true range */
7113 const char *what = NULL;
7116 if (namedclass > OOB_NAMEDCLASS)
7117 optimize_invert = FALSE;
7118 /* Possible truncation here but in some 64-bit environments
7119 * the compiler gets heartburn about switch on 64-bit values.
7120 * A similar issue a little earlier when switching on value.
7122 switch ((I32)namedclass) {
7125 ANYOF_CLASS_SET(ret, ANYOF_ALNUM);
7127 for (value = 0; value < 256; value++)
7129 ANYOF_BITMAP_SET(ret, value);
7136 ANYOF_CLASS_SET(ret, ANYOF_NALNUM);
7138 for (value = 0; value < 256; value++)
7139 if (!isALNUM(value))
7140 ANYOF_BITMAP_SET(ret, value);
7147 ANYOF_CLASS_SET(ret, ANYOF_ALNUMC);
7149 for (value = 0; value < 256; value++)
7150 if (isALNUMC(value))
7151 ANYOF_BITMAP_SET(ret, value);
7158 ANYOF_CLASS_SET(ret, ANYOF_NALNUMC);
7160 for (value = 0; value < 256; value++)
7161 if (!isALNUMC(value))
7162 ANYOF_BITMAP_SET(ret, value);
7169 ANYOF_CLASS_SET(ret, ANYOF_ALPHA);
7171 for (value = 0; value < 256; value++)
7173 ANYOF_BITMAP_SET(ret, value);
7180 ANYOF_CLASS_SET(ret, ANYOF_NALPHA);
7182 for (value = 0; value < 256; value++)
7183 if (!isALPHA(value))
7184 ANYOF_BITMAP_SET(ret, value);
7191 ANYOF_CLASS_SET(ret, ANYOF_ASCII);
7194 for (value = 0; value < 128; value++)
7195 ANYOF_BITMAP_SET(ret, value);
7197 for (value = 0; value < 256; value++) {
7199 ANYOF_BITMAP_SET(ret, value);
7208 ANYOF_CLASS_SET(ret, ANYOF_NASCII);
7211 for (value = 128; value < 256; value++)
7212 ANYOF_BITMAP_SET(ret, value);
7214 for (value = 0; value < 256; value++) {
7215 if (!isASCII(value))
7216 ANYOF_BITMAP_SET(ret, value);
7225 ANYOF_CLASS_SET(ret, ANYOF_BLANK);
7227 for (value = 0; value < 256; value++)
7229 ANYOF_BITMAP_SET(ret, value);
7236 ANYOF_CLASS_SET(ret, ANYOF_NBLANK);
7238 for (value = 0; value < 256; value++)
7239 if (!isBLANK(value))
7240 ANYOF_BITMAP_SET(ret, value);
7247 ANYOF_CLASS_SET(ret, ANYOF_CNTRL);
7249 for (value = 0; value < 256; value++)
7251 ANYOF_BITMAP_SET(ret, value);
7258 ANYOF_CLASS_SET(ret, ANYOF_NCNTRL);
7260 for (value = 0; value < 256; value++)
7261 if (!isCNTRL(value))
7262 ANYOF_BITMAP_SET(ret, value);
7269 ANYOF_CLASS_SET(ret, ANYOF_DIGIT);
7271 /* consecutive digits assumed */
7272 for (value = '0'; value <= '9'; value++)
7273 ANYOF_BITMAP_SET(ret, value);
7280 ANYOF_CLASS_SET(ret, ANYOF_NDIGIT);
7282 /* consecutive digits assumed */
7283 for (value = 0; value < '0'; value++)
7284 ANYOF_BITMAP_SET(ret, value);
7285 for (value = '9' + 1; value < 256; value++)
7286 ANYOF_BITMAP_SET(ret, value);
7293 ANYOF_CLASS_SET(ret, ANYOF_GRAPH);
7295 for (value = 0; value < 256; value++)
7297 ANYOF_BITMAP_SET(ret, value);
7304 ANYOF_CLASS_SET(ret, ANYOF_NGRAPH);
7306 for (value = 0; value < 256; value++)
7307 if (!isGRAPH(value))
7308 ANYOF_BITMAP_SET(ret, value);
7315 ANYOF_CLASS_SET(ret, ANYOF_LOWER);
7317 for (value = 0; value < 256; value++)
7319 ANYOF_BITMAP_SET(ret, value);
7326 ANYOF_CLASS_SET(ret, ANYOF_NLOWER);
7328 for (value = 0; value < 256; value++)
7329 if (!isLOWER(value))
7330 ANYOF_BITMAP_SET(ret, value);
7337 ANYOF_CLASS_SET(ret, ANYOF_PRINT);
7339 for (value = 0; value < 256; value++)
7341 ANYOF_BITMAP_SET(ret, value);
7348 ANYOF_CLASS_SET(ret, ANYOF_NPRINT);
7350 for (value = 0; value < 256; value++)
7351 if (!isPRINT(value))
7352 ANYOF_BITMAP_SET(ret, value);
7359 ANYOF_CLASS_SET(ret, ANYOF_PSXSPC);
7361 for (value = 0; value < 256; value++)
7362 if (isPSXSPC(value))
7363 ANYOF_BITMAP_SET(ret, value);
7370 ANYOF_CLASS_SET(ret, ANYOF_NPSXSPC);
7372 for (value = 0; value < 256; value++)
7373 if (!isPSXSPC(value))
7374 ANYOF_BITMAP_SET(ret, value);
7381 ANYOF_CLASS_SET(ret, ANYOF_PUNCT);
7383 for (value = 0; value < 256; value++)
7385 ANYOF_BITMAP_SET(ret, value);
7392 ANYOF_CLASS_SET(ret, ANYOF_NPUNCT);
7394 for (value = 0; value < 256; value++)
7395 if (!isPUNCT(value))
7396 ANYOF_BITMAP_SET(ret, value);
7403 ANYOF_CLASS_SET(ret, ANYOF_SPACE);
7405 for (value = 0; value < 256; value++)
7407 ANYOF_BITMAP_SET(ret, value);
7414 ANYOF_CLASS_SET(ret, ANYOF_NSPACE);
7416 for (value = 0; value < 256; value++)
7417 if (!isSPACE(value))
7418 ANYOF_BITMAP_SET(ret, value);
7425 ANYOF_CLASS_SET(ret, ANYOF_UPPER);
7427 for (value = 0; value < 256; value++)
7429 ANYOF_BITMAP_SET(ret, value);
7436 ANYOF_CLASS_SET(ret, ANYOF_NUPPER);
7438 for (value = 0; value < 256; value++)
7439 if (!isUPPER(value))
7440 ANYOF_BITMAP_SET(ret, value);
7447 ANYOF_CLASS_SET(ret, ANYOF_XDIGIT);
7449 for (value = 0; value < 256; value++)
7450 if (isXDIGIT(value))
7451 ANYOF_BITMAP_SET(ret, value);
7458 ANYOF_CLASS_SET(ret, ANYOF_NXDIGIT);
7460 for (value = 0; value < 256; value++)
7461 if (!isXDIGIT(value))
7462 ANYOF_BITMAP_SET(ret, value);
7468 /* this is to handle \p and \P */
7471 vFAIL("Invalid [::] class");
7475 /* Strings such as "+utf8::isWord\n" */
7476 Perl_sv_catpvf(aTHX_ listsv, "%cutf8::Is%s\n", yesno, what);
7479 ANYOF_FLAGS(ret) |= ANYOF_CLASS;
7482 } /* end of namedclass \blah */
7485 if (prevvalue > (IV)value) /* b-a */ {
7486 const int w = RExC_parse - rangebegin;
7487 Simple_vFAIL4("Invalid [] range \"%*.*s\"", w, w, rangebegin);
7488 range = 0; /* not a valid range */
7492 prevvalue = value; /* save the beginning of the range */
7493 if (*RExC_parse == '-' && RExC_parse+1 < RExC_end &&
7494 RExC_parse[1] != ']') {
7497 /* a bad range like \w-, [:word:]- ? */
7498 if (namedclass > OOB_NAMEDCLASS) {
7499 if (ckWARN(WARN_REGEXP)) {
7501 RExC_parse >= rangebegin ?
7502 RExC_parse - rangebegin : 0;
7504 "False [] range \"%*.*s\"",
7508 ANYOF_BITMAP_SET(ret, '-');
7510 range = 1; /* yeah, it's a range! */
7511 continue; /* but do it the next time */
7515 /* now is the next time */
7516 /*stored += (value - prevvalue + 1);*/
7518 if (prevvalue < 256) {
7519 const IV ceilvalue = value < 256 ? value : 255;
7522 /* In EBCDIC [\x89-\x91] should include
7523 * the \x8e but [i-j] should not. */
7524 if (literal_endpoint == 2 &&
7525 ((isLOWER(prevvalue) && isLOWER(ceilvalue)) ||
7526 (isUPPER(prevvalue) && isUPPER(ceilvalue))))
7528 if (isLOWER(prevvalue)) {
7529 for (i = prevvalue; i <= ceilvalue; i++)
7531 ANYOF_BITMAP_SET(ret, i);
7533 for (i = prevvalue; i <= ceilvalue; i++)
7535 ANYOF_BITMAP_SET(ret, i);
7540 for (i = prevvalue; i <= ceilvalue; i++) {
7541 if (!ANYOF_BITMAP_TEST(ret,i)) {
7543 ANYOF_BITMAP_SET(ret, i);
7547 if (value > 255 || UTF) {
7548 const UV prevnatvalue = NATIVE_TO_UNI(prevvalue);
7549 const UV natvalue = NATIVE_TO_UNI(value);
7550 stored+=2; /* can't optimize this class */
7551 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
7552 if (prevnatvalue < natvalue) { /* what about > ? */
7553 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\t%04"UVxf"\n",
7554 prevnatvalue, natvalue);
7556 else if (prevnatvalue == natvalue) {
7557 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n", natvalue);
7559 U8 foldbuf[UTF8_MAXBYTES_CASE+1];
7561 const UV f = to_uni_fold(natvalue, foldbuf, &foldlen);
7563 #ifdef EBCDIC /* RD t/uni/fold ff and 6b */
7564 if (RExC_precomp[0] == ':' &&
7565 RExC_precomp[1] == '[' &&
7566 (f == 0xDF || f == 0x92)) {
7567 f = NATIVE_TO_UNI(f);
7570 /* If folding and foldable and a single
7571 * character, insert also the folded version
7572 * to the charclass. */
7574 #ifdef EBCDIC /* RD tunifold ligatures s,t fb05, fb06 */
7575 if ((RExC_precomp[0] == ':' &&
7576 RExC_precomp[1] == '[' &&
7578 (value == 0xFB05 || value == 0xFB06))) ?
7579 foldlen == ((STRLEN)UNISKIP(f) - 1) :
7580 foldlen == (STRLEN)UNISKIP(f) )
7582 if (foldlen == (STRLEN)UNISKIP(f))
7584 Perl_sv_catpvf(aTHX_ listsv,
7587 /* Any multicharacter foldings
7588 * require the following transform:
7589 * [ABCDEF] -> (?:[ABCabcDEFd]|pq|rst)
7590 * where E folds into "pq" and F folds
7591 * into "rst", all other characters
7592 * fold to single characters. We save
7593 * away these multicharacter foldings,
7594 * to be later saved as part of the
7595 * additional "s" data. */
7598 if (!unicode_alternate)
7599 unicode_alternate = newAV();
7600 sv = newSVpvn((char*)foldbuf, foldlen);
7602 av_push(unicode_alternate, sv);
7606 /* If folding and the value is one of the Greek
7607 * sigmas insert a few more sigmas to make the
7608 * folding rules of the sigmas to work right.
7609 * Note that not all the possible combinations
7610 * are handled here: some of them are handled
7611 * by the standard folding rules, and some of
7612 * them (literal or EXACTF cases) are handled
7613 * during runtime in regexec.c:S_find_byclass(). */
7614 if (value == UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA) {
7615 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
7616 (UV)UNICODE_GREEK_CAPITAL_LETTER_SIGMA);
7617 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
7618 (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
7620 else if (value == UNICODE_GREEK_CAPITAL_LETTER_SIGMA)
7621 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
7622 (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
7627 literal_endpoint = 0;
7631 range = 0; /* this range (if it was one) is done now */
7635 ANYOF_FLAGS(ret) |= ANYOF_LARGE;
7637 RExC_size += ANYOF_CLASS_ADD_SKIP;
7639 RExC_emit += ANYOF_CLASS_ADD_SKIP;
7645 /****** !SIZE_ONLY AFTER HERE *********/
7647 if( stored == 1 && value < 256
7648 && !( ANYOF_FLAGS(ret) & ( ANYOF_FLAGS_ALL ^ ANYOF_FOLD ) )
7650 /* optimize single char class to an EXACT node
7651 but *only* when its not a UTF/high char */
7652 const char * cur_parse= RExC_parse;
7653 RExC_emit = (regnode *)orig_emit;
7654 RExC_parse = (char *)orig_parse;
7655 ret = reg_node(pRExC_state,
7656 (U8)((ANYOF_FLAGS(ret) & ANYOF_FOLD) ? EXACTF : EXACT));
7657 RExC_parse = (char *)cur_parse;
7658 *STRING(ret)= (char)value;
7660 RExC_emit += STR_SZ(1);
7663 /* optimize case-insensitive simple patterns (e.g. /[a-z]/i) */
7664 if ( /* If the only flag is folding (plus possibly inversion). */
7665 ((ANYOF_FLAGS(ret) & (ANYOF_FLAGS_ALL ^ ANYOF_INVERT)) == ANYOF_FOLD)
7667 for (value = 0; value < 256; ++value) {
7668 if (ANYOF_BITMAP_TEST(ret, value)) {
7669 UV fold = PL_fold[value];
7672 ANYOF_BITMAP_SET(ret, fold);
7675 ANYOF_FLAGS(ret) &= ~ANYOF_FOLD;
7678 /* optimize inverted simple patterns (e.g. [^a-z]) */
7679 if (optimize_invert &&
7680 /* If the only flag is inversion. */
7681 (ANYOF_FLAGS(ret) & ANYOF_FLAGS_ALL) == ANYOF_INVERT) {
7682 for (value = 0; value < ANYOF_BITMAP_SIZE; ++value)
7683 ANYOF_BITMAP(ret)[value] ^= ANYOF_FLAGS_ALL;
7684 ANYOF_FLAGS(ret) = ANYOF_UNICODE_ALL;
7687 AV * const av = newAV();
7689 /* The 0th element stores the character class description
7690 * in its textual form: used later (regexec.c:Perl_regclass_swash())
7691 * to initialize the appropriate swash (which gets stored in
7692 * the 1st element), and also useful for dumping the regnode.
7693 * The 2nd element stores the multicharacter foldings,
7694 * used later (regexec.c:S_reginclass()). */
7695 av_store(av, 0, listsv);
7696 av_store(av, 1, NULL);
7697 av_store(av, 2, (SV*)unicode_alternate);
7698 rv = newRV_noinc((SV*)av);
7699 n = add_data(pRExC_state, 1, "s");
7700 RExC_rx->data->data[n] = (void*)rv;
7707 S_nextchar(pTHX_ RExC_state_t *pRExC_state)
7709 char* const retval = RExC_parse++;
7712 if (*RExC_parse == '(' && RExC_parse[1] == '?' &&
7713 RExC_parse[2] == '#') {
7714 while (*RExC_parse != ')') {
7715 if (RExC_parse == RExC_end)
7716 FAIL("Sequence (?#... not terminated");
7722 if (RExC_flags & PMf_EXTENDED) {
7723 if (isSPACE(*RExC_parse)) {
7727 else if (*RExC_parse == '#') {
7728 while (RExC_parse < RExC_end)
7729 if (*RExC_parse++ == '\n') break;
7738 - reg_node - emit a node
7740 STATIC regnode * /* Location. */
7741 S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
7744 register regnode *ptr;
7745 regnode * const ret = RExC_emit;
7746 GET_RE_DEBUG_FLAGS_DECL;
7749 SIZE_ALIGN(RExC_size);
7754 if (OP(RExC_emit) == 255)
7755 Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %s: %d ",
7756 reg_name[op], OP(RExC_emit));
7758 NODE_ALIGN_FILL(ret);
7760 FILL_ADVANCE_NODE(ptr, op);
7761 if (RExC_offsets) { /* MJD */
7762 MJD_OFFSET_DEBUG(("%s:%d: (op %s) %s %"UVuf" (len %"UVuf") (max %"UVuf").\n",
7763 "reg_node", __LINE__,
7765 (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0]
7766 ? "Overwriting end of array!\n" : "OK",
7767 (UV)(RExC_emit - RExC_emit_start),
7768 (UV)(RExC_parse - RExC_start),
7769 (UV)RExC_offsets[0]));
7770 Set_Node_Offset(RExC_emit, RExC_parse + (op == END));
7778 - reganode - emit a node with an argument
7780 STATIC regnode * /* Location. */
7781 S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
7784 register regnode *ptr;
7785 regnode * const ret = RExC_emit;
7786 GET_RE_DEBUG_FLAGS_DECL;
7789 SIZE_ALIGN(RExC_size);
7794 assert(2==regarglen[op]+1);
7796 Anything larger than this has to allocate the extra amount.
7797 If we changed this to be:
7799 RExC_size += (1 + regarglen[op]);
7801 then it wouldn't matter. Its not clear what side effect
7802 might come from that so its not done so far.
7808 if (OP(RExC_emit) == 255)
7809 Perl_croak(aTHX_ "panic: reganode overwriting end of allocated program space");
7811 NODE_ALIGN_FILL(ret);
7813 FILL_ADVANCE_NODE_ARG(ptr, op, arg);
7814 if (RExC_offsets) { /* MJD */
7815 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
7819 (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0] ?
7820 "Overwriting end of array!\n" : "OK",
7821 (UV)(RExC_emit - RExC_emit_start),
7822 (UV)(RExC_parse - RExC_start),
7823 (UV)RExC_offsets[0]));
7824 Set_Cur_Node_Offset;
7832 - reguni - emit (if appropriate) a Unicode character
7835 S_reguni(pTHX_ const RExC_state_t *pRExC_state, UV uv, char* s)
7838 return SIZE_ONLY ? UNISKIP(uv) : (uvchr_to_utf8((U8*)s, uv) - (U8*)s);
7842 - reginsert - insert an operator in front of already-emitted operand
7844 * Means relocating the operand.
7847 S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd, U32 depth)
7850 register regnode *src;
7851 register regnode *dst;
7852 register regnode *place;
7853 const int offset = regarglen[(U8)op];
7854 const int size = NODE_STEP_REGNODE + offset;
7855 GET_RE_DEBUG_FLAGS_DECL;
7856 /* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */
7857 DEBUG_PARSE_FMT("inst"," - %s",reg_name[op]);
7866 if (RExC_open_parens) {
7868 DEBUG_PARSE_FMT("inst"," - %"IVdf, (IV)RExC_npar);
7869 for ( paren=0 ; paren < RExC_npar ; paren++ ) {
7870 if ( RExC_open_parens[paren] >= opnd ) {
7871 DEBUG_PARSE_FMT("open"," - %d",size);
7872 RExC_open_parens[paren] += size;
7874 DEBUG_PARSE_FMT("open"," - %s","ok");
7876 if ( RExC_close_parens[paren] >= opnd ) {
7877 DEBUG_PARSE_FMT("close"," - %d",size);
7878 RExC_close_parens[paren] += size;
7880 DEBUG_PARSE_FMT("close"," - %s","ok");
7885 while (src > opnd) {
7886 StructCopy(--src, --dst, regnode);
7887 if (RExC_offsets) { /* MJD 20010112 */
7888 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s copy %"UVuf" -> %"UVuf" (max %"UVuf").\n",
7892 (UV)(dst - RExC_emit_start) > RExC_offsets[0]
7893 ? "Overwriting end of array!\n" : "OK",
7894 (UV)(src - RExC_emit_start),
7895 (UV)(dst - RExC_emit_start),
7896 (UV)RExC_offsets[0]));
7897 Set_Node_Offset_To_R(dst-RExC_emit_start, Node_Offset(src));
7898 Set_Node_Length_To_R(dst-RExC_emit_start, Node_Length(src));
7903 place = opnd; /* Op node, where operand used to be. */
7904 if (RExC_offsets) { /* MJD */
7905 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
7909 (UV)(place - RExC_emit_start) > RExC_offsets[0]
7910 ? "Overwriting end of array!\n" : "OK",
7911 (UV)(place - RExC_emit_start),
7912 (UV)(RExC_parse - RExC_start),
7913 (UV)RExC_offsets[0]));
7914 Set_Node_Offset(place, RExC_parse);
7915 Set_Node_Length(place, 1);
7917 src = NEXTOPER(place);
7918 FILL_ADVANCE_NODE(place, op);
7919 Zero(src, offset, regnode);
7923 - regtail - set the next-pointer at the end of a node chain of p to val.
7924 - SEE ALSO: regtail_study
7926 /* TODO: All three parms should be const */
7928 S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
7931 register regnode *scan;
7932 GET_RE_DEBUG_FLAGS_DECL;
7934 PERL_UNUSED_ARG(depth);
7940 /* Find last node. */
7943 regnode * const temp = regnext(scan);
7945 SV * const mysv=sv_newmortal();
7946 DEBUG_PARSE_MSG((scan==p ? "tail" : ""));
7947 regprop(RExC_rx, mysv, scan);
7948 PerlIO_printf(Perl_debug_log, "~ %s (%d) %s %s\n",
7949 SvPV_nolen_const(mysv), REG_NODE_NUM(scan),
7950 (temp == NULL ? "->" : ""),
7951 (temp == NULL ? reg_name[OP(val)] : "")
7959 if (reg_off_by_arg[OP(scan)]) {
7960 ARG_SET(scan, val - scan);
7963 NEXT_OFF(scan) = val - scan;
7969 - regtail_study - set the next-pointer at the end of a node chain of p to val.
7970 - Look for optimizable sequences at the same time.
7971 - currently only looks for EXACT chains.
7973 This is expermental code. The idea is to use this routine to perform
7974 in place optimizations on branches and groups as they are constructed,
7975 with the long term intention of removing optimization from study_chunk so
7976 that it is purely analytical.
7978 Currently only used when in DEBUG mode. The macro REGTAIL_STUDY() is used
7979 to control which is which.
7982 /* TODO: All four parms should be const */
7985 S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
7988 register regnode *scan;
7990 #ifdef EXPERIMENTAL_INPLACESCAN
7994 GET_RE_DEBUG_FLAGS_DECL;
8000 /* Find last node. */
8004 regnode * const temp = regnext(scan);
8005 #ifdef EXPERIMENTAL_INPLACESCAN
8006 if (PL_regkind[OP(scan)] == EXACT)
8007 if (join_exact(pRExC_state,scan,&min,1,val,depth+1))
8015 if( exact == PSEUDO )
8017 else if ( exact != OP(scan) )
8026 SV * const mysv=sv_newmortal();
8027 DEBUG_PARSE_MSG((scan==p ? "tsdy" : ""));
8028 regprop(RExC_rx, mysv, scan);
8029 PerlIO_printf(Perl_debug_log, "~ %s (%d) -> %s\n",
8030 SvPV_nolen_const(mysv),
8039 SV * const mysv_val=sv_newmortal();
8040 DEBUG_PARSE_MSG("");
8041 regprop(RExC_rx, mysv_val, val);
8042 PerlIO_printf(Perl_debug_log, "~ attach to %s (%d) offset to %d\n",
8043 SvPV_nolen_const(mysv_val),
8048 if (reg_off_by_arg[OP(scan)]) {
8049 ARG_SET(scan, val - scan);
8052 NEXT_OFF(scan) = val - scan;
8060 - regcurly - a little FSA that accepts {\d+,?\d*}
8063 S_regcurly(register const char *s)
8082 - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
8085 Perl_regdump(pTHX_ const regexp *r)
8089 SV * const sv = sv_newmortal();
8090 SV *dsv= sv_newmortal();
8092 (void)dumpuntil(r, r->program, r->program + 1, NULL, NULL, sv, 0, 0);
8094 /* Header fields of interest. */
8095 if (r->anchored_substr) {
8096 RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->anchored_substr),
8097 RE_SV_DUMPLEN(r->anchored_substr), 30);
8098 PerlIO_printf(Perl_debug_log,
8099 "anchored %s%s at %"IVdf" ",
8100 s, RE_SV_TAIL(r->anchored_substr),
8101 (IV)r->anchored_offset);
8102 } else if (r->anchored_utf8) {
8103 RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->anchored_utf8),
8104 RE_SV_DUMPLEN(r->anchored_utf8), 30);
8105 PerlIO_printf(Perl_debug_log,
8106 "anchored utf8 %s%s at %"IVdf" ",
8107 s, RE_SV_TAIL(r->anchored_utf8),
8108 (IV)r->anchored_offset);
8110 if (r->float_substr) {
8111 RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->float_substr),
8112 RE_SV_DUMPLEN(r->float_substr), 30);
8113 PerlIO_printf(Perl_debug_log,
8114 "floating %s%s at %"IVdf"..%"UVuf" ",
8115 s, RE_SV_TAIL(r->float_substr),
8116 (IV)r->float_min_offset, (UV)r->float_max_offset);
8117 } else if (r->float_utf8) {
8118 RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->float_utf8),
8119 RE_SV_DUMPLEN(r->float_utf8), 30);
8120 PerlIO_printf(Perl_debug_log,
8121 "floating utf8 %s%s at %"IVdf"..%"UVuf" ",
8122 s, RE_SV_TAIL(r->float_utf8),
8123 (IV)r->float_min_offset, (UV)r->float_max_offset);
8125 if (r->check_substr || r->check_utf8)
8126 PerlIO_printf(Perl_debug_log,
8128 (r->check_substr == r->float_substr
8129 && r->check_utf8 == r->float_utf8
8130 ? "(checking floating" : "(checking anchored"));
8131 if (r->reganch & ROPT_NOSCAN)
8132 PerlIO_printf(Perl_debug_log, " noscan");
8133 if (r->reganch & ROPT_CHECK_ALL)
8134 PerlIO_printf(Perl_debug_log, " isall");
8135 if (r->check_substr || r->check_utf8)
8136 PerlIO_printf(Perl_debug_log, ") ");
8138 if (r->regstclass) {
8139 regprop(r, sv, r->regstclass);
8140 PerlIO_printf(Perl_debug_log, "stclass %s ", SvPVX_const(sv));
8142 if (r->reganch & ROPT_ANCH) {
8143 PerlIO_printf(Perl_debug_log, "anchored");
8144 if (r->reganch & ROPT_ANCH_BOL)
8145 PerlIO_printf(Perl_debug_log, "(BOL)");
8146 if (r->reganch & ROPT_ANCH_MBOL)
8147 PerlIO_printf(Perl_debug_log, "(MBOL)");
8148 if (r->reganch & ROPT_ANCH_SBOL)
8149 PerlIO_printf(Perl_debug_log, "(SBOL)");
8150 if (r->reganch & ROPT_ANCH_GPOS)
8151 PerlIO_printf(Perl_debug_log, "(GPOS)");
8152 PerlIO_putc(Perl_debug_log, ' ');
8154 if (r->reganch & ROPT_GPOS_SEEN)
8155 PerlIO_printf(Perl_debug_log, "GPOS:%"UVuf" ", r->gofs);
8156 if (r->reganch & ROPT_SKIP)
8157 PerlIO_printf(Perl_debug_log, "plus ");
8158 if (r->reganch & ROPT_IMPLICIT)
8159 PerlIO_printf(Perl_debug_log, "implicit ");
8160 PerlIO_printf(Perl_debug_log, "minlen %ld ", (long) r->minlen);
8161 if (r->reganch & ROPT_EVAL_SEEN)
8162 PerlIO_printf(Perl_debug_log, "with eval ");
8163 PerlIO_printf(Perl_debug_log, "\n");
8165 PERL_UNUSED_CONTEXT;
8167 #endif /* DEBUGGING */
8171 - regprop - printable representation of opcode
8174 Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o)
8179 GET_RE_DEBUG_FLAGS_DECL;
8181 sv_setpvn(sv, "", 0);
8183 if (OP(o) > REGNODE_MAX) /* regnode.type is unsigned */
8184 /* It would be nice to FAIL() here, but this may be called from
8185 regexec.c, and it would be hard to supply pRExC_state. */
8186 Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d", (int)OP(o), (int)REGNODE_MAX);
8187 sv_catpv(sv, reg_name[OP(o)]); /* Take off const! */
8189 k = PL_regkind[OP(o)];
8192 SV * const dsv = sv_2mortal(newSVpvs(""));
8193 /* Using is_utf8_string() (via PERL_PV_UNI_DETECT)
8194 * is a crude hack but it may be the best for now since
8195 * we have no flag "this EXACTish node was UTF-8"
8197 const char * const s =
8198 pv_pretty(dsv, STRING(o), STR_LEN(o), 60,
8199 PL_colors[0], PL_colors[1],
8200 PERL_PV_ESCAPE_UNI_DETECT |
8201 PERL_PV_PRETTY_ELIPSES |
8204 Perl_sv_catpvf(aTHX_ sv, " %s", s );
8205 } else if (k == TRIE) {
8206 /* print the details of the trie in dumpuntil instead, as
8207 * prog->data isn't available here */
8208 const char op = OP(o);
8209 const I32 n = ARG(o);
8210 const reg_ac_data * const ac = IS_TRIE_AC(op) ?
8211 (reg_ac_data *)prog->data->data[n] :
8213 const reg_trie_data * const trie = !IS_TRIE_AC(op) ?
8214 (reg_trie_data*)prog->data->data[n] :
8217 Perl_sv_catpvf(aTHX_ sv, "-%s",reg_name[o->flags]);
8218 DEBUG_TRIE_COMPILE_r(
8219 Perl_sv_catpvf(aTHX_ sv,
8220 "<S:%"UVuf"/%"IVdf" W:%"UVuf" L:%"UVuf"/%"UVuf" C:%"UVuf"/%"UVuf">",
8221 (UV)trie->startstate,
8222 (IV)trie->statecount-1, /* -1 because of the unused 0 element */
8223 (UV)trie->wordcount,
8226 (UV)TRIE_CHARCOUNT(trie),
8227 (UV)trie->uniquecharcount
8230 if ( IS_ANYOF_TRIE(op) || trie->bitmap ) {
8232 int rangestart = -1;
8233 U8* bitmap = IS_ANYOF_TRIE(op) ? (U8*)ANYOF_BITMAP(o) : (U8*)TRIE_BITMAP(trie);
8234 Perl_sv_catpvf(aTHX_ sv, "[");
8235 for (i = 0; i <= 256; i++) {
8236 if (i < 256 && BITMAP_TEST(bitmap,i)) {
8237 if (rangestart == -1)
8239 } else if (rangestart != -1) {
8240 if (i <= rangestart + 3)
8241 for (; rangestart < i; rangestart++)
8242 put_byte(sv, rangestart);
8244 put_byte(sv, rangestart);
8246 put_byte(sv, i - 1);
8251 Perl_sv_catpvf(aTHX_ sv, "]");
8254 } else if (k == CURLY) {
8255 if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX)
8256 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */
8257 Perl_sv_catpvf(aTHX_ sv, " {%d,%d}", ARG1(o), ARG2(o));
8259 else if (k == WHILEM && o->flags) /* Ordinal/of */
8260 Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4);
8261 else if (k == REF || k == OPEN || k == CLOSE || k == GROUPP || OP(o)==ACCEPT)
8262 Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o)); /* Parenth number */
8263 else if (k == GOSUB)
8264 Perl_sv_catpvf(aTHX_ sv, "%d[%+d]", (int)ARG(o),(int)ARG2L(o)); /* Paren and offset */
8265 else if (k == VERB) {
8267 Perl_sv_catpvf(aTHX_ sv, ":%"SVf,
8268 (SV*)prog->data->data[ ARG( o ) ]);
8269 } else if (k == LOGICAL)
8270 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* 2: embedded, otherwise 1 */
8271 else if (k == ANYOF) {
8272 int i, rangestart = -1;
8273 const U8 flags = ANYOF_FLAGS(o);
8275 /* Should be synchronized with * ANYOF_ #xdefines in regcomp.h */
8276 static const char * const anyofs[] = {
8309 if (flags & ANYOF_LOCALE)
8310 sv_catpvs(sv, "{loc}");
8311 if (flags & ANYOF_FOLD)
8312 sv_catpvs(sv, "{i}");
8313 Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
8314 if (flags & ANYOF_INVERT)
8316 for (i = 0; i <= 256; i++) {
8317 if (i < 256 && ANYOF_BITMAP_TEST(o,i)) {
8318 if (rangestart == -1)
8320 } else if (rangestart != -1) {
8321 if (i <= rangestart + 3)
8322 for (; rangestart < i; rangestart++)
8323 put_byte(sv, rangestart);
8325 put_byte(sv, rangestart);
8327 put_byte(sv, i - 1);
8333 if (o->flags & ANYOF_CLASS)
8334 for (i = 0; i < (int)(sizeof(anyofs)/sizeof(char*)); i++)
8335 if (ANYOF_CLASS_TEST(o,i))
8336 sv_catpv(sv, anyofs[i]);
8338 if (flags & ANYOF_UNICODE)
8339 sv_catpvs(sv, "{unicode}");
8340 else if (flags & ANYOF_UNICODE_ALL)
8341 sv_catpvs(sv, "{unicode_all}");
8345 SV * const sw = regclass_swash(prog, o, FALSE, &lv, 0);
8349 U8 s[UTF8_MAXBYTES_CASE+1];
8351 for (i = 0; i <= 256; i++) { /* just the first 256 */
8352 uvchr_to_utf8(s, i);
8354 if (i < 256 && swash_fetch(sw, s, TRUE)) {
8355 if (rangestart == -1)
8357 } else if (rangestart != -1) {
8358 if (i <= rangestart + 3)
8359 for (; rangestart < i; rangestart++) {
8360 const U8 * const e = uvchr_to_utf8(s,rangestart);
8362 for(p = s; p < e; p++)
8366 const U8 *e = uvchr_to_utf8(s,rangestart);
8368 for (p = s; p < e; p++)
8371 e = uvchr_to_utf8(s, i-1);
8372 for (p = s; p < e; p++)
8379 sv_catpvs(sv, "..."); /* et cetera */
8383 char *s = savesvpv(lv);
8384 char * const origs = s;
8386 while (*s && *s != '\n')
8390 const char * const t = ++s;
8408 Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
8410 else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH))
8411 Perl_sv_catpvf(aTHX_ sv, "[%d]", -(o->flags));
8413 PERL_UNUSED_CONTEXT;
8414 PERL_UNUSED_ARG(sv);
8416 PERL_UNUSED_ARG(prog);
8417 #endif /* DEBUGGING */
8421 Perl_re_intuit_string(pTHX_ regexp *prog)
8422 { /* Assume that RE_INTUIT is set */
8424 GET_RE_DEBUG_FLAGS_DECL;
8425 PERL_UNUSED_CONTEXT;
8429 const char * const s = SvPV_nolen_const(prog->check_substr
8430 ? prog->check_substr : prog->check_utf8);
8432 if (!PL_colorset) reginitcolors();
8433 PerlIO_printf(Perl_debug_log,
8434 "%sUsing REx %ssubstr:%s \"%s%.60s%s%s\"\n",
8436 prog->check_substr ? "" : "utf8 ",
8437 PL_colors[5],PL_colors[0],
8440 (strlen(s) > 60 ? "..." : ""));
8443 return prog->check_substr ? prog->check_substr : prog->check_utf8;
8447 pregfree - free a regexp
8449 See regdupe below if you change anything here.
8453 Perl_pregfree(pTHX_ struct regexp *r)
8457 GET_RE_DEBUG_FLAGS_DECL;
8459 if (!r || (--r->refcnt > 0))
8465 SV *dsv= sv_newmortal();
8466 RE_PV_QUOTED_DECL(s, (r->reganch & ROPT_UTF8),
8467 dsv, r->precomp, r->prelen, 60);
8468 PerlIO_printf(Perl_debug_log,"%sFreeing REx:%s %s\n",
8469 PL_colors[4],PL_colors[5],s);
8473 /* gcov results gave these as non-null 100% of the time, so there's no
8474 optimisation in checking them before calling Safefree */
8475 Safefree(r->precomp);
8476 Safefree(r->offsets); /* 20010421 MJD */
8477 RX_MATCH_COPY_FREE(r);
8478 #ifdef PERL_OLD_COPY_ON_WRITE
8480 SvREFCNT_dec(r->saved_copy);
8483 if (r->anchored_substr)
8484 SvREFCNT_dec(r->anchored_substr);
8485 if (r->anchored_utf8)
8486 SvREFCNT_dec(r->anchored_utf8);
8487 if (r->float_substr)
8488 SvREFCNT_dec(r->float_substr);
8490 SvREFCNT_dec(r->float_utf8);
8491 Safefree(r->substrs);
8494 SvREFCNT_dec(r->paren_names);
8496 int n = r->data->count;
8497 PAD* new_comppad = NULL;
8502 /* If you add a ->what type here, update the comment in regcomp.h */
8503 switch (r->data->what[n]) {
8506 SvREFCNT_dec((SV*)r->data->data[n]);
8509 Safefree(r->data->data[n]);
8512 new_comppad = (AV*)r->data->data[n];
8515 if (new_comppad == NULL)
8516 Perl_croak(aTHX_ "panic: pregfree comppad");
8517 PAD_SAVE_LOCAL(old_comppad,
8518 /* Watch out for global destruction's random ordering. */
8519 (SvTYPE(new_comppad) == SVt_PVAV) ? new_comppad : NULL
8522 refcnt = OpREFCNT_dec((OP_4tree*)r->data->data[n]);
8525 op_free((OP_4tree*)r->data->data[n]);
8527 PAD_RESTORE_LOCAL(old_comppad);
8528 SvREFCNT_dec((SV*)new_comppad);
8534 { /* Aho Corasick add-on structure for a trie node.
8535 Used in stclass optimization only */
8537 reg_ac_data *aho=(reg_ac_data*)r->data->data[n];
8539 refcount = --aho->refcount;
8542 Safefree(aho->states);
8543 Safefree(aho->fail);
8544 aho->trie=NULL; /* not necessary to free this as it is
8545 handled by the 't' case */
8546 Safefree(r->data->data[n]); /* do this last!!!! */
8547 Safefree(r->regstclass);
8553 /* trie structure. */
8555 reg_trie_data *trie=(reg_trie_data*)r->data->data[n];
8557 refcount = --trie->refcount;
8560 Safefree(trie->charmap);
8561 if (trie->widecharmap)
8562 SvREFCNT_dec((SV*)trie->widecharmap);
8563 Safefree(trie->states);
8564 Safefree(trie->trans);
8566 Safefree(trie->bitmap);
8568 Safefree(trie->wordlen);
8570 Safefree(trie->jump);
8572 Safefree(trie->nextword);
8575 SvREFCNT_dec((SV*)trie->words);
8576 if (trie->revcharmap)
8577 SvREFCNT_dec((SV*)trie->revcharmap);
8579 Safefree(r->data->data[n]); /* do this last!!!! */
8584 Perl_croak(aTHX_ "panic: regfree data code '%c'", r->data->what[n]);
8587 Safefree(r->data->what);
8590 Safefree(r->startp);
8593 Safefree(r->swap->startp);
8594 Safefree(r->swap->endp);
8600 #define sv_dup_inc(s,t) SvREFCNT_inc(sv_dup(s,t))
8601 #define av_dup_inc(s,t) (AV*)SvREFCNT_inc(sv_dup((SV*)s,t))
8602 #define hv_dup_inc(s,t) (HV*)SvREFCNT_inc(sv_dup((SV*)s,t))
8603 #define SAVEPVN(p,n) ((p) ? savepvn(p,n) : NULL)
8606 regdupe - duplicate a regexp.
8608 This routine is called by sv.c's re_dup and is expected to clone a
8609 given regexp structure. It is a no-op when not under USE_ITHREADS.
8610 (Originally this *was* re_dup() for change history see sv.c)
8612 See pregfree() above if you change anything here.
8614 #if defined(USE_ITHREADS)
8616 Perl_regdupe(pTHX_ const regexp *r, CLONE_PARAMS *param)
8621 struct reg_substr_datum *s;
8624 return (REGEXP *)NULL;
8626 if ((ret = (REGEXP *)ptr_table_fetch(PL_ptr_table, r)))
8629 len = r->offsets[0];
8630 npar = r->nparens+1;
8632 Newxc(ret, sizeof(regexp) + (len+1)*sizeof(regnode), char, regexp);
8633 Copy(r->program, ret->program, len+1, regnode);
8635 Newx(ret->startp, npar, I32);
8636 Copy(r->startp, ret->startp, npar, I32);
8637 Newx(ret->endp, npar, I32);
8638 Copy(r->startp, ret->startp, npar, I32);
8640 Newx(ret->swap, 1, regexp_paren_ofs);
8641 /* no need to copy these */
8642 Newx(ret->swap->startp, npar, I32);
8643 Newx(ret->swap->endp, npar, I32);
8648 Newx(ret->substrs, 1, struct reg_substr_data);
8649 for (s = ret->substrs->data, i = 0; i < 3; i++, s++) {
8650 s->min_offset = r->substrs->data[i].min_offset;
8651 s->max_offset = r->substrs->data[i].max_offset;
8652 s->end_shift = r->substrs->data[i].end_shift;
8653 s->substr = sv_dup_inc(r->substrs->data[i].substr, param);
8654 s->utf8_substr = sv_dup_inc(r->substrs->data[i].utf8_substr, param);
8657 ret->regstclass = NULL;
8660 const int count = r->data->count;
8663 Newxc(d, sizeof(struct reg_data) + count*sizeof(void *),
8664 char, struct reg_data);
8665 Newx(d->what, count, U8);
8668 for (i = 0; i < count; i++) {
8669 d->what[i] = r->data->what[i];
8670 switch (d->what[i]) {
8671 /* legal options are one of: sSfpont
8672 see also regcomp.h and pregfree() */
8675 d->data[i] = sv_dup_inc((SV *)r->data->data[i], param);
8678 d->data[i] = av_dup_inc((AV *)r->data->data[i], param);
8681 /* This is cheating. */
8682 Newx(d->data[i], 1, struct regnode_charclass_class);
8683 StructCopy(r->data->data[i], d->data[i],
8684 struct regnode_charclass_class);
8685 ret->regstclass = (regnode*)d->data[i];
8688 /* Compiled op trees are readonly, and can thus be
8689 shared without duplication. */
8691 d->data[i] = (void*)OpREFCNT_inc((OP*)r->data->data[i]);
8695 d->data[i] = r->data->data[i];
8698 d->data[i] = r->data->data[i];
8700 ((reg_trie_data*)d->data[i])->refcount++;
8704 d->data[i] = r->data->data[i];
8706 ((reg_ac_data*)d->data[i])->refcount++;
8708 /* Trie stclasses are readonly and can thus be shared
8709 * without duplication. We free the stclass in pregfree
8710 * when the corresponding reg_ac_data struct is freed.
8712 ret->regstclass= r->regstclass;
8715 Perl_croak(aTHX_ "panic: re_dup unknown data code '%c'", r->data->what[i]);
8724 Newx(ret->offsets, 2*len+1, U32);
8725 Copy(r->offsets, ret->offsets, 2*len+1, U32);
8727 ret->precomp = SAVEPVN(r->precomp, r->prelen);
8728 ret->refcnt = r->refcnt;
8729 ret->minlen = r->minlen;
8730 ret->minlenret = r->minlenret;
8731 ret->prelen = r->prelen;
8732 ret->nparens = r->nparens;
8733 ret->lastparen = r->lastparen;
8734 ret->lastcloseparen = r->lastcloseparen;
8735 ret->reganch = r->reganch;
8737 ret->sublen = r->sublen;
8739 ret->engine = r->engine;
8741 ret->paren_names = hv_dup_inc(r->paren_names, param);
8743 if (RX_MATCH_COPIED(ret))
8744 ret->subbeg = SAVEPVN(r->subbeg, r->sublen);
8747 #ifdef PERL_OLD_COPY_ON_WRITE
8748 ret->saved_copy = NULL;
8751 ptr_table_store(PL_ptr_table, r, ret);
8759 converts a regexp embedded in a MAGIC struct to its stringified form,
8760 caching the converted form in the struct and returns the cached
8763 If lp is nonnull then it is used to return the length of the
8766 If flags is nonnull and the returned string contains UTF8 then
8767 (flags & 1) will be true.
8769 If haseval is nonnull then it is used to return whether the pattern
8772 Normally called via macro:
8774 CALLREG_STRINGIFY(mg,0,0);
8778 CALLREG_AS_STR(mg,lp,flags,haseval)
8780 See sv_2pv_flags() in sv.c for an example of internal usage.
8785 Perl_reg_stringify(pTHX_ MAGIC *mg, STRLEN *lp, U32 *flags, I32 *haseval ) {
8787 const regexp * const re = (regexp *)mg->mg_obj;
8790 const char *fptr = "msix";
8795 bool need_newline = 0;
8796 U16 reganch = (U16)((re->reganch & PMf_COMPILETIME) >> 12);
8798 while((ch = *fptr++)) {
8800 reflags[left++] = ch;
8803 reflags[right--] = ch;
8808 reflags[left] = '-';
8812 mg->mg_len = re->prelen + 4 + left;
8814 * If /x was used, we have to worry about a regex ending with a
8815 * comment later being embedded within another regex. If so, we don't
8816 * want this regex's "commentization" to leak out to the right part of
8817 * the enclosing regex, we must cap it with a newline.
8819 * So, if /x was used, we scan backwards from the end of the regex. If
8820 * we find a '#' before we find a newline, we need to add a newline
8821 * ourself. If we find a '\n' first (or if we don't find '#' or '\n'),
8822 * we don't need to add anything. -jfriedl
8824 if (PMf_EXTENDED & re->reganch) {
8825 const char *endptr = re->precomp + re->prelen;
8826 while (endptr >= re->precomp) {
8827 const char c = *(endptr--);
8829 break; /* don't need another */
8831 /* we end while in a comment, so we need a newline */
8832 mg->mg_len++; /* save space for it */
8833 need_newline = 1; /* note to add it */
8839 Newx(mg->mg_ptr, mg->mg_len + 1 + left, char);
8840 mg->mg_ptr[0] = '(';
8841 mg->mg_ptr[1] = '?';
8842 Copy(reflags, mg->mg_ptr+2, left, char);
8843 *(mg->mg_ptr+left+2) = ':';
8844 Copy(re->precomp, mg->mg_ptr+3+left, re->prelen, char);
8846 mg->mg_ptr[mg->mg_len - 2] = '\n';
8847 mg->mg_ptr[mg->mg_len - 1] = ')';
8848 mg->mg_ptr[mg->mg_len] = 0;
8851 *haseval = re->program[0].next_off;
8853 *flags = ((re->reganch & ROPT_UTF8) ? 1 : 0);
8861 #ifndef PERL_IN_XSUB_RE
8863 - regnext - dig the "next" pointer out of a node
8866 Perl_regnext(pTHX_ register regnode *p)
8869 register I32 offset;
8871 if (p == &PL_regdummy)
8874 offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p));
8883 S_re_croak2(pTHX_ const char* pat1,const char* pat2,...)
8886 STRLEN l1 = strlen(pat1);
8887 STRLEN l2 = strlen(pat2);
8890 const char *message;
8896 Copy(pat1, buf, l1 , char);
8897 Copy(pat2, buf + l1, l2 , char);
8898 buf[l1 + l2] = '\n';
8899 buf[l1 + l2 + 1] = '\0';
8901 /* ANSI variant takes additional second argument */
8902 va_start(args, pat2);
8906 msv = vmess(buf, &args);
8908 message = SvPV_const(msv,l1);
8911 Copy(message, buf, l1 , char);
8912 buf[l1-1] = '\0'; /* Overwrite \n */
8913 Perl_croak(aTHX_ "%s", buf);
8916 /* XXX Here's a total kludge. But we need to re-enter for swash routines. */
8918 #ifndef PERL_IN_XSUB_RE
8920 Perl_save_re_context(pTHX)
8924 struct re_save_state *state;
8926 SAVEVPTR(PL_curcop);
8927 SSGROW(SAVESTACK_ALLOC_FOR_RE_SAVE_STATE + 1);
8929 state = (struct re_save_state *)(PL_savestack + PL_savestack_ix);
8930 PL_savestack_ix += SAVESTACK_ALLOC_FOR_RE_SAVE_STATE;
8931 SSPUSHINT(SAVEt_RE_STATE);
8933 Copy(&PL_reg_state, state, 1, struct re_save_state);
8935 PL_reg_start_tmp = 0;
8936 PL_reg_start_tmpl = 0;
8937 PL_reg_oldsaved = NULL;
8938 PL_reg_oldsavedlen = 0;
8940 PL_reg_leftiter = 0;
8941 PL_reg_poscache = NULL;
8942 PL_reg_poscache_size = 0;
8943 #ifdef PERL_OLD_COPY_ON_WRITE
8947 /* Save $1..$n (#18107: UTF-8 s/(\w+)/uc($1)/e); AMS 20021106. */
8949 const REGEXP * const rx = PM_GETRE(PL_curpm);
8952 for (i = 1; i <= rx->nparens; i++) {
8953 char digits[TYPE_CHARS(long)];
8954 const STRLEN len = my_snprintf(digits, sizeof(digits), "%lu", (long)i);
8955 GV *const *const gvp
8956 = (GV**)hv_fetch(PL_defstash, digits, len, 0);
8959 GV * const gv = *gvp;
8960 if (SvTYPE(gv) == SVt_PVGV && GvSV(gv))
8970 clear_re(pTHX_ void *r)
8973 ReREFCNT_dec((regexp *)r);
8979 S_put_byte(pTHX_ SV *sv, int c)
8981 if (isCNTRL(c) || c == 255 || !isPRINT(c))
8982 Perl_sv_catpvf(aTHX_ sv, "\\%o", c);
8983 else if (c == '-' || c == ']' || c == '\\' || c == '^')
8984 Perl_sv_catpvf(aTHX_ sv, "\\%c", c);
8986 Perl_sv_catpvf(aTHX_ sv, "%c", c);
8990 #define CLEAR_OPTSTART \
8991 if (optstart) STMT_START { \
8992 DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log, " (%d nodes)\n", node - optstart)); \
8996 #define DUMPUNTIL(b,e) CLEAR_OPTSTART; node=dumpuntil(r,start,(b),(e),last,sv,indent+1,depth+1);
8998 STATIC const regnode *
8999 S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node,
9000 const regnode *last, const regnode *plast,
9001 SV* sv, I32 indent, U32 depth)
9004 register U8 op = PSEUDO; /* Arbitrary non-END op. */
9005 register const regnode *next;
9006 const regnode *optstart= NULL;
9007 GET_RE_DEBUG_FLAGS_DECL;
9009 #ifdef DEBUG_DUMPUNTIL
9010 PerlIO_printf(Perl_debug_log, "--- %d : %d - %d - %d\n",indent,node-start,
9011 last ? last-start : 0,plast ? plast-start : 0);
9014 if (plast && plast < last)
9017 while (PL_regkind[op] != END && (!last || node < last)) {
9018 /* While that wasn't END last time... */
9022 if (op == CLOSE || op == WHILEM)
9024 next = regnext((regnode *)node);
9027 if (OP(node) == OPTIMIZED) {
9028 if (!optstart && RE_DEBUG_FLAG(RE_DEBUG_COMPILE_OPTIMISE))
9035 regprop(r, sv, node);
9036 PerlIO_printf(Perl_debug_log, "%4"IVdf":%*s%s", (IV)(node - start),
9037 (int)(2*indent + 1), "", SvPVX_const(sv));
9039 if (OP(node) != OPTIMIZED) {
9040 if (next == NULL) /* Next ptr. */
9041 PerlIO_printf(Perl_debug_log, "(0)");
9042 else if (PL_regkind[(U8)op] == BRANCH && PL_regkind[OP(next)] != BRANCH )
9043 PerlIO_printf(Perl_debug_log, "(FAIL)");
9045 PerlIO_printf(Perl_debug_log, "(%"IVdf")", (IV)(next - start));
9047 /*if (PL_regkind[(U8)op] != TRIE)*/
9048 (void)PerlIO_putc(Perl_debug_log, '\n');
9052 if (PL_regkind[(U8)op] == BRANCHJ) {
9055 register const regnode *nnode = (OP(next) == LONGJMP
9056 ? regnext((regnode *)next)
9058 if (last && nnode > last)
9060 DUMPUNTIL(NEXTOPER(NEXTOPER(node)), nnode);
9063 else if (PL_regkind[(U8)op] == BRANCH) {
9065 DUMPUNTIL(NEXTOPER(node), next);
9067 else if ( PL_regkind[(U8)op] == TRIE ) {
9068 const regnode *this_trie = node;
9069 const char op = OP(node);
9070 const I32 n = ARG(node);
9071 const reg_ac_data * const ac = op>=AHOCORASICK ?
9072 (reg_ac_data *)r->data->data[n] :
9074 const reg_trie_data * const trie = op<AHOCORASICK ?
9075 (reg_trie_data*)r->data->data[n] :
9077 const regnode *nextbranch= NULL;
9079 sv_setpvn(sv, "", 0);
9080 for (word_idx= 0; word_idx < (I32)trie->wordcount; word_idx++) {
9081 SV ** const elem_ptr = av_fetch(trie->words,word_idx,0);
9083 PerlIO_printf(Perl_debug_log, "%*s%s ",
9084 (int)(2*(indent+3)), "",
9085 elem_ptr ? pv_pretty(sv, SvPV_nolen_const(*elem_ptr), SvCUR(*elem_ptr), 60,
9086 PL_colors[0], PL_colors[1],
9087 (SvUTF8(*elem_ptr) ? PERL_PV_ESCAPE_UNI : 0) |
9088 PERL_PV_PRETTY_ELIPSES |
9094 U16 dist= trie->jump[word_idx+1];
9095 PerlIO_printf(Perl_debug_log, "(%u)\n",
9096 (dist ? this_trie + dist : next) - start);
9099 nextbranch= this_trie + trie->jump[0];
9100 DUMPUNTIL(this_trie + dist, nextbranch);
9102 if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
9103 nextbranch= regnext((regnode *)nextbranch);
9105 PerlIO_printf(Perl_debug_log, "\n");
9108 if (last && next > last)
9113 else if ( op == CURLY ) { /* "next" might be very big: optimizer */
9114 DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS,
9115 NEXTOPER(node) + EXTRA_STEP_2ARGS + 1);
9117 else if (PL_regkind[(U8)op] == CURLY && op != CURLYX) {
9119 DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS, next);
9121 else if ( op == PLUS || op == STAR) {
9122 DUMPUNTIL(NEXTOPER(node), NEXTOPER(node) + 1);
9124 else if (op == ANYOF) {
9125 /* arglen 1 + class block */
9126 node += 1 + ((ANYOF_FLAGS(node) & ANYOF_LARGE)
9127 ? ANYOF_CLASS_SKIP : ANYOF_SKIP);
9128 node = NEXTOPER(node);
9130 else if (PL_regkind[(U8)op] == EXACT) {
9131 /* Literal string, where present. */
9132 node += NODE_SZ_STR(node) - 1;
9133 node = NEXTOPER(node);
9136 node = NEXTOPER(node);
9137 node += regarglen[(U8)op];
9139 if (op == CURLYX || op == OPEN)
9143 #ifdef DEBUG_DUMPUNTIL
9144 PerlIO_printf(Perl_debug_log, "--- %d\n",indent);
9149 #endif /* DEBUGGING */
9153 * c-indentation-style: bsd
9155 * indent-tabs-mode: t
9158 * ex: set ts=8 sts=4 sw=4 noet: