5 * "A fair jaw-cracker dwarf-language must be." --Samwise Gamgee
8 /* This file contains functions for compiling a regular expression. See
9 * also regexec.c which funnily enough, contains functions for executing
10 * a regular expression.
12 * This file is also copied at build time to ext/re/re_comp.c, where
13 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
14 * This causes the main functions to be compiled under new names and with
15 * debugging support added, which makes "use re 'debug'" work.
18 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
19 * confused with the original package (see point 3 below). Thanks, Henry!
22 /* Additional note: this code is very heavily munged from Henry's version
23 * in places. In some spots I've traded clarity for efficiency, so don't
24 * blame Henry for some of the lack of readability.
27 /* The names of the functions have been changed from regcomp and
28 * regexec to pregcomp and pregexec in order to avoid conflicts
29 * with the POSIX routines of the same names.
32 #ifdef PERL_EXT_RE_BUILD
37 * pregcomp and pregexec -- regsub and regerror are not used in perl
39 * Copyright (c) 1986 by University of Toronto.
40 * Written by Henry Spencer. Not derived from licensed software.
42 * Permission is granted to anyone to use this software for any
43 * purpose on any computer system, and to redistribute it freely,
44 * subject to the following restrictions:
46 * 1. The author is not responsible for the consequences of use of
47 * this software, no matter how awful, even if they arise
50 * 2. The origin of this software must not be misrepresented, either
51 * by explicit claim or by omission.
53 * 3. Altered versions must be plainly marked as such, and must not
54 * be misrepresented as being the original software.
57 **** Alterations to Henry's code are...
59 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
60 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, by Larry Wall and others
62 **** You may distribute under the terms of either the GNU General Public
63 **** License or the Artistic License, as specified in the README file.
66 * Beware that some of this code is subtly aware of the way operator
67 * precedence is structured in regular expressions. Serious changes in
68 * regular-expression syntax might require a total rethink.
71 #define PERL_IN_REGCOMP_C
74 #ifndef PERL_IN_XSUB_RE
79 #ifdef PERL_IN_XSUB_RE
90 # if defined(BUGGY_MSC6)
91 /* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */
92 # pragma optimize("a",off)
93 /* But MSC 6.00A is happy with 'w', for aliases only across function calls*/
94 # pragma optimize("w",on )
95 # endif /* BUGGY_MSC6 */
102 typedef struct RExC_state_t {
103 U32 flags; /* are we folding, multilining? */
104 char *precomp; /* uncompiled string. */
106 char *start; /* Start of input for compile */
107 char *end; /* End of input for compile */
108 char *parse; /* Input-scan pointer. */
109 I32 whilem_seen; /* number of WHILEM in this expr */
110 regnode *emit_start; /* Start of emitted-code area */
111 regnode *emit; /* Code-emit pointer; ®dummy = don't = compiling */
112 I32 naughty; /* How bad is this pattern? */
113 I32 sawback; /* Did we see \1, ...? */
115 I32 size; /* Code size. */
116 I32 npar; /* Capture buffer count, (OPEN). */
117 I32 cpar; /* Capture buffer count, (CLOSE). */
118 I32 nestroot; /* root parens we are in - used by accept */
122 regnode **open_parens; /* pointers to open parens */
123 regnode **close_parens; /* pointers to close parens */
124 regnode *opend; /* END node in program */
126 HV *charnames; /* cache of named sequences */
127 HV *paren_names; /* Paren names */
128 regnode **recurse; /* Recurse regops */
129 I32 recurse_count; /* Number of recurse regops */
131 char *starttry; /* -Dr: where regtry was called. */
132 #define RExC_starttry (pRExC_state->starttry)
135 const char *lastparse;
137 #define RExC_lastparse (pRExC_state->lastparse)
138 #define RExC_lastnum (pRExC_state->lastnum)
142 #define RExC_flags (pRExC_state->flags)
143 #define RExC_precomp (pRExC_state->precomp)
144 #define RExC_rx (pRExC_state->rx)
145 #define RExC_start (pRExC_state->start)
146 #define RExC_end (pRExC_state->end)
147 #define RExC_parse (pRExC_state->parse)
148 #define RExC_whilem_seen (pRExC_state->whilem_seen)
149 #define RExC_offsets (pRExC_state->rx->offsets) /* I am not like the others */
150 #define RExC_emit (pRExC_state->emit)
151 #define RExC_emit_start (pRExC_state->emit_start)
152 #define RExC_naughty (pRExC_state->naughty)
153 #define RExC_sawback (pRExC_state->sawback)
154 #define RExC_seen (pRExC_state->seen)
155 #define RExC_size (pRExC_state->size)
156 #define RExC_npar (pRExC_state->npar)
157 #define RExC_cpar (pRExC_state->cpar)
158 #define RExC_nestroot (pRExC_state->nestroot)
159 #define RExC_extralen (pRExC_state->extralen)
160 #define RExC_seen_zerolen (pRExC_state->seen_zerolen)
161 #define RExC_seen_evals (pRExC_state->seen_evals)
162 #define RExC_utf8 (pRExC_state->utf8)
163 #define RExC_charnames (pRExC_state->charnames)
164 #define RExC_open_parens (pRExC_state->open_parens)
165 #define RExC_close_parens (pRExC_state->close_parens)
166 #define RExC_opend (pRExC_state->opend)
167 #define RExC_paren_names (pRExC_state->paren_names)
168 #define RExC_recurse (pRExC_state->recurse)
169 #define RExC_recurse_count (pRExC_state->recurse_count)
171 #define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?')
172 #define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
173 ((*s) == '{' && regcurly(s)))
176 #undef SPSTART /* dratted cpp namespace... */
179 * Flags to be passed up and down.
181 #define WORST 0 /* Worst case. */
182 #define HASWIDTH 0x1 /* Known to match non-null strings. */
183 #define SIMPLE 0x2 /* Simple enough to be STAR/PLUS operand. */
184 #define SPSTART 0x4 /* Starts with * or +. */
185 #define TRYAGAIN 0x8 /* Weeded out a declaration. */
187 #define REG_NODE_NUM(x) ((x) ? (int)((x)-RExC_emit_start) : -1)
189 /* whether trie related optimizations are enabled */
190 #if PERL_ENABLE_EXTENDED_TRIE_OPTIMISATION
191 #define TRIE_STUDY_OPT
192 #define FULL_TRIE_STUDY
198 #define PBYTE(u8str,paren) ((U8*)(u8str))[(paren) >> 3]
199 #define PBITVAL(paren) (1 << ((paren) & 7))
200 #define PAREN_TEST(u8str,paren) ( PBYTE(u8str,paren) & PBITVAL(paren))
201 #define PAREN_SET(u8str,paren) PBYTE(u8str,paren) |= PBITVAL(paren)
202 #define PAREN_UNSET(u8str,paren) PBYTE(u8str,paren) &= (~PBITVAL(paren))
205 /* About scan_data_t.
207 During optimisation we recurse through the regexp program performing
208 various inplace (keyhole style) optimisations. In addition study_chunk
209 and scan_commit populate this data structure with information about
210 what strings MUST appear in the pattern. We look for the longest
211 string that must appear for at a fixed location, and we look for the
212 longest string that may appear at a floating location. So for instance
217 Both 'FOO' and 'A' are fixed strings. Both 'B' and 'BAR' are floating
218 strings (because they follow a .* construct). study_chunk will identify
219 both FOO and BAR as being the longest fixed and floating strings respectively.
221 The strings can be composites, for instance
225 will result in a composite fixed substring 'foo'.
227 For each string some basic information is maintained:
229 - offset or min_offset
230 This is the position the string must appear at, or not before.
231 It also implicitly (when combined with minlenp) tells us how many
232 character must match before the string we are searching.
233 Likewise when combined with minlenp and the length of the string
234 tells us how many characters must appear after the string we have
238 Only used for floating strings. This is the rightmost point that
239 the string can appear at. Ifset to I32 max it indicates that the
240 string can occur infinitely far to the right.
243 A pointer to the minimum length of the pattern that the string
244 was found inside. This is important as in the case of positive
245 lookahead or positive lookbehind we can have multiple patterns
250 The minimum length of the pattern overall is 3, the minimum length
251 of the lookahead part is 3, but the minimum length of the part that
252 will actually match is 1. So 'FOO's minimum length is 3, but the
253 minimum length for the F is 1. This is important as the minimum length
254 is used to determine offsets in front of and behind the string being
255 looked for. Since strings can be composites this is the length of the
256 pattern at the time it was commited with a scan_commit. Note that
257 the length is calculated by study_chunk, so that the minimum lengths
258 are not known until the full pattern has been compiled, thus the
259 pointer to the value.
263 In the case of lookbehind the string being searched for can be
264 offset past the start point of the final matching string.
265 If this value was just blithely removed from the min_offset it would
266 invalidate some of the calculations for how many chars must match
267 before or after (as they are derived from min_offset and minlen and
268 the length of the string being searched for).
269 When the final pattern is compiled and the data is moved from the
270 scan_data_t structure into the regexp structure the information
271 about lookbehind is factored in, with the information that would
272 have been lost precalculated in the end_shift field for the
275 The fields pos_min and pos_delta are used to store the minimum offset
276 and the delta to the maximum offset at the current point in the pattern.
280 typedef struct scan_data_t {
281 /*I32 len_min; unused */
282 /*I32 len_delta; unused */
286 I32 last_end; /* min value, <0 unless valid. */
289 SV **longest; /* Either &l_fixed, or &l_float. */
290 SV *longest_fixed; /* longest fixed string found in pattern */
291 I32 offset_fixed; /* offset where it starts */
292 I32 *minlen_fixed; /* pointer to the minlen relevent to the string */
293 I32 lookbehind_fixed; /* is the position of the string modfied by LB */
294 SV *longest_float; /* longest floating string found in pattern */
295 I32 offset_float_min; /* earliest point in string it can appear */
296 I32 offset_float_max; /* latest point in string it can appear */
297 I32 *minlen_float; /* pointer to the minlen relevent to the string */
298 I32 lookbehind_float; /* is the position of the string modified by LB */
302 struct regnode_charclass_class *start_class;
306 * Forward declarations for pregcomp()'s friends.
309 static const scan_data_t zero_scan_data =
310 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ,0};
312 #define SF_BEFORE_EOL (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
313 #define SF_BEFORE_SEOL 0x0001
314 #define SF_BEFORE_MEOL 0x0002
315 #define SF_FIX_BEFORE_EOL (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
316 #define SF_FL_BEFORE_EOL (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
319 # define SF_FIX_SHIFT_EOL (0+2)
320 # define SF_FL_SHIFT_EOL (0+4)
322 # define SF_FIX_SHIFT_EOL (+2)
323 # define SF_FL_SHIFT_EOL (+4)
326 #define SF_FIX_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
327 #define SF_FIX_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
329 #define SF_FL_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
330 #define SF_FL_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
331 #define SF_IS_INF 0x0040
332 #define SF_HAS_PAR 0x0080
333 #define SF_IN_PAR 0x0100
334 #define SF_HAS_EVAL 0x0200
335 #define SCF_DO_SUBSTR 0x0400
336 #define SCF_DO_STCLASS_AND 0x0800
337 #define SCF_DO_STCLASS_OR 0x1000
338 #define SCF_DO_STCLASS (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
339 #define SCF_WHILEM_VISITED_POS 0x2000
341 #define SCF_TRIE_RESTUDY 0x4000 /* Do restudy? */
342 #define SCF_SEEN_ACCEPT 0x8000
344 #define UTF (RExC_utf8 != 0)
345 #define LOC ((RExC_flags & PMf_LOCALE) != 0)
346 #define FOLD ((RExC_flags & PMf_FOLD) != 0)
348 #define OOB_UNICODE 12345678
349 #define OOB_NAMEDCLASS -1
351 #define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
352 #define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
355 /* length of regex to show in messages that don't mark a position within */
356 #define RegexLengthToShowInErrorMessages 127
359 * If MARKER[12] are adjusted, be sure to adjust the constants at the top
360 * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
361 * op/pragma/warn/regcomp.
363 #define MARKER1 "<-- HERE" /* marker as it appears in the description */
364 #define MARKER2 " <-- HERE " /* marker as it appears within the regex */
366 #define REPORT_LOCATION " in regex; marked by " MARKER1 " in m/%.*s" MARKER2 "%s/"
369 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
370 * arg. Show regex, up to a maximum length. If it's too long, chop and add
373 #define FAIL(msg) STMT_START { \
374 const char *ellipses = ""; \
375 IV len = RExC_end - RExC_precomp; \
378 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
379 if (len > RegexLengthToShowInErrorMessages) { \
380 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
381 len = RegexLengthToShowInErrorMessages - 10; \
384 Perl_croak(aTHX_ "%s in regex m/%.*s%s/", \
385 msg, (int)len, RExC_precomp, ellipses); \
389 * Simple_vFAIL -- like FAIL, but marks the current location in the scan
391 #define Simple_vFAIL(m) STMT_START { \
392 const IV offset = RExC_parse - RExC_precomp; \
393 Perl_croak(aTHX_ "%s" REPORT_LOCATION, \
394 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
398 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
400 #define vFAIL(m) STMT_START { \
402 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
407 * Like Simple_vFAIL(), but accepts two arguments.
409 #define Simple_vFAIL2(m,a1) STMT_START { \
410 const IV offset = RExC_parse - RExC_precomp; \
411 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, \
412 (int)offset, RExC_precomp, RExC_precomp + offset); \
416 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
418 #define vFAIL2(m,a1) STMT_START { \
420 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
421 Simple_vFAIL2(m, a1); \
426 * Like Simple_vFAIL(), but accepts three arguments.
428 #define Simple_vFAIL3(m, a1, a2) STMT_START { \
429 const IV offset = RExC_parse - RExC_precomp; \
430 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, \
431 (int)offset, RExC_precomp, RExC_precomp + offset); \
435 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
437 #define vFAIL3(m,a1,a2) STMT_START { \
439 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
440 Simple_vFAIL3(m, a1, a2); \
444 * Like Simple_vFAIL(), but accepts four arguments.
446 #define Simple_vFAIL4(m, a1, a2, a3) STMT_START { \
447 const IV offset = RExC_parse - RExC_precomp; \
448 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3, \
449 (int)offset, RExC_precomp, RExC_precomp + offset); \
452 #define vWARN(loc,m) STMT_START { \
453 const IV offset = loc - RExC_precomp; \
454 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s" REPORT_LOCATION, \
455 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
458 #define vWARNdep(loc,m) STMT_START { \
459 const IV offset = loc - RExC_precomp; \
460 Perl_warner(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), \
461 "%s" REPORT_LOCATION, \
462 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
466 #define vWARN2(loc, m, a1) STMT_START { \
467 const IV offset = loc - RExC_precomp; \
468 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
469 a1, (int)offset, RExC_precomp, RExC_precomp + offset); \
472 #define vWARN3(loc, m, a1, a2) STMT_START { \
473 const IV offset = loc - RExC_precomp; \
474 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
475 a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset); \
478 #define vWARN4(loc, m, a1, a2, a3) STMT_START { \
479 const IV offset = loc - RExC_precomp; \
480 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
481 a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
484 #define vWARN5(loc, m, a1, a2, a3, a4) STMT_START { \
485 const IV offset = loc - RExC_precomp; \
486 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
487 a1, a2, a3, a4, (int)offset, RExC_precomp, RExC_precomp + offset); \
491 /* Allow for side effects in s */
492 #define REGC(c,s) STMT_START { \
493 if (!SIZE_ONLY) *(s) = (c); else (void)(s); \
496 /* Macros for recording node offsets. 20001227 mjd@plover.com
497 * Nodes are numbered 1, 2, 3, 4. Node #n's position is recorded in
498 * element 2*n-1 of the array. Element #2n holds the byte length node #n.
499 * Element 0 holds the number n.
500 * Position is 1 indexed.
503 #define Set_Node_Offset_To_R(node,byte) STMT_START { \
505 MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n", \
506 __LINE__, (int)(node), (int)(byte))); \
508 Perl_croak(aTHX_ "value of node is %d in Offset macro", (int)(node)); \
510 RExC_offsets[2*(node)-1] = (byte); \
515 #define Set_Node_Offset(node,byte) \
516 Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
517 #define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
519 #define Set_Node_Length_To_R(node,len) STMT_START { \
521 MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n", \
522 __LINE__, (int)(node), (int)(len))); \
524 Perl_croak(aTHX_ "value of node is %d in Length macro", (int)(node)); \
526 RExC_offsets[2*(node)] = (len); \
531 #define Set_Node_Length(node,len) \
532 Set_Node_Length_To_R((node)-RExC_emit_start, len)
533 #define Set_Cur_Node_Length(len) Set_Node_Length(RExC_emit, len)
534 #define Set_Node_Cur_Length(node) \
535 Set_Node_Length(node, RExC_parse - parse_start)
537 /* Get offsets and lengths */
538 #define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
539 #define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
541 #define Set_Node_Offset_Length(node,offset,len) STMT_START { \
542 Set_Node_Offset_To_R((node)-RExC_emit_start, (offset)); \
543 Set_Node_Length_To_R((node)-RExC_emit_start, (len)); \
547 #if PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS
548 #define EXPERIMENTAL_INPLACESCAN
551 #define DEBUG_STUDYDATA(data,depth) \
552 DEBUG_OPTIMISE_MORE_r(if(data){ \
553 PerlIO_printf(Perl_debug_log, \
554 "%*s"/* Len:%"IVdf"/%"IVdf" */"Pos:%"IVdf"/%"IVdf \
555 " Flags: %"IVdf" Whilem_c: %"IVdf" Lcp: %"IVdf" ", \
556 (int)(depth)*2, "", \
557 (IV)((data)->pos_min), \
558 (IV)((data)->pos_delta), \
559 (IV)((data)->flags), \
560 (IV)((data)->whilem_c), \
561 (IV)((data)->last_closep ? *((data)->last_closep) : -1) \
563 if ((data)->last_found) \
564 PerlIO_printf(Perl_debug_log, \
565 "Last:'%s' %"IVdf":%"IVdf"/%"IVdf" %sFixed:'%s' @ %"IVdf \
566 " %sFloat: '%s' @ %"IVdf"/%"IVdf"", \
567 SvPVX_const((data)->last_found), \
568 (IV)((data)->last_end), \
569 (IV)((data)->last_start_min), \
570 (IV)((data)->last_start_max), \
571 ((data)->longest && \
572 (data)->longest==&((data)->longest_fixed)) ? "*" : "", \
573 SvPVX_const((data)->longest_fixed), \
574 (IV)((data)->offset_fixed), \
575 ((data)->longest && \
576 (data)->longest==&((data)->longest_float)) ? "*" : "", \
577 SvPVX_const((data)->longest_float), \
578 (IV)((data)->offset_float_min), \
579 (IV)((data)->offset_float_max) \
581 PerlIO_printf(Perl_debug_log,"\n"); \
584 static void clear_re(pTHX_ void *r);
586 /* Mark that we cannot extend a found fixed substring at this point.
587 Update the longest found anchored substring and the longest found
588 floating substrings if needed. */
591 S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data, I32 *minlenp)
593 const STRLEN l = CHR_SVLEN(data->last_found);
594 const STRLEN old_l = CHR_SVLEN(*data->longest);
595 GET_RE_DEBUG_FLAGS_DECL;
597 if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
598 SvSetMagicSV(*data->longest, data->last_found);
599 if (*data->longest == data->longest_fixed) {
600 data->offset_fixed = l ? data->last_start_min : data->pos_min;
601 if (data->flags & SF_BEFORE_EOL)
603 |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
605 data->flags &= ~SF_FIX_BEFORE_EOL;
606 data->minlen_fixed=minlenp;
607 data->lookbehind_fixed=0;
610 data->offset_float_min = l ? data->last_start_min : data->pos_min;
611 data->offset_float_max = (l
612 ? data->last_start_max
613 : data->pos_min + data->pos_delta);
614 if ((U32)data->offset_float_max > (U32)I32_MAX)
615 data->offset_float_max = I32_MAX;
616 if (data->flags & SF_BEFORE_EOL)
618 |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
620 data->flags &= ~SF_FL_BEFORE_EOL;
621 data->minlen_float=minlenp;
622 data->lookbehind_float=0;
625 SvCUR_set(data->last_found, 0);
627 SV * const sv = data->last_found;
628 if (SvUTF8(sv) && SvMAGICAL(sv)) {
629 MAGIC * const mg = mg_find(sv, PERL_MAGIC_utf8);
635 data->flags &= ~SF_BEFORE_EOL;
636 DEBUG_STUDYDATA(data,0);
639 /* Can match anything (initialization) */
641 S_cl_anything(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
643 ANYOF_CLASS_ZERO(cl);
644 ANYOF_BITMAP_SETALL(cl);
645 cl->flags = ANYOF_EOS|ANYOF_UNICODE_ALL;
647 cl->flags |= ANYOF_LOCALE;
650 /* Can match anything (initialization) */
652 S_cl_is_anything(const struct regnode_charclass_class *cl)
656 for (value = 0; value <= ANYOF_MAX; value += 2)
657 if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1))
659 if (!(cl->flags & ANYOF_UNICODE_ALL))
661 if (!ANYOF_BITMAP_TESTALLSET((const void*)cl))
666 /* Can match anything (initialization) */
668 S_cl_init(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
670 Zero(cl, 1, struct regnode_charclass_class);
672 cl_anything(pRExC_state, cl);
676 S_cl_init_zero(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
678 Zero(cl, 1, struct regnode_charclass_class);
680 cl_anything(pRExC_state, cl);
682 cl->flags |= ANYOF_LOCALE;
685 /* 'And' a given class with another one. Can create false positives */
686 /* We assume that cl is not inverted */
688 S_cl_and(struct regnode_charclass_class *cl,
689 const struct regnode_charclass_class *and_with)
692 assert(and_with->type == ANYOF);
693 if (!(and_with->flags & ANYOF_CLASS)
694 && !(cl->flags & ANYOF_CLASS)
695 && (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
696 && !(and_with->flags & ANYOF_FOLD)
697 && !(cl->flags & ANYOF_FOLD)) {
700 if (and_with->flags & ANYOF_INVERT)
701 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
702 cl->bitmap[i] &= ~and_with->bitmap[i];
704 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
705 cl->bitmap[i] &= and_with->bitmap[i];
706 } /* XXXX: logic is complicated otherwise, leave it along for a moment. */
707 if (!(and_with->flags & ANYOF_EOS))
708 cl->flags &= ~ANYOF_EOS;
710 if (cl->flags & ANYOF_UNICODE_ALL && and_with->flags & ANYOF_UNICODE &&
711 !(and_with->flags & ANYOF_INVERT)) {
712 cl->flags &= ~ANYOF_UNICODE_ALL;
713 cl->flags |= ANYOF_UNICODE;
714 ARG_SET(cl, ARG(and_with));
716 if (!(and_with->flags & ANYOF_UNICODE_ALL) &&
717 !(and_with->flags & ANYOF_INVERT))
718 cl->flags &= ~ANYOF_UNICODE_ALL;
719 if (!(and_with->flags & (ANYOF_UNICODE|ANYOF_UNICODE_ALL)) &&
720 !(and_with->flags & ANYOF_INVERT))
721 cl->flags &= ~ANYOF_UNICODE;
724 /* 'OR' a given class with another one. Can create false positives */
725 /* We assume that cl is not inverted */
727 S_cl_or(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, const struct regnode_charclass_class *or_with)
729 if (or_with->flags & ANYOF_INVERT) {
731 * (B1 | CL1) | (!B2 & !CL2) = (B1 | !B2 & !CL2) | (CL1 | (!B2 & !CL2))
732 * <= (B1 | !B2) | (CL1 | !CL2)
733 * which is wasteful if CL2 is small, but we ignore CL2:
734 * (B1 | CL1) | (!B2 & !CL2) <= (B1 | CL1) | !B2 = (B1 | !B2) | CL1
735 * XXXX Can we handle case-fold? Unclear:
736 * (OK1(i) | OK1(i')) | !(OK1(i) | OK1(i')) =
737 * (OK1(i) | OK1(i')) | (!OK1(i) & !OK1(i'))
739 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
740 && !(or_with->flags & ANYOF_FOLD)
741 && !(cl->flags & ANYOF_FOLD) ) {
744 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
745 cl->bitmap[i] |= ~or_with->bitmap[i];
746 } /* XXXX: logic is complicated otherwise */
748 cl_anything(pRExC_state, cl);
751 /* (B1 | CL1) | (B2 | CL2) = (B1 | B2) | (CL1 | CL2)) */
752 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
753 && (!(or_with->flags & ANYOF_FOLD)
754 || (cl->flags & ANYOF_FOLD)) ) {
757 /* OR char bitmap and class bitmap separately */
758 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
759 cl->bitmap[i] |= or_with->bitmap[i];
760 if (or_with->flags & ANYOF_CLASS) {
761 for (i = 0; i < ANYOF_CLASSBITMAP_SIZE; i++)
762 cl->classflags[i] |= or_with->classflags[i];
763 cl->flags |= ANYOF_CLASS;
766 else { /* XXXX: logic is complicated, leave it along for a moment. */
767 cl_anything(pRExC_state, cl);
770 if (or_with->flags & ANYOF_EOS)
771 cl->flags |= ANYOF_EOS;
773 if (cl->flags & ANYOF_UNICODE && or_with->flags & ANYOF_UNICODE &&
774 ARG(cl) != ARG(or_with)) {
775 cl->flags |= ANYOF_UNICODE_ALL;
776 cl->flags &= ~ANYOF_UNICODE;
778 if (or_with->flags & ANYOF_UNICODE_ALL) {
779 cl->flags |= ANYOF_UNICODE_ALL;
780 cl->flags &= ~ANYOF_UNICODE;
784 #define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ]
785 #define TRIE_LIST_CUR(state) ( TRIE_LIST_ITEM( state, 0 ).forid )
786 #define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate )
787 #define TRIE_LIST_USED(idx) ( trie->states[state].trans.list ? (TRIE_LIST_CUR( idx ) - 1) : 0 )
793 dump_trie_interim_list(trie,next_alloc)
794 dump_trie_interim_table(trie,next_alloc)
796 These routines dump out a trie in a somewhat readable format.
797 The _interim_ variants are used for debugging the interim
798 tables that are used to generate the final compressed
799 representation which is what dump_trie expects.
801 Part of the reason for their existance is to provide a form
802 of documentation as to how the different representations function.
808 Dumps the final compressed table form of the trie to Perl_debug_log.
809 Used for debugging make_trie().
813 S_dump_trie(pTHX_ const struct _reg_trie_data *trie,U32 depth)
816 SV *sv=sv_newmortal();
817 int colwidth= trie->widecharmap ? 6 : 4;
818 GET_RE_DEBUG_FLAGS_DECL;
821 PerlIO_printf( Perl_debug_log, "%*sChar : %-6s%-6s%-4s ",
822 (int)depth * 2 + 2,"",
823 "Match","Base","Ofs" );
825 for( state = 0 ; state < trie->uniquecharcount ; state++ ) {
826 SV ** const tmp = av_fetch( trie->revcharmap, state, 0);
828 PerlIO_printf( Perl_debug_log, "%*s",
830 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
831 PL_colors[0], PL_colors[1],
832 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
833 PERL_PV_ESCAPE_FIRSTCHAR
838 PerlIO_printf( Perl_debug_log, "\n%*sState|-----------------------",
839 (int)depth * 2 + 2,"");
841 for( state = 0 ; state < trie->uniquecharcount ; state++ )
842 PerlIO_printf( Perl_debug_log, "%.*s", colwidth, "--------");
843 PerlIO_printf( Perl_debug_log, "\n");
845 for( state = 1 ; state < trie->statecount ; state++ ) {
846 const U32 base = trie->states[ state ].trans.base;
848 PerlIO_printf( Perl_debug_log, "%*s#%4"UVXf"|", (int)depth * 2 + 2,"", (UV)state);
850 if ( trie->states[ state ].wordnum ) {
851 PerlIO_printf( Perl_debug_log, " W%4X", trie->states[ state ].wordnum );
853 PerlIO_printf( Perl_debug_log, "%6s", "" );
856 PerlIO_printf( Perl_debug_log, " @%4"UVXf" ", (UV)base );
861 while( ( base + ofs < trie->uniquecharcount ) ||
862 ( base + ofs - trie->uniquecharcount < trie->lasttrans
863 && trie->trans[ base + ofs - trie->uniquecharcount ].check != state))
866 PerlIO_printf( Perl_debug_log, "+%2"UVXf"[ ", (UV)ofs);
868 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
869 if ( ( base + ofs >= trie->uniquecharcount ) &&
870 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
871 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
873 PerlIO_printf( Perl_debug_log, "%*"UVXf,
875 (UV)trie->trans[ base + ofs - trie->uniquecharcount ].next );
877 PerlIO_printf( Perl_debug_log, "%*s",colwidth," ." );
881 PerlIO_printf( Perl_debug_log, "]");
884 PerlIO_printf( Perl_debug_log, "\n" );
888 dump_trie_interim_list(trie,next_alloc)
889 Dumps a fully constructed but uncompressed trie in list form.
890 List tries normally only are used for construction when the number of
891 possible chars (trie->uniquecharcount) is very high.
892 Used for debugging make_trie().
895 S_dump_trie_interim_list(pTHX_ const struct _reg_trie_data *trie, U32 next_alloc,U32 depth)
898 SV *sv=sv_newmortal();
899 int colwidth= trie->widecharmap ? 6 : 4;
900 GET_RE_DEBUG_FLAGS_DECL;
901 /* print out the table precompression. */
902 PerlIO_printf( Perl_debug_log, "%*sState :Word | Transition Data\n%*s%s",
903 (int)depth * 2 + 2,"", (int)depth * 2 + 2,"",
904 "------:-----+-----------------\n" );
906 for( state=1 ; state < next_alloc ; state ++ ) {
909 PerlIO_printf( Perl_debug_log, "%*s %4"UVXf" :",
910 (int)depth * 2 + 2,"", (UV)state );
911 if ( ! trie->states[ state ].wordnum ) {
912 PerlIO_printf( Perl_debug_log, "%5s| ","");
914 PerlIO_printf( Perl_debug_log, "W%4x| ",
915 trie->states[ state ].wordnum
918 for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) {
919 SV ** const tmp = av_fetch( trie->revcharmap, TRIE_LIST_ITEM(state,charid).forid, 0);
921 PerlIO_printf( Perl_debug_log, "%*s:%3X=%4"UVXf" | ",
923 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
924 PL_colors[0], PL_colors[1],
925 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
926 PERL_PV_ESCAPE_FIRSTCHAR
928 TRIE_LIST_ITEM(state,charid).forid,
929 (UV)TRIE_LIST_ITEM(state,charid).newstate
932 PerlIO_printf(Perl_debug_log, "\n%*s| ",
933 (int)((depth * 2) + 14), "");
936 PerlIO_printf( Perl_debug_log, "\n");
941 dump_trie_interim_table(trie,next_alloc)
942 Dumps a fully constructed but uncompressed trie in table form.
943 This is the normal DFA style state transition table, with a few
944 twists to facilitate compression later.
945 Used for debugging make_trie().
948 S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie, U32 next_alloc, U32 depth)
952 SV *sv=sv_newmortal();
953 int colwidth= trie->widecharmap ? 6 : 4;
954 GET_RE_DEBUG_FLAGS_DECL;
957 print out the table precompression so that we can do a visual check
958 that they are identical.
961 PerlIO_printf( Perl_debug_log, "%*sChar : ",(int)depth * 2 + 2,"" );
963 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
964 SV ** const tmp = av_fetch( trie->revcharmap, charid, 0);
966 PerlIO_printf( Perl_debug_log, "%*s",
968 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
969 PL_colors[0], PL_colors[1],
970 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
971 PERL_PV_ESCAPE_FIRSTCHAR
977 PerlIO_printf( Perl_debug_log, "\n%*sState+-",(int)depth * 2 + 2,"" );
979 for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) {
980 PerlIO_printf( Perl_debug_log, "%.*s", colwidth,"--------");
983 PerlIO_printf( Perl_debug_log, "\n" );
985 for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) {
987 PerlIO_printf( Perl_debug_log, "%*s%4"UVXf" : ",
988 (int)depth * 2 + 2,"",
989 (UV)TRIE_NODENUM( state ) );
991 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
992 UV v=(UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next );
994 PerlIO_printf( Perl_debug_log, "%*"UVXf, colwidth, v );
996 PerlIO_printf( Perl_debug_log, "%*s", colwidth, "." );
998 if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) {
999 PerlIO_printf( Perl_debug_log, " (%4"UVXf")\n", (UV)trie->trans[ state ].check );
1001 PerlIO_printf( Perl_debug_log, " (%4"UVXf") W%4X\n", (UV)trie->trans[ state ].check,
1002 trie->states[ TRIE_NODENUM( state ) ].wordnum );
1009 /* make_trie(startbranch,first,last,tail,word_count,flags,depth)
1010 startbranch: the first branch in the whole branch sequence
1011 first : start branch of sequence of branch-exact nodes.
1012 May be the same as startbranch
1013 last : Thing following the last branch.
1014 May be the same as tail.
1015 tail : item following the branch sequence
1016 count : words in the sequence
1017 flags : currently the OP() type we will be building one of /EXACT(|F|Fl)/
1018 depth : indent depth
1020 Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node.
1022 A trie is an N'ary tree where the branches are determined by digital
1023 decomposition of the key. IE, at the root node you look up the 1st character and
1024 follow that branch repeat until you find the end of the branches. Nodes can be
1025 marked as "accepting" meaning they represent a complete word. Eg:
1029 would convert into the following structure. Numbers represent states, letters
1030 following numbers represent valid transitions on the letter from that state, if
1031 the number is in square brackets it represents an accepting state, otherwise it
1032 will be in parenthesis.
1034 +-h->+-e->[3]-+-r->(8)-+-s->[9]
1038 (1) +-i->(6)-+-s->[7]
1040 +-s->(3)-+-h->(4)-+-e->[5]
1042 Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers)
1044 This shows that when matching against the string 'hers' we will begin at state 1
1045 read 'h' and move to state 2, read 'e' and move to state 3 which is accepting,
1046 then read 'r' and go to state 8 followed by 's' which takes us to state 9 which
1047 is also accepting. Thus we know that we can match both 'he' and 'hers' with a
1048 single traverse. We store a mapping from accepting to state to which word was
1049 matched, and then when we have multiple possibilities we try to complete the
1050 rest of the regex in the order in which they occured in the alternation.
1052 The only prior NFA like behaviour that would be changed by the TRIE support is
1053 the silent ignoring of duplicate alternations which are of the form:
1055 / (DUPE|DUPE) X? (?{ ... }) Y /x
1057 Thus EVAL blocks follwing a trie may be called a different number of times with
1058 and without the optimisation. With the optimisations dupes will be silently
1059 ignored. This inconsistant behaviour of EVAL type nodes is well established as
1060 the following demonstrates:
1062 'words'=~/(word|word|word)(?{ print $1 })[xyz]/
1064 which prints out 'word' three times, but
1066 'words'=~/(word|word|word)(?{ print $1 })S/
1068 which doesnt print it out at all. This is due to other optimisations kicking in.
1070 Example of what happens on a structural level:
1072 The regexp /(ac|ad|ab)+/ will produce the folowing debug output:
1074 1: CURLYM[1] {1,32767}(18)
1085 This would be optimizable with startbranch=5, first=5, last=16, tail=16
1086 and should turn into:
1088 1: CURLYM[1] {1,32767}(18)
1090 [Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1]
1098 Cases where tail != last would be like /(?foo|bar)baz/:
1108 which would be optimizable with startbranch=1, first=1, last=7, tail=8
1109 and would end up looking like:
1112 [Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1]
1119 d = uvuni_to_utf8_flags(d, uv, 0);
1121 is the recommended Unicode-aware way of saying
1126 #define TRIE_STORE_REVCHAR \
1128 SV *tmp = newSVpvs(""); \
1129 if (UTF) SvUTF8_on(tmp); \
1130 Perl_sv_catpvf( aTHX_ tmp, "%c", (int)uvc ); \
1131 av_push( TRIE_REVCHARMAP(trie), tmp ); \
1134 #define TRIE_READ_CHAR STMT_START { \
1138 if ( foldlen > 0 ) { \
1139 uvc = utf8n_to_uvuni( scan, UTF8_MAXLEN, &len, uniflags ); \
1144 uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
1145 uvc = to_uni_fold( uvc, foldbuf, &foldlen ); \
1146 foldlen -= UNISKIP( uvc ); \
1147 scan = foldbuf + UNISKIP( uvc ); \
1150 uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
1160 #define TRIE_LIST_PUSH(state,fid,ns) STMT_START { \
1161 if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) { \
1162 U32 ging = TRIE_LIST_LEN( state ) *= 2; \
1163 Renew( trie->states[ state ].trans.list, ging, reg_trie_trans_le ); \
1165 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid; \
1166 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns; \
1167 TRIE_LIST_CUR( state )++; \
1170 #define TRIE_LIST_NEW(state) STMT_START { \
1171 Newxz( trie->states[ state ].trans.list, \
1172 4, reg_trie_trans_le ); \
1173 TRIE_LIST_CUR( state ) = 1; \
1174 TRIE_LIST_LEN( state ) = 4; \
1177 #define TRIE_HANDLE_WORD(state) STMT_START { \
1178 U16 dupe= trie->states[ state ].wordnum; \
1179 regnode * const noper_next = regnext( noper ); \
1181 if (trie->wordlen) \
1182 trie->wordlen[ curword ] = wordlen; \
1184 /* store the word for dumping */ \
1186 if (OP(noper) != NOTHING) \
1187 tmp = newSVpvn(STRING(noper), STR_LEN(noper)); \
1189 tmp = newSVpvn( "", 0 ); \
1190 if ( UTF ) SvUTF8_on( tmp ); \
1191 av_push( trie->words, tmp ); \
1196 if ( noper_next < tail ) { \
1198 Newxz( trie->jump, word_count + 1, U16); \
1199 trie->jump[curword] = (U16)(noper_next - convert); \
1201 jumper = noper_next; \
1203 nextbranch= regnext(cur); \
1207 /* So it's a dupe. This means we need to maintain a */\
1208 /* linked-list from the first to the next. */\
1209 /* we only allocate the nextword buffer when there */\
1210 /* a dupe, so first time we have to do the allocation */\
1211 if (!trie->nextword) \
1212 Newxz( trie->nextword, word_count + 1, U16); \
1213 while ( trie->nextword[dupe] ) \
1214 dupe= trie->nextword[dupe]; \
1215 trie->nextword[dupe]= curword; \
1217 /* we haven't inserted this word yet. */ \
1218 trie->states[ state ].wordnum = curword; \
1223 #define TRIE_TRANS_STATE(state,base,ucharcount,charid,special) \
1224 ( ( base + charid >= ucharcount \
1225 && base + charid < ubound \
1226 && state == trie->trans[ base - ucharcount + charid ].check \
1227 && trie->trans[ base - ucharcount + charid ].next ) \
1228 ? trie->trans[ base - ucharcount + charid ].next \
1229 : ( state==1 ? special : 0 ) \
1233 #define MADE_JUMP_TRIE 2
1234 #define MADE_EXACT_TRIE 4
1237 S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *first, regnode *last, regnode *tail, U32 word_count, U32 flags, U32 depth)
1240 /* first pass, loop through and scan words */
1241 reg_trie_data *trie;
1243 const U32 uniflags = UTF8_ALLOW_DEFAULT;
1248 regnode *jumper = NULL;
1249 regnode *nextbranch = NULL;
1250 regnode *convert = NULL;
1251 /* we just use folder as a flag in utf8 */
1252 const U8 * const folder = ( flags == EXACTF
1254 : ( flags == EXACTFL
1260 const U32 data_slot = add_data( pRExC_state, 1, "t" );
1261 SV *re_trie_maxbuff;
1263 /* these are only used during construction but are useful during
1264 * debugging so we store them in the struct when debugging.
1266 STRLEN trie_charcount=0;
1267 AV *trie_revcharmap;
1269 GET_RE_DEBUG_FLAGS_DECL;
1271 PERL_UNUSED_ARG(depth);
1274 Newxz( trie, 1, reg_trie_data );
1276 trie->startstate = 1;
1277 trie->wordcount = word_count;
1278 RExC_rx->data->data[ data_slot ] = (void*)trie;
1279 Newxz( trie->charmap, 256, U16 );
1280 if (!(UTF && folder))
1281 Newxz( trie->bitmap, ANYOF_BITMAP_SIZE, char );
1283 trie->words = newAV();
1285 TRIE_REVCHARMAP(trie) = newAV();
1287 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
1288 if (!SvIOK(re_trie_maxbuff)) {
1289 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
1292 PerlIO_printf( Perl_debug_log,
1293 "%*smake_trie start==%d, first==%d, last==%d, tail==%d depth=%d\n",
1294 (int)depth * 2 + 2, "",
1295 REG_NODE_NUM(startbranch),REG_NODE_NUM(first),
1296 REG_NODE_NUM(last), REG_NODE_NUM(tail),
1300 /* Find the node we are going to overwrite */
1301 if ( first == startbranch && OP( last ) != BRANCH ) {
1302 /* whole branch chain */
1305 /* branch sub-chain */
1306 convert = NEXTOPER( first );
1309 /* -- First loop and Setup --
1311 We first traverse the branches and scan each word to determine if it
1312 contains widechars, and how many unique chars there are, this is
1313 important as we have to build a table with at least as many columns as we
1316 We use an array of integers to represent the character codes 0..255
1317 (trie->charmap) and we use a an HV* to store unicode characters. We use the
1318 native representation of the character value as the key and IV's for the
1321 *TODO* If we keep track of how many times each character is used we can
1322 remap the columns so that the table compression later on is more
1323 efficient in terms of memory by ensuring most common value is in the
1324 middle and the least common are on the outside. IMO this would be better
1325 than a most to least common mapping as theres a decent chance the most
1326 common letter will share a node with the least common, meaning the node
1327 will not be compressable. With a middle is most common approach the worst
1328 case is when we have the least common nodes twice.
1332 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1333 regnode * const noper = NEXTOPER( cur );
1334 const U8 *uc = (U8*)STRING( noper );
1335 const U8 * const e = uc + STR_LEN( noper );
1337 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1338 const U8 *scan = (U8*)NULL;
1339 U32 wordlen = 0; /* required init */
1342 if (OP(noper) == NOTHING) {
1347 TRIE_BITMAP_SET(trie,*uc);
1348 if ( folder ) TRIE_BITMAP_SET(trie,folder[ *uc ]);
1350 for ( ; uc < e ; uc += len ) {
1351 TRIE_CHARCOUNT(trie)++;
1355 if ( !trie->charmap[ uvc ] ) {
1356 trie->charmap[ uvc ]=( ++trie->uniquecharcount );
1358 trie->charmap[ folder[ uvc ] ] = trie->charmap[ uvc ];
1363 if ( !trie->widecharmap )
1364 trie->widecharmap = newHV();
1366 svpp = hv_fetch( trie->widecharmap, (char*)&uvc, sizeof( UV ), 1 );
1369 Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%"UVXf, uvc );
1371 if ( !SvTRUE( *svpp ) ) {
1372 sv_setiv( *svpp, ++trie->uniquecharcount );
1377 if( cur == first ) {
1380 } else if (chars < trie->minlen) {
1382 } else if (chars > trie->maxlen) {
1386 } /* end first pass */
1387 DEBUG_TRIE_COMPILE_r(
1388 PerlIO_printf( Perl_debug_log, "%*sTRIE(%s): W:%d C:%d Uq:%d Min:%d Max:%d\n",
1389 (int)depth * 2 + 2,"",
1390 ( trie->widecharmap ? "UTF8" : "NATIVE" ), (int)word_count,
1391 (int)TRIE_CHARCOUNT(trie), trie->uniquecharcount,
1392 (int)trie->minlen, (int)trie->maxlen )
1394 Newxz( trie->wordlen, word_count, U32 );
1397 We now know what we are dealing with in terms of unique chars and
1398 string sizes so we can calculate how much memory a naive
1399 representation using a flat table will take. If it's over a reasonable
1400 limit (as specified by ${^RE_TRIE_MAXBUF}) we use a more memory
1401 conservative but potentially much slower representation using an array
1404 At the end we convert both representations into the same compressed
1405 form that will be used in regexec.c for matching with. The latter
1406 is a form that cannot be used to construct with but has memory
1407 properties similar to the list form and access properties similar
1408 to the table form making it both suitable for fast searches and
1409 small enough that its feasable to store for the duration of a program.
1411 See the comment in the code where the compressed table is produced
1412 inplace from the flat tabe representation for an explanation of how
1413 the compression works.
1418 if ( (IV)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1) > SvIV(re_trie_maxbuff) ) {
1420 Second Pass -- Array Of Lists Representation
1422 Each state will be represented by a list of charid:state records
1423 (reg_trie_trans_le) the first such element holds the CUR and LEN
1424 points of the allocated array. (See defines above).
1426 We build the initial structure using the lists, and then convert
1427 it into the compressed table form which allows faster lookups
1428 (but cant be modified once converted).
1431 STRLEN transcount = 1;
1433 DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
1434 "%*sCompiling trie using list compiler\n",
1435 (int)depth * 2 + 2, ""));
1437 Newxz( trie->states, TRIE_CHARCOUNT(trie) + 2, reg_trie_state );
1441 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1443 regnode * const noper = NEXTOPER( cur );
1444 U8 *uc = (U8*)STRING( noper );
1445 const U8 * const e = uc + STR_LEN( noper );
1446 U32 state = 1; /* required init */
1447 U16 charid = 0; /* sanity init */
1448 U8 *scan = (U8*)NULL; /* sanity init */
1449 STRLEN foldlen = 0; /* required init */
1450 U32 wordlen = 0; /* required init */
1451 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1453 if (OP(noper) != NOTHING) {
1454 for ( ; uc < e ; uc += len ) {
1459 charid = trie->charmap[ uvc ];
1461 SV** const svpp = hv_fetch( trie->widecharmap, (char*)&uvc, sizeof( UV ), 0);
1465 charid=(U16)SvIV( *svpp );
1468 /* charid is now 0 if we dont know the char read, or nonzero if we do */
1475 if ( !trie->states[ state ].trans.list ) {
1476 TRIE_LIST_NEW( state );
1478 for ( check = 1; check <= TRIE_LIST_USED( state ); check++ ) {
1479 if ( TRIE_LIST_ITEM( state, check ).forid == charid ) {
1480 newstate = TRIE_LIST_ITEM( state, check ).newstate;
1485 newstate = next_alloc++;
1486 TRIE_LIST_PUSH( state, charid, newstate );
1491 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
1495 TRIE_HANDLE_WORD(state);
1497 } /* end second pass */
1499 /* next alloc is the NEXT state to be allocated */
1500 trie->statecount = next_alloc;
1501 Renew( trie->states, next_alloc, reg_trie_state );
1503 /* and now dump it out before we compress it */
1504 DEBUG_TRIE_COMPILE_MORE_r(
1505 dump_trie_interim_list(trie,next_alloc,depth+1)
1508 Newxz( trie->trans, transcount ,reg_trie_trans );
1515 for( state=1 ; state < next_alloc ; state ++ ) {
1519 DEBUG_TRIE_COMPILE_MORE_r(
1520 PerlIO_printf( Perl_debug_log, "tp: %d zp: %d ",tp,zp)
1524 if (trie->states[state].trans.list) {
1525 U16 minid=TRIE_LIST_ITEM( state, 1).forid;
1529 for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
1530 const U16 forid = TRIE_LIST_ITEM( state, idx).forid;
1531 if ( forid < minid ) {
1533 } else if ( forid > maxid ) {
1537 if ( transcount < tp + maxid - minid + 1) {
1539 Renew( trie->trans, transcount, reg_trie_trans );
1540 Zero( trie->trans + (transcount / 2), transcount / 2 , reg_trie_trans );
1542 base = trie->uniquecharcount + tp - minid;
1543 if ( maxid == minid ) {
1545 for ( ; zp < tp ; zp++ ) {
1546 if ( ! trie->trans[ zp ].next ) {
1547 base = trie->uniquecharcount + zp - minid;
1548 trie->trans[ zp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1549 trie->trans[ zp ].check = state;
1555 trie->trans[ tp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1556 trie->trans[ tp ].check = state;
1561 for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
1562 const U32 tid = base - trie->uniquecharcount + TRIE_LIST_ITEM( state, idx ).forid;
1563 trie->trans[ tid ].next = TRIE_LIST_ITEM( state, idx ).newstate;
1564 trie->trans[ tid ].check = state;
1566 tp += ( maxid - minid + 1 );
1568 Safefree(trie->states[ state ].trans.list);
1571 DEBUG_TRIE_COMPILE_MORE_r(
1572 PerlIO_printf( Perl_debug_log, " base: %d\n",base);
1575 trie->states[ state ].trans.base=base;
1577 trie->lasttrans = tp + 1;
1581 Second Pass -- Flat Table Representation.
1583 we dont use the 0 slot of either trans[] or states[] so we add 1 to each.
1584 We know that we will need Charcount+1 trans at most to store the data
1585 (one row per char at worst case) So we preallocate both structures
1586 assuming worst case.
1588 We then construct the trie using only the .next slots of the entry
1591 We use the .check field of the first entry of the node temporarily to
1592 make compression both faster and easier by keeping track of how many non
1593 zero fields are in the node.
1595 Since trans are numbered from 1 any 0 pointer in the table is a FAIL
1598 There are two terms at use here: state as a TRIE_NODEIDX() which is a
1599 number representing the first entry of the node, and state as a
1600 TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1) and
1601 TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3) if there
1602 are 2 entrys per node. eg:
1610 The table is internally in the right hand, idx form. However as we also
1611 have to deal with the states array which is indexed by nodenum we have to
1612 use TRIE_NODENUM() to convert.
1615 DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
1616 "%*sCompiling trie using table compiler\n",
1617 (int)depth * 2 + 2, ""));
1619 Newxz( trie->trans, ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1,
1621 Newxz( trie->states, TRIE_CHARCOUNT(trie) + 2, reg_trie_state );
1622 next_alloc = trie->uniquecharcount + 1;
1625 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1627 regnode * const noper = NEXTOPER( cur );
1628 const U8 *uc = (U8*)STRING( noper );
1629 const U8 * const e = uc + STR_LEN( noper );
1631 U32 state = 1; /* required init */
1633 U16 charid = 0; /* sanity init */
1634 U32 accept_state = 0; /* sanity init */
1635 U8 *scan = (U8*)NULL; /* sanity init */
1637 STRLEN foldlen = 0; /* required init */
1638 U32 wordlen = 0; /* required init */
1639 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1641 if ( OP(noper) != NOTHING ) {
1642 for ( ; uc < e ; uc += len ) {
1647 charid = trie->charmap[ uvc ];
1649 SV* const * const svpp = hv_fetch( trie->widecharmap, (char*)&uvc, sizeof( UV ), 0);
1650 charid = svpp ? (U16)SvIV(*svpp) : 0;
1654 if ( !trie->trans[ state + charid ].next ) {
1655 trie->trans[ state + charid ].next = next_alloc;
1656 trie->trans[ state ].check++;
1657 next_alloc += trie->uniquecharcount;
1659 state = trie->trans[ state + charid ].next;
1661 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
1663 /* charid is now 0 if we dont know the char read, or nonzero if we do */
1666 accept_state = TRIE_NODENUM( state );
1667 TRIE_HANDLE_WORD(accept_state);
1669 } /* end second pass */
1671 /* and now dump it out before we compress it */
1672 DEBUG_TRIE_COMPILE_MORE_r(
1673 dump_trie_interim_table(trie,next_alloc,depth+1)
1678 * Inplace compress the table.*
1680 For sparse data sets the table constructed by the trie algorithm will
1681 be mostly 0/FAIL transitions or to put it another way mostly empty.
1682 (Note that leaf nodes will not contain any transitions.)
1684 This algorithm compresses the tables by eliminating most such
1685 transitions, at the cost of a modest bit of extra work during lookup:
1687 - Each states[] entry contains a .base field which indicates the
1688 index in the state[] array wheres its transition data is stored.
1690 - If .base is 0 there are no valid transitions from that node.
1692 - If .base is nonzero then charid is added to it to find an entry in
1695 -If trans[states[state].base+charid].check!=state then the
1696 transition is taken to be a 0/Fail transition. Thus if there are fail
1697 transitions at the front of the node then the .base offset will point
1698 somewhere inside the previous nodes data (or maybe even into a node
1699 even earlier), but the .check field determines if the transition is
1703 The following process inplace converts the table to the compressed
1704 table: We first do not compress the root node 1,and mark its all its
1705 .check pointers as 1 and set its .base pointer as 1 as well. This
1706 allows to do a DFA construction from the compressed table later, and
1707 ensures that any .base pointers we calculate later are greater than
1710 - We set 'pos' to indicate the first entry of the second node.
1712 - We then iterate over the columns of the node, finding the first and
1713 last used entry at l and m. We then copy l..m into pos..(pos+m-l),
1714 and set the .check pointers accordingly, and advance pos
1715 appropriately and repreat for the next node. Note that when we copy
1716 the next pointers we have to convert them from the original
1717 NODEIDX form to NODENUM form as the former is not valid post
1720 - If a node has no transitions used we mark its base as 0 and do not
1721 advance the pos pointer.
1723 - If a node only has one transition we use a second pointer into the
1724 structure to fill in allocated fail transitions from other states.
1725 This pointer is independent of the main pointer and scans forward
1726 looking for null transitions that are allocated to a state. When it
1727 finds one it writes the single transition into the "hole". If the
1728 pointer doesnt find one the single transition is appended as normal.
1730 - Once compressed we can Renew/realloc the structures to release the
1733 See "Table-Compression Methods" in sec 3.9 of the Red Dragon,
1734 specifically Fig 3.47 and the associated pseudocode.
1738 const U32 laststate = TRIE_NODENUM( next_alloc );
1741 trie->statecount = laststate;
1743 for ( state = 1 ; state < laststate ; state++ ) {
1745 const U32 stateidx = TRIE_NODEIDX( state );
1746 const U32 o_used = trie->trans[ stateidx ].check;
1747 U32 used = trie->trans[ stateidx ].check;
1748 trie->trans[ stateidx ].check = 0;
1750 for ( charid = 0 ; used && charid < trie->uniquecharcount ; charid++ ) {
1751 if ( flag || trie->trans[ stateidx + charid ].next ) {
1752 if ( trie->trans[ stateidx + charid ].next ) {
1754 for ( ; zp < pos ; zp++ ) {
1755 if ( ! trie->trans[ zp ].next ) {
1759 trie->states[ state ].trans.base = zp + trie->uniquecharcount - charid ;
1760 trie->trans[ zp ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
1761 trie->trans[ zp ].check = state;
1762 if ( ++zp > pos ) pos = zp;
1769 trie->states[ state ].trans.base = pos + trie->uniquecharcount - charid ;
1771 trie->trans[ pos ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
1772 trie->trans[ pos ].check = state;
1777 trie->lasttrans = pos + 1;
1778 Renew( trie->states, laststate, reg_trie_state);
1779 DEBUG_TRIE_COMPILE_MORE_r(
1780 PerlIO_printf( Perl_debug_log,
1781 "%*sAlloc: %d Orig: %"IVdf" elements, Final:%"IVdf". Savings of %%%5.2f\n",
1782 (int)depth * 2 + 2,"",
1783 (int)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1 ),
1786 ( ( next_alloc - pos ) * 100 ) / (double)next_alloc );
1789 } /* end table compress */
1791 DEBUG_TRIE_COMPILE_MORE_r(
1792 PerlIO_printf(Perl_debug_log, "%*sStatecount:%"UVxf" Lasttrans:%"UVxf"\n",
1793 (int)depth * 2 + 2, "",
1794 (UV)trie->statecount,
1795 (UV)trie->lasttrans)
1797 /* resize the trans array to remove unused space */
1798 Renew( trie->trans, trie->lasttrans, reg_trie_trans);
1800 /* and now dump out the compressed format */
1801 DEBUG_TRIE_COMPILE_r(
1802 dump_trie(trie,depth+1)
1805 { /* Modify the program and insert the new TRIE node*/
1806 U8 nodetype =(U8)(flags & 0xFF);
1810 regnode *optimize = NULL;
1812 U32 mjd_nodelen = 0;
1815 This means we convert either the first branch or the first Exact,
1816 depending on whether the thing following (in 'last') is a branch
1817 or not and whther first is the startbranch (ie is it a sub part of
1818 the alternation or is it the whole thing.)
1819 Assuming its a sub part we conver the EXACT otherwise we convert
1820 the whole branch sequence, including the first.
1822 /* Find the node we are going to overwrite */
1823 if ( first != startbranch || OP( last ) == BRANCH ) {
1824 /* branch sub-chain */
1825 NEXT_OFF( first ) = (U16)(last - first);
1827 mjd_offset= Node_Offset((convert));
1828 mjd_nodelen= Node_Length((convert));
1830 /* whole branch chain */
1833 const regnode *nop = NEXTOPER( convert );
1834 mjd_offset= Node_Offset((nop));
1835 mjd_nodelen= Node_Length((nop));
1840 PerlIO_printf(Perl_debug_log, "%*sMJD offset:%"UVuf" MJD length:%"UVuf"\n",
1841 (int)depth * 2 + 2, "",
1842 (UV)mjd_offset, (UV)mjd_nodelen)
1845 /* But first we check to see if there is a common prefix we can
1846 split out as an EXACT and put in front of the TRIE node. */
1847 trie->startstate= 1;
1848 if ( trie->bitmap && !trie->widecharmap && !trie->jump ) {
1850 for ( state = 1 ; state < trie->statecount-1 ; state++ ) {
1854 const U32 base = trie->states[ state ].trans.base;
1856 if ( trie->states[state].wordnum )
1859 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
1860 if ( ( base + ofs >= trie->uniquecharcount ) &&
1861 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
1862 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
1864 if ( ++count > 1 ) {
1865 SV **tmp = av_fetch( TRIE_REVCHARMAP(trie), ofs, 0);
1866 const U8 *ch = (U8*)SvPV_nolen_const( *tmp );
1867 if ( state == 1 ) break;
1869 Zero(trie->bitmap, ANYOF_BITMAP_SIZE, char);
1871 PerlIO_printf(Perl_debug_log,
1872 "%*sNew Start State=%"UVuf" Class: [",
1873 (int)depth * 2 + 2, "",
1876 SV ** const tmp = av_fetch( TRIE_REVCHARMAP(trie), idx, 0);
1877 const U8 * const ch = (U8*)SvPV_nolen_const( *tmp );
1879 TRIE_BITMAP_SET(trie,*ch);
1881 TRIE_BITMAP_SET(trie, folder[ *ch ]);
1883 PerlIO_printf(Perl_debug_log, (char*)ch)
1887 TRIE_BITMAP_SET(trie,*ch);
1889 TRIE_BITMAP_SET(trie,folder[ *ch ]);
1890 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"%s", ch));
1896 SV **tmp = av_fetch( TRIE_REVCHARMAP(trie), idx, 0);
1897 const char *ch = SvPV_nolen_const( *tmp );
1899 PerlIO_printf( Perl_debug_log,
1900 "%*sPrefix State: %"UVuf" Idx:%"UVuf" Char='%s'\n",
1901 (int)depth * 2 + 2, "",
1902 (UV)state, (UV)idx, ch)
1905 OP( convert ) = nodetype;
1906 str=STRING(convert);
1915 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"]\n"));
1921 regnode *n = convert+NODE_SZ_STR(convert);
1922 NEXT_OFF(convert) = NODE_SZ_STR(convert);
1923 trie->startstate = state;
1924 trie->minlen -= (state - 1);
1925 trie->maxlen -= (state - 1);
1927 regnode *fix = convert;
1929 Set_Node_Offset_Length(convert, mjd_offset, state - 1);
1930 while( ++fix < n ) {
1931 Set_Node_Offset_Length(fix, 0, 0);
1937 NEXT_OFF(convert) = (U16)(tail - convert);
1938 DEBUG_r(optimize= n);
1944 if ( trie->maxlen ) {
1945 NEXT_OFF( convert ) = (U16)(tail - convert);
1946 ARG_SET( convert, data_slot );
1947 /* Store the offset to the first unabsorbed branch in
1948 jump[0], which is otherwise unused by the jump logic.
1949 We use this when dumping a trie and during optimisation. */
1951 trie->jump[0] = (U16)(nextbranch - convert);
1954 if ( !trie->states[trie->startstate].wordnum && trie->bitmap &&
1955 ( (char *)jumper - (char *)convert) >= (int)sizeof(struct regnode_charclass) )
1957 OP( convert ) = TRIEC;
1958 Copy(trie->bitmap, ((struct regnode_charclass *)convert)->bitmap, ANYOF_BITMAP_SIZE, char);
1959 Safefree(trie->bitmap);
1962 OP( convert ) = TRIE;
1964 /* store the type in the flags */
1965 convert->flags = nodetype;
1969 + regarglen[ OP( convert ) ];
1971 /* XXX We really should free up the resource in trie now,
1972 as we won't use them - (which resources?) dmq */
1974 /* needed for dumping*/
1975 DEBUG_r(if (optimize) {
1976 regnode *opt = convert;
1977 while ( ++opt < optimize) {
1978 Set_Node_Offset_Length(opt,0,0);
1981 Try to clean up some of the debris left after the
1984 while( optimize < jumper ) {
1985 mjd_nodelen += Node_Length((optimize));
1986 OP( optimize ) = OPTIMIZED;
1987 Set_Node_Offset_Length(optimize,0,0);
1990 Set_Node_Offset_Length(convert,mjd_offset,mjd_nodelen);
1992 } /* end node insert */
1994 SvREFCNT_dec(TRIE_REVCHARMAP(trie));
1998 : trie->startstate>1
2004 S_make_trie_failtable(pTHX_ RExC_state_t *pRExC_state, regnode *source, regnode *stclass, U32 depth)
2006 /* The Trie is constructed and compressed now so we can build a fail array now if its needed
2008 This is basically the Aho-Corasick algorithm. Its from exercise 3.31 and 3.32 in the
2009 "Red Dragon" -- Compilers, principles, techniques, and tools. Aho, Sethi, Ullman 1985/88
2012 We find the fail state for each state in the trie, this state is the longest proper
2013 suffix of the current states 'word' that is also a proper prefix of another word in our
2014 trie. State 1 represents the word '' and is the thus the default fail state. This allows
2015 the DFA not to have to restart after its tried and failed a word at a given point, it
2016 simply continues as though it had been matching the other word in the first place.
2018 'abcdgu'=~/abcdefg|cdgu/
2019 When we get to 'd' we are still matching the first word, we would encounter 'g' which would
2020 fail, which would bring use to the state representing 'd' in the second word where we would
2021 try 'g' and succeed, prodceding to match 'cdgu'.
2023 /* add a fail transition */
2024 reg_trie_data *trie=(reg_trie_data *)RExC_rx->data->data[ARG(source)];
2026 const U32 ucharcount = trie->uniquecharcount;
2027 const U32 numstates = trie->statecount;
2028 const U32 ubound = trie->lasttrans + ucharcount;
2032 U32 base = trie->states[ 1 ].trans.base;
2035 const U32 data_slot = add_data( pRExC_state, 1, "T" );
2036 GET_RE_DEBUG_FLAGS_DECL;
2038 PERL_UNUSED_ARG(depth);
2042 ARG_SET( stclass, data_slot );
2043 Newxz( aho, 1, reg_ac_data );
2044 RExC_rx->data->data[ data_slot ] = (void*)aho;
2046 aho->states=(reg_trie_state *)savepvn((const char*)trie->states,
2047 numstates * sizeof(reg_trie_state));
2048 Newxz( q, numstates, U32);
2049 Newxz( aho->fail, numstates, U32 );
2052 /* initialize fail[0..1] to be 1 so that we always have
2053 a valid final fail state */
2054 fail[ 0 ] = fail[ 1 ] = 1;
2056 for ( charid = 0; charid < ucharcount ; charid++ ) {
2057 const U32 newstate = TRIE_TRANS_STATE( 1, base, ucharcount, charid, 0 );
2059 q[ q_write ] = newstate;
2060 /* set to point at the root */
2061 fail[ q[ q_write++ ] ]=1;
2064 while ( q_read < q_write) {
2065 const U32 cur = q[ q_read++ % numstates ];
2066 base = trie->states[ cur ].trans.base;
2068 for ( charid = 0 ; charid < ucharcount ; charid++ ) {
2069 const U32 ch_state = TRIE_TRANS_STATE( cur, base, ucharcount, charid, 1 );
2071 U32 fail_state = cur;
2074 fail_state = fail[ fail_state ];
2075 fail_base = aho->states[ fail_state ].trans.base;
2076 } while ( !TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 ) );
2078 fail_state = TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 );
2079 fail[ ch_state ] = fail_state;
2080 if ( !aho->states[ ch_state ].wordnum && aho->states[ fail_state ].wordnum )
2082 aho->states[ ch_state ].wordnum = aho->states[ fail_state ].wordnum;
2084 q[ q_write++ % numstates] = ch_state;
2088 /* restore fail[0..1] to 0 so that we "fall out" of the AC loop
2089 when we fail in state 1, this allows us to use the
2090 charclass scan to find a valid start char. This is based on the principle
2091 that theres a good chance the string being searched contains lots of stuff
2092 that cant be a start char.
2094 fail[ 0 ] = fail[ 1 ] = 0;
2095 DEBUG_TRIE_COMPILE_r({
2096 PerlIO_printf(Perl_debug_log,
2097 "%*sStclass Failtable (%"UVuf" states): 0",
2098 (int)(depth * 2), "", (UV)numstates
2100 for( q_read=1; q_read<numstates; q_read++ ) {
2101 PerlIO_printf(Perl_debug_log, ", %"UVuf, (UV)fail[q_read]);
2103 PerlIO_printf(Perl_debug_log, "\n");
2106 /*RExC_seen |= REG_SEEN_TRIEDFA;*/
2111 * There are strange code-generation bugs caused on sparc64 by gcc-2.95.2.
2112 * These need to be revisited when a newer toolchain becomes available.
2114 #if defined(__sparc64__) && defined(__GNUC__)
2115 # if __GNUC__ < 2 || (__GNUC__ == 2 && __GNUC_MINOR__ < 96)
2116 # undef SPARC64_GCC_WORKAROUND
2117 # define SPARC64_GCC_WORKAROUND 1
2121 #define DEBUG_PEEP(str,scan,depth) \
2122 DEBUG_OPTIMISE_r({if (scan){ \
2123 SV * const mysv=sv_newmortal(); \
2124 regnode *Next = regnext(scan); \
2125 regprop(RExC_rx, mysv, scan); \
2126 PerlIO_printf(Perl_debug_log, "%*s" str ">%3d: %s (%d)\n", \
2127 (int)depth*2, "", REG_NODE_NUM(scan), SvPV_nolen_const(mysv),\
2128 Next ? (REG_NODE_NUM(Next)) : 0 ); \
2135 #define JOIN_EXACT(scan,min,flags) \
2136 if (PL_regkind[OP(scan)] == EXACT) \
2137 join_exact(pRExC_state,(scan),(min),(flags),NULL,depth+1)
2140 S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, I32 *min, U32 flags,regnode *val, U32 depth) {
2141 /* Merge several consecutive EXACTish nodes into one. */
2142 regnode *n = regnext(scan);
2144 regnode *next = scan + NODE_SZ_STR(scan);
2148 regnode *stop = scan;
2149 GET_RE_DEBUG_FLAGS_DECL;
2151 PERL_UNUSED_ARG(depth);
2153 #ifndef EXPERIMENTAL_INPLACESCAN
2154 PERL_UNUSED_ARG(flags);
2155 PERL_UNUSED_ARG(val);
2157 DEBUG_PEEP("join",scan,depth);
2159 /* Skip NOTHING, merge EXACT*. */
2161 ( PL_regkind[OP(n)] == NOTHING ||
2162 (stringok && (OP(n) == OP(scan))))
2164 && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX) {
2166 if (OP(n) == TAIL || n > next)
2168 if (PL_regkind[OP(n)] == NOTHING) {
2169 DEBUG_PEEP("skip:",n,depth);
2170 NEXT_OFF(scan) += NEXT_OFF(n);
2171 next = n + NODE_STEP_REGNODE;
2178 else if (stringok) {
2179 const unsigned int oldl = STR_LEN(scan);
2180 regnode * const nnext = regnext(n);
2182 DEBUG_PEEP("merg",n,depth);
2185 if (oldl + STR_LEN(n) > U8_MAX)
2187 NEXT_OFF(scan) += NEXT_OFF(n);
2188 STR_LEN(scan) += STR_LEN(n);
2189 next = n + NODE_SZ_STR(n);
2190 /* Now we can overwrite *n : */
2191 Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
2199 #ifdef EXPERIMENTAL_INPLACESCAN
2200 if (flags && !NEXT_OFF(n)) {
2201 DEBUG_PEEP("atch", val, depth);
2202 if (reg_off_by_arg[OP(n)]) {
2203 ARG_SET(n, val - n);
2206 NEXT_OFF(n) = val - n;
2213 if (UTF && ( OP(scan) == EXACTF ) && ( STR_LEN(scan) >= 6 ) ) {
2215 Two problematic code points in Unicode casefolding of EXACT nodes:
2217 U+0390 - GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
2218 U+03B0 - GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
2224 U+03B9 U+0308 U+0301 0xCE 0xB9 0xCC 0x88 0xCC 0x81
2225 U+03C5 U+0308 U+0301 0xCF 0x85 0xCC 0x88 0xCC 0x81
2227 This means that in case-insensitive matching (or "loose matching",
2228 as Unicode calls it), an EXACTF of length six (the UTF-8 encoded byte
2229 length of the above casefolded versions) can match a target string
2230 of length two (the byte length of UTF-8 encoded U+0390 or U+03B0).
2231 This would rather mess up the minimum length computation.
2233 What we'll do is to look for the tail four bytes, and then peek
2234 at the preceding two bytes to see whether we need to decrease
2235 the minimum length by four (six minus two).
2237 Thanks to the design of UTF-8, there cannot be false matches:
2238 A sequence of valid UTF-8 bytes cannot be a subsequence of
2239 another valid sequence of UTF-8 bytes.
2242 char * const s0 = STRING(scan), *s, *t;
2243 char * const s1 = s0 + STR_LEN(scan) - 1;
2244 char * const s2 = s1 - 4;
2245 #ifdef EBCDIC /* RD tunifold greek 0390 and 03B0 */
2246 const char t0[] = "\xaf\x49\xaf\x42";
2248 const char t0[] = "\xcc\x88\xcc\x81";
2250 const char * const t1 = t0 + 3;
2253 s < s2 && (t = ninstr(s, s1, t0, t1));
2256 if (((U8)t[-1] == 0x68 && (U8)t[-2] == 0xB4) ||
2257 ((U8)t[-1] == 0x46 && (U8)t[-2] == 0xB5))
2259 if (((U8)t[-1] == 0xB9 && (U8)t[-2] == 0xCE) ||
2260 ((U8)t[-1] == 0x85 && (U8)t[-2] == 0xCF))
2268 n = scan + NODE_SZ_STR(scan);
2270 if (PL_regkind[OP(n)] != NOTHING || OP(n) == NOTHING) {
2277 DEBUG_OPTIMISE_r(if (merged){DEBUG_PEEP("finl",scan,depth)});
2281 /* REx optimizer. Converts nodes into quickier variants "in place".
2282 Finds fixed substrings. */
2284 /* Stops at toplevel WHILEM as well as at "last". At end *scanp is set
2285 to the position after last scanned or to NULL. */
2287 #define INIT_AND_WITHP \
2288 assert(!and_withp); \
2289 Newx(and_withp,1,struct regnode_charclass_class); \
2290 SAVEFREEPV(and_withp)
2292 /* this is a chain of data about sub patterns we are processing that
2293 need to be handled seperately/specially in study_chunk. Its so
2294 we can simulate recursion without losing state. */
2296 typedef struct scan_frame {
2297 regnode *last; /* last node to process in this frame */
2298 regnode *next; /* next node to process when last is reached */
2299 struct scan_frame *prev; /*previous frame*/
2300 I32 stop; /* what stopparen do we use */
2304 S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
2305 I32 *minlenp, I32 *deltap,
2310 struct regnode_charclass_class *and_withp,
2311 U32 flags, U32 depth)
2312 /* scanp: Start here (read-write). */
2313 /* deltap: Write maxlen-minlen here. */
2314 /* last: Stop before this one. */
2315 /* data: string data about the pattern */
2316 /* stopparen: treat close N as END */
2317 /* recursed: which subroutines have we recursed into */
2318 /* and_withp: Valid if flags & SCF_DO_STCLASS_OR */
2321 I32 min = 0, pars = 0, code;
2322 regnode *scan = *scanp, *next;
2324 int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
2325 int is_inf_internal = 0; /* The studied chunk is infinite */
2326 I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
2327 scan_data_t data_fake;
2328 SV *re_trie_maxbuff = NULL;
2329 regnode *first_non_open = scan;
2330 I32 stopmin = I32_MAX;
2331 scan_frame *frame = NULL;
2333 GET_RE_DEBUG_FLAGS_DECL;
2336 StructCopy(&zero_scan_data, &data_fake, scan_data_t);
2340 while (first_non_open && OP(first_non_open) == OPEN)
2341 first_non_open=regnext(first_non_open);
2346 while ( scan && OP(scan) != END && scan < last ){
2347 /* Peephole optimizer: */
2348 DEBUG_STUDYDATA(data,depth);
2349 DEBUG_PEEP("Peep",scan,depth);
2350 JOIN_EXACT(scan,&min,0);
2352 /* Follow the next-chain of the current node and optimize
2353 away all the NOTHINGs from it. */
2354 if (OP(scan) != CURLYX) {
2355 const int max = (reg_off_by_arg[OP(scan)]
2357 /* I32 may be smaller than U16 on CRAYs! */
2358 : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
2359 int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
2363 /* Skip NOTHING and LONGJMP. */
2364 while ((n = regnext(n))
2365 && ((PL_regkind[OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
2366 || ((OP(n) == LONGJMP) && (noff = ARG(n))))
2367 && off + noff < max)
2369 if (reg_off_by_arg[OP(scan)])
2372 NEXT_OFF(scan) = off;
2377 /* The principal pseudo-switch. Cannot be a switch, since we
2378 look into several different things. */
2379 if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
2380 || OP(scan) == IFTHEN) {
2381 next = regnext(scan);
2383 /* demq: the op(next)==code check is to see if we have "branch-branch" AFAICT */
2385 if (OP(next) == code || code == IFTHEN) {
2386 /* NOTE - There is similar code to this block below for handling
2387 TRIE nodes on a re-study. If you change stuff here check there
2389 I32 max1 = 0, min1 = I32_MAX, num = 0;
2390 struct regnode_charclass_class accum;
2391 regnode * const startbranch=scan;
2393 if (flags & SCF_DO_SUBSTR)
2394 scan_commit(pRExC_state, data, minlenp); /* Cannot merge strings after this. */
2395 if (flags & SCF_DO_STCLASS)
2396 cl_init_zero(pRExC_state, &accum);
2398 while (OP(scan) == code) {
2399 I32 deltanext, minnext, f = 0, fake;
2400 struct regnode_charclass_class this_class;
2403 data_fake.flags = 0;
2405 data_fake.whilem_c = data->whilem_c;
2406 data_fake.last_closep = data->last_closep;
2409 data_fake.last_closep = &fake;
2410 next = regnext(scan);
2411 scan = NEXTOPER(scan);
2413 scan = NEXTOPER(scan);
2414 if (flags & SCF_DO_STCLASS) {
2415 cl_init(pRExC_state, &this_class);
2416 data_fake.start_class = &this_class;
2417 f = SCF_DO_STCLASS_AND;
2419 if (flags & SCF_WHILEM_VISITED_POS)
2420 f |= SCF_WHILEM_VISITED_POS;
2422 /* we suppose the run is continuous, last=next...*/
2423 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
2425 stopparen, recursed, NULL, f,depth+1);
2428 if (max1 < minnext + deltanext)
2429 max1 = minnext + deltanext;
2430 if (deltanext == I32_MAX)
2431 is_inf = is_inf_internal = 1;
2433 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
2435 if (data_fake.flags & SCF_SEEN_ACCEPT) {
2436 if ( stopmin > minnext)
2437 stopmin = min + min1;
2438 flags &= ~SCF_DO_SUBSTR;
2440 data->flags |= SCF_SEEN_ACCEPT;
2443 if (data_fake.flags & SF_HAS_EVAL)
2444 data->flags |= SF_HAS_EVAL;
2445 data->whilem_c = data_fake.whilem_c;
2447 if (flags & SCF_DO_STCLASS)
2448 cl_or(pRExC_state, &accum, &this_class);
2450 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
2452 if (flags & SCF_DO_SUBSTR) {
2453 data->pos_min += min1;
2454 data->pos_delta += max1 - min1;
2455 if (max1 != min1 || is_inf)
2456 data->longest = &(data->longest_float);
2459 delta += max1 - min1;
2460 if (flags & SCF_DO_STCLASS_OR) {
2461 cl_or(pRExC_state, data->start_class, &accum);
2463 cl_and(data->start_class, and_withp);
2464 flags &= ~SCF_DO_STCLASS;
2467 else if (flags & SCF_DO_STCLASS_AND) {
2469 cl_and(data->start_class, &accum);
2470 flags &= ~SCF_DO_STCLASS;
2473 /* Switch to OR mode: cache the old value of
2474 * data->start_class */
2476 StructCopy(data->start_class, and_withp,
2477 struct regnode_charclass_class);
2478 flags &= ~SCF_DO_STCLASS_AND;
2479 StructCopy(&accum, data->start_class,
2480 struct regnode_charclass_class);
2481 flags |= SCF_DO_STCLASS_OR;
2482 data->start_class->flags |= ANYOF_EOS;
2486 if (PERL_ENABLE_TRIE_OPTIMISATION && OP( startbranch ) == BRANCH ) {
2489 Assuming this was/is a branch we are dealing with: 'scan' now
2490 points at the item that follows the branch sequence, whatever
2491 it is. We now start at the beginning of the sequence and look
2498 which would be constructed from a pattern like /A|LIST|OF|WORDS/
2500 If we can find such a subseqence we need to turn the first
2501 element into a trie and then add the subsequent branch exact
2502 strings to the trie.
2506 1. patterns where the whole set of branch can be converted.
2508 2. patterns where only a subset can be converted.
2510 In case 1 we can replace the whole set with a single regop
2511 for the trie. In case 2 we need to keep the start and end
2514 'BRANCH EXACT; BRANCH EXACT; BRANCH X'
2515 becomes BRANCH TRIE; BRANCH X;
2517 There is an additional case, that being where there is a
2518 common prefix, which gets split out into an EXACT like node
2519 preceding the TRIE node.
2521 If x(1..n)==tail then we can do a simple trie, if not we make
2522 a "jump" trie, such that when we match the appropriate word
2523 we "jump" to the appopriate tail node. Essentailly we turn
2524 a nested if into a case structure of sorts.
2529 if (!re_trie_maxbuff) {
2530 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
2531 if (!SvIOK(re_trie_maxbuff))
2532 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
2534 if ( SvIV(re_trie_maxbuff)>=0 ) {
2536 regnode *first = (regnode *)NULL;
2537 regnode *last = (regnode *)NULL;
2538 regnode *tail = scan;
2543 SV * const mysv = sv_newmortal(); /* for dumping */
2545 /* var tail is used because there may be a TAIL
2546 regop in the way. Ie, the exacts will point to the
2547 thing following the TAIL, but the last branch will
2548 point at the TAIL. So we advance tail. If we
2549 have nested (?:) we may have to move through several
2553 while ( OP( tail ) == TAIL ) {
2554 /* this is the TAIL generated by (?:) */
2555 tail = regnext( tail );
2560 regprop(RExC_rx, mysv, tail );
2561 PerlIO_printf( Perl_debug_log, "%*s%s%s\n",
2562 (int)depth * 2 + 2, "",
2563 "Looking for TRIE'able sequences. Tail node is: ",
2564 SvPV_nolen_const( mysv )
2570 step through the branches, cur represents each
2571 branch, noper is the first thing to be matched
2572 as part of that branch and noper_next is the
2573 regnext() of that node. if noper is an EXACT
2574 and noper_next is the same as scan (our current
2575 position in the regex) then the EXACT branch is
2576 a possible optimization target. Once we have
2577 two or more consequetive such branches we can
2578 create a trie of the EXACT's contents and stich
2579 it in place. If the sequence represents all of
2580 the branches we eliminate the whole thing and
2581 replace it with a single TRIE. If it is a
2582 subsequence then we need to stitch it in. This
2583 means the first branch has to remain, and needs
2584 to be repointed at the item on the branch chain
2585 following the last branch optimized. This could
2586 be either a BRANCH, in which case the
2587 subsequence is internal, or it could be the
2588 item following the branch sequence in which
2589 case the subsequence is at the end.
2593 /* dont use tail as the end marker for this traverse */
2594 for ( cur = startbranch ; cur != scan ; cur = regnext( cur ) ) {
2595 regnode * const noper = NEXTOPER( cur );
2596 #if defined(DEBUGGING) || defined(NOJUMPTRIE)
2597 regnode * const noper_next = regnext( noper );
2601 regprop(RExC_rx, mysv, cur);
2602 PerlIO_printf( Perl_debug_log, "%*s- %s (%d)",
2603 (int)depth * 2 + 2,"", SvPV_nolen_const( mysv ), REG_NODE_NUM(cur) );
2605 regprop(RExC_rx, mysv, noper);
2606 PerlIO_printf( Perl_debug_log, " -> %s",
2607 SvPV_nolen_const(mysv));
2610 regprop(RExC_rx, mysv, noper_next );
2611 PerlIO_printf( Perl_debug_log,"\t=> %s\t",
2612 SvPV_nolen_const(mysv));
2614 PerlIO_printf( Perl_debug_log, "(First==%d,Last==%d,Cur==%d)\n",
2615 REG_NODE_NUM(first), REG_NODE_NUM(last), REG_NODE_NUM(cur) );
2617 if ( (((first && optype!=NOTHING) ? OP( noper ) == optype
2618 : PL_regkind[ OP( noper ) ] == EXACT )
2619 || OP(noper) == NOTHING )
2621 && noper_next == tail
2626 if ( !first || optype == NOTHING ) {
2627 if (!first) first = cur;
2628 optype = OP( noper );
2634 make_trie( pRExC_state,
2635 startbranch, first, cur, tail, count,
2638 if ( PL_regkind[ OP( noper ) ] == EXACT
2640 && noper_next == tail
2645 optype = OP( noper );
2655 regprop(RExC_rx, mysv, cur);
2656 PerlIO_printf( Perl_debug_log,
2657 "%*s- %s (%d) <SCAN FINISHED>\n", (int)depth * 2 + 2,
2658 "", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur));
2662 made= make_trie( pRExC_state, startbranch, first, scan, tail, count, optype, depth+1 );
2663 #ifdef TRIE_STUDY_OPT
2664 if ( ((made == MADE_EXACT_TRIE &&
2665 startbranch == first)
2666 || ( first_non_open == first )) &&
2668 flags |= SCF_TRIE_RESTUDY;
2669 if ( startbranch == first
2672 RExC_seen &=~REG_TOP_LEVEL_BRANCHES;
2682 else if ( code == BRANCHJ ) { /* single branch is optimized. */
2683 scan = NEXTOPER(NEXTOPER(scan));
2684 } else /* single branch is optimized. */
2685 scan = NEXTOPER(scan);
2687 } else if (OP(scan) == SUSPEND || OP(scan) == GOSUB || OP(scan) == GOSTART) {
2688 scan_frame *newframe = NULL;
2693 if (OP(scan) != SUSPEND) {
2694 /* set the pointer */
2695 if (OP(scan) == GOSUB) {
2697 RExC_recurse[ARG2L(scan)] = scan;
2698 start = RExC_open_parens[paren-1];
2699 end = RExC_close_parens[paren-1];
2702 start = RExC_rx->program + 1;
2706 Newxz(recursed, (((RExC_npar)>>3) +1), U8);
2707 SAVEFREEPV(recursed);
2709 if (!PAREN_TEST(recursed,paren+1)) {
2710 PAREN_SET(recursed,paren+1);
2711 Newx(newframe,1,scan_frame);
2713 if (flags & SCF_DO_SUBSTR) {
2714 scan_commit(pRExC_state,data,minlenp);
2715 data->longest = &(data->longest_float);
2717 is_inf = is_inf_internal = 1;
2718 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
2719 cl_anything(pRExC_state, data->start_class);
2720 flags &= ~SCF_DO_STCLASS;
2723 Newx(newframe,1,scan_frame);
2726 end = regnext(scan);
2731 SAVEFREEPV(newframe);
2732 newframe->next = regnext(scan);
2733 newframe->last = last;
2734 newframe->stop = stopparen;
2735 newframe->prev = frame;
2745 else if (OP(scan) == EXACT) {
2746 I32 l = STR_LEN(scan);
2749 const U8 * const s = (U8*)STRING(scan);
2750 l = utf8_length(s, s + l);
2751 uc = utf8_to_uvchr(s, NULL);
2753 uc = *((U8*)STRING(scan));
2756 if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
2757 /* The code below prefers earlier match for fixed
2758 offset, later match for variable offset. */
2759 if (data->last_end == -1) { /* Update the start info. */
2760 data->last_start_min = data->pos_min;
2761 data->last_start_max = is_inf
2762 ? I32_MAX : data->pos_min + data->pos_delta;
2764 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
2766 SvUTF8_on(data->last_found);
2768 SV * const sv = data->last_found;
2769 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
2770 mg_find(sv, PERL_MAGIC_utf8) : NULL;
2771 if (mg && mg->mg_len >= 0)
2772 mg->mg_len += utf8_length((U8*)STRING(scan),
2773 (U8*)STRING(scan)+STR_LEN(scan));
2775 data->last_end = data->pos_min + l;
2776 data->pos_min += l; /* As in the first entry. */
2777 data->flags &= ~SF_BEFORE_EOL;
2779 if (flags & SCF_DO_STCLASS_AND) {
2780 /* Check whether it is compatible with what we know already! */
2784 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
2785 && !ANYOF_BITMAP_TEST(data->start_class, uc)
2786 && (!(data->start_class->flags & ANYOF_FOLD)
2787 || !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
2790 ANYOF_CLASS_ZERO(data->start_class);
2791 ANYOF_BITMAP_ZERO(data->start_class);
2793 ANYOF_BITMAP_SET(data->start_class, uc);
2794 data->start_class->flags &= ~ANYOF_EOS;
2796 data->start_class->flags &= ~ANYOF_UNICODE_ALL;
2798 else if (flags & SCF_DO_STCLASS_OR) {
2799 /* false positive possible if the class is case-folded */
2801 ANYOF_BITMAP_SET(data->start_class, uc);
2803 data->start_class->flags |= ANYOF_UNICODE_ALL;
2804 data->start_class->flags &= ~ANYOF_EOS;
2805 cl_and(data->start_class, and_withp);
2807 flags &= ~SCF_DO_STCLASS;
2809 else if (PL_regkind[OP(scan)] == EXACT) { /* But OP != EXACT! */
2810 I32 l = STR_LEN(scan);
2811 UV uc = *((U8*)STRING(scan));
2813 /* Search for fixed substrings supports EXACT only. */
2814 if (flags & SCF_DO_SUBSTR) {
2816 scan_commit(pRExC_state, data, minlenp);
2819 const U8 * const s = (U8 *)STRING(scan);
2820 l = utf8_length(s, s + l);
2821 uc = utf8_to_uvchr(s, NULL);
2824 if (flags & SCF_DO_SUBSTR)
2826 if (flags & SCF_DO_STCLASS_AND) {
2827 /* Check whether it is compatible with what we know already! */
2831 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
2832 && !ANYOF_BITMAP_TEST(data->start_class, uc)
2833 && !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
2835 ANYOF_CLASS_ZERO(data->start_class);
2836 ANYOF_BITMAP_ZERO(data->start_class);
2838 ANYOF_BITMAP_SET(data->start_class, uc);
2839 data->start_class->flags &= ~ANYOF_EOS;
2840 data->start_class->flags |= ANYOF_FOLD;
2841 if (OP(scan) == EXACTFL)
2842 data->start_class->flags |= ANYOF_LOCALE;
2845 else if (flags & SCF_DO_STCLASS_OR) {
2846 if (data->start_class->flags & ANYOF_FOLD) {
2847 /* false positive possible if the class is case-folded.
2848 Assume that the locale settings are the same... */
2850 ANYOF_BITMAP_SET(data->start_class, uc);
2851 data->start_class->flags &= ~ANYOF_EOS;
2853 cl_and(data->start_class, and_withp);
2855 flags &= ~SCF_DO_STCLASS;
2857 else if (strchr((const char*)PL_varies,OP(scan))) {
2858 I32 mincount, maxcount, minnext, deltanext, fl = 0;
2859 I32 f = flags, pos_before = 0;
2860 regnode * const oscan = scan;
2861 struct regnode_charclass_class this_class;
2862 struct regnode_charclass_class *oclass = NULL;
2863 I32 next_is_eval = 0;
2865 switch (PL_regkind[OP(scan)]) {
2866 case WHILEM: /* End of (?:...)* . */
2867 scan = NEXTOPER(scan);
2870 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
2871 next = NEXTOPER(scan);
2872 if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) {
2874 maxcount = REG_INFTY;
2875 next = regnext(scan);
2876 scan = NEXTOPER(scan);
2880 if (flags & SCF_DO_SUBSTR)
2885 if (flags & SCF_DO_STCLASS) {
2887 maxcount = REG_INFTY;
2888 next = regnext(scan);
2889 scan = NEXTOPER(scan);
2892 is_inf = is_inf_internal = 1;
2893 scan = regnext(scan);
2894 if (flags & SCF_DO_SUBSTR) {
2895 scan_commit(pRExC_state, data, minlenp); /* Cannot extend fixed substrings */
2896 data->longest = &(data->longest_float);
2898 goto optimize_curly_tail;
2900 if (stopparen>0 && (OP(scan)==CURLYN || OP(scan)==CURLYM)
2901 && (scan->flags == stopparen))
2906 mincount = ARG1(scan);
2907 maxcount = ARG2(scan);
2909 next = regnext(scan);
2910 if (OP(scan) == CURLYX) {
2911 I32 lp = (data ? *(data->last_closep) : 0);
2912 scan->flags = ((lp <= (I32)U8_MAX) ? (U8)lp : U8_MAX);
2914 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
2915 next_is_eval = (OP(scan) == EVAL);
2917 if (flags & SCF_DO_SUBSTR) {
2918 if (mincount == 0) scan_commit(pRExC_state,data,minlenp); /* Cannot extend fixed substrings */
2919 pos_before = data->pos_min;
2923 data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
2925 data->flags |= SF_IS_INF;
2927 if (flags & SCF_DO_STCLASS) {
2928 cl_init(pRExC_state, &this_class);
2929 oclass = data->start_class;
2930 data->start_class = &this_class;
2931 f |= SCF_DO_STCLASS_AND;
2932 f &= ~SCF_DO_STCLASS_OR;
2934 /* These are the cases when once a subexpression
2935 fails at a particular position, it cannot succeed
2936 even after backtracking at the enclosing scope.
2938 XXXX what if minimal match and we are at the
2939 initial run of {n,m}? */
2940 if ((mincount != maxcount - 1) && (maxcount != REG_INFTY))
2941 f &= ~SCF_WHILEM_VISITED_POS;
2943 /* This will finish on WHILEM, setting scan, or on NULL: */
2944 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
2945 last, data, stopparen, recursed, NULL,
2947 ? (f & ~SCF_DO_SUBSTR) : f),depth+1);
2949 if (flags & SCF_DO_STCLASS)
2950 data->start_class = oclass;
2951 if (mincount == 0 || minnext == 0) {
2952 if (flags & SCF_DO_STCLASS_OR) {
2953 cl_or(pRExC_state, data->start_class, &this_class);
2955 else if (flags & SCF_DO_STCLASS_AND) {
2956 /* Switch to OR mode: cache the old value of
2957 * data->start_class */
2959 StructCopy(data->start_class, and_withp,
2960 struct regnode_charclass_class);
2961 flags &= ~SCF_DO_STCLASS_AND;
2962 StructCopy(&this_class, data->start_class,
2963 struct regnode_charclass_class);
2964 flags |= SCF_DO_STCLASS_OR;
2965 data->start_class->flags |= ANYOF_EOS;
2967 } else { /* Non-zero len */
2968 if (flags & SCF_DO_STCLASS_OR) {
2969 cl_or(pRExC_state, data->start_class, &this_class);
2970 cl_and(data->start_class, and_withp);
2972 else if (flags & SCF_DO_STCLASS_AND)
2973 cl_and(data->start_class, &this_class);
2974 flags &= ~SCF_DO_STCLASS;
2976 if (!scan) /* It was not CURLYX, but CURLY. */
2978 if ( /* ? quantifier ok, except for (?{ ... }) */
2979 (next_is_eval || !(mincount == 0 && maxcount == 1))
2980 && (minnext == 0) && (deltanext == 0)
2981 && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
2982 && maxcount <= REG_INFTY/3 /* Complement check for big count */
2983 && ckWARN(WARN_REGEXP))
2986 "Quantifier unexpected on zero-length expression");
2989 min += minnext * mincount;
2990 is_inf_internal |= ((maxcount == REG_INFTY
2991 && (minnext + deltanext) > 0)
2992 || deltanext == I32_MAX);
2993 is_inf |= is_inf_internal;
2994 delta += (minnext + deltanext) * maxcount - minnext * mincount;
2996 /* Try powerful optimization CURLYX => CURLYN. */
2997 if ( OP(oscan) == CURLYX && data
2998 && data->flags & SF_IN_PAR
2999 && !(data->flags & SF_HAS_EVAL)
3000 && !deltanext && minnext == 1 ) {
3001 /* Try to optimize to CURLYN. */
3002 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
3003 regnode * const nxt1 = nxt;
3010 if (!strchr((const char*)PL_simple,OP(nxt))
3011 && !(PL_regkind[OP(nxt)] == EXACT
3012 && STR_LEN(nxt) == 1))
3018 if (OP(nxt) != CLOSE)
3020 if (RExC_open_parens) {
3021 RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
3022 RExC_close_parens[ARG(nxt1)-1]=nxt+2; /*close->while*/
3024 /* Now we know that nxt2 is the only contents: */
3025 oscan->flags = (U8)ARG(nxt);
3027 OP(nxt1) = NOTHING; /* was OPEN. */
3030 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
3031 NEXT_OFF(nxt1+ 1) = 0; /* just for consistancy. */
3032 NEXT_OFF(nxt2) = 0; /* just for consistancy with CURLY. */
3033 OP(nxt) = OPTIMIZED; /* was CLOSE. */
3034 OP(nxt + 1) = OPTIMIZED; /* was count. */
3035 NEXT_OFF(nxt+ 1) = 0; /* just for consistancy. */
3040 /* Try optimization CURLYX => CURLYM. */
3041 if ( OP(oscan) == CURLYX && data
3042 && !(data->flags & SF_HAS_PAR)
3043 && !(data->flags & SF_HAS_EVAL)
3044 && !deltanext /* atom is fixed width */
3045 && minnext != 0 /* CURLYM can't handle zero width */
3047 /* XXXX How to optimize if data == 0? */
3048 /* Optimize to a simpler form. */
3049 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
3053 while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
3054 && (OP(nxt2) != WHILEM))
3056 OP(nxt2) = SUCCEED; /* Whas WHILEM */
3057 /* Need to optimize away parenths. */
3058 if (data->flags & SF_IN_PAR) {
3059 /* Set the parenth number. */
3060 regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
3062 if (OP(nxt) != CLOSE)
3063 FAIL("Panic opt close");
3064 oscan->flags = (U8)ARG(nxt);
3065 if (RExC_open_parens) {
3066 RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
3067 RExC_close_parens[ARG(nxt1)-1]=nxt2+1; /*close->NOTHING*/
3069 OP(nxt1) = OPTIMIZED; /* was OPEN. */
3070 OP(nxt) = OPTIMIZED; /* was CLOSE. */
3073 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
3074 OP(nxt + 1) = OPTIMIZED; /* was count. */
3075 NEXT_OFF(nxt1 + 1) = 0; /* just for consistancy. */
3076 NEXT_OFF(nxt + 1) = 0; /* just for consistancy. */
3079 while ( nxt1 && (OP(nxt1) != WHILEM)) {
3080 regnode *nnxt = regnext(nxt1);
3083 if (reg_off_by_arg[OP(nxt1)])
3084 ARG_SET(nxt1, nxt2 - nxt1);
3085 else if (nxt2 - nxt1 < U16_MAX)
3086 NEXT_OFF(nxt1) = nxt2 - nxt1;
3088 OP(nxt) = NOTHING; /* Cannot beautify */
3093 /* Optimize again: */
3094 study_chunk(pRExC_state, &nxt1, minlenp, &deltanext, nxt,
3095 NULL, stopparen, recursed, NULL, 0,depth+1);
3100 else if ((OP(oscan) == CURLYX)
3101 && (flags & SCF_WHILEM_VISITED_POS)
3102 /* See the comment on a similar expression above.
3103 However, this time it not a subexpression
3104 we care about, but the expression itself. */
3105 && (maxcount == REG_INFTY)
3106 && data && ++data->whilem_c < 16) {
3107 /* This stays as CURLYX, we can put the count/of pair. */
3108 /* Find WHILEM (as in regexec.c) */
3109 regnode *nxt = oscan + NEXT_OFF(oscan);
3111 if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
3113 PREVOPER(nxt)->flags = (U8)(data->whilem_c
3114 | (RExC_whilem_seen << 4)); /* On WHILEM */
3116 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
3118 if (flags & SCF_DO_SUBSTR) {
3119 SV *last_str = NULL;
3120 int counted = mincount != 0;
3122 if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */
3123 #if defined(SPARC64_GCC_WORKAROUND)
3126 const char *s = NULL;
3129 if (pos_before >= data->last_start_min)
3132 b = data->last_start_min;
3135 s = SvPV_const(data->last_found, l);
3136 old = b - data->last_start_min;
3139 I32 b = pos_before >= data->last_start_min
3140 ? pos_before : data->last_start_min;
3142 const char * const s = SvPV_const(data->last_found, l);
3143 I32 old = b - data->last_start_min;
3147 old = utf8_hop((U8*)s, old) - (U8*)s;
3150 /* Get the added string: */
3151 last_str = newSVpvn(s + old, l);
3153 SvUTF8_on(last_str);
3154 if (deltanext == 0 && pos_before == b) {
3155 /* What was added is a constant string */
3157 SvGROW(last_str, (mincount * l) + 1);
3158 repeatcpy(SvPVX(last_str) + l,
3159 SvPVX_const(last_str), l, mincount - 1);
3160 SvCUR_set(last_str, SvCUR(last_str) * mincount);
3161 /* Add additional parts. */
3162 SvCUR_set(data->last_found,
3163 SvCUR(data->last_found) - l);
3164 sv_catsv(data->last_found, last_str);
3166 SV * sv = data->last_found;
3168 SvUTF8(sv) && SvMAGICAL(sv) ?
3169 mg_find(sv, PERL_MAGIC_utf8) : NULL;
3170 if (mg && mg->mg_len >= 0)
3171 mg->mg_len += CHR_SVLEN(last_str);
3173 data->last_end += l * (mincount - 1);
3176 /* start offset must point into the last copy */
3177 data->last_start_min += minnext * (mincount - 1);
3178 data->last_start_max += is_inf ? I32_MAX
3179 : (maxcount - 1) * (minnext + data->pos_delta);
3182 /* It is counted once already... */
3183 data->pos_min += minnext * (mincount - counted);
3184 data->pos_delta += - counted * deltanext +
3185 (minnext + deltanext) * maxcount - minnext * mincount;
3186 if (mincount != maxcount) {
3187 /* Cannot extend fixed substrings found inside
3189 scan_commit(pRExC_state,data,minlenp);
3190 if (mincount && last_str) {
3191 SV * const sv = data->last_found;
3192 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
3193 mg_find(sv, PERL_MAGIC_utf8) : NULL;
3197 sv_setsv(sv, last_str);
3198 data->last_end = data->pos_min;
3199 data->last_start_min =
3200 data->pos_min - CHR_SVLEN(last_str);
3201 data->last_start_max = is_inf
3203 : data->pos_min + data->pos_delta
3204 - CHR_SVLEN(last_str);
3206 data->longest = &(data->longest_float);
3208 SvREFCNT_dec(last_str);
3210 if (data && (fl & SF_HAS_EVAL))
3211 data->flags |= SF_HAS_EVAL;
3212 optimize_curly_tail:
3213 if (OP(oscan) != CURLYX) {
3214 while (PL_regkind[OP(next = regnext(oscan))] == NOTHING
3216 NEXT_OFF(oscan) += NEXT_OFF(next);
3219 default: /* REF and CLUMP only? */
3220 if (flags & SCF_DO_SUBSTR) {
3221 scan_commit(pRExC_state,data,minlenp); /* Cannot expect anything... */
3222 data->longest = &(data->longest_float);
3224 is_inf = is_inf_internal = 1;
3225 if (flags & SCF_DO_STCLASS_OR)
3226 cl_anything(pRExC_state, data->start_class);
3227 flags &= ~SCF_DO_STCLASS;
3231 else if (strchr((const char*)PL_simple,OP(scan))) {
3234 if (flags & SCF_DO_SUBSTR) {
3235 scan_commit(pRExC_state,data,minlenp);
3239 if (flags & SCF_DO_STCLASS) {
3240 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
3242 /* Some of the logic below assumes that switching
3243 locale on will only add false positives. */
3244 switch (PL_regkind[OP(scan)]) {
3248 /* Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan)); */
3249 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
3250 cl_anything(pRExC_state, data->start_class);
3253 if (OP(scan) == SANY)
3255 if (flags & SCF_DO_STCLASS_OR) { /* Everything but \n */
3256 value = (ANYOF_BITMAP_TEST(data->start_class,'\n')
3257 || (data->start_class->flags & ANYOF_CLASS));
3258 cl_anything(pRExC_state, data->start_class);
3260 if (flags & SCF_DO_STCLASS_AND || !value)
3261 ANYOF_BITMAP_CLEAR(data->start_class,'\n');
3264 if (flags & SCF_DO_STCLASS_AND)
3265 cl_and(data->start_class,
3266 (struct regnode_charclass_class*)scan);
3268 cl_or(pRExC_state, data->start_class,
3269 (struct regnode_charclass_class*)scan);
3272 if (flags & SCF_DO_STCLASS_AND) {
3273 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3274 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
3275 for (value = 0; value < 256; value++)
3276 if (!isALNUM(value))
3277 ANYOF_BITMAP_CLEAR(data->start_class, value);
3281 if (data->start_class->flags & ANYOF_LOCALE)
3282 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
3284 for (value = 0; value < 256; value++)
3286 ANYOF_BITMAP_SET(data->start_class, value);
3291 if (flags & SCF_DO_STCLASS_AND) {
3292 if (data->start_class->flags & ANYOF_LOCALE)
3293 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
3296 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
3297 data->start_class->flags |= ANYOF_LOCALE;
3301 if (flags & SCF_DO_STCLASS_AND) {
3302 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3303 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
3304 for (value = 0; value < 256; value++)
3306 ANYOF_BITMAP_CLEAR(data->start_class, value);
3310 if (data->start_class->flags & ANYOF_LOCALE)
3311 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
3313 for (value = 0; value < 256; value++)
3314 if (!isALNUM(value))
3315 ANYOF_BITMAP_SET(data->start_class, value);
3320 if (flags & SCF_DO_STCLASS_AND) {
3321 if (data->start_class->flags & ANYOF_LOCALE)
3322 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
3325 data->start_class->flags |= ANYOF_LOCALE;
3326 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
3330 if (flags & SCF_DO_STCLASS_AND) {
3331 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3332 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
3333 for (value = 0; value < 256; value++)
3334 if (!isSPACE(value))
3335 ANYOF_BITMAP_CLEAR(data->start_class, value);
3339 if (data->start_class->flags & ANYOF_LOCALE)
3340 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
3342 for (value = 0; value < 256; value++)
3344 ANYOF_BITMAP_SET(data->start_class, value);
3349 if (flags & SCF_DO_STCLASS_AND) {
3350 if (data->start_class->flags & ANYOF_LOCALE)
3351 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
3354 data->start_class->flags |= ANYOF_LOCALE;
3355 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
3359 if (flags & SCF_DO_STCLASS_AND) {
3360 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3361 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
3362 for (value = 0; value < 256; value++)
3364 ANYOF_BITMAP_CLEAR(data->start_class, value);
3368 if (data->start_class->flags & ANYOF_LOCALE)
3369 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
3371 for (value = 0; value < 256; value++)
3372 if (!isSPACE(value))
3373 ANYOF_BITMAP_SET(data->start_class, value);
3378 if (flags & SCF_DO_STCLASS_AND) {
3379 if (data->start_class->flags & ANYOF_LOCALE) {
3380 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
3381 for (value = 0; value < 256; value++)
3382 if (!isSPACE(value))
3383 ANYOF_BITMAP_CLEAR(data->start_class, value);
3387 data->start_class->flags |= ANYOF_LOCALE;
3388 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
3392 if (flags & SCF_DO_STCLASS_AND) {
3393 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NDIGIT);
3394 for (value = 0; value < 256; value++)
3395 if (!isDIGIT(value))
3396 ANYOF_BITMAP_CLEAR(data->start_class, value);
3399 if (data->start_class->flags & ANYOF_LOCALE)
3400 ANYOF_CLASS_SET(data->start_class,ANYOF_DIGIT);
3402 for (value = 0; value < 256; value++)
3404 ANYOF_BITMAP_SET(data->start_class, value);
3409 if (flags & SCF_DO_STCLASS_AND) {
3410 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_DIGIT);
3411 for (value = 0; value < 256; value++)
3413 ANYOF_BITMAP_CLEAR(data->start_class, value);
3416 if (data->start_class->flags & ANYOF_LOCALE)
3417 ANYOF_CLASS_SET(data->start_class,ANYOF_NDIGIT);
3419 for (value = 0; value < 256; value++)
3420 if (!isDIGIT(value))
3421 ANYOF_BITMAP_SET(data->start_class, value);
3426 if (flags & SCF_DO_STCLASS_OR)
3427 cl_and(data->start_class, and_withp);
3428 flags &= ~SCF_DO_STCLASS;
3431 else if (PL_regkind[OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
3432 data->flags |= (OP(scan) == MEOL
3436 else if ( PL_regkind[OP(scan)] == BRANCHJ
3437 /* Lookbehind, or need to calculate parens/evals/stclass: */
3438 && (scan->flags || data || (flags & SCF_DO_STCLASS))
3439 && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) {
3440 if ( !PERL_ENABLE_POSITIVE_ASSERTION_STUDY
3441 || OP(scan) == UNLESSM )
3443 /* Negative Lookahead/lookbehind
3444 In this case we can't do fixed string optimisation.
3447 I32 deltanext, minnext, fake = 0;
3449 struct regnode_charclass_class intrnl;
3452 data_fake.flags = 0;
3454 data_fake.whilem_c = data->whilem_c;
3455 data_fake.last_closep = data->last_closep;
3458 data_fake.last_closep = &fake;
3459 if ( flags & SCF_DO_STCLASS && !scan->flags
3460 && OP(scan) == IFMATCH ) { /* Lookahead */
3461 cl_init(pRExC_state, &intrnl);
3462 data_fake.start_class = &intrnl;
3463 f |= SCF_DO_STCLASS_AND;
3465 if (flags & SCF_WHILEM_VISITED_POS)
3466 f |= SCF_WHILEM_VISITED_POS;
3467 next = regnext(scan);
3468 nscan = NEXTOPER(NEXTOPER(scan));
3469 minnext = study_chunk(pRExC_state, &nscan, minlenp, &deltanext,
3470 last, &data_fake, stopparen, recursed, NULL, f, depth+1);
3473 vFAIL("Variable length lookbehind not implemented");
3475 else if (minnext > (I32)U8_MAX) {
3476 vFAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
3478 scan->flags = (U8)minnext;
3481 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3483 if (data_fake.flags & SF_HAS_EVAL)
3484 data->flags |= SF_HAS_EVAL;
3485 data->whilem_c = data_fake.whilem_c;
3487 if (f & SCF_DO_STCLASS_AND) {
3488 const int was = (data->start_class->flags & ANYOF_EOS);
3490 cl_and(data->start_class, &intrnl);
3492 data->start_class->flags |= ANYOF_EOS;
3495 #if PERL_ENABLE_POSITIVE_ASSERTION_STUDY
3497 /* Positive Lookahead/lookbehind
3498 In this case we can do fixed string optimisation,
3499 but we must be careful about it. Note in the case of
3500 lookbehind the positions will be offset by the minimum
3501 length of the pattern, something we won't know about
3502 until after the recurse.
3504 I32 deltanext, fake = 0;
3506 struct regnode_charclass_class intrnl;
3508 /* We use SAVEFREEPV so that when the full compile
3509 is finished perl will clean up the allocated
3510 minlens when its all done. This was we don't
3511 have to worry about freeing them when we know
3512 they wont be used, which would be a pain.
3515 Newx( minnextp, 1, I32 );
3516 SAVEFREEPV(minnextp);
3519 StructCopy(data, &data_fake, scan_data_t);
3520 if ((flags & SCF_DO_SUBSTR) && data->last_found) {
3523 scan_commit(pRExC_state, &data_fake,minlenp);
3524 data_fake.last_found=newSVsv(data->last_found);
3528 data_fake.last_closep = &fake;
3529 data_fake.flags = 0;
3531 data_fake.flags |= SF_IS_INF;
3532 if ( flags & SCF_DO_STCLASS && !scan->flags
3533 && OP(scan) == IFMATCH ) { /* Lookahead */
3534 cl_init(pRExC_state, &intrnl);
3535 data_fake.start_class = &intrnl;
3536 f |= SCF_DO_STCLASS_AND;
3538 if (flags & SCF_WHILEM_VISITED_POS)
3539 f |= SCF_WHILEM_VISITED_POS;
3540 next = regnext(scan);
3541 nscan = NEXTOPER(NEXTOPER(scan));
3543 *minnextp = study_chunk(pRExC_state, &nscan, minnextp, &deltanext,
3544 last, &data_fake, stopparen, recursed, NULL, f,depth+1);
3547 vFAIL("Variable length lookbehind not implemented");
3549 else if (*minnextp > (I32)U8_MAX) {
3550 vFAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
3552 scan->flags = (U8)*minnextp;
3557 if (f & SCF_DO_STCLASS_AND) {
3558 const int was = (data->start_class->flags & ANYOF_EOS);
3560 cl_and(data->start_class, &intrnl);
3562 data->start_class->flags |= ANYOF_EOS;
3565 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3567 if (data_fake.flags & SF_HAS_EVAL)
3568 data->flags |= SF_HAS_EVAL;
3569 data->whilem_c = data_fake.whilem_c;
3570 if ((flags & SCF_DO_SUBSTR) && data_fake.last_found) {
3571 if (RExC_rx->minlen<*minnextp)
3572 RExC_rx->minlen=*minnextp;
3573 scan_commit(pRExC_state, &data_fake, minnextp);
3574 SvREFCNT_dec(data_fake.last_found);
3576 if ( data_fake.minlen_fixed != minlenp )
3578 data->offset_fixed= data_fake.offset_fixed;
3579 data->minlen_fixed= data_fake.minlen_fixed;
3580 data->lookbehind_fixed+= scan->flags;
3582 if ( data_fake.minlen_float != minlenp )
3584 data->minlen_float= data_fake.minlen_float;
3585 data->offset_float_min=data_fake.offset_float_min;
3586 data->offset_float_max=data_fake.offset_float_max;
3587 data->lookbehind_float+= scan->flags;
3596 else if (OP(scan) == OPEN) {
3597 if (stopparen != (I32)ARG(scan))
3600 else if (OP(scan) == CLOSE) {
3601 if (stopparen == (I32)ARG(scan)) {
3604 if ((I32)ARG(scan) == is_par) {
3605 next = regnext(scan);
3607 if ( next && (OP(next) != WHILEM) && next < last)
3608 is_par = 0; /* Disable optimization */
3611 *(data->last_closep) = ARG(scan);
3613 else if (OP(scan) == EVAL) {
3615 data->flags |= SF_HAS_EVAL;
3617 else if ( PL_regkind[OP(scan)] == ENDLIKE ) {
3618 if (flags & SCF_DO_SUBSTR) {
3619 scan_commit(pRExC_state,data,minlenp);
3620 flags &= ~SCF_DO_SUBSTR;
3622 if (data && OP(scan)==ACCEPT) {
3623 data->flags |= SCF_SEEN_ACCEPT;
3628 else if (OP(scan) == LOGICAL && scan->flags == 2) /* Embedded follows */
3630 if (flags & SCF_DO_SUBSTR) {
3631 scan_commit(pRExC_state,data,minlenp);
3632 data->longest = &(data->longest_float);
3634 is_inf = is_inf_internal = 1;
3635 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
3636 cl_anything(pRExC_state, data->start_class);
3637 flags &= ~SCF_DO_STCLASS;
3639 #ifdef TRIE_STUDY_OPT
3640 #ifdef FULL_TRIE_STUDY
3641 else if (PL_regkind[OP(scan)] == TRIE) {
3642 /* NOTE - There is similar code to this block above for handling
3643 BRANCH nodes on the initial study. If you change stuff here
3645 regnode *trie_node= scan;
3646 regnode *tail= regnext(scan);
3647 reg_trie_data *trie = (reg_trie_data*)RExC_rx->data->data[ ARG(scan) ];
3648 I32 max1 = 0, min1 = I32_MAX;
3649 struct regnode_charclass_class accum;
3651 if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
3652 scan_commit(pRExC_state, data,minlenp); /* Cannot merge strings after this. */
3653 if (flags & SCF_DO_STCLASS)
3654 cl_init_zero(pRExC_state, &accum);
3660 const regnode *nextbranch= NULL;
3663 for ( word=1 ; word <= trie->wordcount ; word++)
3665 I32 deltanext=0, minnext=0, f = 0, fake;
3666 struct regnode_charclass_class this_class;
3668 data_fake.flags = 0;
3670 data_fake.whilem_c = data->whilem_c;
3671 data_fake.last_closep = data->last_closep;
3674 data_fake.last_closep = &fake;
3676 if (flags & SCF_DO_STCLASS) {
3677 cl_init(pRExC_state, &this_class);
3678 data_fake.start_class = &this_class;
3679 f = SCF_DO_STCLASS_AND;
3681 if (flags & SCF_WHILEM_VISITED_POS)
3682 f |= SCF_WHILEM_VISITED_POS;
3684 if (trie->jump[word]) {
3686 nextbranch = trie_node + trie->jump[0];
3687 scan= trie_node + trie->jump[word];
3688 /* We go from the jump point to the branch that follows
3689 it. Note this means we need the vestigal unused branches
3690 even though they arent otherwise used.
3692 minnext = study_chunk(pRExC_state, &scan, minlenp,
3693 &deltanext, (regnode *)nextbranch, &data_fake,
3694 stopparen, recursed, NULL, f,depth+1);
3696 if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
3697 nextbranch= regnext((regnode*)nextbranch);
3699 if (min1 > (I32)(minnext + trie->minlen))
3700 min1 = minnext + trie->minlen;
3701 if (max1 < (I32)(minnext + deltanext + trie->maxlen))
3702 max1 = minnext + deltanext + trie->maxlen;
3703 if (deltanext == I32_MAX)
3704 is_inf = is_inf_internal = 1;
3706 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3708 if (data_fake.flags & SCF_SEEN_ACCEPT) {
3709 if ( stopmin > min + min1)
3710 stopmin = min + min1;
3711 flags &= ~SCF_DO_SUBSTR;
3713 data->flags |= SCF_SEEN_ACCEPT;
3716 if (data_fake.flags & SF_HAS_EVAL)
3717 data->flags |= SF_HAS_EVAL;
3718 data->whilem_c = data_fake.whilem_c;
3720 if (flags & SCF_DO_STCLASS)
3721 cl_or(pRExC_state, &accum, &this_class);
3724 if (flags & SCF_DO_SUBSTR) {
3725 data->pos_min += min1;
3726 data->pos_delta += max1 - min1;
3727 if (max1 != min1 || is_inf)
3728 data->longest = &(data->longest_float);
3731 delta += max1 - min1;
3732 if (flags & SCF_DO_STCLASS_OR) {
3733 cl_or(pRExC_state, data->start_class, &accum);
3735 cl_and(data->start_class, and_withp);
3736 flags &= ~SCF_DO_STCLASS;
3739 else if (flags & SCF_DO_STCLASS_AND) {
3741 cl_and(data->start_class, &accum);
3742 flags &= ~SCF_DO_STCLASS;
3745 /* Switch to OR mode: cache the old value of
3746 * data->start_class */
3748 StructCopy(data->start_class, and_withp,
3749 struct regnode_charclass_class);
3750 flags &= ~SCF_DO_STCLASS_AND;
3751 StructCopy(&accum, data->start_class,
3752 struct regnode_charclass_class);
3753 flags |= SCF_DO_STCLASS_OR;
3754 data->start_class->flags |= ANYOF_EOS;
3761 else if (PL_regkind[OP(scan)] == TRIE) {
3762 reg_trie_data *trie = (reg_trie_data*)RExC_rx->data->data[ ARG(scan) ];
3765 min += trie->minlen;
3766 delta += (trie->maxlen - trie->minlen);
3767 flags &= ~SCF_DO_STCLASS; /* xxx */
3768 if (flags & SCF_DO_SUBSTR) {
3769 scan_commit(pRExC_state,data,minlenp); /* Cannot expect anything... */
3770 data->pos_min += trie->minlen;
3771 data->pos_delta += (trie->maxlen - trie->minlen);
3772 if (trie->maxlen != trie->minlen)
3773 data->longest = &(data->longest_float);
3775 if (trie->jump) /* no more substrings -- for now /grr*/
3776 flags &= ~SCF_DO_SUBSTR;
3778 #endif /* old or new */
3779 #endif /* TRIE_STUDY_OPT */
3780 /* Else: zero-length, ignore. */
3781 scan = regnext(scan);
3786 stopparen = frame->stop;
3787 frame = frame->prev;
3788 goto fake_study_recurse;
3795 *deltap = is_inf_internal ? I32_MAX : delta;
3796 if (flags & SCF_DO_SUBSTR && is_inf)
3797 data->pos_delta = I32_MAX - data->pos_min;
3798 if (is_par > (I32)U8_MAX)
3800 if (is_par && pars==1 && data) {
3801 data->flags |= SF_IN_PAR;
3802 data->flags &= ~SF_HAS_PAR;
3804 else if (pars && data) {
3805 data->flags |= SF_HAS_PAR;
3806 data->flags &= ~SF_IN_PAR;
3808 if (flags & SCF_DO_STCLASS_OR)
3809 cl_and(data->start_class, and_withp);
3810 if (flags & SCF_TRIE_RESTUDY)
3811 data->flags |= SCF_TRIE_RESTUDY;
3813 DEBUG_STUDYDATA(data,depth);
3815 return min < stopmin ? min : stopmin;
3819 S_add_data(RExC_state_t *pRExC_state, I32 n, const char *s)
3821 if (RExC_rx->data) {
3822 const U32 count = RExC_rx->data->count;
3823 Renewc(RExC_rx->data,
3824 sizeof(*RExC_rx->data) + sizeof(void*) * (count + n - 1),
3825 char, struct reg_data);
3826 Renew(RExC_rx->data->what, count + n, U8);
3827 RExC_rx->data->count += n;
3830 Newxc(RExC_rx->data, sizeof(*RExC_rx->data) + sizeof(void*) * (n - 1),
3831 char, struct reg_data);
3832 Newx(RExC_rx->data->what, n, U8);
3833 RExC_rx->data->count = n;
3835 Copy(s, RExC_rx->data->what + RExC_rx->data->count - n, n, U8);
3836 return RExC_rx->data->count - n;
3839 #ifndef PERL_IN_XSUB_RE
3841 Perl_reginitcolors(pTHX)
3844 const char * const s = PerlEnv_getenv("PERL_RE_COLORS");
3846 char *t = savepv(s);
3850 t = strchr(t, '\t');
3856 PL_colors[i] = t = (char *)"";
3861 PL_colors[i++] = (char *)"";
3868 #ifdef TRIE_STUDY_OPT
3869 #define CHECK_RESTUDY_GOTO \
3871 (data.flags & SCF_TRIE_RESTUDY) \
3875 #define CHECK_RESTUDY_GOTO
3879 - pregcomp - compile a regular expression into internal code
3881 * We can't allocate space until we know how big the compiled form will be,
3882 * but we can't compile it (and thus know how big it is) until we've got a
3883 * place to put the code. So we cheat: we compile it twice, once with code
3884 * generation turned off and size counting turned on, and once "for real".
3885 * This also means that we don't allocate space until we are sure that the
3886 * thing really will compile successfully, and we never have to move the
3887 * code and thus invalidate pointers into it. (Note that it has to be in
3888 * one piece because free() must be able to free it all.) [NB: not true in perl]
3890 * Beware that the optimization-preparation code in here knows about some
3891 * of the structure of the compiled regexp. [I'll say.]
3896 #ifndef PERL_IN_XSUB_RE
3897 #define RE_ENGINE_PTR &PL_core_reg_engine
3899 extern const struct regexp_engine my_reg_engine;
3900 #define RE_ENGINE_PTR &my_reg_engine
3902 /* these make a few things look better, to avoid indentation */
3903 #define BEGIN_BLOCK {
3907 Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm)
3910 GET_RE_DEBUG_FLAGS_DECL;
3911 DEBUG_r(if (!PL_colorset) reginitcolors());
3912 #ifndef PERL_IN_XSUB_RE
3914 /* Dispatch a request to compile a regexp to correct
3916 HV * const table = GvHV(PL_hintgv);
3918 SV **ptr= hv_fetchs(table, "regcomp", FALSE);
3919 if (ptr && SvIOK(*ptr) && SvIV(*ptr)) {
3920 const regexp_engine *eng=INT2PTR(regexp_engine*,SvIV(*ptr));
3922 PerlIO_printf(Perl_debug_log, "Using engine %"UVxf"\n",
3925 return CALLREGCOMP_ENG(eng, exp, xend, pm);
3939 RExC_state_t RExC_state;
3940 RExC_state_t * const pRExC_state = &RExC_state;
3941 #ifdef TRIE_STUDY_OPT
3943 RExC_state_t copyRExC_state;
3946 FAIL("NULL regexp argument");
3948 RExC_utf8 = pm->op_pmdynflags & PMdf_CMP_UTF8;
3952 SV *dsv= sv_newmortal();
3953 RE_PV_QUOTED_DECL(s, RExC_utf8,
3954 dsv, RExC_precomp, (xend - exp), 60);
3955 PerlIO_printf(Perl_debug_log, "%sCompiling REx%s %s\n",
3956 PL_colors[4],PL_colors[5],s);
3958 RExC_flags = pm->op_pmflags;
3962 RExC_seen_zerolen = *exp == '^' ? -1 : 0;
3963 RExC_seen_evals = 0;
3966 /* First pass: determine size, legality. */
3975 RExC_emit = &PL_regdummy;
3976 RExC_whilem_seen = 0;
3977 RExC_charnames = NULL;
3978 RExC_open_parens = NULL;
3979 RExC_close_parens = NULL;
3981 RExC_paren_names = NULL;
3982 RExC_recurse = NULL;
3983 RExC_recurse_count = 0;
3985 #if 0 /* REGC() is (currently) a NOP at the first pass.
3986 * Clever compilers notice this and complain. --jhi */
3987 REGC((U8)REG_MAGIC, (char*)RExC_emit);
3989 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "Starting first pass (sizing)\n"));
3990 if (reg(pRExC_state, 0, &flags,1) == NULL) {
3991 RExC_precomp = NULL;
3995 PerlIO_printf(Perl_debug_log,
3996 "Required size %"IVdf" nodes\n"
3997 "Starting second pass (creation)\n",
4000 RExC_lastparse=NULL;
4002 /* Small enough for pointer-storage convention?
4003 If extralen==0, this means that we will not need long jumps. */
4004 if (RExC_size >= 0x10000L && RExC_extralen)
4005 RExC_size += RExC_extralen;
4008 if (RExC_whilem_seen > 15)
4009 RExC_whilem_seen = 15;
4012 /* Make room for a sentinel value at the end of the program */
4016 /* Allocate space and zero-initialize. Note, the two step process
4017 of zeroing when in debug mode, thus anything assigned has to
4018 happen after that */
4019 Newxc(r, sizeof(regexp) + (unsigned)RExC_size * sizeof(regnode),
4022 FAIL("Regexp out of space");
4024 /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */
4025 Zero(r, sizeof(regexp) + (unsigned)RExC_size * sizeof(regnode), char);
4027 /* initialization begins here */
4028 r->engine= RE_ENGINE_PTR;
4030 r->prelen = xend - exp;
4031 r->precomp = savepvn(RExC_precomp, r->prelen);
4033 #ifdef PERL_OLD_COPY_ON_WRITE
4034 r->saved_copy = NULL;
4036 r->reganch = pm->op_pmflags & PMf_COMPILETIME;
4037 r->nparens = RExC_npar - 1; /* set early to validate backrefs */
4038 r->lastparen = 0; /* mg.c reads this. */
4040 r->substrs = 0; /* Useful during FAIL. */
4041 r->startp = 0; /* Useful during FAIL. */
4046 if (RExC_seen & REG_SEEN_RECURSE) {
4047 Newxz(RExC_open_parens, RExC_npar,regnode *);
4048 SAVEFREEPV(RExC_open_parens);
4049 Newxz(RExC_close_parens,RExC_npar,regnode *);
4050 SAVEFREEPV(RExC_close_parens);
4053 /* Useful during FAIL. */
4054 Newxz(r->offsets, 2*RExC_size+1, U32); /* MJD 20001228 */
4056 r->offsets[0] = RExC_size;
4058 DEBUG_OFFSETS_r(PerlIO_printf(Perl_debug_log,
4059 "%s %"UVuf" bytes for offset annotations.\n",
4060 r->offsets ? "Got" : "Couldn't get",
4061 (UV)((2*RExC_size+1) * sizeof(U32))));
4065 /* Second pass: emit code. */
4066 RExC_flags = pm->op_pmflags; /* don't let top level (?i) bleed */
4072 RExC_emit_start = r->program;
4073 RExC_emit = r->program;
4075 /* put a sentinal on the end of the program so we can check for
4077 r->program[RExC_size].type = 255;
4079 /* Store the count of eval-groups for security checks: */
4080 RExC_emit->next_off = (RExC_seen_evals > (I32)U16_MAX) ? U16_MAX : (U16)RExC_seen_evals;
4081 REGC((U8)REG_MAGIC, (char*) RExC_emit++);
4083 if (reg(pRExC_state, 0, &flags,1) == NULL)
4086 /* XXXX To minimize changes to RE engine we always allocate
4087 3-units-long substrs field. */
4088 Newx(r->substrs, 1, struct reg_substr_data);
4089 if (RExC_recurse_count) {
4090 Newxz(RExC_recurse,RExC_recurse_count,regnode *);
4091 SAVEFREEPV(RExC_recurse);
4095 r->minlen = minlen = sawplus = sawopen = 0;
4096 Zero(r->substrs, 1, struct reg_substr_data);
4098 #ifdef TRIE_STUDY_OPT
4101 DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log,"Restudying\n"));
4103 RExC_state = copyRExC_state;
4104 if (seen & REG_TOP_LEVEL_BRANCHES)
4105 RExC_seen |= REG_TOP_LEVEL_BRANCHES;
4107 RExC_seen &= ~REG_TOP_LEVEL_BRANCHES;
4108 if (data.last_found) {
4109 SvREFCNT_dec(data.longest_fixed);
4110 SvREFCNT_dec(data.longest_float);
4111 SvREFCNT_dec(data.last_found);
4113 StructCopy(&zero_scan_data, &data, scan_data_t);
4115 StructCopy(&zero_scan_data, &data, scan_data_t);
4116 copyRExC_state = RExC_state;
4119 StructCopy(&zero_scan_data, &data, scan_data_t);
4122 /* Dig out information for optimizations. */
4123 r->reganch = pm->op_pmflags & PMf_COMPILETIME; /* Again? */
4124 pm->op_pmflags = RExC_flags;
4126 r->reganch |= ROPT_UTF8; /* Unicode in it? */
4127 r->regstclass = NULL;
4128 if (RExC_naughty >= 10) /* Probably an expensive pattern. */
4129 r->reganch |= ROPT_NAUGHTY;
4130 scan = r->program + 1; /* First BRANCH. */
4132 /* testing for BRANCH here tells us whether there is "must appear"
4133 data in the pattern. If there is then we can use it for optimisations */
4134 if (!(RExC_seen & REG_TOP_LEVEL_BRANCHES)) { /* Only one top-level choice. */
4136 STRLEN longest_float_length, longest_fixed_length;
4137 struct regnode_charclass_class ch_class; /* pointed to by data */
4139 I32 last_close = 0; /* pointed to by data */
4142 /* Skip introductions and multiplicators >= 1. */
4143 while ((OP(first) == OPEN && (sawopen = 1)) ||
4144 /* An OR of *one* alternative - should not happen now. */
4145 (OP(first) == BRANCH && OP(regnext(first)) != BRANCH) ||
4146 /* for now we can't handle lookbehind IFMATCH*/
4147 (OP(first) == IFMATCH && !first->flags) ||
4148 (OP(first) == PLUS) ||
4149 (OP(first) == MINMOD) ||
4150 /* An {n,m} with n>0 */
4151 (PL_regkind[OP(first)] == CURLY && ARG1(first) > 0) )
4154 if (OP(first) == PLUS)
4157 first += regarglen[OP(first)];
4158 if (OP(first) == IFMATCH) {
4159 first = NEXTOPER(first);
4160 first += EXTRA_STEP_2ARGS;
4161 } else /* XXX possible optimisation for /(?=)/ */
4162 first = NEXTOPER(first);
4165 /* Starting-point info. */
4167 DEBUG_PEEP("first:",first,0);
4168 /* Ignore EXACT as we deal with it later. */
4169 if (PL_regkind[OP(first)] == EXACT) {
4170 if (OP(first) == EXACT)
4171 NOOP; /* Empty, get anchored substr later. */
4172 else if ((OP(first) == EXACTF || OP(first) == EXACTFL))
4173 r->regstclass = first;
4176 else if (PL_regkind[OP(first)] == TRIE &&
4177 ((reg_trie_data *)r->data->data[ ARG(first) ])->minlen>0)
4180 /* this can happen only on restudy */
4181 if ( OP(first) == TRIE ) {
4182 struct regnode_1 *trieop;
4183 Newxz(trieop,1,struct regnode_1);
4184 StructCopy(first,trieop,struct regnode_1);
4185 trie_op=(regnode *)trieop;
4187 struct regnode_charclass *trieop;
4188 Newxz(trieop,1,struct regnode_charclass);
4189 StructCopy(first,trieop,struct regnode_charclass);
4190 trie_op=(regnode *)trieop;
4193 make_trie_failtable(pRExC_state, (regnode *)first, trie_op, 0);
4194 r->regstclass = trie_op;
4197 else if (strchr((const char*)PL_simple,OP(first)))
4198 r->regstclass = first;
4199 else if (PL_regkind[OP(first)] == BOUND ||
4200 PL_regkind[OP(first)] == NBOUND)
4201 r->regstclass = first;
4202 else if (PL_regkind[OP(first)] == BOL) {
4203 r->reganch |= (OP(first) == MBOL
4205 : (OP(first) == SBOL
4208 first = NEXTOPER(first);
4211 else if (OP(first) == GPOS) {
4212 r->reganch |= ROPT_ANCH_GPOS;
4213 first = NEXTOPER(first);
4216 else if ((!sawopen || !RExC_sawback) &&
4217 (OP(first) == STAR &&
4218 PL_regkind[OP(NEXTOPER(first))] == REG_ANY) &&
4219 !(r->reganch & ROPT_ANCH) )
4221 /* turn .* into ^.* with an implied $*=1 */
4223 (OP(NEXTOPER(first)) == REG_ANY)
4226 r->reganch |= type | ROPT_IMPLICIT;
4227 first = NEXTOPER(first);
4230 if (sawplus && (!sawopen || !RExC_sawback)
4231 && !(RExC_seen & REG_SEEN_EVAL)) /* May examine pos and $& */
4232 /* x+ must match at the 1st pos of run of x's */
4233 r->reganch |= ROPT_SKIP;
4235 /* Scan is after the zeroth branch, first is atomic matcher. */
4236 #ifdef TRIE_STUDY_OPT
4239 PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
4240 (IV)(first - scan + 1))
4244 PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
4245 (IV)(first - scan + 1))
4251 * If there's something expensive in the r.e., find the
4252 * longest literal string that must appear and make it the
4253 * regmust. Resolve ties in favor of later strings, since
4254 * the regstart check works with the beginning of the r.e.
4255 * and avoiding duplication strengthens checking. Not a
4256 * strong reason, but sufficient in the absence of others.
4257 * [Now we resolve ties in favor of the earlier string if
4258 * it happens that c_offset_min has been invalidated, since the
4259 * earlier string may buy us something the later one won't.]
4262 data.longest_fixed = newSVpvs("");
4263 data.longest_float = newSVpvs("");
4264 data.last_found = newSVpvs("");
4265 data.longest = &(data.longest_fixed);
4267 if (!r->regstclass) {
4268 cl_init(pRExC_state, &ch_class);
4269 data.start_class = &ch_class;
4270 stclass_flag = SCF_DO_STCLASS_AND;
4271 } else /* XXXX Check for BOUND? */
4273 data.last_closep = &last_close;
4275 minlen = study_chunk(pRExC_state, &first, &minlen, &fake, scan + RExC_size, /* Up to end */
4276 &data, -1, NULL, NULL,
4277 SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag,0);
4283 if ( RExC_npar == 1 && data.longest == &(data.longest_fixed)
4284 && data.last_start_min == 0 && data.last_end > 0
4285 && !RExC_seen_zerolen
4286 && (!(RExC_seen & REG_SEEN_GPOS) || (r->reganch & ROPT_ANCH_GPOS)))
4287 r->reganch |= ROPT_CHECK_ALL;
4288 scan_commit(pRExC_state, &data,&minlen);
4289 SvREFCNT_dec(data.last_found);
4291 /* Note that code very similar to this but for anchored string
4292 follows immediately below, changes may need to be made to both.
4295 longest_float_length = CHR_SVLEN(data.longest_float);
4296 if (longest_float_length
4297 || (data.flags & SF_FL_BEFORE_EOL
4298 && (!(data.flags & SF_FL_BEFORE_MEOL)
4299 || (RExC_flags & PMf_MULTILINE))))
4303 if (SvCUR(data.longest_fixed) /* ok to leave SvCUR */
4304 && data.offset_fixed == data.offset_float_min
4305 && SvCUR(data.longest_fixed) == SvCUR(data.longest_float))
4306 goto remove_float; /* As in (a)+. */
4308 /* copy the information about the longest float from the reg_scan_data
4309 over to the program. */
4310 if (SvUTF8(data.longest_float)) {
4311 r->float_utf8 = data.longest_float;
4312 r->float_substr = NULL;
4314 r->float_substr = data.longest_float;
4315 r->float_utf8 = NULL;
4317 /* float_end_shift is how many chars that must be matched that
4318 follow this item. We calculate it ahead of time as once the
4319 lookbehind offset is added in we lose the ability to correctly
4321 ml = data.minlen_float ? *(data.minlen_float)
4322 : (I32)longest_float_length;
4323 r->float_end_shift = ml - data.offset_float_min
4324 - longest_float_length + (SvTAIL(data.longest_float) != 0)
4325 + data.lookbehind_float;
4326 r->float_min_offset = data.offset_float_min - data.lookbehind_float;
4327 r->float_max_offset = data.offset_float_max;
4328 if (data.offset_float_max < I32_MAX) /* Don't offset infinity */
4329 r->float_max_offset -= data.lookbehind_float;
4331 t = (data.flags & SF_FL_BEFORE_EOL /* Can't have SEOL and MULTI */
4332 && (!(data.flags & SF_FL_BEFORE_MEOL)
4333 || (RExC_flags & PMf_MULTILINE)));
4334 fbm_compile(data.longest_float, t ? FBMcf_TAIL : 0);
4338 r->float_substr = r->float_utf8 = NULL;
4339 SvREFCNT_dec(data.longest_float);
4340 longest_float_length = 0;
4343 /* Note that code very similar to this but for floating string
4344 is immediately above, changes may need to be made to both.
4347 longest_fixed_length = CHR_SVLEN(data.longest_fixed);
4348 if (longest_fixed_length
4349 || (data.flags & SF_FIX_BEFORE_EOL /* Cannot have SEOL and MULTI */
4350 && (!(data.flags & SF_FIX_BEFORE_MEOL)
4351 || (RExC_flags & PMf_MULTILINE))))
4355 /* copy the information about the longest fixed
4356 from the reg_scan_data over to the program. */
4357 if (SvUTF8(data.longest_fixed)) {
4358 r->anchored_utf8 = data.longest_fixed;
4359 r->anchored_substr = NULL;
4361 r->anchored_substr = data.longest_fixed;
4362 r->anchored_utf8 = NULL;
4364 /* fixed_end_shift is how many chars that must be matched that
4365 follow this item. We calculate it ahead of time as once the
4366 lookbehind offset is added in we lose the ability to correctly
4368 ml = data.minlen_fixed ? *(data.minlen_fixed)
4369 : (I32)longest_fixed_length;
4370 r->anchored_end_shift = ml - data.offset_fixed
4371 - longest_fixed_length + (SvTAIL(data.longest_fixed) != 0)
4372 + data.lookbehind_fixed;
4373 r->anchored_offset = data.offset_fixed - data.lookbehind_fixed;
4375 t = (data.flags & SF_FIX_BEFORE_EOL /* Can't have SEOL and MULTI */
4376 && (!(data.flags & SF_FIX_BEFORE_MEOL)
4377 || (RExC_flags & PMf_MULTILINE)));
4378 fbm_compile(data.longest_fixed, t ? FBMcf_TAIL : 0);
4381 r->anchored_substr = r->anchored_utf8 = NULL;
4382 SvREFCNT_dec(data.longest_fixed);
4383 longest_fixed_length = 0;
4386 && (OP(r->regstclass) == REG_ANY || OP(r->regstclass) == SANY))
4387 r->regstclass = NULL;
4388 if ((!(r->anchored_substr || r->anchored_utf8) || r->anchored_offset)
4390 && !(data.start_class->flags & ANYOF_EOS)
4391 && !cl_is_anything(data.start_class))
4393 const I32 n = add_data(pRExC_state, 1, "f");
4395 Newx(RExC_rx->data->data[n], 1,
4396 struct regnode_charclass_class);
4397 StructCopy(data.start_class,
4398 (struct regnode_charclass_class*)RExC_rx->data->data[n],
4399 struct regnode_charclass_class);
4400 r->regstclass = (regnode*)RExC_rx->data->data[n];
4401 r->reganch &= ~ROPT_SKIP; /* Used in find_byclass(). */
4402 DEBUG_COMPILE_r({ SV *sv = sv_newmortal();
4403 regprop(r, sv, (regnode*)data.start_class);
4404 PerlIO_printf(Perl_debug_log,
4405 "synthetic stclass \"%s\".\n",
4406 SvPVX_const(sv));});
4409 /* A temporary algorithm prefers floated substr to fixed one to dig more info. */
4410 if (longest_fixed_length > longest_float_length) {
4411 r->check_end_shift = r->anchored_end_shift;
4412 r->check_substr = r->anchored_substr;
4413 r->check_utf8 = r->anchored_utf8;
4414 r->check_offset_min = r->check_offset_max = r->anchored_offset;
4415 if (r->reganch & ROPT_ANCH_SINGLE)
4416 r->reganch |= ROPT_NOSCAN;
4419 r->check_end_shift = r->float_end_shift;
4420 r->check_substr = r->float_substr;
4421 r->check_utf8 = r->float_utf8;
4422 r->check_offset_min = r->float_min_offset;
4423 r->check_offset_max = r->float_max_offset;
4425 /* XXXX Currently intuiting is not compatible with ANCH_GPOS.
4426 This should be changed ASAP! */
4427 if ((r->check_substr || r->check_utf8) && !(r->reganch & ROPT_ANCH_GPOS)) {
4428 r->reganch |= RE_USE_INTUIT;
4429 if (SvTAIL(r->check_substr ? r->check_substr : r->check_utf8))
4430 r->reganch |= RE_INTUIT_TAIL;
4432 /* XXX Unneeded? dmq (shouldn't as this is handled elsewhere)
4433 if ( (STRLEN)minlen < longest_float_length )
4434 minlen= longest_float_length;
4435 if ( (STRLEN)minlen < longest_fixed_length )
4436 minlen= longest_fixed_length;
4440 /* Several toplevels. Best we can is to set minlen. */
4442 struct regnode_charclass_class ch_class;
4445 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "\nMulti Top Level\n"));
4447 scan = r->program + 1;
4448 cl_init(pRExC_state, &ch_class);
4449 data.start_class = &ch_class;
4450 data.last_closep = &last_close;
4453 minlen = study_chunk(pRExC_state, &scan, &minlen, &fake, scan + RExC_size,
4454 &data, -1, NULL, NULL, SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS,0);
4458 r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8
4459 = r->float_substr = r->float_utf8 = NULL;
4460 if (!(data.start_class->flags & ANYOF_EOS)
4461 && !cl_is_anything(data.start_class))
4463 const I32 n = add_data(pRExC_state, 1, "f");
4465 Newx(RExC_rx->data->data[n], 1,
4466 struct regnode_charclass_class);
4467 StructCopy(data.start_class,
4468 (struct regnode_charclass_class*)RExC_rx->data->data[n],
4469 struct regnode_charclass_class);
4470 r->regstclass = (regnode*)RExC_rx->data->data[n];
4471 r->reganch &= ~ROPT_SKIP; /* Used in find_byclass(). */
4472 DEBUG_COMPILE_r({ SV* sv = sv_newmortal();
4473 regprop(r, sv, (regnode*)data.start_class);
4474 PerlIO_printf(Perl_debug_log,
4475 "synthetic stclass \"%s\".\n",
4476 SvPVX_const(sv));});
4480 /* Guard against an embedded (?=) or (?<=) with a longer minlen than
4481 the "real" pattern. */
4483 PerlIO_printf(Perl_debug_log,"minlen: %"IVdf" r->minlen:%"IVdf"\n",
4486 r->minlenret = minlen;
4487 if (r->minlen < minlen)
4490 if (RExC_seen & REG_SEEN_GPOS)
4491 r->reganch |= ROPT_GPOS_SEEN;
4492 if (RExC_seen & REG_SEEN_LOOKBEHIND)
4493 r->reganch |= ROPT_LOOKBEHIND_SEEN;
4494 if (RExC_seen & REG_SEEN_EVAL)
4495 r->reganch |= ROPT_EVAL_SEEN;
4496 if (RExC_seen & REG_SEEN_CANY)
4497 r->reganch |= ROPT_CANY_SEEN;
4498 if (RExC_seen & REG_SEEN_VERBARG)
4499 r->reganch |= ROPT_VERBARG_SEEN;
4500 if (RExC_seen & REG_SEEN_CUTGROUP)
4501 r->reganch |= ROPT_CUTGROUP_SEEN;
4502 if (RExC_paren_names)
4503 r->paren_names = (HV*)SvREFCNT_inc(RExC_paren_names);
4505 r->paren_names = NULL;
4507 if (RExC_recurse_count) {
4508 for ( ; RExC_recurse_count ; RExC_recurse_count-- ) {
4509 const regnode *scan = RExC_recurse[RExC_recurse_count-1];
4510 ARG2L_SET( scan, RExC_open_parens[ARG(scan)-1] - scan );
4513 Newxz(r->startp, RExC_npar, I32);
4514 Newxz(r->endp, RExC_npar, I32);
4515 /* assume we don't need to swap parens around before we match */
4518 PerlIO_printf(Perl_debug_log,"Final program:\n");
4521 DEBUG_OFFSETS_r(if (r->offsets) {
4522 const U32 len = r->offsets[0];
4524 GET_RE_DEBUG_FLAGS_DECL;
4525 PerlIO_printf(Perl_debug_log, "Offsets: [%"UVuf"]\n\t", (UV)r->offsets[0]);
4526 for (i = 1; i <= len; i++) {
4527 if (r->offsets[i*2-1] || r->offsets[i*2])
4528 PerlIO_printf(Perl_debug_log, "%"UVuf":%"UVuf"[%"UVuf"] ",
4529 (UV)i, (UV)r->offsets[i*2-1], (UV)r->offsets[i*2]);
4531 PerlIO_printf(Perl_debug_log, "\n");
4537 #undef CORE_ONLY_BLOCK
4539 #undef RE_ENGINE_PTR
4541 #ifndef PERL_IN_XSUB_RE
4543 Perl_reg_named_buff_sv(pTHX_ SV* namesv)
4545 I32 parno = 0; /* no match */
4547 const REGEXP * const rx = PM_GETRE(PL_curpm);
4548 if (rx && rx->paren_names) {
4549 HE *he_str = hv_fetch_ent( rx->paren_names, namesv, 0, 0 );
4552 SV* sv_dat=HeVAL(he_str);
4553 I32 *nums=(I32*)SvPVX(sv_dat);
4554 for ( i=0; i<SvIVX(sv_dat); i++ ) {
4555 if ((I32)(rx->lastparen) >= nums[i] &&
4556 rx->endp[nums[i]] != -1)
4569 SV *sv= sv_newmortal();
4570 Perl_sv_setpvf(aTHX_ sv, "%"IVdf,(IV)parno);
4571 gv_paren= Perl_gv_fetchsv(aTHX_ sv, GV_ADD, SVt_PVGV);
4572 return GvSVn(gv_paren);
4577 /* Scans the name of a named buffer from the pattern.
4578 * If flags is REG_RSN_RETURN_NULL returns null.
4579 * If flags is REG_RSN_RETURN_NAME returns an SV* containing the name
4580 * If flags is REG_RSN_RETURN_DATA returns the data SV* corresponding
4581 * to the parsed name as looked up in the RExC_paren_names hash.
4582 * If there is an error throws a vFAIL().. type exception.
4585 #define REG_RSN_RETURN_NULL 0
4586 #define REG_RSN_RETURN_NAME 1
4587 #define REG_RSN_RETURN_DATA 2
4590 S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags) {
4591 char *name_start = RExC_parse;
4594 while( isIDFIRST_uni(utf8n_to_uvchr((U8*)RExC_parse,
4595 RExC_end - RExC_parse, &numlen, UTF8_ALLOW_DEFAULT)))
4597 RExC_parse += numlen;
4600 while( isIDFIRST(*RExC_parse) )
4604 SV* sv_name = sv_2mortal(Perl_newSVpvn(aTHX_ name_start,
4605 (int)(RExC_parse - name_start)));
4608 if ( flags == REG_RSN_RETURN_NAME)
4610 else if (flags==REG_RSN_RETURN_DATA) {
4613 if ( ! sv_name ) /* should not happen*/
4614 Perl_croak(aTHX_ "panic: no svname in reg_scan_name");
4615 if (RExC_paren_names)
4616 he_str = hv_fetch_ent( RExC_paren_names, sv_name, 0, 0 );
4618 sv_dat = HeVAL(he_str);
4620 vFAIL("Reference to nonexistent named group");
4624 Perl_croak(aTHX_ "panic: bad flag in reg_scan_name");
4631 #define DEBUG_PARSE_MSG(funcname) DEBUG_PARSE_r({ \
4632 int rem=(int)(RExC_end - RExC_parse); \
4641 if (RExC_lastparse!=RExC_parse) \
4642 PerlIO_printf(Perl_debug_log," >%.*s%-*s", \
4645 iscut ? "..." : "<" \
4648 PerlIO_printf(Perl_debug_log,"%16s",""); \
4653 num=REG_NODE_NUM(RExC_emit); \
4654 if (RExC_lastnum!=num) \
4655 PerlIO_printf(Perl_debug_log,"|%4d",num); \
4657 PerlIO_printf(Perl_debug_log,"|%4s",""); \
4658 PerlIO_printf(Perl_debug_log,"|%*s%-4s", \
4659 (int)((depth*2)), "", \
4663 RExC_lastparse=RExC_parse; \
4668 #define DEBUG_PARSE(funcname) DEBUG_PARSE_r({ \
4669 DEBUG_PARSE_MSG((funcname)); \
4670 PerlIO_printf(Perl_debug_log,"%4s","\n"); \
4672 #define DEBUG_PARSE_FMT(funcname,fmt,args) DEBUG_PARSE_r({ \
4673 DEBUG_PARSE_MSG((funcname)); \
4674 PerlIO_printf(Perl_debug_log,fmt "\n",args); \
4677 - reg - regular expression, i.e. main body or parenthesized thing
4679 * Caller must absorb opening parenthesis.
4681 * Combining parenthesis handling with the base level of regular expression
4682 * is a trifle forced, but the need to tie the tails of the branches to what
4683 * follows makes it hard to avoid.
4685 #define REGTAIL(x,y,z) regtail((x),(y),(z),depth+1)
4687 #define REGTAIL_STUDY(x,y,z) regtail_study((x),(y),(z),depth+1)
4689 #define REGTAIL_STUDY(x,y,z) regtail((x),(y),(z),depth+1)
4692 /* this idea is borrowed from STR_WITH_LEN in handy.h */
4693 #define CHECK_WORD(s,v,l) \
4694 (((sizeof(s)-1)==(l)) && (strnEQ(start_verb, (s ""), (sizeof(s)-1))))
4697 S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
4698 /* paren: Parenthesized? 0=top, 1=(, inside: changed to letter. */
4701 register regnode *ret; /* Will be the head of the group. */
4702 register regnode *br;
4703 register regnode *lastbr;
4704 register regnode *ender = NULL;
4705 register I32 parno = 0;
4707 const I32 oregflags = RExC_flags;
4708 bool have_branch = 0;
4711 /* for (?g), (?gc), and (?o) warnings; warning
4712 about (?c) will warn about (?g) -- japhy */
4714 #define WASTED_O 0x01
4715 #define WASTED_G 0x02
4716 #define WASTED_C 0x04
4717 #define WASTED_GC (0x02|0x04)
4718 I32 wastedflags = 0x00;
4720 char * parse_start = RExC_parse; /* MJD */
4721 char * const oregcomp_parse = RExC_parse;
4723 GET_RE_DEBUG_FLAGS_DECL;
4724 DEBUG_PARSE("reg ");
4727 *flagp = 0; /* Tentatively. */
4730 /* Make an OPEN node, if parenthesized. */
4732 if ( *RExC_parse == '*') { /* (*VERB:ARG) */
4733 char *start_verb = RExC_parse;
4734 STRLEN verb_len = 0;
4735 char *start_arg = NULL;
4736 unsigned char op = 0;
4738 int internal_argval = 0; /* internal_argval is only useful if !argok */
4739 while ( *RExC_parse && *RExC_parse != ')' ) {
4740 if ( *RExC_parse == ':' ) {
4741 start_arg = RExC_parse + 1;
4747 verb_len = RExC_parse - start_verb;
4750 while ( *RExC_parse && *RExC_parse != ')' )
4752 if ( *RExC_parse != ')' )
4753 vFAIL("Unterminated verb pattern argument");
4754 if ( RExC_parse == start_arg )
4757 if ( *RExC_parse != ')' )
4758 vFAIL("Unterminated verb pattern");
4761 switch ( *start_verb ) {
4762 case 'A': /* (*ACCEPT) */
4763 if ( CHECK_WORD("ACCEPT",start_verb,verb_len) ) {
4765 internal_argval = RExC_nestroot;
4768 case 'C': /* (*COMMIT) */
4769 if ( CHECK_WORD("COMMIT",start_verb,verb_len) )
4772 case 'F': /* (*FAIL) */
4773 if ( verb_len==1 || CHECK_WORD("FAIL",start_verb,verb_len) ) {
4778 case ':': /* (*:NAME) */
4779 case 'M': /* (*MARK:NAME) */
4780 if ( verb_len==0 || CHECK_WORD("MARK",start_verb,verb_len) ) {
4785 case 'P': /* (*PRUNE) */
4786 if ( CHECK_WORD("PRUNE",start_verb,verb_len) )
4789 case 'S': /* (*SKIP) */
4790 if ( CHECK_WORD("SKIP",start_verb,verb_len) )
4793 case 'T': /* (*THEN) */
4794 /* [19:06] <TimToady> :: is then */
4795 if ( CHECK_WORD("THEN",start_verb,verb_len) ) {
4797 RExC_seen |= REG_SEEN_CUTGROUP;
4803 vFAIL3("Unknown verb pattern '%.*s'",
4804 verb_len, start_verb);
4807 if ( start_arg && internal_argval ) {
4808 vFAIL3("Verb pattern '%.*s' may not have an argument",
4809 verb_len, start_verb);
4810 } else if ( argok < 0 && !start_arg ) {
4811 vFAIL3("Verb pattern '%.*s' has a mandatory argument",
4812 verb_len, start_verb);
4814 ret = reganode(pRExC_state, op, internal_argval);
4815 if ( ! internal_argval && ! SIZE_ONLY ) {
4817 SV *sv = newSVpvn( start_arg, RExC_parse - start_arg);
4818 ARG(ret) = add_data( pRExC_state, 1, "S" );
4819 RExC_rx->data->data[ARG(ret)]=(void*)sv;
4826 if (!internal_argval)
4827 RExC_seen |= REG_SEEN_VERBARG;
4828 } else if ( start_arg ) {
4829 vFAIL3("Verb pattern '%.*s' may not have an argument",
4830 verb_len, start_verb);
4832 ret = reg_node(pRExC_state, op);
4834 nextchar(pRExC_state);
4837 if (*RExC_parse == '?') { /* (?...) */
4838 U32 posflags = 0, negflags = 0;
4839 U32 *flagsp = &posflags;
4840 bool is_logical = 0;
4841 const char * const seqstart = RExC_parse;
4844 paren = *RExC_parse++;
4845 ret = NULL; /* For look-ahead/behind. */
4848 case '<': /* (?<...) */
4849 if (*RExC_parse == '!')
4851 else if (*RExC_parse != '=')
4856 case '\'': /* (?'...') */
4857 name_start= RExC_parse;
4858 svname = reg_scan_name(pRExC_state,
4859 SIZE_ONLY ? /* reverse test from the others */
4860 REG_RSN_RETURN_NAME :
4861 REG_RSN_RETURN_NULL);
4862 if (RExC_parse == name_start)
4864 if (*RExC_parse != paren)
4865 vFAIL2("Sequence (?%c... not terminated",
4866 paren=='>' ? '<' : paren);
4870 if (!svname) /* shouldnt happen */
4872 "panic: reg_scan_name returned NULL");
4873 if (!RExC_paren_names) {
4874 RExC_paren_names= newHV();
4875 sv_2mortal((SV*)RExC_paren_names);
4877 he_str = hv_fetch_ent( RExC_paren_names, svname, 1, 0 );
4879 sv_dat = HeVAL(he_str);
4881 /* croak baby croak */
4883 "panic: paren_name hash element allocation failed");
4884 } else if ( SvPOK(sv_dat) ) {
4885 IV count=SvIV(sv_dat);
4886 I32 *pv=(I32*)SvGROW(sv_dat,SvCUR(sv_dat)+sizeof(I32)+1);
4887 SvCUR_set(sv_dat,SvCUR(sv_dat)+sizeof(I32));
4888 pv[count]=RExC_npar;
4891 (void)SvUPGRADE(sv_dat,SVt_PVNV);
4892 sv_setpvn(sv_dat, (char *)&(RExC_npar), sizeof(I32));
4897 /*sv_dump(sv_dat);*/
4899 nextchar(pRExC_state);
4901 goto capturing_parens;
4903 RExC_seen |= REG_SEEN_LOOKBEHIND;
4905 case '=': /* (?=...) */
4906 case '!': /* (?!...) */
4907 RExC_seen_zerolen++;
4908 if (*RExC_parse == ')') {
4909 ret=reg_node(pRExC_state, OPFAIL);
4910 nextchar(pRExC_state);
4913 case ':': /* (?:...) */
4914 case '>': /* (?>...) */
4916 case '$': /* (?$...) */
4917 case '@': /* (?@...) */
4918 vFAIL2("Sequence (?%c...) not implemented", (int)paren);
4920 case '#': /* (?#...) */
4921 while (*RExC_parse && *RExC_parse != ')')
4923 if (*RExC_parse != ')')
4924 FAIL("Sequence (?#... not terminated");
4925 nextchar(pRExC_state);
4928 case '0' : /* (?0) */
4929 case 'R' : /* (?R) */
4930 if (*RExC_parse != ')')
4931 FAIL("Sequence (?R) not terminated");
4932 ret = reg_node(pRExC_state, GOSTART);
4933 nextchar(pRExC_state);
4936 { /* named and numeric backreferences */
4939 case '&': /* (?&NAME) */
4940 parse_start = RExC_parse - 1;
4942 SV *sv_dat = reg_scan_name(pRExC_state,
4943 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
4944 num = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
4946 goto gen_recurse_regop;
4949 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
4951 vFAIL("Illegal pattern");
4953 goto parse_recursion;
4955 case '-': /* (?-1) */
4956 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
4957 RExC_parse--; /* rewind to let it be handled later */
4961 case '1': case '2': case '3': case '4': /* (?1) */
4962 case '5': case '6': case '7': case '8': case '9':
4965 num = atoi(RExC_parse);
4966 parse_start = RExC_parse - 1; /* MJD */
4967 if (*RExC_parse == '-')
4969 while (isDIGIT(*RExC_parse))
4971 if (*RExC_parse!=')')
4972 vFAIL("Expecting close bracket");
4975 if ( paren == '-' ) {
4977 Diagram of capture buffer numbering.
4978 Top line is the normal capture buffer numbers
4979 Botton line is the negative indexing as from
4983 /(a(x)y)(a(b(c(?-2)d)e)f)(g(h))/
4987 num = RExC_npar + num;
4990 vFAIL("Reference to nonexistent group");
4992 } else if ( paren == '+' ) {
4993 num = RExC_npar + num - 1;
4996 ret = reganode(pRExC_state, GOSUB, num);
4998 if (num > (I32)RExC_rx->nparens) {
5000 vFAIL("Reference to nonexistent group");
5002 ARG2L_SET( ret, RExC_recurse_count++);
5004 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
5005 "Recurse #%"UVuf" to %"IVdf"\n", (UV)ARG(ret), (IV)ARG2L(ret)));
5009 RExC_seen |= REG_SEEN_RECURSE;
5010 Set_Node_Length(ret, 1 + regarglen[OP(ret)]); /* MJD */
5011 Set_Node_Offset(ret, parse_start); /* MJD */
5013 nextchar(pRExC_state);
5015 } /* named and numeric backreferences */
5018 case 'p': /* (?p...) */
5019 if (SIZE_ONLY && ckWARN2(WARN_DEPRECATED, WARN_REGEXP))
5020 vWARNdep(RExC_parse, "(?p{}) is deprecated - use (??{})");
5022 case '?': /* (??...) */
5024 if (*RExC_parse != '{')
5026 paren = *RExC_parse++;
5028 case '{': /* (?{...}) */
5030 I32 count = 1, n = 0;
5032 char *s = RExC_parse;
5034 RExC_seen_zerolen++;
5035 RExC_seen |= REG_SEEN_EVAL;
5036 while (count && (c = *RExC_parse)) {
5047 if (*RExC_parse != ')') {
5049 vFAIL("Sequence (?{...}) not terminated or not {}-balanced");
5053 OP_4tree *sop, *rop;
5054 SV * const sv = newSVpvn(s, RExC_parse - 1 - s);
5057 Perl_save_re_context(aTHX);
5058 rop = sv_compile_2op(sv, &sop, "re", &pad);
5059 sop->op_private |= OPpREFCOUNTED;
5060 /* re_dup will OpREFCNT_inc */
5061 OpREFCNT_set(sop, 1);
5064 n = add_data(pRExC_state, 3, "nop");
5065 RExC_rx->data->data[n] = (void*)rop;
5066 RExC_rx->data->data[n+1] = (void*)sop;
5067 RExC_rx->data->data[n+2] = (void*)pad;
5070 else { /* First pass */
5071 if (PL_reginterp_cnt < ++RExC_seen_evals
5073 /* No compiled RE interpolated, has runtime
5074 components ===> unsafe. */
5075 FAIL("Eval-group not allowed at runtime, use re 'eval'");
5076 if (PL_tainting && PL_tainted)
5077 FAIL("Eval-group in insecure regular expression");
5078 #if PERL_VERSION > 8
5079 if (IN_PERL_COMPILETIME)
5084 nextchar(pRExC_state);
5086 ret = reg_node(pRExC_state, LOGICAL);
5089 REGTAIL(pRExC_state, ret, reganode(pRExC_state, EVAL, n));
5090 /* deal with the length of this later - MJD */
5093 ret = reganode(pRExC_state, EVAL, n);
5094 Set_Node_Length(ret, RExC_parse - parse_start + 1);
5095 Set_Node_Offset(ret, parse_start);
5098 case '(': /* (?(?{...})...) and (?(?=...)...) */
5101 if (RExC_parse[0] == '?') { /* (?(?...)) */
5102 if (RExC_parse[1] == '=' || RExC_parse[1] == '!'
5103 || RExC_parse[1] == '<'
5104 || RExC_parse[1] == '{') { /* Lookahead or eval. */
5107 ret = reg_node(pRExC_state, LOGICAL);
5110 REGTAIL(pRExC_state, ret, reg(pRExC_state, 1, &flag,depth+1));
5114 else if ( RExC_parse[0] == '<' /* (?(<NAME>)...) */
5115 || RExC_parse[0] == '\'' ) /* (?('NAME')...) */
5117 char ch = RExC_parse[0] == '<' ? '>' : '\'';
5118 char *name_start= RExC_parse++;
5120 SV *sv_dat=reg_scan_name(pRExC_state,
5121 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
5122 if (RExC_parse == name_start || *RExC_parse != ch)
5123 vFAIL2("Sequence (?(%c... not terminated",
5124 (ch == '>' ? '<' : ch));
5127 num = add_data( pRExC_state, 1, "S" );
5128 RExC_rx->data->data[num]=(void*)sv_dat;
5129 SvREFCNT_inc(sv_dat);
5131 ret = reganode(pRExC_state,NGROUPP,num);
5132 goto insert_if_check_paren;
5134 else if (RExC_parse[0] == 'D' &&
5135 RExC_parse[1] == 'E' &&
5136 RExC_parse[2] == 'F' &&
5137 RExC_parse[3] == 'I' &&
5138 RExC_parse[4] == 'N' &&
5139 RExC_parse[5] == 'E')
5141 ret = reganode(pRExC_state,DEFINEP,0);
5144 goto insert_if_check_paren;
5146 else if (RExC_parse[0] == 'R') {
5149 if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
5150 parno = atoi(RExC_parse++);
5151 while (isDIGIT(*RExC_parse))
5153 } else if (RExC_parse[0] == '&') {
5156 sv_dat = reg_scan_name(pRExC_state,
5157 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
5158 parno = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
5160 ret = reganode(pRExC_state,INSUBP,parno);
5161 goto insert_if_check_paren;
5163 else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
5166 parno = atoi(RExC_parse++);
5168 while (isDIGIT(*RExC_parse))
5170 ret = reganode(pRExC_state, GROUPP, parno);
5172 insert_if_check_paren:
5173 if ((c = *nextchar(pRExC_state)) != ')')
5174 vFAIL("Switch condition not recognized");
5176 REGTAIL(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
5177 br = regbranch(pRExC_state, &flags, 1,depth+1);
5179 br = reganode(pRExC_state, LONGJMP, 0);
5181 REGTAIL(pRExC_state, br, reganode(pRExC_state, LONGJMP, 0));
5182 c = *nextchar(pRExC_state);
5187 vFAIL("(?(DEFINE)....) does not allow branches");
5188 lastbr = reganode(pRExC_state, IFTHEN, 0); /* Fake one for optimizer. */
5189 regbranch(pRExC_state, &flags, 1,depth+1);
5190 REGTAIL(pRExC_state, ret, lastbr);
5193 c = *nextchar(pRExC_state);
5198 vFAIL("Switch (?(condition)... contains too many branches");
5199 ender = reg_node(pRExC_state, TAIL);
5200 REGTAIL(pRExC_state, br, ender);
5202 REGTAIL(pRExC_state, lastbr, ender);
5203 REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
5206 REGTAIL(pRExC_state, ret, ender);
5210 vFAIL2("Unknown switch condition (?(%.2s", RExC_parse);
5214 RExC_parse--; /* for vFAIL to print correctly */
5215 vFAIL("Sequence (? incomplete");
5219 parse_flags: /* (?i) */
5220 while (*RExC_parse && strchr("iogcmsx", *RExC_parse)) {
5221 /* (?g), (?gc) and (?o) are useless here
5222 and must be globally applied -- japhy */
5224 if (*RExC_parse == 'o' || *RExC_parse == 'g') {
5225 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
5226 const I32 wflagbit = *RExC_parse == 'o' ? WASTED_O : WASTED_G;
5227 if (! (wastedflags & wflagbit) ) {
5228 wastedflags |= wflagbit;
5231 "Useless (%s%c) - %suse /%c modifier",
5232 flagsp == &negflags ? "?-" : "?",
5234 flagsp == &negflags ? "don't " : "",
5240 else if (*RExC_parse == 'c') {
5241 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
5242 if (! (wastedflags & WASTED_C) ) {
5243 wastedflags |= WASTED_GC;
5246 "Useless (%sc) - %suse /gc modifier",
5247 flagsp == &negflags ? "?-" : "?",
5248 flagsp == &negflags ? "don't " : ""
5253 else { pmflag(flagsp, *RExC_parse); }
5257 if (*RExC_parse == '-') {
5259 wastedflags = 0; /* reset so (?g-c) warns twice */
5263 RExC_flags |= posflags;
5264 RExC_flags &= ~negflags;
5265 if (*RExC_parse == ':') {
5271 if (*RExC_parse != ')') {
5273 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
5275 nextchar(pRExC_state);
5285 ret = reganode(pRExC_state, OPEN, parno);
5288 RExC_nestroot = parno;
5289 if (RExC_seen & REG_SEEN_RECURSE) {
5290 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
5291 "Setting open paren #%"IVdf" to %d\n",
5292 (IV)parno, REG_NODE_NUM(ret)));
5293 RExC_open_parens[parno-1]= ret;
5296 Set_Node_Length(ret, 1); /* MJD */
5297 Set_Node_Offset(ret, RExC_parse); /* MJD */
5304 /* Pick up the branches, linking them together. */
5305 parse_start = RExC_parse; /* MJD */
5306 br = regbranch(pRExC_state, &flags, 1,depth+1);
5307 /* branch_len = (paren != 0); */
5311 if (*RExC_parse == '|') {
5312 if (!SIZE_ONLY && RExC_extralen) {
5313 reginsert(pRExC_state, BRANCHJ, br, depth+1);
5316 reginsert(pRExC_state, BRANCH, br, depth+1);
5317 Set_Node_Length(br, paren != 0);
5318 Set_Node_Offset_To_R(br-RExC_emit_start, parse_start-RExC_start);
5322 RExC_extralen += 1; /* For BRANCHJ-BRANCH. */
5324 else if (paren == ':') {
5325 *flagp |= flags&SIMPLE;
5327 if (is_open) { /* Starts with OPEN. */
5328 REGTAIL(pRExC_state, ret, br); /* OPEN -> first. */
5330 else if (paren != '?') /* Not Conditional */
5332 *flagp |= flags & (SPSTART | HASWIDTH);
5334 while (*RExC_parse == '|') {
5335 if (!SIZE_ONLY && RExC_extralen) {
5336 ender = reganode(pRExC_state, LONGJMP,0);
5337 REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender); /* Append to the previous. */
5340 RExC_extralen += 2; /* Account for LONGJMP. */
5341 nextchar(pRExC_state);
5342 br = regbranch(pRExC_state, &flags, 0, depth+1);
5346 REGTAIL(pRExC_state, lastbr, br); /* BRANCH -> BRANCH. */
5350 *flagp |= flags&SPSTART;
5353 if (have_branch || paren != ':') {
5354 /* Make a closing node, and hook it on the end. */
5357 ender = reg_node(pRExC_state, TAIL);
5361 ender = reganode(pRExC_state, CLOSE, parno);
5362 if (!SIZE_ONLY && RExC_seen & REG_SEEN_RECURSE) {
5363 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
5364 "Setting close paren #%"IVdf" to %d\n",
5365 (IV)parno, REG_NODE_NUM(ender)));
5366 RExC_close_parens[parno-1]= ender;
5367 if (RExC_nestroot == parno)
5370 Set_Node_Offset(ender,RExC_parse+1); /* MJD */
5371 Set_Node_Length(ender,1); /* MJD */
5377 *flagp &= ~HASWIDTH;
5380 ender = reg_node(pRExC_state, SUCCEED);
5383 ender = reg_node(pRExC_state, END);
5385 assert(!RExC_opend); /* there can only be one! */
5390 REGTAIL(pRExC_state, lastbr, ender);
5392 if (have_branch && !SIZE_ONLY) {
5394 RExC_seen |= REG_TOP_LEVEL_BRANCHES;
5396 /* Hook the tails of the branches to the closing node. */
5397 for (br = ret; br; br = regnext(br)) {
5398 const U8 op = PL_regkind[OP(br)];
5400 REGTAIL_STUDY(pRExC_state, NEXTOPER(br), ender);
5402 else if (op == BRANCHJ) {
5403 REGTAIL_STUDY(pRExC_state, NEXTOPER(NEXTOPER(br)), ender);
5411 static const char parens[] = "=!<,>";
5413 if (paren && (p = strchr(parens, paren))) {
5414 U8 node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
5415 int flag = (p - parens) > 1;
5418 node = SUSPEND, flag = 0;
5419 reginsert(pRExC_state, node,ret, depth+1);
5420 Set_Node_Cur_Length(ret);
5421 Set_Node_Offset(ret, parse_start + 1);
5423 REGTAIL_STUDY(pRExC_state, ret, reg_node(pRExC_state, TAIL));
5427 /* Check for proper termination. */
5429 RExC_flags = oregflags;
5430 if (RExC_parse >= RExC_end || *nextchar(pRExC_state) != ')') {
5431 RExC_parse = oregcomp_parse;
5432 vFAIL("Unmatched (");
5435 else if (!paren && RExC_parse < RExC_end) {
5436 if (*RExC_parse == ')') {
5438 vFAIL("Unmatched )");
5441 FAIL("Junk on end of regexp"); /* "Can't happen". */
5449 - regbranch - one alternative of an | operator
5451 * Implements the concatenation operator.
5454 S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first, U32 depth)
5457 register regnode *ret;
5458 register regnode *chain = NULL;
5459 register regnode *latest;
5460 I32 flags = 0, c = 0;
5461 GET_RE_DEBUG_FLAGS_DECL;
5462 DEBUG_PARSE("brnc");
5466 if (!SIZE_ONLY && RExC_extralen)
5467 ret = reganode(pRExC_state, BRANCHJ,0);
5469 ret = reg_node(pRExC_state, BRANCH);
5470 Set_Node_Length(ret, 1);
5474 if (!first && SIZE_ONLY)
5475 RExC_extralen += 1; /* BRANCHJ */
5477 *flagp = WORST; /* Tentatively. */
5480 nextchar(pRExC_state);
5481 while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
5483 latest = regpiece(pRExC_state, &flags,depth+1);
5484 if (latest == NULL) {
5485 if (flags & TRYAGAIN)
5489 else if (ret == NULL)
5491 *flagp |= flags&HASWIDTH;
5492 if (chain == NULL) /* First piece. */
5493 *flagp |= flags&SPSTART;
5496 REGTAIL(pRExC_state, chain, latest);
5501 if (chain == NULL) { /* Loop ran zero times. */
5502 chain = reg_node(pRExC_state, NOTHING);
5507 *flagp |= flags&SIMPLE;
5514 - regpiece - something followed by possible [*+?]
5516 * Note that the branching code sequences used for ? and the general cases
5517 * of * and + are somewhat optimized: they use the same NOTHING node as
5518 * both the endmarker for their branch list and the body of the last branch.
5519 * It might seem that this node could be dispensed with entirely, but the
5520 * endmarker role is not redundant.
5523 S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
5526 register regnode *ret;
5528 register char *next;
5530 const char * const origparse = RExC_parse;
5532 I32 max = REG_INFTY;
5534 const char *maxpos = NULL;
5535 GET_RE_DEBUG_FLAGS_DECL;
5536 DEBUG_PARSE("piec");
5538 ret = regatom(pRExC_state, &flags,depth+1);
5540 if (flags & TRYAGAIN)
5547 if (op == '{' && regcurly(RExC_parse)) {
5549 parse_start = RExC_parse; /* MJD */
5550 next = RExC_parse + 1;
5551 while (isDIGIT(*next) || *next == ',') {
5560 if (*next == '}') { /* got one */
5564 min = atoi(RExC_parse);
5568 maxpos = RExC_parse;
5570 if (!max && *maxpos != '0')
5571 max = REG_INFTY; /* meaning "infinity" */
5572 else if (max >= REG_INFTY)
5573 vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
5575 nextchar(pRExC_state);
5578 if ((flags&SIMPLE)) {
5579 RExC_naughty += 2 + RExC_naughty / 2;
5580 reginsert(pRExC_state, CURLY, ret, depth+1);
5581 Set_Node_Offset(ret, parse_start+1); /* MJD */
5582 Set_Node_Cur_Length(ret);
5585 regnode * const w = reg_node(pRExC_state, WHILEM);
5588 REGTAIL(pRExC_state, ret, w);
5589 if (!SIZE_ONLY && RExC_extralen) {
5590 reginsert(pRExC_state, LONGJMP,ret, depth+1);
5591 reginsert(pRExC_state, NOTHING,ret, depth+1);
5592 NEXT_OFF(ret) = 3; /* Go over LONGJMP. */
5594 reginsert(pRExC_state, CURLYX,ret, depth+1);
5596 Set_Node_Offset(ret, parse_start+1);
5597 Set_Node_Length(ret,
5598 op == '{' ? (RExC_parse - parse_start) : 1);
5600 if (!SIZE_ONLY && RExC_extralen)
5601 NEXT_OFF(ret) = 3; /* Go over NOTHING to LONGJMP. */
5602 REGTAIL(pRExC_state, ret, reg_node(pRExC_state, NOTHING));
5604 RExC_whilem_seen++, RExC_extralen += 3;
5605 RExC_naughty += 4 + RExC_naughty; /* compound interest */
5613 if (max && max < min)
5614 vFAIL("Can't do {n,m} with n > m");
5616 ARG1_SET(ret, (U16)min);
5617 ARG2_SET(ret, (U16)max);
5629 #if 0 /* Now runtime fix should be reliable. */
5631 /* if this is reinstated, don't forget to put this back into perldiag:
5633 =item Regexp *+ operand could be empty at {#} in regex m/%s/
5635 (F) The part of the regexp subject to either the * or + quantifier
5636 could match an empty string. The {#} shows in the regular
5637 expression about where the problem was discovered.
5641 if (!(flags&HASWIDTH) && op != '?')
5642 vFAIL("Regexp *+ operand could be empty");
5645 parse_start = RExC_parse;
5646 nextchar(pRExC_state);
5648 *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH);
5650 if (op == '*' && (flags&SIMPLE)) {
5651 reginsert(pRExC_state, STAR, ret, depth+1);
5655 else if (op == '*') {
5659 else if (op == '+' && (flags&SIMPLE)) {
5660 reginsert(pRExC_state, PLUS, ret, depth+1);
5664 else if (op == '+') {
5668 else if (op == '?') {
5673 if (!SIZE_ONLY && !(flags&HASWIDTH) && max > REG_INFTY/3 && ckWARN(WARN_REGEXP)) {
5675 "%.*s matches null string many times",
5676 (int)(RExC_parse >= origparse ? RExC_parse - origparse : 0),
5680 if (RExC_parse < RExC_end && *RExC_parse == '?') {
5681 nextchar(pRExC_state);
5682 reginsert(pRExC_state, MINMOD, ret, depth+1);
5683 REGTAIL(pRExC_state, ret, ret + NODE_STEP_REGNODE);
5685 #ifndef REG_ALLOW_MINMOD_SUSPEND
5688 if (RExC_parse < RExC_end && *RExC_parse == '+') {
5690 nextchar(pRExC_state);
5691 ender = reg_node(pRExC_state, SUCCEED);
5692 REGTAIL(pRExC_state, ret, ender);
5693 reginsert(pRExC_state, SUSPEND, ret, depth+1);
5695 ender = reg_node(pRExC_state, TAIL);
5696 REGTAIL(pRExC_state, ret, ender);
5700 if (RExC_parse < RExC_end && ISMULT2(RExC_parse)) {
5702 vFAIL("Nested quantifiers");
5709 /* reg_namedseq(pRExC_state,UVp)
5711 This is expected to be called by a parser routine that has
5712 recognized'\N' and needs to handle the rest. RExC_parse is
5713 expected to point at the first char following the N at the time
5716 If valuep is non-null then it is assumed that we are parsing inside
5717 of a charclass definition and the first codepoint in the resolved
5718 string is returned via *valuep and the routine will return NULL.
5719 In this mode if a multichar string is returned from the charnames
5720 handler a warning will be issued, and only the first char in the
5721 sequence will be examined. If the string returned is zero length
5722 then the value of *valuep is undefined and NON-NULL will
5723 be returned to indicate failure. (This will NOT be a valid pointer
5726 If value is null then it is assumed that we are parsing normal text
5727 and inserts a new EXACT node into the program containing the resolved
5728 string and returns a pointer to the new node. If the string is
5729 zerolength a NOTHING node is emitted.
5731 On success RExC_parse is set to the char following the endbrace.
5732 Parsing failures will generate a fatal errorvia vFAIL(...)
5734 NOTE: We cache all results from the charnames handler locally in
5735 the RExC_charnames hash (created on first use) to prevent a charnames
5736 handler from playing silly-buggers and returning a short string and
5737 then a long string for a given pattern. Since the regexp program
5738 size is calculated during an initial parse this would result
5739 in a buffer overrun so we cache to prevent the charname result from
5740 changing during the course of the parse.
5744 S_reg_namedseq(pTHX_ RExC_state_t *pRExC_state, UV *valuep)
5746 char * name; /* start of the content of the name */
5747 char * endbrace; /* endbrace following the name */
5750 STRLEN len; /* this has various purposes throughout the code */
5751 bool cached = 0; /* if this is true then we shouldn't refcount dev sv_str */
5752 regnode *ret = NULL;
5754 if (*RExC_parse != '{') {
5755 vFAIL("Missing braces on \\N{}");
5757 name = RExC_parse+1;
5758 endbrace = strchr(RExC_parse, '}');
5761 vFAIL("Missing right brace on \\N{}");
5763 RExC_parse = endbrace + 1;
5766 /* RExC_parse points at the beginning brace,
5767 endbrace points at the last */
5768 if ( name[0]=='U' && name[1]=='+' ) {
5769 /* its a "unicode hex" notation {U+89AB} */
5770 I32 fl = PERL_SCAN_ALLOW_UNDERSCORES
5771 | PERL_SCAN_DISALLOW_PREFIX
5772 | (SIZE_ONLY ? PERL_SCAN_SILENT_ILLDIGIT : 0);
5774 len = (STRLEN)(endbrace - name - 2);
5775 cp = grok_hex(name + 2, &len, &fl, NULL);
5776 if ( len != (STRLEN)(endbrace - name - 2) ) {
5785 sv_str= Perl_newSVpvf_nocontext("%c",(int)cp);
5787 /* fetch the charnames handler for this scope */
5788 HV * const table = GvHV(PL_hintgv);
5790 hv_fetchs(table, "charnames", FALSE) :
5792 SV *cv= cvp ? *cvp : NULL;
5795 /* create an SV with the name as argument */
5796 sv_name = newSVpvn(name, endbrace - name);
5798 if (!table || !(PL_hints & HINT_LOCALIZE_HH)) {
5799 vFAIL2("Constant(\\N{%s}) unknown: "
5800 "(possibly a missing \"use charnames ...\")",
5803 if (!cvp || !SvOK(*cvp)) { /* when $^H{charnames} = undef; */
5804 vFAIL2("Constant(\\N{%s}): "
5805 "$^H{charnames} is not defined",SvPVX(sv_name));
5810 if (!RExC_charnames) {
5811 /* make sure our cache is allocated */
5812 RExC_charnames = newHV();
5813 sv_2mortal((SV*)RExC_charnames);
5815 /* see if we have looked this one up before */
5816 he_str = hv_fetch_ent( RExC_charnames, sv_name, 0, 0 );
5818 sv_str = HeVAL(he_str);
5831 count= call_sv(cv, G_SCALAR);
5833 if (count == 1) { /* XXXX is this right? dmq */
5835 SvREFCNT_inc_simple_void(sv_str);
5843 if ( !sv_str || !SvOK(sv_str) ) {
5844 vFAIL2("Constant(\\N{%s}): Call to &{$^H{charnames}} "
5845 "did not return a defined value",SvPVX(sv_name));
5847 if (hv_store_ent( RExC_charnames, sv_name, sv_str, 0))
5852 char *p = SvPV(sv_str, len);
5855 if ( SvUTF8(sv_str) ) {
5856 *valuep = utf8_to_uvchr((U8*)p, &numlen);
5860 We have to turn on utf8 for high bit chars otherwise
5861 we get failures with
5863 "ss" =~ /[\N{LATIN SMALL LETTER SHARP S}]/i
5864 "SS" =~ /[\N{LATIN SMALL LETTER SHARP S}]/i
5866 This is different from what \x{} would do with the same
5867 codepoint, where the condition is > 0xFF.
5874 /* warn if we havent used the whole string? */
5876 if (numlen<len && SIZE_ONLY && ckWARN(WARN_REGEXP)) {
5878 "Ignoring excess chars from \\N{%s} in character class",
5882 } else if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
5884 "Ignoring zero length \\N{%s} in character class",
5889 SvREFCNT_dec(sv_name);
5891 SvREFCNT_dec(sv_str);
5892 return len ? NULL : (regnode *)&len;
5893 } else if(SvCUR(sv_str)) {
5898 char * parse_start = name-3; /* needed for the offsets */
5899 GET_RE_DEBUG_FLAGS_DECL; /* needed for the offsets */
5901 ret = reg_node(pRExC_state,
5902 (U8)(FOLD ? (LOC ? EXACTFL : EXACTF) : EXACT));
5905 if ( RExC_utf8 && !SvUTF8(sv_str) ) {
5906 sv_utf8_upgrade(sv_str);
5907 } else if ( !RExC_utf8 && SvUTF8(sv_str) ) {
5911 p = SvPV(sv_str, len);
5913 /* len is the length written, charlen is the size the char read */
5914 for ( len = 0; p < pend; p += charlen ) {
5916 UV uvc = utf8_to_uvchr((U8*)p, &charlen);
5918 STRLEN foldlen,numlen;
5919 U8 tmpbuf[UTF8_MAXBYTES_CASE+1], *foldbuf;
5920 uvc = toFOLD_uni(uvc, tmpbuf, &foldlen);
5921 /* Emit all the Unicode characters. */
5923 for (foldbuf = tmpbuf;
5927 uvc = utf8_to_uvchr(foldbuf, &numlen);
5929 const STRLEN unilen = reguni(pRExC_state, uvc, s);
5932 /* In EBCDIC the numlen
5933 * and unilen can differ. */
5935 if (numlen >= foldlen)
5939 break; /* "Can't happen." */
5942 const STRLEN unilen = reguni(pRExC_state, uvc, s);
5954 RExC_size += STR_SZ(len);
5957 RExC_emit += STR_SZ(len);
5959 Set_Node_Cur_Length(ret); /* MJD */
5961 nextchar(pRExC_state);
5963 ret = reg_node(pRExC_state,NOTHING);
5966 SvREFCNT_dec(sv_str);
5969 SvREFCNT_dec(sv_name);
5979 * It returns the code point in utf8 for the value in *encp.
5980 * value: a code value in the source encoding
5981 * encp: a pointer to an Encode object
5983 * If the result from Encode is not a single character,
5984 * it returns U+FFFD (Replacement character) and sets *encp to NULL.
5987 S_reg_recode(pTHX_ const char value, SV **encp)
5990 SV * const sv = sv_2mortal(newSVpvn(&value, numlen));
5991 const char * const s = encp && *encp ? sv_recode_to_utf8(sv, *encp)
5993 const STRLEN newlen = SvCUR(sv);
5994 UV uv = UNICODE_REPLACEMENT;
5998 ? utf8n_to_uvchr((U8*)s, newlen, &numlen, UTF8_ALLOW_DEFAULT)
6001 if (!newlen || numlen != newlen) {
6002 uv = UNICODE_REPLACEMENT;
6011 - regatom - the lowest level
6013 * Optimization: gobbles an entire sequence of ordinary characters so that
6014 * it can turn them into a single node, which is smaller to store and
6015 * faster to run. Backslashed characters are exceptions, each becoming a
6016 * separate node; the code is simpler that way and it's not worth fixing.
6018 * [Yes, it is worth fixing, some scripts can run twice the speed.]
6019 * [It looks like its ok, as in S_study_chunk we merge adjacent EXACT nodes]
6022 S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
6025 register regnode *ret = NULL;
6027 char *parse_start = RExC_parse;
6028 GET_RE_DEBUG_FLAGS_DECL;
6029 DEBUG_PARSE("atom");
6030 *flagp = WORST; /* Tentatively. */
6033 switch (*RExC_parse) {
6035 RExC_seen_zerolen++;
6036 nextchar(pRExC_state);
6037 if (RExC_flags & PMf_MULTILINE)
6038 ret = reg_node(pRExC_state, MBOL);
6039 else if (RExC_flags & PMf_SINGLELINE)
6040 ret = reg_node(pRExC_state, SBOL);
6042 ret = reg_node(pRExC_state, BOL);
6043 Set_Node_Length(ret, 1); /* MJD */
6046 nextchar(pRExC_state);
6048 RExC_seen_zerolen++;
6049 if (RExC_flags & PMf_MULTILINE)
6050 ret = reg_node(pRExC_state, MEOL);
6051 else if (RExC_flags & PMf_SINGLELINE)
6052 ret = reg_node(pRExC_state, SEOL);
6054 ret = reg_node(pRExC_state, EOL);
6055 Set_Node_Length(ret, 1); /* MJD */
6058 nextchar(pRExC_state);
6059 if (RExC_flags & PMf_SINGLELINE)
6060 ret = reg_node(pRExC_state, SANY);
6062 ret = reg_node(pRExC_state, REG_ANY);
6063 *flagp |= HASWIDTH|SIMPLE;
6065 Set_Node_Length(ret, 1); /* MJD */
6069 char * const oregcomp_parse = ++RExC_parse;
6070 ret = regclass(pRExC_state,depth+1);
6071 if (*RExC_parse != ']') {
6072 RExC_parse = oregcomp_parse;
6073 vFAIL("Unmatched [");
6075 nextchar(pRExC_state);
6076 *flagp |= HASWIDTH|SIMPLE;
6077 Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */
6081 nextchar(pRExC_state);
6082 ret = reg(pRExC_state, 1, &flags,depth+1);
6084 if (flags & TRYAGAIN) {
6085 if (RExC_parse == RExC_end) {
6086 /* Make parent create an empty node if needed. */
6094 *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE);
6098 if (flags & TRYAGAIN) {
6102 vFAIL("Internal urp");
6103 /* Supposed to be caught earlier. */
6106 if (!regcurly(RExC_parse)) {
6115 vFAIL("Quantifier follows nothing");
6118 switch (*++RExC_parse) {
6120 RExC_seen_zerolen++;
6121 ret = reg_node(pRExC_state, SBOL);
6123 nextchar(pRExC_state);
6124 Set_Node_Length(ret, 2); /* MJD */
6127 ret = reg_node(pRExC_state, GPOS);
6128 RExC_seen |= REG_SEEN_GPOS;
6130 nextchar(pRExC_state);
6131 Set_Node_Length(ret, 2); /* MJD */
6134 ret = reg_node(pRExC_state, SEOL);
6136 RExC_seen_zerolen++; /* Do not optimize RE away */
6137 nextchar(pRExC_state);
6140 ret = reg_node(pRExC_state, EOS);
6142 RExC_seen_zerolen++; /* Do not optimize RE away */
6143 nextchar(pRExC_state);
6144 Set_Node_Length(ret, 2); /* MJD */
6147 ret = reg_node(pRExC_state, CANY);
6148 RExC_seen |= REG_SEEN_CANY;
6149 *flagp |= HASWIDTH|SIMPLE;
6150 nextchar(pRExC_state);
6151 Set_Node_Length(ret, 2); /* MJD */
6154 ret = reg_node(pRExC_state, CLUMP);
6156 nextchar(pRExC_state);
6157 Set_Node_Length(ret, 2); /* MJD */
6160 ret = reg_node(pRExC_state, (U8)(LOC ? ALNUML : ALNUM));
6161 *flagp |= HASWIDTH|SIMPLE;
6162 nextchar(pRExC_state);
6163 Set_Node_Length(ret, 2); /* MJD */
6166 ret = reg_node(pRExC_state, (U8)(LOC ? NALNUML : NALNUM));
6167 *flagp |= HASWIDTH|SIMPLE;
6168 nextchar(pRExC_state);
6169 Set_Node_Length(ret, 2); /* MJD */
6172 RExC_seen_zerolen++;
6173 RExC_seen |= REG_SEEN_LOOKBEHIND;
6174 ret = reg_node(pRExC_state, (U8)(LOC ? BOUNDL : BOUND));
6176 nextchar(pRExC_state);
6177 Set_Node_Length(ret, 2); /* MJD */
6180 RExC_seen_zerolen++;
6181 RExC_seen |= REG_SEEN_LOOKBEHIND;
6182 ret = reg_node(pRExC_state, (U8)(LOC ? NBOUNDL : NBOUND));
6184 nextchar(pRExC_state);
6185 Set_Node_Length(ret, 2); /* MJD */
6188 ret = reg_node(pRExC_state, (U8)(LOC ? SPACEL : SPACE));
6189 *flagp |= HASWIDTH|SIMPLE;
6190 nextchar(pRExC_state);
6191 Set_Node_Length(ret, 2); /* MJD */
6194 ret = reg_node(pRExC_state, (U8)(LOC ? NSPACEL : NSPACE));
6195 *flagp |= HASWIDTH|SIMPLE;
6196 nextchar(pRExC_state);
6197 Set_Node_Length(ret, 2); /* MJD */
6200 ret = reg_node(pRExC_state, DIGIT);
6201 *flagp |= HASWIDTH|SIMPLE;
6202 nextchar(pRExC_state);
6203 Set_Node_Length(ret, 2); /* MJD */
6206 ret = reg_node(pRExC_state, NDIGIT);
6207 *flagp |= HASWIDTH|SIMPLE;
6208 nextchar(pRExC_state);
6209 Set_Node_Length(ret, 2); /* MJD */
6214 char* const oldregxend = RExC_end;
6215 char* parse_start = RExC_parse - 2;
6217 if (RExC_parse[1] == '{') {
6218 /* a lovely hack--pretend we saw [\pX] instead */
6219 RExC_end = strchr(RExC_parse, '}');
6221 const U8 c = (U8)*RExC_parse;
6223 RExC_end = oldregxend;
6224 vFAIL2("Missing right brace on \\%c{}", c);
6229 RExC_end = RExC_parse + 2;
6230 if (RExC_end > oldregxend)
6231 RExC_end = oldregxend;
6235 ret = regclass(pRExC_state,depth+1);
6237 RExC_end = oldregxend;
6240 Set_Node_Offset(ret, parse_start + 2);
6241 Set_Node_Cur_Length(ret);
6242 nextchar(pRExC_state);
6243 *flagp |= HASWIDTH|SIMPLE;
6247 /* Handle \N{NAME} here and not below because it can be
6248 multicharacter. join_exact() will join them up later on.
6249 Also this makes sure that things like /\N{BLAH}+/ and
6250 \N{BLAH} being multi char Just Happen. dmq*/
6252 ret= reg_namedseq(pRExC_state, NULL);
6254 case 'k': /* Handle \k<NAME> and \k'NAME' */
6256 char ch= RExC_parse[1];
6257 if (ch != '<' && ch != '\'') {
6259 vWARN( RExC_parse + 1,
6260 "Possible broken named back reference treated as literal k");
6264 char* name_start = (RExC_parse += 2);
6266 SV *sv_dat = reg_scan_name(pRExC_state,
6267 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
6268 ch= (ch == '<') ? '>' : '\'';
6270 if (RExC_parse == name_start || *RExC_parse != ch)
6271 vFAIL2("Sequence \\k%c... not terminated",
6272 (ch == '>' ? '<' : ch));
6275 ret = reganode(pRExC_state,
6276 (U8)(FOLD ? (LOC ? NREFFL : NREFF) : NREF),
6282 num = add_data( pRExC_state, 1, "S" );
6284 RExC_rx->data->data[num]=(void*)sv_dat;
6285 SvREFCNT_inc(sv_dat);
6287 /* override incorrect value set in reganode MJD */
6288 Set_Node_Offset(ret, parse_start+1);
6289 Set_Node_Cur_Length(ret); /* MJD */
6290 nextchar(pRExC_state);
6306 case '1': case '2': case '3': case '4':
6307 case '5': case '6': case '7': case '8': case '9':
6310 bool isrel=(*RExC_parse=='R');
6313 num = atoi(RExC_parse);
6315 num = RExC_cpar - num;
6317 vFAIL("Reference to nonexistent or unclosed group");
6319 if (num > 9 && num >= RExC_npar)
6322 char * const parse_start = RExC_parse - 1; /* MJD */
6323 while (isDIGIT(*RExC_parse))
6327 if (num > (I32)RExC_rx->nparens)
6328 vFAIL("Reference to nonexistent group");
6329 /* People make this error all the time apparently.
6330 So we cant fail on it, even though we should
6332 else if (num >= RExC_cpar)
6333 vFAIL("Reference to unclosed group will always match");
6337 ret = reganode(pRExC_state,
6338 (U8)(FOLD ? (LOC ? REFFL : REFF) : REF),
6342 /* override incorrect value set in reganode MJD */
6343 Set_Node_Offset(ret, parse_start+1);
6344 Set_Node_Cur_Length(ret); /* MJD */
6346 nextchar(pRExC_state);
6351 if (RExC_parse >= RExC_end)
6352 FAIL("Trailing \\");
6355 /* Do not generate "unrecognized" warnings here, we fall
6356 back into the quick-grab loop below */
6363 if (RExC_flags & PMf_EXTENDED) {
6364 while (RExC_parse < RExC_end && *RExC_parse != '\n')
6366 if (RExC_parse < RExC_end)
6372 register STRLEN len;
6377 U8 tmpbuf[UTF8_MAXBYTES_CASE+1], *foldbuf;
6379 parse_start = RExC_parse - 1;
6385 ret = reg_node(pRExC_state,
6386 (U8)(FOLD ? (LOC ? EXACTFL : EXACTF) : EXACT));
6388 for (len = 0, p = RExC_parse - 1;
6389 len < 127 && p < RExC_end;
6392 char * const oldp = p;
6394 if (RExC_flags & PMf_EXTENDED)
6395 p = regwhite(p, RExC_end);
6444 ender = ASCII_TO_NATIVE('\033');
6448 ender = ASCII_TO_NATIVE('\007');
6453 char* const e = strchr(p, '}');
6457 vFAIL("Missing right brace on \\x{}");
6460 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
6461 | PERL_SCAN_DISALLOW_PREFIX;
6462 STRLEN numlen = e - p - 1;
6463 ender = grok_hex(p + 1, &numlen, &flags, NULL);
6470 I32 flags = PERL_SCAN_DISALLOW_PREFIX;
6472 ender = grok_hex(p, &numlen, &flags, NULL);
6475 if (PL_encoding && ender < 0x100)
6476 goto recode_encoding;
6480 ender = UCHARAT(p++);
6481 ender = toCTRL(ender);
6483 case '0': case '1': case '2': case '3':case '4':
6484 case '5': case '6': case '7': case '8':case '9':
6486 (isDIGIT(p[1]) && atoi(p) >= RExC_npar) ) {
6489 ender = grok_oct(p, &numlen, &flags, NULL);
6496 if (PL_encoding && ender < 0x100)
6497 goto recode_encoding;
6501 SV* enc = PL_encoding;
6502 ender = reg_recode((const char)(U8)ender, &enc);
6503 if (!enc && SIZE_ONLY && ckWARN(WARN_REGEXP))
6504 vWARN(p, "Invalid escape in the specified encoding");
6510 FAIL("Trailing \\");
6513 if (!SIZE_ONLY&& isALPHA(*p) && ckWARN(WARN_REGEXP))
6514 vWARN2(p + 1, "Unrecognized escape \\%c passed through", UCHARAT(p));
6515 goto normal_default;
6520 if (UTF8_IS_START(*p) && UTF) {
6522 ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
6523 &numlen, UTF8_ALLOW_DEFAULT);
6530 if (RExC_flags & PMf_EXTENDED)
6531 p = regwhite(p, RExC_end);
6533 /* Prime the casefolded buffer. */
6534 ender = toFOLD_uni(ender, tmpbuf, &foldlen);
6536 if (ISMULT2(p)) { /* Back off on ?+*. */
6541 /* Emit all the Unicode characters. */
6543 for (foldbuf = tmpbuf;
6545 foldlen -= numlen) {
6546 ender = utf8_to_uvchr(foldbuf, &numlen);
6548 const STRLEN unilen = reguni(pRExC_state, ender, s);
6551 /* In EBCDIC the numlen
6552 * and unilen can differ. */
6554 if (numlen >= foldlen)
6558 break; /* "Can't happen." */
6562 const STRLEN unilen = reguni(pRExC_state, ender, s);
6571 REGC((char)ender, s++);
6577 /* Emit all the Unicode characters. */
6579 for (foldbuf = tmpbuf;
6581 foldlen -= numlen) {
6582 ender = utf8_to_uvchr(foldbuf, &numlen);
6584 const STRLEN unilen = reguni(pRExC_state, ender, s);
6587 /* In EBCDIC the numlen
6588 * and unilen can differ. */
6590 if (numlen >= foldlen)
6598 const STRLEN unilen = reguni(pRExC_state, ender, s);
6607 REGC((char)ender, s++);
6611 Set_Node_Cur_Length(ret); /* MJD */
6612 nextchar(pRExC_state);
6614 /* len is STRLEN which is unsigned, need to copy to signed */
6617 vFAIL("Internal disaster");
6621 if (len == 1 && UNI_IS_INVARIANT(ender))
6625 RExC_size += STR_SZ(len);
6628 RExC_emit += STR_SZ(len);
6638 S_regwhite(char *p, const char *e)
6643 else if (*p == '#') {
6646 } while (p < e && *p != '\n');
6654 /* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
6655 Character classes ([:foo:]) can also be negated ([:^foo:]).
6656 Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
6657 Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
6658 but trigger failures because they are currently unimplemented. */
6660 #define POSIXCC_DONE(c) ((c) == ':')
6661 #define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.')
6662 #define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c))
6665 S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value)
6668 I32 namedclass = OOB_NAMEDCLASS;
6670 if (value == '[' && RExC_parse + 1 < RExC_end &&
6671 /* I smell either [: or [= or [. -- POSIX has been here, right? */
6672 POSIXCC(UCHARAT(RExC_parse))) {
6673 const char c = UCHARAT(RExC_parse);
6674 char* const s = RExC_parse++;
6676 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != c)
6678 if (RExC_parse == RExC_end)
6679 /* Grandfather lone [:, [=, [. */
6682 const char* const t = RExC_parse++; /* skip over the c */
6685 if (UCHARAT(RExC_parse) == ']') {
6686 const char *posixcc = s + 1;
6687 RExC_parse++; /* skip over the ending ] */
6690 const I32 complement = *posixcc == '^' ? *posixcc++ : 0;
6691 const I32 skip = t - posixcc;
6693 /* Initially switch on the length of the name. */
6696 if (memEQ(posixcc, "word", 4)) /* this is not POSIX, this is the Perl \w */
6697 namedclass = complement ? ANYOF_NALNUM : ANYOF_ALNUM;
6700 /* Names all of length 5. */
6701 /* alnum alpha ascii blank cntrl digit graph lower
6702 print punct space upper */
6703 /* Offset 4 gives the best switch position. */
6704 switch (posixcc[4]) {
6706 if (memEQ(posixcc, "alph", 4)) /* alpha */
6707 namedclass = complement ? ANYOF_NALPHA : ANYOF_ALPHA;
6710 if (memEQ(posixcc, "spac", 4)) /* space */
6711 namedclass = complement ? ANYOF_NPSXSPC : ANYOF_PSXSPC;
6714 if (memEQ(posixcc, "grap", 4)) /* graph */
6715 namedclass = complement ? ANYOF_NGRAPH : ANYOF_GRAPH;
6718 if (memEQ(posixcc, "asci", 4)) /* ascii */
6719 namedclass = complement ? ANYOF_NASCII : ANYOF_ASCII;
6722 if (memEQ(posixcc, "blan", 4)) /* blank */
6723 namedclass = complement ? ANYOF_NBLANK : ANYOF_BLANK;
6726 if (memEQ(posixcc, "cntr", 4)) /* cntrl */
6727 namedclass = complement ? ANYOF_NCNTRL : ANYOF_CNTRL;
6730 if (memEQ(posixcc, "alnu", 4)) /* alnum */
6731 namedclass = complement ? ANYOF_NALNUMC : ANYOF_ALNUMC;
6734 if (memEQ(posixcc, "lowe", 4)) /* lower */
6735 namedclass = complement ? ANYOF_NLOWER : ANYOF_LOWER;
6736 else if (memEQ(posixcc, "uppe", 4)) /* upper */
6737 namedclass = complement ? ANYOF_NUPPER : ANYOF_UPPER;
6740 if (memEQ(posixcc, "digi", 4)) /* digit */
6741 namedclass = complement ? ANYOF_NDIGIT : ANYOF_DIGIT;
6742 else if (memEQ(posixcc, "prin", 4)) /* print */
6743 namedclass = complement ? ANYOF_NPRINT : ANYOF_PRINT;
6744 else if (memEQ(posixcc, "punc", 4)) /* punct */
6745 namedclass = complement ? ANYOF_NPUNCT : ANYOF_PUNCT;
6750 if (memEQ(posixcc, "xdigit", 6))
6751 namedclass = complement ? ANYOF_NXDIGIT : ANYOF_XDIGIT;
6755 if (namedclass == OOB_NAMEDCLASS)
6756 Simple_vFAIL3("POSIX class [:%.*s:] unknown",
6758 assert (posixcc[skip] == ':');
6759 assert (posixcc[skip+1] == ']');
6760 } else if (!SIZE_ONLY) {
6761 /* [[=foo=]] and [[.foo.]] are still future. */
6763 /* adjust RExC_parse so the warning shows after
6765 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse) != ']')
6767 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
6770 /* Maternal grandfather:
6771 * "[:" ending in ":" but not in ":]" */
6781 S_checkposixcc(pTHX_ RExC_state_t *pRExC_state)
6784 if (POSIXCC(UCHARAT(RExC_parse))) {
6785 const char *s = RExC_parse;
6786 const char c = *s++;
6790 if (*s && c == *s && s[1] == ']') {
6791 if (ckWARN(WARN_REGEXP))
6793 "POSIX syntax [%c %c] belongs inside character classes",
6796 /* [[=foo=]] and [[.foo.]] are still future. */
6797 if (POSIXCC_NOTYET(c)) {
6798 /* adjust RExC_parse so the error shows after
6800 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse++) != ']')
6802 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
6810 parse a class specification and produce either an ANYOF node that
6811 matches the pattern. If the pattern matches a single char only and
6812 that char is < 256 then we produce an EXACT node instead.
6815 S_regclass(pTHX_ RExC_state_t *pRExC_state, U32 depth)
6818 register UV value = 0;
6819 register UV nextvalue;
6820 register IV prevvalue = OOB_UNICODE;
6821 register IV range = 0;
6822 register regnode *ret;
6825 char *rangebegin = NULL;
6826 bool need_class = 0;
6829 bool optimize_invert = TRUE;
6830 AV* unicode_alternate = NULL;
6832 UV literal_endpoint = 0;
6834 UV stored = 0; /* number of chars stored in the class */
6836 regnode * const orig_emit = RExC_emit; /* Save the original RExC_emit in
6837 case we need to change the emitted regop to an EXACT. */
6838 const char * orig_parse = RExC_parse;
6839 GET_RE_DEBUG_FLAGS_DECL;
6841 PERL_UNUSED_ARG(depth);
6844 DEBUG_PARSE("clas");
6846 /* Assume we are going to generate an ANYOF node. */
6847 ret = reganode(pRExC_state, ANYOF, 0);
6850 ANYOF_FLAGS(ret) = 0;
6852 if (UCHARAT(RExC_parse) == '^') { /* Complement of range. */
6856 ANYOF_FLAGS(ret) |= ANYOF_INVERT;
6860 RExC_size += ANYOF_SKIP;
6861 listsv = &PL_sv_undef; /* For code scanners: listsv always non-NULL. */
6864 RExC_emit += ANYOF_SKIP;
6866 ANYOF_FLAGS(ret) |= ANYOF_FOLD;
6868 ANYOF_FLAGS(ret) |= ANYOF_LOCALE;
6869 ANYOF_BITMAP_ZERO(ret);
6870 listsv = newSVpvs("# comment\n");
6873 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
6875 if (!SIZE_ONLY && POSIXCC(nextvalue))
6876 checkposixcc(pRExC_state);
6878 /* allow 1st char to be ] (allowing it to be - is dealt with later) */
6879 if (UCHARAT(RExC_parse) == ']')
6883 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != ']') {
6887 namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
6890 rangebegin = RExC_parse;
6892 value = utf8n_to_uvchr((U8*)RExC_parse,
6893 RExC_end - RExC_parse,
6894 &numlen, UTF8_ALLOW_DEFAULT);
6895 RExC_parse += numlen;
6898 value = UCHARAT(RExC_parse++);
6900 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
6901 if (value == '[' && POSIXCC(nextvalue))
6902 namedclass = regpposixcc(pRExC_state, value);
6903 else if (value == '\\') {
6905 value = utf8n_to_uvchr((U8*)RExC_parse,
6906 RExC_end - RExC_parse,
6907 &numlen, UTF8_ALLOW_DEFAULT);
6908 RExC_parse += numlen;
6911 value = UCHARAT(RExC_parse++);
6912 /* Some compilers cannot handle switching on 64-bit integer
6913 * values, therefore value cannot be an UV. Yes, this will
6914 * be a problem later if we want switch on Unicode.
6915 * A similar issue a little bit later when switching on
6916 * namedclass. --jhi */
6917 switch ((I32)value) {
6918 case 'w': namedclass = ANYOF_ALNUM; break;
6919 case 'W': namedclass = ANYOF_NALNUM; break;
6920 case 's': namedclass = ANYOF_SPACE; break;
6921 case 'S': namedclass = ANYOF_NSPACE; break;
6922 case 'd': namedclass = ANYOF_DIGIT; break;
6923 case 'D': namedclass = ANYOF_NDIGIT; break;
6924 case 'N': /* Handle \N{NAME} in class */
6926 /* We only pay attention to the first char of
6927 multichar strings being returned. I kinda wonder
6928 if this makes sense as it does change the behaviour
6929 from earlier versions, OTOH that behaviour was broken
6931 UV v; /* value is register so we cant & it /grrr */
6932 if (reg_namedseq(pRExC_state, &v)) {
6942 if (RExC_parse >= RExC_end)
6943 vFAIL2("Empty \\%c{}", (U8)value);
6944 if (*RExC_parse == '{') {
6945 const U8 c = (U8)value;
6946 e = strchr(RExC_parse++, '}');
6948 vFAIL2("Missing right brace on \\%c{}", c);
6949 while (isSPACE(UCHARAT(RExC_parse)))
6951 if (e == RExC_parse)
6952 vFAIL2("Empty \\%c{}", c);
6954 while (isSPACE(UCHARAT(RExC_parse + n - 1)))
6962 if (UCHARAT(RExC_parse) == '^') {
6965 value = value == 'p' ? 'P' : 'p'; /* toggle */
6966 while (isSPACE(UCHARAT(RExC_parse))) {
6971 Perl_sv_catpvf(aTHX_ listsv, "%cutf8::%.*s\n",
6972 (value=='p' ? '+' : '!'), (int)n, RExC_parse);
6975 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
6976 namedclass = ANYOF_MAX; /* no official name, but it's named */
6979 case 'n': value = '\n'; break;
6980 case 'r': value = '\r'; break;
6981 case 't': value = '\t'; break;
6982 case 'f': value = '\f'; break;
6983 case 'b': value = '\b'; break;
6984 case 'e': value = ASCII_TO_NATIVE('\033');break;
6985 case 'a': value = ASCII_TO_NATIVE('\007');break;
6987 if (*RExC_parse == '{') {
6988 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
6989 | PERL_SCAN_DISALLOW_PREFIX;
6990 char * const e = strchr(RExC_parse++, '}');
6992 vFAIL("Missing right brace on \\x{}");
6994 numlen = e - RExC_parse;
6995 value = grok_hex(RExC_parse, &numlen, &flags, NULL);
6999 I32 flags = PERL_SCAN_DISALLOW_PREFIX;
7001 value = grok_hex(RExC_parse, &numlen, &flags, NULL);
7002 RExC_parse += numlen;
7004 if (PL_encoding && value < 0x100)
7005 goto recode_encoding;
7008 value = UCHARAT(RExC_parse++);
7009 value = toCTRL(value);
7011 case '0': case '1': case '2': case '3': case '4':
7012 case '5': case '6': case '7': case '8': case '9':
7016 value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
7017 RExC_parse += numlen;
7018 if (PL_encoding && value < 0x100)
7019 goto recode_encoding;
7024 SV* enc = PL_encoding;
7025 value = reg_recode((const char)(U8)value, &enc);
7026 if (!enc && SIZE_ONLY && ckWARN(WARN_REGEXP))
7028 "Invalid escape in the specified encoding");
7032 if (!SIZE_ONLY && isALPHA(value) && ckWARN(WARN_REGEXP))
7034 "Unrecognized escape \\%c in character class passed through",
7038 } /* end of \blah */
7044 if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
7046 if (!SIZE_ONLY && !need_class)
7047 ANYOF_CLASS_ZERO(ret);
7051 /* a bad range like a-\d, a-[:digit:] ? */
7054 if (ckWARN(WARN_REGEXP)) {
7056 RExC_parse >= rangebegin ?
7057 RExC_parse - rangebegin : 0;
7059 "False [] range \"%*.*s\"",
7062 if (prevvalue < 256) {
7063 ANYOF_BITMAP_SET(ret, prevvalue);
7064 ANYOF_BITMAP_SET(ret, '-');
7067 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
7068 Perl_sv_catpvf(aTHX_ listsv,
7069 "%04"UVxf"\n%04"UVxf"\n", (UV)prevvalue, (UV) '-');
7073 range = 0; /* this was not a true range */
7077 const char *what = NULL;
7080 if (namedclass > OOB_NAMEDCLASS)
7081 optimize_invert = FALSE;
7082 /* Possible truncation here but in some 64-bit environments
7083 * the compiler gets heartburn about switch on 64-bit values.
7084 * A similar issue a little earlier when switching on value.
7086 switch ((I32)namedclass) {
7089 ANYOF_CLASS_SET(ret, ANYOF_ALNUM);
7091 for (value = 0; value < 256; value++)
7093 ANYOF_BITMAP_SET(ret, value);
7100 ANYOF_CLASS_SET(ret, ANYOF_NALNUM);
7102 for (value = 0; value < 256; value++)
7103 if (!isALNUM(value))
7104 ANYOF_BITMAP_SET(ret, value);
7111 ANYOF_CLASS_SET(ret, ANYOF_ALNUMC);
7113 for (value = 0; value < 256; value++)
7114 if (isALNUMC(value))
7115 ANYOF_BITMAP_SET(ret, value);
7122 ANYOF_CLASS_SET(ret, ANYOF_NALNUMC);
7124 for (value = 0; value < 256; value++)
7125 if (!isALNUMC(value))
7126 ANYOF_BITMAP_SET(ret, value);
7133 ANYOF_CLASS_SET(ret, ANYOF_ALPHA);
7135 for (value = 0; value < 256; value++)
7137 ANYOF_BITMAP_SET(ret, value);
7144 ANYOF_CLASS_SET(ret, ANYOF_NALPHA);
7146 for (value = 0; value < 256; value++)
7147 if (!isALPHA(value))
7148 ANYOF_BITMAP_SET(ret, value);
7155 ANYOF_CLASS_SET(ret, ANYOF_ASCII);
7158 for (value = 0; value < 128; value++)
7159 ANYOF_BITMAP_SET(ret, value);
7161 for (value = 0; value < 256; value++) {
7163 ANYOF_BITMAP_SET(ret, value);
7172 ANYOF_CLASS_SET(ret, ANYOF_NASCII);
7175 for (value = 128; value < 256; value++)
7176 ANYOF_BITMAP_SET(ret, value);
7178 for (value = 0; value < 256; value++) {
7179 if (!isASCII(value))
7180 ANYOF_BITMAP_SET(ret, value);
7189 ANYOF_CLASS_SET(ret, ANYOF_BLANK);
7191 for (value = 0; value < 256; value++)
7193 ANYOF_BITMAP_SET(ret, value);
7200 ANYOF_CLASS_SET(ret, ANYOF_NBLANK);
7202 for (value = 0; value < 256; value++)
7203 if (!isBLANK(value))
7204 ANYOF_BITMAP_SET(ret, value);
7211 ANYOF_CLASS_SET(ret, ANYOF_CNTRL);
7213 for (value = 0; value < 256; value++)
7215 ANYOF_BITMAP_SET(ret, value);
7222 ANYOF_CLASS_SET(ret, ANYOF_NCNTRL);
7224 for (value = 0; value < 256; value++)
7225 if (!isCNTRL(value))
7226 ANYOF_BITMAP_SET(ret, value);
7233 ANYOF_CLASS_SET(ret, ANYOF_DIGIT);
7235 /* consecutive digits assumed */
7236 for (value = '0'; value <= '9'; value++)
7237 ANYOF_BITMAP_SET(ret, value);
7244 ANYOF_CLASS_SET(ret, ANYOF_NDIGIT);
7246 /* consecutive digits assumed */
7247 for (value = 0; value < '0'; value++)
7248 ANYOF_BITMAP_SET(ret, value);
7249 for (value = '9' + 1; value < 256; value++)
7250 ANYOF_BITMAP_SET(ret, value);
7257 ANYOF_CLASS_SET(ret, ANYOF_GRAPH);
7259 for (value = 0; value < 256; value++)
7261 ANYOF_BITMAP_SET(ret, value);
7268 ANYOF_CLASS_SET(ret, ANYOF_NGRAPH);
7270 for (value = 0; value < 256; value++)
7271 if (!isGRAPH(value))
7272 ANYOF_BITMAP_SET(ret, value);
7279 ANYOF_CLASS_SET(ret, ANYOF_LOWER);
7281 for (value = 0; value < 256; value++)
7283 ANYOF_BITMAP_SET(ret, value);
7290 ANYOF_CLASS_SET(ret, ANYOF_NLOWER);
7292 for (value = 0; value < 256; value++)
7293 if (!isLOWER(value))
7294 ANYOF_BITMAP_SET(ret, value);
7301 ANYOF_CLASS_SET(ret, ANYOF_PRINT);
7303 for (value = 0; value < 256; value++)
7305 ANYOF_BITMAP_SET(ret, value);
7312 ANYOF_CLASS_SET(ret, ANYOF_NPRINT);
7314 for (value = 0; value < 256; value++)
7315 if (!isPRINT(value))
7316 ANYOF_BITMAP_SET(ret, value);
7323 ANYOF_CLASS_SET(ret, ANYOF_PSXSPC);
7325 for (value = 0; value < 256; value++)
7326 if (isPSXSPC(value))
7327 ANYOF_BITMAP_SET(ret, value);
7334 ANYOF_CLASS_SET(ret, ANYOF_NPSXSPC);
7336 for (value = 0; value < 256; value++)
7337 if (!isPSXSPC(value))
7338 ANYOF_BITMAP_SET(ret, value);
7345 ANYOF_CLASS_SET(ret, ANYOF_PUNCT);
7347 for (value = 0; value < 256; value++)
7349 ANYOF_BITMAP_SET(ret, value);
7356 ANYOF_CLASS_SET(ret, ANYOF_NPUNCT);
7358 for (value = 0; value < 256; value++)
7359 if (!isPUNCT(value))
7360 ANYOF_BITMAP_SET(ret, value);
7367 ANYOF_CLASS_SET(ret, ANYOF_SPACE);
7369 for (value = 0; value < 256; value++)
7371 ANYOF_BITMAP_SET(ret, value);
7378 ANYOF_CLASS_SET(ret, ANYOF_NSPACE);
7380 for (value = 0; value < 256; value++)
7381 if (!isSPACE(value))
7382 ANYOF_BITMAP_SET(ret, value);
7389 ANYOF_CLASS_SET(ret, ANYOF_UPPER);
7391 for (value = 0; value < 256; value++)
7393 ANYOF_BITMAP_SET(ret, value);
7400 ANYOF_CLASS_SET(ret, ANYOF_NUPPER);
7402 for (value = 0; value < 256; value++)
7403 if (!isUPPER(value))
7404 ANYOF_BITMAP_SET(ret, value);
7411 ANYOF_CLASS_SET(ret, ANYOF_XDIGIT);
7413 for (value = 0; value < 256; value++)
7414 if (isXDIGIT(value))
7415 ANYOF_BITMAP_SET(ret, value);
7422 ANYOF_CLASS_SET(ret, ANYOF_NXDIGIT);
7424 for (value = 0; value < 256; value++)
7425 if (!isXDIGIT(value))
7426 ANYOF_BITMAP_SET(ret, value);
7432 /* this is to handle \p and \P */
7435 vFAIL("Invalid [::] class");
7439 /* Strings such as "+utf8::isWord\n" */
7440 Perl_sv_catpvf(aTHX_ listsv, "%cutf8::Is%s\n", yesno, what);
7443 ANYOF_FLAGS(ret) |= ANYOF_CLASS;
7446 } /* end of namedclass \blah */
7449 if (prevvalue > (IV)value) /* b-a */ {
7450 const int w = RExC_parse - rangebegin;
7451 Simple_vFAIL4("Invalid [] range \"%*.*s\"", w, w, rangebegin);
7452 range = 0; /* not a valid range */
7456 prevvalue = value; /* save the beginning of the range */
7457 if (*RExC_parse == '-' && RExC_parse+1 < RExC_end &&
7458 RExC_parse[1] != ']') {
7461 /* a bad range like \w-, [:word:]- ? */
7462 if (namedclass > OOB_NAMEDCLASS) {
7463 if (ckWARN(WARN_REGEXP)) {
7465 RExC_parse >= rangebegin ?
7466 RExC_parse - rangebegin : 0;
7468 "False [] range \"%*.*s\"",
7472 ANYOF_BITMAP_SET(ret, '-');
7474 range = 1; /* yeah, it's a range! */
7475 continue; /* but do it the next time */
7479 /* now is the next time */
7480 /*stored += (value - prevvalue + 1);*/
7482 if (prevvalue < 256) {
7483 const IV ceilvalue = value < 256 ? value : 255;
7486 /* In EBCDIC [\x89-\x91] should include
7487 * the \x8e but [i-j] should not. */
7488 if (literal_endpoint == 2 &&
7489 ((isLOWER(prevvalue) && isLOWER(ceilvalue)) ||
7490 (isUPPER(prevvalue) && isUPPER(ceilvalue))))
7492 if (isLOWER(prevvalue)) {
7493 for (i = prevvalue; i <= ceilvalue; i++)
7495 ANYOF_BITMAP_SET(ret, i);
7497 for (i = prevvalue; i <= ceilvalue; i++)
7499 ANYOF_BITMAP_SET(ret, i);
7504 for (i = prevvalue; i <= ceilvalue; i++) {
7505 if (!ANYOF_BITMAP_TEST(ret,i)) {
7507 ANYOF_BITMAP_SET(ret, i);
7511 if (value > 255 || UTF) {
7512 const UV prevnatvalue = NATIVE_TO_UNI(prevvalue);
7513 const UV natvalue = NATIVE_TO_UNI(value);
7514 stored+=2; /* can't optimize this class */
7515 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
7516 if (prevnatvalue < natvalue) { /* what about > ? */
7517 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\t%04"UVxf"\n",
7518 prevnatvalue, natvalue);
7520 else if (prevnatvalue == natvalue) {
7521 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n", natvalue);
7523 U8 foldbuf[UTF8_MAXBYTES_CASE+1];
7525 const UV f = to_uni_fold(natvalue, foldbuf, &foldlen);
7527 #ifdef EBCDIC /* RD t/uni/fold ff and 6b */
7528 if (RExC_precomp[0] == ':' &&
7529 RExC_precomp[1] == '[' &&
7530 (f == 0xDF || f == 0x92)) {
7531 f = NATIVE_TO_UNI(f);
7534 /* If folding and foldable and a single
7535 * character, insert also the folded version
7536 * to the charclass. */
7538 #ifdef EBCDIC /* RD tunifold ligatures s,t fb05, fb06 */
7539 if ((RExC_precomp[0] == ':' &&
7540 RExC_precomp[1] == '[' &&
7542 (value == 0xFB05 || value == 0xFB06))) ?
7543 foldlen == ((STRLEN)UNISKIP(f) - 1) :
7544 foldlen == (STRLEN)UNISKIP(f) )
7546 if (foldlen == (STRLEN)UNISKIP(f))
7548 Perl_sv_catpvf(aTHX_ listsv,
7551 /* Any multicharacter foldings
7552 * require the following transform:
7553 * [ABCDEF] -> (?:[ABCabcDEFd]|pq|rst)
7554 * where E folds into "pq" and F folds
7555 * into "rst", all other characters
7556 * fold to single characters. We save
7557 * away these multicharacter foldings,
7558 * to be later saved as part of the
7559 * additional "s" data. */
7562 if (!unicode_alternate)
7563 unicode_alternate = newAV();
7564 sv = newSVpvn((char*)foldbuf, foldlen);
7566 av_push(unicode_alternate, sv);
7570 /* If folding and the value is one of the Greek
7571 * sigmas insert a few more sigmas to make the
7572 * folding rules of the sigmas to work right.
7573 * Note that not all the possible combinations
7574 * are handled here: some of them are handled
7575 * by the standard folding rules, and some of
7576 * them (literal or EXACTF cases) are handled
7577 * during runtime in regexec.c:S_find_byclass(). */
7578 if (value == UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA) {
7579 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
7580 (UV)UNICODE_GREEK_CAPITAL_LETTER_SIGMA);
7581 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
7582 (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
7584 else if (value == UNICODE_GREEK_CAPITAL_LETTER_SIGMA)
7585 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
7586 (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
7591 literal_endpoint = 0;
7595 range = 0; /* this range (if it was one) is done now */
7599 ANYOF_FLAGS(ret) |= ANYOF_LARGE;
7601 RExC_size += ANYOF_CLASS_ADD_SKIP;
7603 RExC_emit += ANYOF_CLASS_ADD_SKIP;
7609 /****** !SIZE_ONLY AFTER HERE *********/
7611 if( stored == 1 && value < 256
7612 && !( ANYOF_FLAGS(ret) & ( ANYOF_FLAGS_ALL ^ ANYOF_FOLD ) )
7614 /* optimize single char class to an EXACT node
7615 but *only* when its not a UTF/high char */
7616 const char * cur_parse= RExC_parse;
7617 RExC_emit = (regnode *)orig_emit;
7618 RExC_parse = (char *)orig_parse;
7619 ret = reg_node(pRExC_state,
7620 (U8)((ANYOF_FLAGS(ret) & ANYOF_FOLD) ? EXACTF : EXACT));
7621 RExC_parse = (char *)cur_parse;
7622 *STRING(ret)= (char)value;
7624 RExC_emit += STR_SZ(1);
7627 /* optimize case-insensitive simple patterns (e.g. /[a-z]/i) */
7628 if ( /* If the only flag is folding (plus possibly inversion). */
7629 ((ANYOF_FLAGS(ret) & (ANYOF_FLAGS_ALL ^ ANYOF_INVERT)) == ANYOF_FOLD)
7631 for (value = 0; value < 256; ++value) {
7632 if (ANYOF_BITMAP_TEST(ret, value)) {
7633 UV fold = PL_fold[value];
7636 ANYOF_BITMAP_SET(ret, fold);
7639 ANYOF_FLAGS(ret) &= ~ANYOF_FOLD;
7642 /* optimize inverted simple patterns (e.g. [^a-z]) */
7643 if (optimize_invert &&
7644 /* If the only flag is inversion. */
7645 (ANYOF_FLAGS(ret) & ANYOF_FLAGS_ALL) == ANYOF_INVERT) {
7646 for (value = 0; value < ANYOF_BITMAP_SIZE; ++value)
7647 ANYOF_BITMAP(ret)[value] ^= ANYOF_FLAGS_ALL;
7648 ANYOF_FLAGS(ret) = ANYOF_UNICODE_ALL;
7651 AV * const av = newAV();
7653 /* The 0th element stores the character class description
7654 * in its textual form: used later (regexec.c:Perl_regclass_swash())
7655 * to initialize the appropriate swash (which gets stored in
7656 * the 1st element), and also useful for dumping the regnode.
7657 * The 2nd element stores the multicharacter foldings,
7658 * used later (regexec.c:S_reginclass()). */
7659 av_store(av, 0, listsv);
7660 av_store(av, 1, NULL);
7661 av_store(av, 2, (SV*)unicode_alternate);
7662 rv = newRV_noinc((SV*)av);
7663 n = add_data(pRExC_state, 1, "s");
7664 RExC_rx->data->data[n] = (void*)rv;
7671 S_nextchar(pTHX_ RExC_state_t *pRExC_state)
7673 char* const retval = RExC_parse++;
7676 if (*RExC_parse == '(' && RExC_parse[1] == '?' &&
7677 RExC_parse[2] == '#') {
7678 while (*RExC_parse != ')') {
7679 if (RExC_parse == RExC_end)
7680 FAIL("Sequence (?#... not terminated");
7686 if (RExC_flags & PMf_EXTENDED) {
7687 if (isSPACE(*RExC_parse)) {
7691 else if (*RExC_parse == '#') {
7692 while (RExC_parse < RExC_end)
7693 if (*RExC_parse++ == '\n') break;
7702 - reg_node - emit a node
7704 STATIC regnode * /* Location. */
7705 S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
7708 register regnode *ptr;
7709 regnode * const ret = RExC_emit;
7710 GET_RE_DEBUG_FLAGS_DECL;
7713 SIZE_ALIGN(RExC_size);
7718 if (OP(RExC_emit) == 255)
7719 Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %s: %d ",
7720 reg_name[op], OP(RExC_emit));
7722 NODE_ALIGN_FILL(ret);
7724 FILL_ADVANCE_NODE(ptr, op);
7725 if (RExC_offsets) { /* MJD */
7726 MJD_OFFSET_DEBUG(("%s:%d: (op %s) %s %"UVuf" (len %"UVuf") (max %"UVuf").\n",
7727 "reg_node", __LINE__,
7729 (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0]
7730 ? "Overwriting end of array!\n" : "OK",
7731 (UV)(RExC_emit - RExC_emit_start),
7732 (UV)(RExC_parse - RExC_start),
7733 (UV)RExC_offsets[0]));
7734 Set_Node_Offset(RExC_emit, RExC_parse + (op == END));
7742 - reganode - emit a node with an argument
7744 STATIC regnode * /* Location. */
7745 S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
7748 register regnode *ptr;
7749 regnode * const ret = RExC_emit;
7750 GET_RE_DEBUG_FLAGS_DECL;
7753 SIZE_ALIGN(RExC_size);
7758 assert(2==regarglen[op]+1);
7760 Anything larger than this has to allocate the extra amount.
7761 If we changed this to be:
7763 RExC_size += (1 + regarglen[op]);
7765 then it wouldn't matter. Its not clear what side effect
7766 might come from that so its not done so far.
7772 if (OP(RExC_emit) == 255)
7773 Perl_croak(aTHX_ "panic: reganode overwriting end of allocated program space");
7775 NODE_ALIGN_FILL(ret);
7777 FILL_ADVANCE_NODE_ARG(ptr, op, arg);
7778 if (RExC_offsets) { /* MJD */
7779 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
7783 (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0] ?
7784 "Overwriting end of array!\n" : "OK",
7785 (UV)(RExC_emit - RExC_emit_start),
7786 (UV)(RExC_parse - RExC_start),
7787 (UV)RExC_offsets[0]));
7788 Set_Cur_Node_Offset;
7796 - reguni - emit (if appropriate) a Unicode character
7799 S_reguni(pTHX_ const RExC_state_t *pRExC_state, UV uv, char* s)
7802 return SIZE_ONLY ? UNISKIP(uv) : (uvchr_to_utf8((U8*)s, uv) - (U8*)s);
7806 - reginsert - insert an operator in front of already-emitted operand
7808 * Means relocating the operand.
7811 S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd, U32 depth)
7814 register regnode *src;
7815 register regnode *dst;
7816 register regnode *place;
7817 const int offset = regarglen[(U8)op];
7818 const int size = NODE_STEP_REGNODE + offset;
7819 GET_RE_DEBUG_FLAGS_DECL;
7820 /* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */
7821 DEBUG_PARSE_FMT("inst"," - %s",reg_name[op]);
7830 if (RExC_open_parens) {
7832 DEBUG_PARSE_FMT("inst"," - %"IVdf, (IV)RExC_npar);
7833 for ( paren=0 ; paren < RExC_npar ; paren++ ) {
7834 if ( RExC_open_parens[paren] >= opnd ) {
7835 DEBUG_PARSE_FMT("open"," - %d",size);
7836 RExC_open_parens[paren] += size;
7838 DEBUG_PARSE_FMT("open"," - %s","ok");
7840 if ( RExC_close_parens[paren] >= opnd ) {
7841 DEBUG_PARSE_FMT("close"," - %d",size);
7842 RExC_close_parens[paren] += size;
7844 DEBUG_PARSE_FMT("close"," - %s","ok");
7849 while (src > opnd) {
7850 StructCopy(--src, --dst, regnode);
7851 if (RExC_offsets) { /* MJD 20010112 */
7852 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s copy %"UVuf" -> %"UVuf" (max %"UVuf").\n",
7856 (UV)(dst - RExC_emit_start) > RExC_offsets[0]
7857 ? "Overwriting end of array!\n" : "OK",
7858 (UV)(src - RExC_emit_start),
7859 (UV)(dst - RExC_emit_start),
7860 (UV)RExC_offsets[0]));
7861 Set_Node_Offset_To_R(dst-RExC_emit_start, Node_Offset(src));
7862 Set_Node_Length_To_R(dst-RExC_emit_start, Node_Length(src));
7867 place = opnd; /* Op node, where operand used to be. */
7868 if (RExC_offsets) { /* MJD */
7869 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
7873 (UV)(place - RExC_emit_start) > RExC_offsets[0]
7874 ? "Overwriting end of array!\n" : "OK",
7875 (UV)(place - RExC_emit_start),
7876 (UV)(RExC_parse - RExC_start),
7877 (UV)RExC_offsets[0]));
7878 Set_Node_Offset(place, RExC_parse);
7879 Set_Node_Length(place, 1);
7881 src = NEXTOPER(place);
7882 FILL_ADVANCE_NODE(place, op);
7883 Zero(src, offset, regnode);
7887 - regtail - set the next-pointer at the end of a node chain of p to val.
7888 - SEE ALSO: regtail_study
7890 /* TODO: All three parms should be const */
7892 S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
7895 register regnode *scan;
7896 GET_RE_DEBUG_FLAGS_DECL;
7898 PERL_UNUSED_ARG(depth);
7904 /* Find last node. */
7907 regnode * const temp = regnext(scan);
7909 SV * const mysv=sv_newmortal();
7910 DEBUG_PARSE_MSG((scan==p ? "tail" : ""));
7911 regprop(RExC_rx, mysv, scan);
7912 PerlIO_printf(Perl_debug_log, "~ %s (%d) %s %s\n",
7913 SvPV_nolen_const(mysv), REG_NODE_NUM(scan),
7914 (temp == NULL ? "->" : ""),
7915 (temp == NULL ? reg_name[OP(val)] : "")
7923 if (reg_off_by_arg[OP(scan)]) {
7924 ARG_SET(scan, val - scan);
7927 NEXT_OFF(scan) = val - scan;
7933 - regtail_study - set the next-pointer at the end of a node chain of p to val.
7934 - Look for optimizable sequences at the same time.
7935 - currently only looks for EXACT chains.
7937 This is expermental code. The idea is to use this routine to perform
7938 in place optimizations on branches and groups as they are constructed,
7939 with the long term intention of removing optimization from study_chunk so
7940 that it is purely analytical.
7942 Currently only used when in DEBUG mode. The macro REGTAIL_STUDY() is used
7943 to control which is which.
7946 /* TODO: All four parms should be const */
7949 S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
7952 register regnode *scan;
7954 #ifdef EXPERIMENTAL_INPLACESCAN
7958 GET_RE_DEBUG_FLAGS_DECL;
7964 /* Find last node. */
7968 regnode * const temp = regnext(scan);
7969 #ifdef EXPERIMENTAL_INPLACESCAN
7970 if (PL_regkind[OP(scan)] == EXACT)
7971 if (join_exact(pRExC_state,scan,&min,1,val,depth+1))
7979 if( exact == PSEUDO )
7981 else if ( exact != OP(scan) )
7990 SV * const mysv=sv_newmortal();
7991 DEBUG_PARSE_MSG((scan==p ? "tsdy" : ""));
7992 regprop(RExC_rx, mysv, scan);
7993 PerlIO_printf(Perl_debug_log, "~ %s (%d) -> %s\n",
7994 SvPV_nolen_const(mysv),
8003 SV * const mysv_val=sv_newmortal();
8004 DEBUG_PARSE_MSG("");
8005 regprop(RExC_rx, mysv_val, val);
8006 PerlIO_printf(Perl_debug_log, "~ attach to %s (%d) offset to %d\n",
8007 SvPV_nolen_const(mysv_val),
8012 if (reg_off_by_arg[OP(scan)]) {
8013 ARG_SET(scan, val - scan);
8016 NEXT_OFF(scan) = val - scan;
8024 - regcurly - a little FSA that accepts {\d+,?\d*}
8027 S_regcurly(register const char *s)
8046 - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
8049 Perl_regdump(pTHX_ const regexp *r)
8053 SV * const sv = sv_newmortal();
8054 SV *dsv= sv_newmortal();
8056 (void)dumpuntil(r, r->program, r->program + 1, NULL, NULL, sv, 0, 0);
8058 /* Header fields of interest. */
8059 if (r->anchored_substr) {
8060 RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->anchored_substr),
8061 RE_SV_DUMPLEN(r->anchored_substr), 30);
8062 PerlIO_printf(Perl_debug_log,
8063 "anchored %s%s at %"IVdf" ",
8064 s, RE_SV_TAIL(r->anchored_substr),
8065 (IV)r->anchored_offset);
8066 } else if (r->anchored_utf8) {
8067 RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->anchored_utf8),
8068 RE_SV_DUMPLEN(r->anchored_utf8), 30);
8069 PerlIO_printf(Perl_debug_log,
8070 "anchored utf8 %s%s at %"IVdf" ",
8071 s, RE_SV_TAIL(r->anchored_utf8),
8072 (IV)r->anchored_offset);
8074 if (r->float_substr) {
8075 RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->float_substr),
8076 RE_SV_DUMPLEN(r->float_substr), 30);
8077 PerlIO_printf(Perl_debug_log,
8078 "floating %s%s at %"IVdf"..%"UVuf" ",
8079 s, RE_SV_TAIL(r->float_substr),
8080 (IV)r->float_min_offset, (UV)r->float_max_offset);
8081 } else if (r->float_utf8) {
8082 RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->float_utf8),
8083 RE_SV_DUMPLEN(r->float_utf8), 30);
8084 PerlIO_printf(Perl_debug_log,
8085 "floating utf8 %s%s at %"IVdf"..%"UVuf" ",
8086 s, RE_SV_TAIL(r->float_utf8),
8087 (IV)r->float_min_offset, (UV)r->float_max_offset);
8089 if (r->check_substr || r->check_utf8)
8090 PerlIO_printf(Perl_debug_log,
8092 (r->check_substr == r->float_substr
8093 && r->check_utf8 == r->float_utf8
8094 ? "(checking floating" : "(checking anchored"));
8095 if (r->reganch & ROPT_NOSCAN)
8096 PerlIO_printf(Perl_debug_log, " noscan");
8097 if (r->reganch & ROPT_CHECK_ALL)
8098 PerlIO_printf(Perl_debug_log, " isall");
8099 if (r->check_substr || r->check_utf8)
8100 PerlIO_printf(Perl_debug_log, ") ");
8102 if (r->regstclass) {
8103 regprop(r, sv, r->regstclass);
8104 PerlIO_printf(Perl_debug_log, "stclass %s ", SvPVX_const(sv));
8106 if (r->reganch & ROPT_ANCH) {
8107 PerlIO_printf(Perl_debug_log, "anchored");
8108 if (r->reganch & ROPT_ANCH_BOL)
8109 PerlIO_printf(Perl_debug_log, "(BOL)");
8110 if (r->reganch & ROPT_ANCH_MBOL)
8111 PerlIO_printf(Perl_debug_log, "(MBOL)");
8112 if (r->reganch & ROPT_ANCH_SBOL)
8113 PerlIO_printf(Perl_debug_log, "(SBOL)");
8114 if (r->reganch & ROPT_ANCH_GPOS)
8115 PerlIO_printf(Perl_debug_log, "(GPOS)");
8116 PerlIO_putc(Perl_debug_log, ' ');
8118 if (r->reganch & ROPT_GPOS_SEEN)
8119 PerlIO_printf(Perl_debug_log, "GPOS ");
8120 if (r->reganch & ROPT_SKIP)
8121 PerlIO_printf(Perl_debug_log, "plus ");
8122 if (r->reganch & ROPT_IMPLICIT)
8123 PerlIO_printf(Perl_debug_log, "implicit ");
8124 PerlIO_printf(Perl_debug_log, "minlen %ld ", (long) r->minlen);
8125 if (r->reganch & ROPT_EVAL_SEEN)
8126 PerlIO_printf(Perl_debug_log, "with eval ");
8127 PerlIO_printf(Perl_debug_log, "\n");
8129 PERL_UNUSED_CONTEXT;
8131 #endif /* DEBUGGING */
8135 - regprop - printable representation of opcode
8138 Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o)
8143 GET_RE_DEBUG_FLAGS_DECL;
8145 sv_setpvn(sv, "", 0);
8147 if (OP(o) > REGNODE_MAX) /* regnode.type is unsigned */
8148 /* It would be nice to FAIL() here, but this may be called from
8149 regexec.c, and it would be hard to supply pRExC_state. */
8150 Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d", (int)OP(o), (int)REGNODE_MAX);
8151 sv_catpv(sv, reg_name[OP(o)]); /* Take off const! */
8153 k = PL_regkind[OP(o)];
8156 SV * const dsv = sv_2mortal(newSVpvs(""));
8157 /* Using is_utf8_string() (via PERL_PV_UNI_DETECT)
8158 * is a crude hack but it may be the best for now since
8159 * we have no flag "this EXACTish node was UTF-8"
8161 const char * const s =
8162 pv_pretty(dsv, STRING(o), STR_LEN(o), 60,
8163 PL_colors[0], PL_colors[1],
8164 PERL_PV_ESCAPE_UNI_DETECT |
8165 PERL_PV_PRETTY_ELIPSES |
8168 Perl_sv_catpvf(aTHX_ sv, " %s", s );
8169 } else if (k == TRIE) {
8170 /* print the details of the trie in dumpuntil instead, as
8171 * prog->data isn't available here */
8172 const char op = OP(o);
8173 const I32 n = ARG(o);
8174 const reg_ac_data * const ac = IS_TRIE_AC(op) ?
8175 (reg_ac_data *)prog->data->data[n] :
8177 const reg_trie_data * const trie = !IS_TRIE_AC(op) ?
8178 (reg_trie_data*)prog->data->data[n] :
8181 Perl_sv_catpvf(aTHX_ sv, "-%s",reg_name[o->flags]);
8182 DEBUG_TRIE_COMPILE_r(
8183 Perl_sv_catpvf(aTHX_ sv,
8184 "<S:%"UVuf"/%"IVdf" W:%"UVuf" L:%"UVuf"/%"UVuf" C:%"UVuf"/%"UVuf">",
8185 (UV)trie->startstate,
8186 (IV)trie->statecount-1, /* -1 because of the unused 0 element */
8187 (UV)trie->wordcount,
8190 (UV)TRIE_CHARCOUNT(trie),
8191 (UV)trie->uniquecharcount
8194 if ( IS_ANYOF_TRIE(op) || trie->bitmap ) {
8196 int rangestart = -1;
8197 U8* bitmap = IS_ANYOF_TRIE(op) ? (U8*)ANYOF_BITMAP(o) : (U8*)TRIE_BITMAP(trie);
8198 Perl_sv_catpvf(aTHX_ sv, "[");
8199 for (i = 0; i <= 256; i++) {
8200 if (i < 256 && BITMAP_TEST(bitmap,i)) {
8201 if (rangestart == -1)
8203 } else if (rangestart != -1) {
8204 if (i <= rangestart + 3)
8205 for (; rangestart < i; rangestart++)
8206 put_byte(sv, rangestart);
8208 put_byte(sv, rangestart);
8210 put_byte(sv, i - 1);
8215 Perl_sv_catpvf(aTHX_ sv, "]");
8218 } else if (k == CURLY) {
8219 if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX)
8220 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */
8221 Perl_sv_catpvf(aTHX_ sv, " {%d,%d}", ARG1(o), ARG2(o));
8223 else if (k == WHILEM && o->flags) /* Ordinal/of */
8224 Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4);
8225 else if (k == REF || k == OPEN || k == CLOSE || k == GROUPP || OP(o)==ACCEPT)
8226 Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o)); /* Parenth number */
8227 else if (k == GOSUB)
8228 Perl_sv_catpvf(aTHX_ sv, "%d[%+d]", (int)ARG(o),(int)ARG2L(o)); /* Paren and offset */
8229 else if (k == VERB) {
8231 Perl_sv_catpvf(aTHX_ sv, ":%"SVf,
8232 (SV*)prog->data->data[ ARG( o ) ]);
8233 } else if (k == LOGICAL)
8234 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* 2: embedded, otherwise 1 */
8235 else if (k == ANYOF) {
8236 int i, rangestart = -1;
8237 const U8 flags = ANYOF_FLAGS(o);
8239 /* Should be synchronized with * ANYOF_ #xdefines in regcomp.h */
8240 static const char * const anyofs[] = {
8273 if (flags & ANYOF_LOCALE)
8274 sv_catpvs(sv, "{loc}");
8275 if (flags & ANYOF_FOLD)
8276 sv_catpvs(sv, "{i}");
8277 Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
8278 if (flags & ANYOF_INVERT)
8280 for (i = 0; i <= 256; i++) {
8281 if (i < 256 && ANYOF_BITMAP_TEST(o,i)) {
8282 if (rangestart == -1)
8284 } else if (rangestart != -1) {
8285 if (i <= rangestart + 3)
8286 for (; rangestart < i; rangestart++)
8287 put_byte(sv, rangestart);
8289 put_byte(sv, rangestart);
8291 put_byte(sv, i - 1);
8297 if (o->flags & ANYOF_CLASS)
8298 for (i = 0; i < (int)(sizeof(anyofs)/sizeof(char*)); i++)
8299 if (ANYOF_CLASS_TEST(o,i))
8300 sv_catpv(sv, anyofs[i]);
8302 if (flags & ANYOF_UNICODE)
8303 sv_catpvs(sv, "{unicode}");
8304 else if (flags & ANYOF_UNICODE_ALL)
8305 sv_catpvs(sv, "{unicode_all}");
8309 SV * const sw = regclass_swash(prog, o, FALSE, &lv, 0);
8313 U8 s[UTF8_MAXBYTES_CASE+1];
8315 for (i = 0; i <= 256; i++) { /* just the first 256 */
8316 uvchr_to_utf8(s, i);
8318 if (i < 256 && swash_fetch(sw, s, TRUE)) {
8319 if (rangestart == -1)
8321 } else if (rangestart != -1) {
8322 if (i <= rangestart + 3)
8323 for (; rangestart < i; rangestart++) {
8324 const U8 * const e = uvchr_to_utf8(s,rangestart);
8326 for(p = s; p < e; p++)
8330 const U8 *e = uvchr_to_utf8(s,rangestart);
8332 for (p = s; p < e; p++)
8335 e = uvchr_to_utf8(s, i-1);
8336 for (p = s; p < e; p++)
8343 sv_catpvs(sv, "..."); /* et cetera */
8347 char *s = savesvpv(lv);
8348 char * const origs = s;
8350 while (*s && *s != '\n')
8354 const char * const t = ++s;
8372 Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
8374 else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH))
8375 Perl_sv_catpvf(aTHX_ sv, "[%d]", -(o->flags));
8377 PERL_UNUSED_CONTEXT;
8378 PERL_UNUSED_ARG(sv);
8380 PERL_UNUSED_ARG(prog);
8381 #endif /* DEBUGGING */
8385 Perl_re_intuit_string(pTHX_ regexp *prog)
8386 { /* Assume that RE_INTUIT is set */
8388 GET_RE_DEBUG_FLAGS_DECL;
8389 PERL_UNUSED_CONTEXT;
8393 const char * const s = SvPV_nolen_const(prog->check_substr
8394 ? prog->check_substr : prog->check_utf8);
8396 if (!PL_colorset) reginitcolors();
8397 PerlIO_printf(Perl_debug_log,
8398 "%sUsing REx %ssubstr:%s \"%s%.60s%s%s\"\n",
8400 prog->check_substr ? "" : "utf8 ",
8401 PL_colors[5],PL_colors[0],
8404 (strlen(s) > 60 ? "..." : ""));
8407 return prog->check_substr ? prog->check_substr : prog->check_utf8;
8411 pregfree - free a regexp
8413 See regdupe below if you change anything here.
8417 Perl_pregfree(pTHX_ struct regexp *r)
8421 GET_RE_DEBUG_FLAGS_DECL;
8423 if (!r || (--r->refcnt > 0))
8429 SV *dsv= sv_newmortal();
8430 RE_PV_QUOTED_DECL(s, (r->reganch & ROPT_UTF8),
8431 dsv, r->precomp, r->prelen, 60);
8432 PerlIO_printf(Perl_debug_log,"%sFreeing REx:%s %s\n",
8433 PL_colors[4],PL_colors[5],s);
8437 /* gcov results gave these as non-null 100% of the time, so there's no
8438 optimisation in checking them before calling Safefree */
8439 Safefree(r->precomp);
8440 Safefree(r->offsets); /* 20010421 MJD */
8441 RX_MATCH_COPY_FREE(r);
8442 #ifdef PERL_OLD_COPY_ON_WRITE
8444 SvREFCNT_dec(r->saved_copy);
8447 if (r->anchored_substr)
8448 SvREFCNT_dec(r->anchored_substr);
8449 if (r->anchored_utf8)
8450 SvREFCNT_dec(r->anchored_utf8);
8451 if (r->float_substr)
8452 SvREFCNT_dec(r->float_substr);
8454 SvREFCNT_dec(r->float_utf8);
8455 Safefree(r->substrs);
8458 SvREFCNT_dec(r->paren_names);
8460 int n = r->data->count;
8461 PAD* new_comppad = NULL;
8466 /* If you add a ->what type here, update the comment in regcomp.h */
8467 switch (r->data->what[n]) {
8470 SvREFCNT_dec((SV*)r->data->data[n]);
8473 Safefree(r->data->data[n]);
8476 new_comppad = (AV*)r->data->data[n];
8479 if (new_comppad == NULL)
8480 Perl_croak(aTHX_ "panic: pregfree comppad");
8481 PAD_SAVE_LOCAL(old_comppad,
8482 /* Watch out for global destruction's random ordering. */
8483 (SvTYPE(new_comppad) == SVt_PVAV) ? new_comppad : NULL
8486 refcnt = OpREFCNT_dec((OP_4tree*)r->data->data[n]);
8489 op_free((OP_4tree*)r->data->data[n]);
8491 PAD_RESTORE_LOCAL(old_comppad);
8492 SvREFCNT_dec((SV*)new_comppad);
8498 { /* Aho Corasick add-on structure for a trie node.
8499 Used in stclass optimization only */
8501 reg_ac_data *aho=(reg_ac_data*)r->data->data[n];
8503 refcount = --aho->refcount;
8506 Safefree(aho->states);
8507 Safefree(aho->fail);
8508 aho->trie=NULL; /* not necessary to free this as it is
8509 handled by the 't' case */
8510 Safefree(r->data->data[n]); /* do this last!!!! */
8511 Safefree(r->regstclass);
8517 /* trie structure. */
8519 reg_trie_data *trie=(reg_trie_data*)r->data->data[n];
8521 refcount = --trie->refcount;
8524 Safefree(trie->charmap);
8525 if (trie->widecharmap)
8526 SvREFCNT_dec((SV*)trie->widecharmap);
8527 Safefree(trie->states);
8528 Safefree(trie->trans);
8530 Safefree(trie->bitmap);
8532 Safefree(trie->wordlen);
8534 Safefree(trie->jump);
8536 Safefree(trie->nextword);
8539 SvREFCNT_dec((SV*)trie->words);
8540 if (trie->revcharmap)
8541 SvREFCNT_dec((SV*)trie->revcharmap);
8543 Safefree(r->data->data[n]); /* do this last!!!! */
8548 Perl_croak(aTHX_ "panic: regfree data code '%c'", r->data->what[n]);
8551 Safefree(r->data->what);
8554 Safefree(r->startp);
8557 Safefree(r->swap->startp);
8558 Safefree(r->swap->endp);
8564 #define sv_dup_inc(s,t) SvREFCNT_inc(sv_dup(s,t))
8565 #define av_dup_inc(s,t) (AV*)SvREFCNT_inc(sv_dup((SV*)s,t))
8566 #define hv_dup_inc(s,t) (HV*)SvREFCNT_inc(sv_dup((SV*)s,t))
8567 #define SAVEPVN(p,n) ((p) ? savepvn(p,n) : NULL)
8570 regdupe - duplicate a regexp.
8572 This routine is called by sv.c's re_dup and is expected to clone a
8573 given regexp structure. It is a no-op when not under USE_ITHREADS.
8574 (Originally this *was* re_dup() for change history see sv.c)
8576 See pregfree() above if you change anything here.
8578 #if defined(USE_ITHREADS)
8580 Perl_regdupe(pTHX_ const regexp *r, CLONE_PARAMS *param)
8585 struct reg_substr_datum *s;
8588 return (REGEXP *)NULL;
8590 if ((ret = (REGEXP *)ptr_table_fetch(PL_ptr_table, r)))
8593 len = r->offsets[0];
8594 npar = r->nparens+1;
8596 Newxc(ret, sizeof(regexp) + (len+1)*sizeof(regnode), char, regexp);
8597 Copy(r->program, ret->program, len+1, regnode);
8599 Newx(ret->startp, npar, I32);
8600 Copy(r->startp, ret->startp, npar, I32);
8601 Newx(ret->endp, npar, I32);
8602 Copy(r->startp, ret->startp, npar, I32);
8604 Newx(ret->swap, 1, regexp_paren_ofs);
8605 /* no need to copy these */
8606 Newx(ret->swap->startp, npar, I32);
8607 Newx(ret->swap->endp, npar, I32);
8612 Newx(ret->substrs, 1, struct reg_substr_data);
8613 for (s = ret->substrs->data, i = 0; i < 3; i++, s++) {
8614 s->min_offset = r->substrs->data[i].min_offset;
8615 s->max_offset = r->substrs->data[i].max_offset;
8616 s->end_shift = r->substrs->data[i].end_shift;
8617 s->substr = sv_dup_inc(r->substrs->data[i].substr, param);
8618 s->utf8_substr = sv_dup_inc(r->substrs->data[i].utf8_substr, param);
8621 ret->regstclass = NULL;
8624 const int count = r->data->count;
8627 Newxc(d, sizeof(struct reg_data) + count*sizeof(void *),
8628 char, struct reg_data);
8629 Newx(d->what, count, U8);
8632 for (i = 0; i < count; i++) {
8633 d->what[i] = r->data->what[i];
8634 switch (d->what[i]) {
8635 /* legal options are one of: sSfpont
8636 see also regcomp.h and pregfree() */
8639 d->data[i] = sv_dup_inc((SV *)r->data->data[i], param);
8642 d->data[i] = av_dup_inc((AV *)r->data->data[i], param);
8645 /* This is cheating. */
8646 Newx(d->data[i], 1, struct regnode_charclass_class);
8647 StructCopy(r->data->data[i], d->data[i],
8648 struct regnode_charclass_class);
8649 ret->regstclass = (regnode*)d->data[i];
8652 /* Compiled op trees are readonly, and can thus be
8653 shared without duplication. */
8655 d->data[i] = (void*)OpREFCNT_inc((OP*)r->data->data[i]);
8659 d->data[i] = r->data->data[i];
8662 d->data[i] = r->data->data[i];
8664 ((reg_trie_data*)d->data[i])->refcount++;
8668 d->data[i] = r->data->data[i];
8670 ((reg_ac_data*)d->data[i])->refcount++;
8672 /* Trie stclasses are readonly and can thus be shared
8673 * without duplication. We free the stclass in pregfree
8674 * when the corresponding reg_ac_data struct is freed.
8676 ret->regstclass= r->regstclass;
8679 Perl_croak(aTHX_ "panic: re_dup unknown data code '%c'", r->data->what[i]);
8688 Newx(ret->offsets, 2*len+1, U32);
8689 Copy(r->offsets, ret->offsets, 2*len+1, U32);
8691 ret->precomp = SAVEPVN(r->precomp, r->prelen);
8692 ret->refcnt = r->refcnt;
8693 ret->minlen = r->minlen;
8694 ret->minlenret = r->minlenret;
8695 ret->prelen = r->prelen;
8696 ret->nparens = r->nparens;
8697 ret->lastparen = r->lastparen;
8698 ret->lastcloseparen = r->lastcloseparen;
8699 ret->reganch = r->reganch;
8701 ret->sublen = r->sublen;
8703 ret->engine = r->engine;
8705 ret->paren_names = hv_dup_inc(r->paren_names, param);
8707 if (RX_MATCH_COPIED(ret))
8708 ret->subbeg = SAVEPVN(r->subbeg, r->sublen);
8711 #ifdef PERL_OLD_COPY_ON_WRITE
8712 ret->saved_copy = NULL;
8715 ptr_table_store(PL_ptr_table, r, ret);
8723 converts a regexp embedded in a MAGIC struct to its stringified form,
8724 caching the converted form in the struct and returns the cached
8727 If lp is nonnull then it is used to return the length of the
8730 If flags is nonnull and the returned string contains UTF8 then
8731 (flags & 1) will be true.
8733 If haseval is nonnull then it is used to return whether the pattern
8736 Normally called via macro:
8738 CALLREG_STRINGIFY(mg,0,0);
8742 CALLREG_AS_STR(mg,lp,flags,haseval)
8744 See sv_2pv_flags() in sv.c for an example of internal usage.
8749 Perl_reg_stringify(pTHX_ MAGIC *mg, STRLEN *lp, U32 *flags, I32 *haseval ) {
8751 const regexp * const re = (regexp *)mg->mg_obj;
8754 const char *fptr = "msix";
8759 bool need_newline = 0;
8760 U16 reganch = (U16)((re->reganch & PMf_COMPILETIME) >> 12);
8762 while((ch = *fptr++)) {
8764 reflags[left++] = ch;
8767 reflags[right--] = ch;
8772 reflags[left] = '-';
8776 mg->mg_len = re->prelen + 4 + left;
8778 * If /x was used, we have to worry about a regex ending with a
8779 * comment later being embedded within another regex. If so, we don't
8780 * want this regex's "commentization" to leak out to the right part of
8781 * the enclosing regex, we must cap it with a newline.
8783 * So, if /x was used, we scan backwards from the end of the regex. If
8784 * we find a '#' before we find a newline, we need to add a newline
8785 * ourself. If we find a '\n' first (or if we don't find '#' or '\n'),
8786 * we don't need to add anything. -jfriedl
8788 if (PMf_EXTENDED & re->reganch) {
8789 const char *endptr = re->precomp + re->prelen;
8790 while (endptr >= re->precomp) {
8791 const char c = *(endptr--);
8793 break; /* don't need another */
8795 /* we end while in a comment, so we need a newline */
8796 mg->mg_len++; /* save space for it */
8797 need_newline = 1; /* note to add it */
8803 Newx(mg->mg_ptr, mg->mg_len + 1 + left, char);
8804 mg->mg_ptr[0] = '(';
8805 mg->mg_ptr[1] = '?';
8806 Copy(reflags, mg->mg_ptr+2, left, char);
8807 *(mg->mg_ptr+left+2) = ':';
8808 Copy(re->precomp, mg->mg_ptr+3+left, re->prelen, char);
8810 mg->mg_ptr[mg->mg_len - 2] = '\n';
8811 mg->mg_ptr[mg->mg_len - 1] = ')';
8812 mg->mg_ptr[mg->mg_len] = 0;
8815 *haseval = re->program[0].next_off;
8817 *flags = ((re->reganch & ROPT_UTF8) ? 1 : 0);
8825 #ifndef PERL_IN_XSUB_RE
8827 - regnext - dig the "next" pointer out of a node
8830 Perl_regnext(pTHX_ register regnode *p)
8833 register I32 offset;
8835 if (p == &PL_regdummy)
8838 offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p));
8847 S_re_croak2(pTHX_ const char* pat1,const char* pat2,...)
8850 STRLEN l1 = strlen(pat1);
8851 STRLEN l2 = strlen(pat2);
8854 const char *message;
8860 Copy(pat1, buf, l1 , char);
8861 Copy(pat2, buf + l1, l2 , char);
8862 buf[l1 + l2] = '\n';
8863 buf[l1 + l2 + 1] = '\0';
8865 /* ANSI variant takes additional second argument */
8866 va_start(args, pat2);
8870 msv = vmess(buf, &args);
8872 message = SvPV_const(msv,l1);
8875 Copy(message, buf, l1 , char);
8876 buf[l1-1] = '\0'; /* Overwrite \n */
8877 Perl_croak(aTHX_ "%s", buf);
8880 /* XXX Here's a total kludge. But we need to re-enter for swash routines. */
8882 #ifndef PERL_IN_XSUB_RE
8884 Perl_save_re_context(pTHX)
8888 struct re_save_state *state;
8890 SAVEVPTR(PL_curcop);
8891 SSGROW(SAVESTACK_ALLOC_FOR_RE_SAVE_STATE + 1);
8893 state = (struct re_save_state *)(PL_savestack + PL_savestack_ix);
8894 PL_savestack_ix += SAVESTACK_ALLOC_FOR_RE_SAVE_STATE;
8895 SSPUSHINT(SAVEt_RE_STATE);
8897 Copy(&PL_reg_state, state, 1, struct re_save_state);
8899 PL_reg_start_tmp = 0;
8900 PL_reg_start_tmpl = 0;
8901 PL_reg_oldsaved = NULL;
8902 PL_reg_oldsavedlen = 0;
8904 PL_reg_leftiter = 0;
8905 PL_reg_poscache = NULL;
8906 PL_reg_poscache_size = 0;
8907 #ifdef PERL_OLD_COPY_ON_WRITE
8911 /* Save $1..$n (#18107: UTF-8 s/(\w+)/uc($1)/e); AMS 20021106. */
8913 const REGEXP * const rx = PM_GETRE(PL_curpm);
8916 for (i = 1; i <= rx->nparens; i++) {
8917 char digits[TYPE_CHARS(long)];
8918 const STRLEN len = my_snprintf(digits, sizeof(digits), "%lu", (long)i);
8919 GV *const *const gvp
8920 = (GV**)hv_fetch(PL_defstash, digits, len, 0);
8923 GV * const gv = *gvp;
8924 if (SvTYPE(gv) == SVt_PVGV && GvSV(gv))
8934 clear_re(pTHX_ void *r)
8937 ReREFCNT_dec((regexp *)r);
8943 S_put_byte(pTHX_ SV *sv, int c)
8945 if (isCNTRL(c) || c == 255 || !isPRINT(c))
8946 Perl_sv_catpvf(aTHX_ sv, "\\%o", c);
8947 else if (c == '-' || c == ']' || c == '\\' || c == '^')
8948 Perl_sv_catpvf(aTHX_ sv, "\\%c", c);
8950 Perl_sv_catpvf(aTHX_ sv, "%c", c);
8954 #define CLEAR_OPTSTART \
8955 if (optstart) STMT_START { \
8956 DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log, " (%d nodes)\n", node - optstart)); \
8960 #define DUMPUNTIL(b,e) CLEAR_OPTSTART; node=dumpuntil(r,start,(b),(e),last,sv,indent+1,depth+1);
8962 STATIC const regnode *
8963 S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node,
8964 const regnode *last, const regnode *plast,
8965 SV* sv, I32 indent, U32 depth)
8968 register U8 op = PSEUDO; /* Arbitrary non-END op. */
8969 register const regnode *next;
8970 const regnode *optstart= NULL;
8971 GET_RE_DEBUG_FLAGS_DECL;
8973 #ifdef DEBUG_DUMPUNTIL
8974 PerlIO_printf(Perl_debug_log, "--- %d : %d - %d - %d\n",indent,node-start,
8975 last ? last-start : 0,plast ? plast-start : 0);
8978 if (plast && plast < last)
8981 while (PL_regkind[op] != END && (!last || node < last)) {
8982 /* While that wasn't END last time... */
8988 next = regnext((regnode *)node);
8991 if (OP(node) == OPTIMIZED) {
8992 if (!optstart && RE_DEBUG_FLAG(RE_DEBUG_COMPILE_OPTIMISE))
8999 regprop(r, sv, node);
9000 PerlIO_printf(Perl_debug_log, "%4"IVdf":%*s%s", (IV)(node - start),
9001 (int)(2*indent + 1), "", SvPVX_const(sv));
9003 if (OP(node) != OPTIMIZED) {
9004 if (next == NULL) /* Next ptr. */
9005 PerlIO_printf(Perl_debug_log, "(0)");
9006 else if (PL_regkind[(U8)op] == BRANCH && PL_regkind[OP(next)] != BRANCH )
9007 PerlIO_printf(Perl_debug_log, "(FAIL)");
9009 PerlIO_printf(Perl_debug_log, "(%"IVdf")", (IV)(next - start));
9011 /*if (PL_regkind[(U8)op] != TRIE)*/
9012 (void)PerlIO_putc(Perl_debug_log, '\n');
9016 if (PL_regkind[(U8)op] == BRANCHJ) {
9019 register const regnode *nnode = (OP(next) == LONGJMP
9020 ? regnext((regnode *)next)
9022 if (last && nnode > last)
9024 DUMPUNTIL(NEXTOPER(NEXTOPER(node)), nnode);
9027 else if (PL_regkind[(U8)op] == BRANCH) {
9029 DUMPUNTIL(NEXTOPER(node), next);
9031 else if ( PL_regkind[(U8)op] == TRIE ) {
9032 const regnode *this_trie = node;
9033 const char op = OP(node);
9034 const I32 n = ARG(node);
9035 const reg_ac_data * const ac = op>=AHOCORASICK ?
9036 (reg_ac_data *)r->data->data[n] :
9038 const reg_trie_data * const trie = op<AHOCORASICK ?
9039 (reg_trie_data*)r->data->data[n] :
9041 const regnode *nextbranch= NULL;
9043 sv_setpvn(sv, "", 0);
9044 for (word_idx= 0; word_idx < (I32)trie->wordcount; word_idx++) {
9045 SV ** const elem_ptr = av_fetch(trie->words,word_idx,0);
9047 PerlIO_printf(Perl_debug_log, "%*s%s ",
9048 (int)(2*(indent+3)), "",
9049 elem_ptr ? pv_pretty(sv, SvPV_nolen_const(*elem_ptr), SvCUR(*elem_ptr), 60,
9050 PL_colors[0], PL_colors[1],
9051 (SvUTF8(*elem_ptr) ? PERL_PV_ESCAPE_UNI : 0) |
9052 PERL_PV_PRETTY_ELIPSES |
9058 U16 dist= trie->jump[word_idx+1];
9059 PerlIO_printf(Perl_debug_log, "(%u)\n",
9060 (dist ? this_trie + dist : next) - start);
9063 nextbranch= this_trie + trie->jump[0];
9064 DUMPUNTIL(this_trie + dist, nextbranch);
9066 if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
9067 nextbranch= regnext((regnode *)nextbranch);
9069 PerlIO_printf(Perl_debug_log, "\n");
9072 if (last && next > last)
9077 else if ( op == CURLY ) { /* "next" might be very big: optimizer */
9078 DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS,
9079 NEXTOPER(node) + EXTRA_STEP_2ARGS + 1);
9081 else if (PL_regkind[(U8)op] == CURLY && op != CURLYX) {
9083 DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS, next);
9085 else if ( op == PLUS || op == STAR) {
9086 DUMPUNTIL(NEXTOPER(node), NEXTOPER(node) + 1);
9088 else if (op == ANYOF) {
9089 /* arglen 1 + class block */
9090 node += 1 + ((ANYOF_FLAGS(node) & ANYOF_LARGE)
9091 ? ANYOF_CLASS_SKIP : ANYOF_SKIP);
9092 node = NEXTOPER(node);
9094 else if (PL_regkind[(U8)op] == EXACT) {
9095 /* Literal string, where present. */
9096 node += NODE_SZ_STR(node) - 1;
9097 node = NEXTOPER(node);
9100 node = NEXTOPER(node);
9101 node += regarglen[(U8)op];
9103 if (op == CURLYX || op == OPEN)
9105 else if (op == WHILEM)
9109 #ifdef DEBUG_DUMPUNTIL
9110 PerlIO_printf(Perl_debug_log, "--- %d\n",indent);
9115 #endif /* DEBUGGING */
9119 * c-indentation-style: bsd
9121 * indent-tabs-mode: t
9124 * ex: set ts=8 sts=4 sw=4 noet: