5 * "A fair jaw-cracker dwarf-language must be." --Samwise Gamgee
8 /* This file contains functions for compiling a regular expression. See
9 * also regexec.c which funnily enough, contains functions for executing
10 * a regular expression.
12 * This file is also copied at build time to ext/re/re_comp.c, where
13 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
14 * This causes the main functions to be compiled under new names and with
15 * debugging support added, which makes "use re 'debug'" work.
18 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
19 * confused with the original package (see point 3 below). Thanks, Henry!
22 /* Additional note: this code is very heavily munged from Henry's version
23 * in places. In some spots I've traded clarity for efficiency, so don't
24 * blame Henry for some of the lack of readability.
27 /* The names of the functions have been changed from regcomp and
28 * regexec to pregcomp and pregexec in order to avoid conflicts
29 * with the POSIX routines of the same names.
32 #ifdef PERL_EXT_RE_BUILD
37 * pregcomp and pregexec -- regsub and regerror are not used in perl
39 * Copyright (c) 1986 by University of Toronto.
40 * Written by Henry Spencer. Not derived from licensed software.
42 * Permission is granted to anyone to use this software for any
43 * purpose on any computer system, and to redistribute it freely,
44 * subject to the following restrictions:
46 * 1. The author is not responsible for the consequences of use of
47 * this software, no matter how awful, even if they arise
50 * 2. The origin of this software must not be misrepresented, either
51 * by explicit claim or by omission.
53 * 3. Altered versions must be plainly marked as such, and must not
54 * be misrepresented as being the original software.
57 **** Alterations to Henry's code are...
59 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
60 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, by Larry Wall and others
62 **** You may distribute under the terms of either the GNU General Public
63 **** License or the Artistic License, as specified in the README file.
66 * Beware that some of this code is subtly aware of the way operator
67 * precedence is structured in regular expressions. Serious changes in
68 * regular-expression syntax might require a total rethink.
71 #define PERL_IN_REGCOMP_C
74 #ifndef PERL_IN_XSUB_RE
79 #ifdef PERL_IN_XSUB_RE
90 # if defined(BUGGY_MSC6)
91 /* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */
92 # pragma optimize("a",off)
93 /* But MSC 6.00A is happy with 'w', for aliases only across function calls*/
94 # pragma optimize("w",on )
95 # endif /* BUGGY_MSC6 */
102 typedef struct RExC_state_t {
103 U32 flags; /* are we folding, multilining? */
104 char *precomp; /* uncompiled string. */
106 char *start; /* Start of input for compile */
107 char *end; /* End of input for compile */
108 char *parse; /* Input-scan pointer. */
109 I32 whilem_seen; /* number of WHILEM in this expr */
110 regnode *emit_start; /* Start of emitted-code area */
111 regnode *emit; /* Code-emit pointer; ®dummy = don't = compiling */
112 I32 naughty; /* How bad is this pattern? */
113 I32 sawback; /* Did we see \1, ...? */
115 I32 size; /* Code size. */
116 I32 npar; /* Capture buffer count, (OPEN). */
117 I32 cpar; /* Capture buffer count, (CLOSE). */
118 I32 nestroot; /* root parens we are in - used by accept */
122 regnode **open_parens; /* pointers to open parens */
123 regnode **close_parens; /* pointers to close parens */
124 regnode *opend; /* END node in program */
126 HV *charnames; /* cache of named sequences */
127 HV *paren_names; /* Paren names */
128 regnode **recurse; /* Recurse regops */
129 I32 recurse_count; /* Number of recurse regops */
131 char *starttry; /* -Dr: where regtry was called. */
132 #define RExC_starttry (pRExC_state->starttry)
135 const char *lastparse;
137 #define RExC_lastparse (pRExC_state->lastparse)
138 #define RExC_lastnum (pRExC_state->lastnum)
142 #define RExC_flags (pRExC_state->flags)
143 #define RExC_precomp (pRExC_state->precomp)
144 #define RExC_rx (pRExC_state->rx)
145 #define RExC_start (pRExC_state->start)
146 #define RExC_end (pRExC_state->end)
147 #define RExC_parse (pRExC_state->parse)
148 #define RExC_whilem_seen (pRExC_state->whilem_seen)
149 #define RExC_offsets (pRExC_state->rx->offsets) /* I am not like the others */
150 #define RExC_emit (pRExC_state->emit)
151 #define RExC_emit_start (pRExC_state->emit_start)
152 #define RExC_naughty (pRExC_state->naughty)
153 #define RExC_sawback (pRExC_state->sawback)
154 #define RExC_seen (pRExC_state->seen)
155 #define RExC_size (pRExC_state->size)
156 #define RExC_npar (pRExC_state->npar)
157 #define RExC_cpar (pRExC_state->cpar)
158 #define RExC_nestroot (pRExC_state->nestroot)
159 #define RExC_extralen (pRExC_state->extralen)
160 #define RExC_seen_zerolen (pRExC_state->seen_zerolen)
161 #define RExC_seen_evals (pRExC_state->seen_evals)
162 #define RExC_utf8 (pRExC_state->utf8)
163 #define RExC_charnames (pRExC_state->charnames)
164 #define RExC_open_parens (pRExC_state->open_parens)
165 #define RExC_close_parens (pRExC_state->close_parens)
166 #define RExC_opend (pRExC_state->opend)
167 #define RExC_paren_names (pRExC_state->paren_names)
168 #define RExC_recurse (pRExC_state->recurse)
169 #define RExC_recurse_count (pRExC_state->recurse_count)
171 #define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?')
172 #define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
173 ((*s) == '{' && regcurly(s)))
176 #undef SPSTART /* dratted cpp namespace... */
179 * Flags to be passed up and down.
181 #define WORST 0 /* Worst case. */
182 #define HASWIDTH 0x1 /* Known to match non-null strings. */
183 #define SIMPLE 0x2 /* Simple enough to be STAR/PLUS operand. */
184 #define SPSTART 0x4 /* Starts with * or +. */
185 #define TRYAGAIN 0x8 /* Weeded out a declaration. */
187 #define REG_NODE_NUM(x) ((x) ? (int)((x)-RExC_emit_start) : -1)
189 /* whether trie related optimizations are enabled */
190 #if PERL_ENABLE_EXTENDED_TRIE_OPTIMISATION
191 #define TRIE_STUDY_OPT
192 #define FULL_TRIE_STUDY
198 #define PBYTE(u8str,paren) ((U8*)(u8str))[(paren) >> 3]
199 #define PBITVAL(paren) (1 << ((paren) & 7))
200 #define PAREN_TEST(u8str,paren) ( PBYTE(u8str,paren) & PBITVAL(paren))
201 #define PAREN_SET(u8str,paren) PBYTE(u8str,paren) |= PBITVAL(paren)
202 #define PAREN_UNSET(u8str,paren) PBYTE(u8str,paren) &= (~PBITVAL(paren))
205 /* About scan_data_t.
207 During optimisation we recurse through the regexp program performing
208 various inplace (keyhole style) optimisations. In addition study_chunk
209 and scan_commit populate this data structure with information about
210 what strings MUST appear in the pattern. We look for the longest
211 string that must appear for at a fixed location, and we look for the
212 longest string that may appear at a floating location. So for instance
217 Both 'FOO' and 'A' are fixed strings. Both 'B' and 'BAR' are floating
218 strings (because they follow a .* construct). study_chunk will identify
219 both FOO and BAR as being the longest fixed and floating strings respectively.
221 The strings can be composites, for instance
225 will result in a composite fixed substring 'foo'.
227 For each string some basic information is maintained:
229 - offset or min_offset
230 This is the position the string must appear at, or not before.
231 It also implicitly (when combined with minlenp) tells us how many
232 character must match before the string we are searching.
233 Likewise when combined with minlenp and the length of the string
234 tells us how many characters must appear after the string we have
238 Only used for floating strings. This is the rightmost point that
239 the string can appear at. Ifset to I32 max it indicates that the
240 string can occur infinitely far to the right.
243 A pointer to the minimum length of the pattern that the string
244 was found inside. This is important as in the case of positive
245 lookahead or positive lookbehind we can have multiple patterns
250 The minimum length of the pattern overall is 3, the minimum length
251 of the lookahead part is 3, but the minimum length of the part that
252 will actually match is 1. So 'FOO's minimum length is 3, but the
253 minimum length for the F is 1. This is important as the minimum length
254 is used to determine offsets in front of and behind the string being
255 looked for. Since strings can be composites this is the length of the
256 pattern at the time it was commited with a scan_commit. Note that
257 the length is calculated by study_chunk, so that the minimum lengths
258 are not known until the full pattern has been compiled, thus the
259 pointer to the value.
263 In the case of lookbehind the string being searched for can be
264 offset past the start point of the final matching string.
265 If this value was just blithely removed from the min_offset it would
266 invalidate some of the calculations for how many chars must match
267 before or after (as they are derived from min_offset and minlen and
268 the length of the string being searched for).
269 When the final pattern is compiled and the data is moved from the
270 scan_data_t structure into the regexp structure the information
271 about lookbehind is factored in, with the information that would
272 have been lost precalculated in the end_shift field for the
275 The fields pos_min and pos_delta are used to store the minimum offset
276 and the delta to the maximum offset at the current point in the pattern.
280 typedef struct scan_data_t {
281 /*I32 len_min; unused */
282 /*I32 len_delta; unused */
286 I32 last_end; /* min value, <0 unless valid. */
289 SV **longest; /* Either &l_fixed, or &l_float. */
290 SV *longest_fixed; /* longest fixed string found in pattern */
291 I32 offset_fixed; /* offset where it starts */
292 I32 *minlen_fixed; /* pointer to the minlen relevent to the string */
293 I32 lookbehind_fixed; /* is the position of the string modfied by LB */
294 SV *longest_float; /* longest floating string found in pattern */
295 I32 offset_float_min; /* earliest point in string it can appear */
296 I32 offset_float_max; /* latest point in string it can appear */
297 I32 *minlen_float; /* pointer to the minlen relevent to the string */
298 I32 lookbehind_float; /* is the position of the string modified by LB */
302 struct regnode_charclass_class *start_class;
306 * Forward declarations for pregcomp()'s friends.
309 static const scan_data_t zero_scan_data =
310 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ,0};
312 #define SF_BEFORE_EOL (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
313 #define SF_BEFORE_SEOL 0x0001
314 #define SF_BEFORE_MEOL 0x0002
315 #define SF_FIX_BEFORE_EOL (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
316 #define SF_FL_BEFORE_EOL (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
319 # define SF_FIX_SHIFT_EOL (0+2)
320 # define SF_FL_SHIFT_EOL (0+4)
322 # define SF_FIX_SHIFT_EOL (+2)
323 # define SF_FL_SHIFT_EOL (+4)
326 #define SF_FIX_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
327 #define SF_FIX_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
329 #define SF_FL_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
330 #define SF_FL_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
331 #define SF_IS_INF 0x0040
332 #define SF_HAS_PAR 0x0080
333 #define SF_IN_PAR 0x0100
334 #define SF_HAS_EVAL 0x0200
335 #define SCF_DO_SUBSTR 0x0400
336 #define SCF_DO_STCLASS_AND 0x0800
337 #define SCF_DO_STCLASS_OR 0x1000
338 #define SCF_DO_STCLASS (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
339 #define SCF_WHILEM_VISITED_POS 0x2000
341 #define SCF_TRIE_RESTUDY 0x4000 /* Do restudy? */
342 #define SCF_SEEN_ACCEPT 0x8000
344 #define UTF (RExC_utf8 != 0)
345 #define LOC ((RExC_flags & PMf_LOCALE) != 0)
346 #define FOLD ((RExC_flags & PMf_FOLD) != 0)
348 #define OOB_UNICODE 12345678
349 #define OOB_NAMEDCLASS -1
351 #define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
352 #define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
355 /* length of regex to show in messages that don't mark a position within */
356 #define RegexLengthToShowInErrorMessages 127
359 * If MARKER[12] are adjusted, be sure to adjust the constants at the top
360 * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
361 * op/pragma/warn/regcomp.
363 #define MARKER1 "<-- HERE" /* marker as it appears in the description */
364 #define MARKER2 " <-- HERE " /* marker as it appears within the regex */
366 #define REPORT_LOCATION " in regex; marked by " MARKER1 " in m/%.*s" MARKER2 "%s/"
369 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
370 * arg. Show regex, up to a maximum length. If it's too long, chop and add
373 #define FAIL(msg) STMT_START { \
374 const char *ellipses = ""; \
375 IV len = RExC_end - RExC_precomp; \
378 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
379 if (len > RegexLengthToShowInErrorMessages) { \
380 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
381 len = RegexLengthToShowInErrorMessages - 10; \
384 Perl_croak(aTHX_ "%s in regex m/%.*s%s/", \
385 msg, (int)len, RExC_precomp, ellipses); \
389 * Simple_vFAIL -- like FAIL, but marks the current location in the scan
391 #define Simple_vFAIL(m) STMT_START { \
392 const IV offset = RExC_parse - RExC_precomp; \
393 Perl_croak(aTHX_ "%s" REPORT_LOCATION, \
394 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
398 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
400 #define vFAIL(m) STMT_START { \
402 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
407 * Like Simple_vFAIL(), but accepts two arguments.
409 #define Simple_vFAIL2(m,a1) STMT_START { \
410 const IV offset = RExC_parse - RExC_precomp; \
411 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, \
412 (int)offset, RExC_precomp, RExC_precomp + offset); \
416 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
418 #define vFAIL2(m,a1) STMT_START { \
420 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
421 Simple_vFAIL2(m, a1); \
426 * Like Simple_vFAIL(), but accepts three arguments.
428 #define Simple_vFAIL3(m, a1, a2) STMT_START { \
429 const IV offset = RExC_parse - RExC_precomp; \
430 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, \
431 (int)offset, RExC_precomp, RExC_precomp + offset); \
435 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
437 #define vFAIL3(m,a1,a2) STMT_START { \
439 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
440 Simple_vFAIL3(m, a1, a2); \
444 * Like Simple_vFAIL(), but accepts four arguments.
446 #define Simple_vFAIL4(m, a1, a2, a3) STMT_START { \
447 const IV offset = RExC_parse - RExC_precomp; \
448 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3, \
449 (int)offset, RExC_precomp, RExC_precomp + offset); \
452 #define vWARN(loc,m) STMT_START { \
453 const IV offset = loc - RExC_precomp; \
454 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s" REPORT_LOCATION, \
455 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
458 #define vWARNdep(loc,m) STMT_START { \
459 const IV offset = loc - RExC_precomp; \
460 Perl_warner(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), \
461 "%s" REPORT_LOCATION, \
462 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
466 #define vWARN2(loc, m, a1) STMT_START { \
467 const IV offset = loc - RExC_precomp; \
468 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
469 a1, (int)offset, RExC_precomp, RExC_precomp + offset); \
472 #define vWARN3(loc, m, a1, a2) STMT_START { \
473 const IV offset = loc - RExC_precomp; \
474 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
475 a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset); \
478 #define vWARN4(loc, m, a1, a2, a3) STMT_START { \
479 const IV offset = loc - RExC_precomp; \
480 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
481 a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
484 #define vWARN5(loc, m, a1, a2, a3, a4) STMT_START { \
485 const IV offset = loc - RExC_precomp; \
486 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
487 a1, a2, a3, a4, (int)offset, RExC_precomp, RExC_precomp + offset); \
491 /* Allow for side effects in s */
492 #define REGC(c,s) STMT_START { \
493 if (!SIZE_ONLY) *(s) = (c); else (void)(s); \
496 /* Macros for recording node offsets. 20001227 mjd@plover.com
497 * Nodes are numbered 1, 2, 3, 4. Node #n's position is recorded in
498 * element 2*n-1 of the array. Element #2n holds the byte length node #n.
499 * Element 0 holds the number n.
500 * Position is 1 indexed.
503 #define Set_Node_Offset_To_R(node,byte) STMT_START { \
505 MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n", \
506 __LINE__, (int)(node), (int)(byte))); \
508 Perl_croak(aTHX_ "value of node is %d in Offset macro", (int)(node)); \
510 RExC_offsets[2*(node)-1] = (byte); \
515 #define Set_Node_Offset(node,byte) \
516 Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
517 #define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
519 #define Set_Node_Length_To_R(node,len) STMT_START { \
521 MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n", \
522 __LINE__, (int)(node), (int)(len))); \
524 Perl_croak(aTHX_ "value of node is %d in Length macro", (int)(node)); \
526 RExC_offsets[2*(node)] = (len); \
531 #define Set_Node_Length(node,len) \
532 Set_Node_Length_To_R((node)-RExC_emit_start, len)
533 #define Set_Cur_Node_Length(len) Set_Node_Length(RExC_emit, len)
534 #define Set_Node_Cur_Length(node) \
535 Set_Node_Length(node, RExC_parse - parse_start)
537 /* Get offsets and lengths */
538 #define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
539 #define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
541 #define Set_Node_Offset_Length(node,offset,len) STMT_START { \
542 Set_Node_Offset_To_R((node)-RExC_emit_start, (offset)); \
543 Set_Node_Length_To_R((node)-RExC_emit_start, (len)); \
547 #if PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS
548 #define EXPERIMENTAL_INPLACESCAN
551 #define DEBUG_STUDYDATA(data,depth) \
552 DEBUG_OPTIMISE_MORE_r(if(data){ \
553 PerlIO_printf(Perl_debug_log, \
554 "%*s"/* Len:%"IVdf"/%"IVdf" */"Pos:%"IVdf"/%"IVdf \
555 " Flags: %"IVdf" Whilem_c: %"IVdf" Lcp: %"IVdf" ", \
556 (int)(depth)*2, "", \
557 (IV)((data)->pos_min), \
558 (IV)((data)->pos_delta), \
559 (IV)((data)->flags), \
560 (IV)((data)->whilem_c), \
561 (IV)((data)->last_closep ? *((data)->last_closep) : -1) \
563 if ((data)->last_found) \
564 PerlIO_printf(Perl_debug_log, \
565 "Last:'%s' %"IVdf":%"IVdf"/%"IVdf" %sFixed:'%s' @ %"IVdf \
566 " %sFloat: '%s' @ %"IVdf"/%"IVdf"", \
567 SvPVX_const((data)->last_found), \
568 (IV)((data)->last_end), \
569 (IV)((data)->last_start_min), \
570 (IV)((data)->last_start_max), \
571 ((data)->longest && \
572 (data)->longest==&((data)->longest_fixed)) ? "*" : "", \
573 SvPVX_const((data)->longest_fixed), \
574 (IV)((data)->offset_fixed), \
575 ((data)->longest && \
576 (data)->longest==&((data)->longest_float)) ? "*" : "", \
577 SvPVX_const((data)->longest_float), \
578 (IV)((data)->offset_float_min), \
579 (IV)((data)->offset_float_max) \
581 PerlIO_printf(Perl_debug_log,"\n"); \
584 static void clear_re(pTHX_ void *r);
586 /* Mark that we cannot extend a found fixed substring at this point.
587 Update the longest found anchored substring and the longest found
588 floating substrings if needed. */
591 S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data, I32 *minlenp)
593 const STRLEN l = CHR_SVLEN(data->last_found);
594 const STRLEN old_l = CHR_SVLEN(*data->longest);
595 GET_RE_DEBUG_FLAGS_DECL;
597 if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
598 SvSetMagicSV(*data->longest, data->last_found);
599 if (*data->longest == data->longest_fixed) {
600 data->offset_fixed = l ? data->last_start_min : data->pos_min;
601 if (data->flags & SF_BEFORE_EOL)
603 |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
605 data->flags &= ~SF_FIX_BEFORE_EOL;
606 data->minlen_fixed=minlenp;
607 data->lookbehind_fixed=0;
610 data->offset_float_min = l ? data->last_start_min : data->pos_min;
611 data->offset_float_max = (l
612 ? data->last_start_max
613 : data->pos_min + data->pos_delta);
614 if ((U32)data->offset_float_max > (U32)I32_MAX)
615 data->offset_float_max = I32_MAX;
616 if (data->flags & SF_BEFORE_EOL)
618 |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
620 data->flags &= ~SF_FL_BEFORE_EOL;
621 data->minlen_float=minlenp;
622 data->lookbehind_float=0;
625 SvCUR_set(data->last_found, 0);
627 SV * const sv = data->last_found;
628 if (SvUTF8(sv) && SvMAGICAL(sv)) {
629 MAGIC * const mg = mg_find(sv, PERL_MAGIC_utf8);
635 data->flags &= ~SF_BEFORE_EOL;
636 DEBUG_STUDYDATA(data,0);
639 /* Can match anything (initialization) */
641 S_cl_anything(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
643 ANYOF_CLASS_ZERO(cl);
644 ANYOF_BITMAP_SETALL(cl);
645 cl->flags = ANYOF_EOS|ANYOF_UNICODE_ALL;
647 cl->flags |= ANYOF_LOCALE;
650 /* Can match anything (initialization) */
652 S_cl_is_anything(const struct regnode_charclass_class *cl)
656 for (value = 0; value <= ANYOF_MAX; value += 2)
657 if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1))
659 if (!(cl->flags & ANYOF_UNICODE_ALL))
661 if (!ANYOF_BITMAP_TESTALLSET((const void*)cl))
666 /* Can match anything (initialization) */
668 S_cl_init(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
670 Zero(cl, 1, struct regnode_charclass_class);
672 cl_anything(pRExC_state, cl);
676 S_cl_init_zero(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
678 Zero(cl, 1, struct regnode_charclass_class);
680 cl_anything(pRExC_state, cl);
682 cl->flags |= ANYOF_LOCALE;
685 /* 'And' a given class with another one. Can create false positives */
686 /* We assume that cl is not inverted */
688 S_cl_and(struct regnode_charclass_class *cl,
689 const struct regnode_charclass_class *and_with)
692 assert(and_with->type == ANYOF);
693 if (!(and_with->flags & ANYOF_CLASS)
694 && !(cl->flags & ANYOF_CLASS)
695 && (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
696 && !(and_with->flags & ANYOF_FOLD)
697 && !(cl->flags & ANYOF_FOLD)) {
700 if (and_with->flags & ANYOF_INVERT)
701 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
702 cl->bitmap[i] &= ~and_with->bitmap[i];
704 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
705 cl->bitmap[i] &= and_with->bitmap[i];
706 } /* XXXX: logic is complicated otherwise, leave it along for a moment. */
707 if (!(and_with->flags & ANYOF_EOS))
708 cl->flags &= ~ANYOF_EOS;
710 if (cl->flags & ANYOF_UNICODE_ALL && and_with->flags & ANYOF_UNICODE &&
711 !(and_with->flags & ANYOF_INVERT)) {
712 cl->flags &= ~ANYOF_UNICODE_ALL;
713 cl->flags |= ANYOF_UNICODE;
714 ARG_SET(cl, ARG(and_with));
716 if (!(and_with->flags & ANYOF_UNICODE_ALL) &&
717 !(and_with->flags & ANYOF_INVERT))
718 cl->flags &= ~ANYOF_UNICODE_ALL;
719 if (!(and_with->flags & (ANYOF_UNICODE|ANYOF_UNICODE_ALL)) &&
720 !(and_with->flags & ANYOF_INVERT))
721 cl->flags &= ~ANYOF_UNICODE;
724 /* 'OR' a given class with another one. Can create false positives */
725 /* We assume that cl is not inverted */
727 S_cl_or(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, const struct regnode_charclass_class *or_with)
729 if (or_with->flags & ANYOF_INVERT) {
731 * (B1 | CL1) | (!B2 & !CL2) = (B1 | !B2 & !CL2) | (CL1 | (!B2 & !CL2))
732 * <= (B1 | !B2) | (CL1 | !CL2)
733 * which is wasteful if CL2 is small, but we ignore CL2:
734 * (B1 | CL1) | (!B2 & !CL2) <= (B1 | CL1) | !B2 = (B1 | !B2) | CL1
735 * XXXX Can we handle case-fold? Unclear:
736 * (OK1(i) | OK1(i')) | !(OK1(i) | OK1(i')) =
737 * (OK1(i) | OK1(i')) | (!OK1(i) & !OK1(i'))
739 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
740 && !(or_with->flags & ANYOF_FOLD)
741 && !(cl->flags & ANYOF_FOLD) ) {
744 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
745 cl->bitmap[i] |= ~or_with->bitmap[i];
746 } /* XXXX: logic is complicated otherwise */
748 cl_anything(pRExC_state, cl);
751 /* (B1 | CL1) | (B2 | CL2) = (B1 | B2) | (CL1 | CL2)) */
752 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
753 && (!(or_with->flags & ANYOF_FOLD)
754 || (cl->flags & ANYOF_FOLD)) ) {
757 /* OR char bitmap and class bitmap separately */
758 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
759 cl->bitmap[i] |= or_with->bitmap[i];
760 if (or_with->flags & ANYOF_CLASS) {
761 for (i = 0; i < ANYOF_CLASSBITMAP_SIZE; i++)
762 cl->classflags[i] |= or_with->classflags[i];
763 cl->flags |= ANYOF_CLASS;
766 else { /* XXXX: logic is complicated, leave it along for a moment. */
767 cl_anything(pRExC_state, cl);
770 if (or_with->flags & ANYOF_EOS)
771 cl->flags |= ANYOF_EOS;
773 if (cl->flags & ANYOF_UNICODE && or_with->flags & ANYOF_UNICODE &&
774 ARG(cl) != ARG(or_with)) {
775 cl->flags |= ANYOF_UNICODE_ALL;
776 cl->flags &= ~ANYOF_UNICODE;
778 if (or_with->flags & ANYOF_UNICODE_ALL) {
779 cl->flags |= ANYOF_UNICODE_ALL;
780 cl->flags &= ~ANYOF_UNICODE;
784 #define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ]
785 #define TRIE_LIST_CUR(state) ( TRIE_LIST_ITEM( state, 0 ).forid )
786 #define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate )
787 #define TRIE_LIST_USED(idx) ( trie->states[state].trans.list ? (TRIE_LIST_CUR( idx ) - 1) : 0 )
793 dump_trie_interim_list(trie,next_alloc)
794 dump_trie_interim_table(trie,next_alloc)
796 These routines dump out a trie in a somewhat readable format.
797 The _interim_ variants are used for debugging the interim
798 tables that are used to generate the final compressed
799 representation which is what dump_trie expects.
801 Part of the reason for their existance is to provide a form
802 of documentation as to how the different representations function.
808 Dumps the final compressed table form of the trie to Perl_debug_log.
809 Used for debugging make_trie().
813 S_dump_trie(pTHX_ const struct _reg_trie_data *trie,U32 depth)
816 SV *sv=sv_newmortal();
817 int colwidth= trie->widecharmap ? 6 : 4;
818 GET_RE_DEBUG_FLAGS_DECL;
821 PerlIO_printf( Perl_debug_log, "%*sChar : %-6s%-6s%-4s ",
822 (int)depth * 2 + 2,"",
823 "Match","Base","Ofs" );
825 for( state = 0 ; state < trie->uniquecharcount ; state++ ) {
826 SV ** const tmp = av_fetch( trie->revcharmap, state, 0);
828 PerlIO_printf( Perl_debug_log, "%*s",
830 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
831 PL_colors[0], PL_colors[1],
832 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
833 PERL_PV_ESCAPE_FIRSTCHAR
838 PerlIO_printf( Perl_debug_log, "\n%*sState|-----------------------",
839 (int)depth * 2 + 2,"");
841 for( state = 0 ; state < trie->uniquecharcount ; state++ )
842 PerlIO_printf( Perl_debug_log, "%.*s", colwidth, "--------");
843 PerlIO_printf( Perl_debug_log, "\n");
845 for( state = 1 ; state < trie->statecount ; state++ ) {
846 const U32 base = trie->states[ state ].trans.base;
848 PerlIO_printf( Perl_debug_log, "%*s#%4"UVXf"|", (int)depth * 2 + 2,"", (UV)state);
850 if ( trie->states[ state ].wordnum ) {
851 PerlIO_printf( Perl_debug_log, " W%4X", trie->states[ state ].wordnum );
853 PerlIO_printf( Perl_debug_log, "%6s", "" );
856 PerlIO_printf( Perl_debug_log, " @%4"UVXf" ", (UV)base );
861 while( ( base + ofs < trie->uniquecharcount ) ||
862 ( base + ofs - trie->uniquecharcount < trie->lasttrans
863 && trie->trans[ base + ofs - trie->uniquecharcount ].check != state))
866 PerlIO_printf( Perl_debug_log, "+%2"UVXf"[ ", (UV)ofs);
868 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
869 if ( ( base + ofs >= trie->uniquecharcount ) &&
870 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
871 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
873 PerlIO_printf( Perl_debug_log, "%*"UVXf,
875 (UV)trie->trans[ base + ofs - trie->uniquecharcount ].next );
877 PerlIO_printf( Perl_debug_log, "%*s",colwidth," ." );
881 PerlIO_printf( Perl_debug_log, "]");
884 PerlIO_printf( Perl_debug_log, "\n" );
888 dump_trie_interim_list(trie,next_alloc)
889 Dumps a fully constructed but uncompressed trie in list form.
890 List tries normally only are used for construction when the number of
891 possible chars (trie->uniquecharcount) is very high.
892 Used for debugging make_trie().
895 S_dump_trie_interim_list(pTHX_ const struct _reg_trie_data *trie, U32 next_alloc,U32 depth)
898 SV *sv=sv_newmortal();
899 int colwidth= trie->widecharmap ? 6 : 4;
900 GET_RE_DEBUG_FLAGS_DECL;
901 /* print out the table precompression. */
902 PerlIO_printf( Perl_debug_log, "%*sState :Word | Transition Data\n%*s%s",
903 (int)depth * 2 + 2,"", (int)depth * 2 + 2,"",
904 "------:-----+-----------------\n" );
906 for( state=1 ; state < next_alloc ; state ++ ) {
909 PerlIO_printf( Perl_debug_log, "%*s %4"UVXf" :",
910 (int)depth * 2 + 2,"", (UV)state );
911 if ( ! trie->states[ state ].wordnum ) {
912 PerlIO_printf( Perl_debug_log, "%5s| ","");
914 PerlIO_printf( Perl_debug_log, "W%4x| ",
915 trie->states[ state ].wordnum
918 for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) {
919 SV ** const tmp = av_fetch( trie->revcharmap, TRIE_LIST_ITEM(state,charid).forid, 0);
921 PerlIO_printf( Perl_debug_log, "%*s:%3X=%4"UVXf" | ",
923 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
924 PL_colors[0], PL_colors[1],
925 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
926 PERL_PV_ESCAPE_FIRSTCHAR
928 TRIE_LIST_ITEM(state,charid).forid,
929 (UV)TRIE_LIST_ITEM(state,charid).newstate
932 PerlIO_printf(Perl_debug_log, "\n%*s| ",
933 (int)((depth * 2) + 14), "");
936 PerlIO_printf( Perl_debug_log, "\n");
941 dump_trie_interim_table(trie,next_alloc)
942 Dumps a fully constructed but uncompressed trie in table form.
943 This is the normal DFA style state transition table, with a few
944 twists to facilitate compression later.
945 Used for debugging make_trie().
948 S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie, U32 next_alloc, U32 depth)
952 SV *sv=sv_newmortal();
953 int colwidth= trie->widecharmap ? 6 : 4;
954 GET_RE_DEBUG_FLAGS_DECL;
957 print out the table precompression so that we can do a visual check
958 that they are identical.
961 PerlIO_printf( Perl_debug_log, "%*sChar : ",(int)depth * 2 + 2,"" );
963 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
964 SV ** const tmp = av_fetch( trie->revcharmap, charid, 0);
966 PerlIO_printf( Perl_debug_log, "%*s",
968 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
969 PL_colors[0], PL_colors[1],
970 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
971 PERL_PV_ESCAPE_FIRSTCHAR
977 PerlIO_printf( Perl_debug_log, "\n%*sState+-",(int)depth * 2 + 2,"" );
979 for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) {
980 PerlIO_printf( Perl_debug_log, "%.*s", colwidth,"--------");
983 PerlIO_printf( Perl_debug_log, "\n" );
985 for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) {
987 PerlIO_printf( Perl_debug_log, "%*s%4"UVXf" : ",
988 (int)depth * 2 + 2,"",
989 (UV)TRIE_NODENUM( state ) );
991 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
992 UV v=(UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next );
994 PerlIO_printf( Perl_debug_log, "%*"UVXf, colwidth, v );
996 PerlIO_printf( Perl_debug_log, "%*s", colwidth, "." );
998 if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) {
999 PerlIO_printf( Perl_debug_log, " (%4"UVXf")\n", (UV)trie->trans[ state ].check );
1001 PerlIO_printf( Perl_debug_log, " (%4"UVXf") W%4X\n", (UV)trie->trans[ state ].check,
1002 trie->states[ TRIE_NODENUM( state ) ].wordnum );
1009 /* make_trie(startbranch,first,last,tail,word_count,flags,depth)
1010 startbranch: the first branch in the whole branch sequence
1011 first : start branch of sequence of branch-exact nodes.
1012 May be the same as startbranch
1013 last : Thing following the last branch.
1014 May be the same as tail.
1015 tail : item following the branch sequence
1016 count : words in the sequence
1017 flags : currently the OP() type we will be building one of /EXACT(|F|Fl)/
1018 depth : indent depth
1020 Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node.
1022 A trie is an N'ary tree where the branches are determined by digital
1023 decomposition of the key. IE, at the root node you look up the 1st character and
1024 follow that branch repeat until you find the end of the branches. Nodes can be
1025 marked as "accepting" meaning they represent a complete word. Eg:
1029 would convert into the following structure. Numbers represent states, letters
1030 following numbers represent valid transitions on the letter from that state, if
1031 the number is in square brackets it represents an accepting state, otherwise it
1032 will be in parenthesis.
1034 +-h->+-e->[3]-+-r->(8)-+-s->[9]
1038 (1) +-i->(6)-+-s->[7]
1040 +-s->(3)-+-h->(4)-+-e->[5]
1042 Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers)
1044 This shows that when matching against the string 'hers' we will begin at state 1
1045 read 'h' and move to state 2, read 'e' and move to state 3 which is accepting,
1046 then read 'r' and go to state 8 followed by 's' which takes us to state 9 which
1047 is also accepting. Thus we know that we can match both 'he' and 'hers' with a
1048 single traverse. We store a mapping from accepting to state to which word was
1049 matched, and then when we have multiple possibilities we try to complete the
1050 rest of the regex in the order in which they occured in the alternation.
1052 The only prior NFA like behaviour that would be changed by the TRIE support is
1053 the silent ignoring of duplicate alternations which are of the form:
1055 / (DUPE|DUPE) X? (?{ ... }) Y /x
1057 Thus EVAL blocks follwing a trie may be called a different number of times with
1058 and without the optimisation. With the optimisations dupes will be silently
1059 ignored. This inconsistant behaviour of EVAL type nodes is well established as
1060 the following demonstrates:
1062 'words'=~/(word|word|word)(?{ print $1 })[xyz]/
1064 which prints out 'word' three times, but
1066 'words'=~/(word|word|word)(?{ print $1 })S/
1068 which doesnt print it out at all. This is due to other optimisations kicking in.
1070 Example of what happens on a structural level:
1072 The regexp /(ac|ad|ab)+/ will produce the folowing debug output:
1074 1: CURLYM[1] {1,32767}(18)
1085 This would be optimizable with startbranch=5, first=5, last=16, tail=16
1086 and should turn into:
1088 1: CURLYM[1] {1,32767}(18)
1090 [Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1]
1098 Cases where tail != last would be like /(?foo|bar)baz/:
1108 which would be optimizable with startbranch=1, first=1, last=7, tail=8
1109 and would end up looking like:
1112 [Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1]
1119 d = uvuni_to_utf8_flags(d, uv, 0);
1121 is the recommended Unicode-aware way of saying
1126 #define TRIE_STORE_REVCHAR \
1128 SV *tmp = newSVpvs(""); \
1129 if (UTF) SvUTF8_on(tmp); \
1130 Perl_sv_catpvf( aTHX_ tmp, "%c", (int)uvc ); \
1131 av_push( TRIE_REVCHARMAP(trie), tmp ); \
1134 #define TRIE_READ_CHAR STMT_START { \
1138 if ( foldlen > 0 ) { \
1139 uvc = utf8n_to_uvuni( scan, UTF8_MAXLEN, &len, uniflags ); \
1144 uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
1145 uvc = to_uni_fold( uvc, foldbuf, &foldlen ); \
1146 foldlen -= UNISKIP( uvc ); \
1147 scan = foldbuf + UNISKIP( uvc ); \
1150 uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
1160 #define TRIE_LIST_PUSH(state,fid,ns) STMT_START { \
1161 if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) { \
1162 U32 ging = TRIE_LIST_LEN( state ) *= 2; \
1163 Renew( trie->states[ state ].trans.list, ging, reg_trie_trans_le ); \
1165 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid; \
1166 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns; \
1167 TRIE_LIST_CUR( state )++; \
1170 #define TRIE_LIST_NEW(state) STMT_START { \
1171 Newxz( trie->states[ state ].trans.list, \
1172 4, reg_trie_trans_le ); \
1173 TRIE_LIST_CUR( state ) = 1; \
1174 TRIE_LIST_LEN( state ) = 4; \
1177 #define TRIE_HANDLE_WORD(state) STMT_START { \
1178 U16 dupe= trie->states[ state ].wordnum; \
1179 regnode * const noper_next = regnext( noper ); \
1181 if (trie->wordlen) \
1182 trie->wordlen[ curword ] = wordlen; \
1184 /* store the word for dumping */ \
1186 if (OP(noper) != NOTHING) \
1187 tmp = newSVpvn(STRING(noper), STR_LEN(noper)); \
1189 tmp = newSVpvn( "", 0 ); \
1190 if ( UTF ) SvUTF8_on( tmp ); \
1191 av_push( trie->words, tmp ); \
1196 if ( noper_next < tail ) { \
1198 Newxz( trie->jump, word_count + 1, U16); \
1199 trie->jump[curword] = (U16)(noper_next - convert); \
1201 jumper = noper_next; \
1203 nextbranch= regnext(cur); \
1207 /* So it's a dupe. This means we need to maintain a */\
1208 /* linked-list from the first to the next. */\
1209 /* we only allocate the nextword buffer when there */\
1210 /* a dupe, so first time we have to do the allocation */\
1211 if (!trie->nextword) \
1212 Newxz( trie->nextword, word_count + 1, U16); \
1213 while ( trie->nextword[dupe] ) \
1214 dupe= trie->nextword[dupe]; \
1215 trie->nextword[dupe]= curword; \
1217 /* we haven't inserted this word yet. */ \
1218 trie->states[ state ].wordnum = curword; \
1223 #define TRIE_TRANS_STATE(state,base,ucharcount,charid,special) \
1224 ( ( base + charid >= ucharcount \
1225 && base + charid < ubound \
1226 && state == trie->trans[ base - ucharcount + charid ].check \
1227 && trie->trans[ base - ucharcount + charid ].next ) \
1228 ? trie->trans[ base - ucharcount + charid ].next \
1229 : ( state==1 ? special : 0 ) \
1233 #define MADE_JUMP_TRIE 2
1234 #define MADE_EXACT_TRIE 4
1237 S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *first, regnode *last, regnode *tail, U32 word_count, U32 flags, U32 depth)
1240 /* first pass, loop through and scan words */
1241 reg_trie_data *trie;
1243 const U32 uniflags = UTF8_ALLOW_DEFAULT;
1248 regnode *jumper = NULL;
1249 regnode *nextbranch = NULL;
1250 regnode *convert = NULL;
1251 /* we just use folder as a flag in utf8 */
1252 const U8 * const folder = ( flags == EXACTF
1254 : ( flags == EXACTFL
1260 const U32 data_slot = add_data( pRExC_state, 1, "t" );
1261 SV *re_trie_maxbuff;
1263 /* these are only used during construction but are useful during
1264 * debugging so we store them in the struct when debugging.
1266 STRLEN trie_charcount=0;
1267 AV *trie_revcharmap;
1269 GET_RE_DEBUG_FLAGS_DECL;
1271 PERL_UNUSED_ARG(depth);
1274 Newxz( trie, 1, reg_trie_data );
1276 trie->startstate = 1;
1277 trie->wordcount = word_count;
1278 RExC_rx->data->data[ data_slot ] = (void*)trie;
1279 Newxz( trie->charmap, 256, U16 );
1280 if (!(UTF && folder))
1281 Newxz( trie->bitmap, ANYOF_BITMAP_SIZE, char );
1283 trie->words = newAV();
1285 TRIE_REVCHARMAP(trie) = newAV();
1287 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
1288 if (!SvIOK(re_trie_maxbuff)) {
1289 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
1292 PerlIO_printf( Perl_debug_log,
1293 "%*smake_trie start==%d, first==%d, last==%d, tail==%d depth=%d\n",
1294 (int)depth * 2 + 2, "",
1295 REG_NODE_NUM(startbranch),REG_NODE_NUM(first),
1296 REG_NODE_NUM(last), REG_NODE_NUM(tail),
1300 /* Find the node we are going to overwrite */
1301 if ( first == startbranch && OP( last ) != BRANCH ) {
1302 /* whole branch chain */
1305 /* branch sub-chain */
1306 convert = NEXTOPER( first );
1309 /* -- First loop and Setup --
1311 We first traverse the branches and scan each word to determine if it
1312 contains widechars, and how many unique chars there are, this is
1313 important as we have to build a table with at least as many columns as we
1316 We use an array of integers to represent the character codes 0..255
1317 (trie->charmap) and we use a an HV* to store unicode characters. We use the
1318 native representation of the character value as the key and IV's for the
1321 *TODO* If we keep track of how many times each character is used we can
1322 remap the columns so that the table compression later on is more
1323 efficient in terms of memory by ensuring most common value is in the
1324 middle and the least common are on the outside. IMO this would be better
1325 than a most to least common mapping as theres a decent chance the most
1326 common letter will share a node with the least common, meaning the node
1327 will not be compressable. With a middle is most common approach the worst
1328 case is when we have the least common nodes twice.
1332 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1333 regnode * const noper = NEXTOPER( cur );
1334 const U8 *uc = (U8*)STRING( noper );
1335 const U8 * const e = uc + STR_LEN( noper );
1337 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1338 const U8 *scan = (U8*)NULL;
1339 U32 wordlen = 0; /* required init */
1342 if (OP(noper) == NOTHING) {
1347 TRIE_BITMAP_SET(trie,*uc);
1348 if ( folder ) TRIE_BITMAP_SET(trie,folder[ *uc ]);
1350 for ( ; uc < e ; uc += len ) {
1351 TRIE_CHARCOUNT(trie)++;
1355 if ( !trie->charmap[ uvc ] ) {
1356 trie->charmap[ uvc ]=( ++trie->uniquecharcount );
1358 trie->charmap[ folder[ uvc ] ] = trie->charmap[ uvc ];
1363 if ( !trie->widecharmap )
1364 trie->widecharmap = newHV();
1366 svpp = hv_fetch( trie->widecharmap, (char*)&uvc, sizeof( UV ), 1 );
1369 Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%"UVXf, uvc );
1371 if ( !SvTRUE( *svpp ) ) {
1372 sv_setiv( *svpp, ++trie->uniquecharcount );
1377 if( cur == first ) {
1380 } else if (chars < trie->minlen) {
1382 } else if (chars > trie->maxlen) {
1386 } /* end first pass */
1387 DEBUG_TRIE_COMPILE_r(
1388 PerlIO_printf( Perl_debug_log, "%*sTRIE(%s): W:%d C:%d Uq:%d Min:%d Max:%d\n",
1389 (int)depth * 2 + 2,"",
1390 ( trie->widecharmap ? "UTF8" : "NATIVE" ), (int)word_count,
1391 (int)TRIE_CHARCOUNT(trie), trie->uniquecharcount,
1392 (int)trie->minlen, (int)trie->maxlen )
1394 Newxz( trie->wordlen, word_count, U32 );
1397 We now know what we are dealing with in terms of unique chars and
1398 string sizes so we can calculate how much memory a naive
1399 representation using a flat table will take. If it's over a reasonable
1400 limit (as specified by ${^RE_TRIE_MAXBUF}) we use a more memory
1401 conservative but potentially much slower representation using an array
1404 At the end we convert both representations into the same compressed
1405 form that will be used in regexec.c for matching with. The latter
1406 is a form that cannot be used to construct with but has memory
1407 properties similar to the list form and access properties similar
1408 to the table form making it both suitable for fast searches and
1409 small enough that its feasable to store for the duration of a program.
1411 See the comment in the code where the compressed table is produced
1412 inplace from the flat tabe representation for an explanation of how
1413 the compression works.
1418 if ( (IV)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1) > SvIV(re_trie_maxbuff) ) {
1420 Second Pass -- Array Of Lists Representation
1422 Each state will be represented by a list of charid:state records
1423 (reg_trie_trans_le) the first such element holds the CUR and LEN
1424 points of the allocated array. (See defines above).
1426 We build the initial structure using the lists, and then convert
1427 it into the compressed table form which allows faster lookups
1428 (but cant be modified once converted).
1431 STRLEN transcount = 1;
1433 DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
1434 "%*sCompiling trie using list compiler\n",
1435 (int)depth * 2 + 2, ""));
1437 Newxz( trie->states, TRIE_CHARCOUNT(trie) + 2, reg_trie_state );
1441 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1443 regnode * const noper = NEXTOPER( cur );
1444 U8 *uc = (U8*)STRING( noper );
1445 const U8 * const e = uc + STR_LEN( noper );
1446 U32 state = 1; /* required init */
1447 U16 charid = 0; /* sanity init */
1448 U8 *scan = (U8*)NULL; /* sanity init */
1449 STRLEN foldlen = 0; /* required init */
1450 U32 wordlen = 0; /* required init */
1451 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1453 if (OP(noper) != NOTHING) {
1454 for ( ; uc < e ; uc += len ) {
1459 charid = trie->charmap[ uvc ];
1461 SV** const svpp = hv_fetch( trie->widecharmap, (char*)&uvc, sizeof( UV ), 0);
1465 charid=(U16)SvIV( *svpp );
1468 /* charid is now 0 if we dont know the char read, or nonzero if we do */
1475 if ( !trie->states[ state ].trans.list ) {
1476 TRIE_LIST_NEW( state );
1478 for ( check = 1; check <= TRIE_LIST_USED( state ); check++ ) {
1479 if ( TRIE_LIST_ITEM( state, check ).forid == charid ) {
1480 newstate = TRIE_LIST_ITEM( state, check ).newstate;
1485 newstate = next_alloc++;
1486 TRIE_LIST_PUSH( state, charid, newstate );
1491 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
1495 TRIE_HANDLE_WORD(state);
1497 } /* end second pass */
1499 /* next alloc is the NEXT state to be allocated */
1500 trie->statecount = next_alloc;
1501 Renew( trie->states, next_alloc, reg_trie_state );
1503 /* and now dump it out before we compress it */
1504 DEBUG_TRIE_COMPILE_MORE_r(
1505 dump_trie_interim_list(trie,next_alloc,depth+1)
1508 Newxz( trie->trans, transcount ,reg_trie_trans );
1515 for( state=1 ; state < next_alloc ; state ++ ) {
1519 DEBUG_TRIE_COMPILE_MORE_r(
1520 PerlIO_printf( Perl_debug_log, "tp: %d zp: %d ",tp,zp)
1524 if (trie->states[state].trans.list) {
1525 U16 minid=TRIE_LIST_ITEM( state, 1).forid;
1529 for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
1530 const U16 forid = TRIE_LIST_ITEM( state, idx).forid;
1531 if ( forid < minid ) {
1533 } else if ( forid > maxid ) {
1537 if ( transcount < tp + maxid - minid + 1) {
1539 Renew( trie->trans, transcount, reg_trie_trans );
1540 Zero( trie->trans + (transcount / 2), transcount / 2 , reg_trie_trans );
1542 base = trie->uniquecharcount + tp - minid;
1543 if ( maxid == minid ) {
1545 for ( ; zp < tp ; zp++ ) {
1546 if ( ! trie->trans[ zp ].next ) {
1547 base = trie->uniquecharcount + zp - minid;
1548 trie->trans[ zp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1549 trie->trans[ zp ].check = state;
1555 trie->trans[ tp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1556 trie->trans[ tp ].check = state;
1561 for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
1562 const U32 tid = base - trie->uniquecharcount + TRIE_LIST_ITEM( state, idx ).forid;
1563 trie->trans[ tid ].next = TRIE_LIST_ITEM( state, idx ).newstate;
1564 trie->trans[ tid ].check = state;
1566 tp += ( maxid - minid + 1 );
1568 Safefree(trie->states[ state ].trans.list);
1571 DEBUG_TRIE_COMPILE_MORE_r(
1572 PerlIO_printf( Perl_debug_log, " base: %d\n",base);
1575 trie->states[ state ].trans.base=base;
1577 trie->lasttrans = tp + 1;
1581 Second Pass -- Flat Table Representation.
1583 we dont use the 0 slot of either trans[] or states[] so we add 1 to each.
1584 We know that we will need Charcount+1 trans at most to store the data
1585 (one row per char at worst case) So we preallocate both structures
1586 assuming worst case.
1588 We then construct the trie using only the .next slots of the entry
1591 We use the .check field of the first entry of the node temporarily to
1592 make compression both faster and easier by keeping track of how many non
1593 zero fields are in the node.
1595 Since trans are numbered from 1 any 0 pointer in the table is a FAIL
1598 There are two terms at use here: state as a TRIE_NODEIDX() which is a
1599 number representing the first entry of the node, and state as a
1600 TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1) and
1601 TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3) if there
1602 are 2 entrys per node. eg:
1610 The table is internally in the right hand, idx form. However as we also
1611 have to deal with the states array which is indexed by nodenum we have to
1612 use TRIE_NODENUM() to convert.
1615 DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
1616 "%*sCompiling trie using table compiler\n",
1617 (int)depth * 2 + 2, ""));
1619 Newxz( trie->trans, ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1,
1621 Newxz( trie->states, TRIE_CHARCOUNT(trie) + 2, reg_trie_state );
1622 next_alloc = trie->uniquecharcount + 1;
1625 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1627 regnode * const noper = NEXTOPER( cur );
1628 const U8 *uc = (U8*)STRING( noper );
1629 const U8 * const e = uc + STR_LEN( noper );
1631 U32 state = 1; /* required init */
1633 U16 charid = 0; /* sanity init */
1634 U32 accept_state = 0; /* sanity init */
1635 U8 *scan = (U8*)NULL; /* sanity init */
1637 STRLEN foldlen = 0; /* required init */
1638 U32 wordlen = 0; /* required init */
1639 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1641 if ( OP(noper) != NOTHING ) {
1642 for ( ; uc < e ; uc += len ) {
1647 charid = trie->charmap[ uvc ];
1649 SV* const * const svpp = hv_fetch( trie->widecharmap, (char*)&uvc, sizeof( UV ), 0);
1650 charid = svpp ? (U16)SvIV(*svpp) : 0;
1654 if ( !trie->trans[ state + charid ].next ) {
1655 trie->trans[ state + charid ].next = next_alloc;
1656 trie->trans[ state ].check++;
1657 next_alloc += trie->uniquecharcount;
1659 state = trie->trans[ state + charid ].next;
1661 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
1663 /* charid is now 0 if we dont know the char read, or nonzero if we do */
1666 accept_state = TRIE_NODENUM( state );
1667 TRIE_HANDLE_WORD(accept_state);
1669 } /* end second pass */
1671 /* and now dump it out before we compress it */
1672 DEBUG_TRIE_COMPILE_MORE_r(
1673 dump_trie_interim_table(trie,next_alloc,depth+1)
1678 * Inplace compress the table.*
1680 For sparse data sets the table constructed by the trie algorithm will
1681 be mostly 0/FAIL transitions or to put it another way mostly empty.
1682 (Note that leaf nodes will not contain any transitions.)
1684 This algorithm compresses the tables by eliminating most such
1685 transitions, at the cost of a modest bit of extra work during lookup:
1687 - Each states[] entry contains a .base field which indicates the
1688 index in the state[] array wheres its transition data is stored.
1690 - If .base is 0 there are no valid transitions from that node.
1692 - If .base is nonzero then charid is added to it to find an entry in
1695 -If trans[states[state].base+charid].check!=state then the
1696 transition is taken to be a 0/Fail transition. Thus if there are fail
1697 transitions at the front of the node then the .base offset will point
1698 somewhere inside the previous nodes data (or maybe even into a node
1699 even earlier), but the .check field determines if the transition is
1703 The following process inplace converts the table to the compressed
1704 table: We first do not compress the root node 1,and mark its all its
1705 .check pointers as 1 and set its .base pointer as 1 as well. This
1706 allows to do a DFA construction from the compressed table later, and
1707 ensures that any .base pointers we calculate later are greater than
1710 - We set 'pos' to indicate the first entry of the second node.
1712 - We then iterate over the columns of the node, finding the first and
1713 last used entry at l and m. We then copy l..m into pos..(pos+m-l),
1714 and set the .check pointers accordingly, and advance pos
1715 appropriately and repreat for the next node. Note that when we copy
1716 the next pointers we have to convert them from the original
1717 NODEIDX form to NODENUM form as the former is not valid post
1720 - If a node has no transitions used we mark its base as 0 and do not
1721 advance the pos pointer.
1723 - If a node only has one transition we use a second pointer into the
1724 structure to fill in allocated fail transitions from other states.
1725 This pointer is independent of the main pointer and scans forward
1726 looking for null transitions that are allocated to a state. When it
1727 finds one it writes the single transition into the "hole". If the
1728 pointer doesnt find one the single transition is appended as normal.
1730 - Once compressed we can Renew/realloc the structures to release the
1733 See "Table-Compression Methods" in sec 3.9 of the Red Dragon,
1734 specifically Fig 3.47 and the associated pseudocode.
1738 const U32 laststate = TRIE_NODENUM( next_alloc );
1741 trie->statecount = laststate;
1743 for ( state = 1 ; state < laststate ; state++ ) {
1745 const U32 stateidx = TRIE_NODEIDX( state );
1746 const U32 o_used = trie->trans[ stateidx ].check;
1747 U32 used = trie->trans[ stateidx ].check;
1748 trie->trans[ stateidx ].check = 0;
1750 for ( charid = 0 ; used && charid < trie->uniquecharcount ; charid++ ) {
1751 if ( flag || trie->trans[ stateidx + charid ].next ) {
1752 if ( trie->trans[ stateidx + charid ].next ) {
1754 for ( ; zp < pos ; zp++ ) {
1755 if ( ! trie->trans[ zp ].next ) {
1759 trie->states[ state ].trans.base = zp + trie->uniquecharcount - charid ;
1760 trie->trans[ zp ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
1761 trie->trans[ zp ].check = state;
1762 if ( ++zp > pos ) pos = zp;
1769 trie->states[ state ].trans.base = pos + trie->uniquecharcount - charid ;
1771 trie->trans[ pos ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
1772 trie->trans[ pos ].check = state;
1777 trie->lasttrans = pos + 1;
1778 Renew( trie->states, laststate, reg_trie_state);
1779 DEBUG_TRIE_COMPILE_MORE_r(
1780 PerlIO_printf( Perl_debug_log,
1781 "%*sAlloc: %d Orig: %"IVdf" elements, Final:%"IVdf". Savings of %%%5.2f\n",
1782 (int)depth * 2 + 2,"",
1783 (int)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1 ),
1786 ( ( next_alloc - pos ) * 100 ) / (double)next_alloc );
1789 } /* end table compress */
1791 DEBUG_TRIE_COMPILE_MORE_r(
1792 PerlIO_printf(Perl_debug_log, "%*sStatecount:%"UVxf" Lasttrans:%"UVxf"\n",
1793 (int)depth * 2 + 2, "",
1794 (UV)trie->statecount,
1795 (UV)trie->lasttrans)
1797 /* resize the trans array to remove unused space */
1798 Renew( trie->trans, trie->lasttrans, reg_trie_trans);
1800 /* and now dump out the compressed format */
1801 DEBUG_TRIE_COMPILE_r(
1802 dump_trie(trie,depth+1)
1805 { /* Modify the program and insert the new TRIE node*/
1806 U8 nodetype =(U8)(flags & 0xFF);
1810 regnode *optimize = NULL;
1812 U32 mjd_nodelen = 0;
1815 This means we convert either the first branch or the first Exact,
1816 depending on whether the thing following (in 'last') is a branch
1817 or not and whther first is the startbranch (ie is it a sub part of
1818 the alternation or is it the whole thing.)
1819 Assuming its a sub part we conver the EXACT otherwise we convert
1820 the whole branch sequence, including the first.
1822 /* Find the node we are going to overwrite */
1823 if ( first != startbranch || OP( last ) == BRANCH ) {
1824 /* branch sub-chain */
1825 NEXT_OFF( first ) = (U16)(last - first);
1827 mjd_offset= Node_Offset((convert));
1828 mjd_nodelen= Node_Length((convert));
1830 /* whole branch chain */
1833 const regnode *nop = NEXTOPER( convert );
1834 mjd_offset= Node_Offset((nop));
1835 mjd_nodelen= Node_Length((nop));
1840 PerlIO_printf(Perl_debug_log, "%*sMJD offset:%"UVuf" MJD length:%"UVuf"\n",
1841 (int)depth * 2 + 2, "",
1842 (UV)mjd_offset, (UV)mjd_nodelen)
1845 /* But first we check to see if there is a common prefix we can
1846 split out as an EXACT and put in front of the TRIE node. */
1847 trie->startstate= 1;
1848 if ( trie->bitmap && !trie->widecharmap && !trie->jump ) {
1850 for ( state = 1 ; state < trie->statecount-1 ; state++ ) {
1854 const U32 base = trie->states[ state ].trans.base;
1856 if ( trie->states[state].wordnum )
1859 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
1860 if ( ( base + ofs >= trie->uniquecharcount ) &&
1861 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
1862 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
1864 if ( ++count > 1 ) {
1865 SV **tmp = av_fetch( TRIE_REVCHARMAP(trie), ofs, 0);
1866 const U8 *ch = (U8*)SvPV_nolen_const( *tmp );
1867 if ( state == 1 ) break;
1869 Zero(trie->bitmap, ANYOF_BITMAP_SIZE, char);
1871 PerlIO_printf(Perl_debug_log,
1872 "%*sNew Start State=%"UVuf" Class: [",
1873 (int)depth * 2 + 2, "",
1876 SV ** const tmp = av_fetch( TRIE_REVCHARMAP(trie), idx, 0);
1877 const U8 * const ch = (U8*)SvPV_nolen_const( *tmp );
1879 TRIE_BITMAP_SET(trie,*ch);
1881 TRIE_BITMAP_SET(trie, folder[ *ch ]);
1883 PerlIO_printf(Perl_debug_log, (char*)ch)
1887 TRIE_BITMAP_SET(trie,*ch);
1889 TRIE_BITMAP_SET(trie,folder[ *ch ]);
1890 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"%s", ch));
1896 SV **tmp = av_fetch( TRIE_REVCHARMAP(trie), idx, 0);
1897 const char *ch = SvPV_nolen_const( *tmp );
1899 PerlIO_printf( Perl_debug_log,
1900 "%*sPrefix State: %"UVuf" Idx:%"UVuf" Char='%s'\n",
1901 (int)depth * 2 + 2, "",
1902 (UV)state, (UV)idx, ch)
1905 OP( convert ) = nodetype;
1906 str=STRING(convert);
1915 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"]\n"));
1921 regnode *n = convert+NODE_SZ_STR(convert);
1922 NEXT_OFF(convert) = NODE_SZ_STR(convert);
1923 trie->startstate = state;
1924 trie->minlen -= (state - 1);
1925 trie->maxlen -= (state - 1);
1927 regnode *fix = convert;
1929 Set_Node_Offset_Length(convert, mjd_offset, state - 1);
1930 while( ++fix < n ) {
1931 Set_Node_Offset_Length(fix, 0, 0);
1937 NEXT_OFF(convert) = (U16)(tail - convert);
1938 DEBUG_r(optimize= n);
1944 if ( trie->maxlen ) {
1945 NEXT_OFF( convert ) = (U16)(tail - convert);
1946 ARG_SET( convert, data_slot );
1947 /* Store the offset to the first unabsorbed branch in
1948 jump[0], which is otherwise unused by the jump logic.
1949 We use this when dumping a trie and during optimisation. */
1951 trie->jump[0] = (U16)(nextbranch - convert);
1954 if ( !trie->states[trie->startstate].wordnum && trie->bitmap &&
1955 ( (char *)jumper - (char *)convert) >= (int)sizeof(struct regnode_charclass) )
1957 OP( convert ) = TRIEC;
1958 Copy(trie->bitmap, ((struct regnode_charclass *)convert)->bitmap, ANYOF_BITMAP_SIZE, char);
1959 Safefree(trie->bitmap);
1962 OP( convert ) = TRIE;
1964 /* store the type in the flags */
1965 convert->flags = nodetype;
1969 + regarglen[ OP( convert ) ];
1971 /* XXX We really should free up the resource in trie now,
1972 as we won't use them - (which resources?) dmq */
1974 /* needed for dumping*/
1975 DEBUG_r(if (optimize) {
1976 regnode *opt = convert;
1977 while ( ++opt < optimize) {
1978 Set_Node_Offset_Length(opt,0,0);
1981 Try to clean up some of the debris left after the
1984 while( optimize < jumper ) {
1985 mjd_nodelen += Node_Length((optimize));
1986 OP( optimize ) = OPTIMIZED;
1987 Set_Node_Offset_Length(optimize,0,0);
1990 Set_Node_Offset_Length(convert,mjd_offset,mjd_nodelen);
1992 } /* end node insert */
1994 SvREFCNT_dec(TRIE_REVCHARMAP(trie));
1998 : trie->startstate>1
2004 S_make_trie_failtable(pTHX_ RExC_state_t *pRExC_state, regnode *source, regnode *stclass, U32 depth)
2006 /* The Trie is constructed and compressed now so we can build a fail array now if its needed
2008 This is basically the Aho-Corasick algorithm. Its from exercise 3.31 and 3.32 in the
2009 "Red Dragon" -- Compilers, principles, techniques, and tools. Aho, Sethi, Ullman 1985/88
2012 We find the fail state for each state in the trie, this state is the longest proper
2013 suffix of the current states 'word' that is also a proper prefix of another word in our
2014 trie. State 1 represents the word '' and is the thus the default fail state. This allows
2015 the DFA not to have to restart after its tried and failed a word at a given point, it
2016 simply continues as though it had been matching the other word in the first place.
2018 'abcdgu'=~/abcdefg|cdgu/
2019 When we get to 'd' we are still matching the first word, we would encounter 'g' which would
2020 fail, which would bring use to the state representing 'd' in the second word where we would
2021 try 'g' and succeed, prodceding to match 'cdgu'.
2023 /* add a fail transition */
2024 reg_trie_data *trie=(reg_trie_data *)RExC_rx->data->data[ARG(source)];
2026 const U32 ucharcount = trie->uniquecharcount;
2027 const U32 numstates = trie->statecount;
2028 const U32 ubound = trie->lasttrans + ucharcount;
2032 U32 base = trie->states[ 1 ].trans.base;
2035 const U32 data_slot = add_data( pRExC_state, 1, "T" );
2036 GET_RE_DEBUG_FLAGS_DECL;
2038 PERL_UNUSED_ARG(depth);
2042 ARG_SET( stclass, data_slot );
2043 Newxz( aho, 1, reg_ac_data );
2044 RExC_rx->data->data[ data_slot ] = (void*)aho;
2046 aho->states=(reg_trie_state *)savepvn((const char*)trie->states,
2047 numstates * sizeof(reg_trie_state));
2048 Newxz( q, numstates, U32);
2049 Newxz( aho->fail, numstates, U32 );
2052 /* initialize fail[0..1] to be 1 so that we always have
2053 a valid final fail state */
2054 fail[ 0 ] = fail[ 1 ] = 1;
2056 for ( charid = 0; charid < ucharcount ; charid++ ) {
2057 const U32 newstate = TRIE_TRANS_STATE( 1, base, ucharcount, charid, 0 );
2059 q[ q_write ] = newstate;
2060 /* set to point at the root */
2061 fail[ q[ q_write++ ] ]=1;
2064 while ( q_read < q_write) {
2065 const U32 cur = q[ q_read++ % numstates ];
2066 base = trie->states[ cur ].trans.base;
2068 for ( charid = 0 ; charid < ucharcount ; charid++ ) {
2069 const U32 ch_state = TRIE_TRANS_STATE( cur, base, ucharcount, charid, 1 );
2071 U32 fail_state = cur;
2074 fail_state = fail[ fail_state ];
2075 fail_base = aho->states[ fail_state ].trans.base;
2076 } while ( !TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 ) );
2078 fail_state = TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 );
2079 fail[ ch_state ] = fail_state;
2080 if ( !aho->states[ ch_state ].wordnum && aho->states[ fail_state ].wordnum )
2082 aho->states[ ch_state ].wordnum = aho->states[ fail_state ].wordnum;
2084 q[ q_write++ % numstates] = ch_state;
2088 /* restore fail[0..1] to 0 so that we "fall out" of the AC loop
2089 when we fail in state 1, this allows us to use the
2090 charclass scan to find a valid start char. This is based on the principle
2091 that theres a good chance the string being searched contains lots of stuff
2092 that cant be a start char.
2094 fail[ 0 ] = fail[ 1 ] = 0;
2095 DEBUG_TRIE_COMPILE_r({
2096 PerlIO_printf(Perl_debug_log,
2097 "%*sStclass Failtable (%"UVuf" states): 0",
2098 (int)(depth * 2), "", (UV)numstates
2100 for( q_read=1; q_read<numstates; q_read++ ) {
2101 PerlIO_printf(Perl_debug_log, ", %"UVuf, (UV)fail[q_read]);
2103 PerlIO_printf(Perl_debug_log, "\n");
2106 /*RExC_seen |= REG_SEEN_TRIEDFA;*/
2111 * There are strange code-generation bugs caused on sparc64 by gcc-2.95.2.
2112 * These need to be revisited when a newer toolchain becomes available.
2114 #if defined(__sparc64__) && defined(__GNUC__)
2115 # if __GNUC__ < 2 || (__GNUC__ == 2 && __GNUC_MINOR__ < 96)
2116 # undef SPARC64_GCC_WORKAROUND
2117 # define SPARC64_GCC_WORKAROUND 1
2121 #define DEBUG_PEEP(str,scan,depth) \
2122 DEBUG_OPTIMISE_r({if (scan){ \
2123 SV * const mysv=sv_newmortal(); \
2124 regnode *Next = regnext(scan); \
2125 regprop(RExC_rx, mysv, scan); \
2126 PerlIO_printf(Perl_debug_log, "%*s" str ">%3d: %s (%d)\n", \
2127 (int)depth*2, "", REG_NODE_NUM(scan), SvPV_nolen_const(mysv),\
2128 Next ? (REG_NODE_NUM(Next)) : 0 ); \
2135 #define JOIN_EXACT(scan,min,flags) \
2136 if (PL_regkind[OP(scan)] == EXACT) \
2137 join_exact(pRExC_state,(scan),(min),(flags),NULL,depth+1)
2140 S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, I32 *min, U32 flags,regnode *val, U32 depth) {
2141 /* Merge several consecutive EXACTish nodes into one. */
2142 regnode *n = regnext(scan);
2144 regnode *next = scan + NODE_SZ_STR(scan);
2148 regnode *stop = scan;
2149 GET_RE_DEBUG_FLAGS_DECL;
2151 PERL_UNUSED_ARG(depth);
2153 #ifndef EXPERIMENTAL_INPLACESCAN
2154 PERL_UNUSED_ARG(flags);
2155 PERL_UNUSED_ARG(val);
2157 DEBUG_PEEP("join",scan,depth);
2159 /* Skip NOTHING, merge EXACT*. */
2161 ( PL_regkind[OP(n)] == NOTHING ||
2162 (stringok && (OP(n) == OP(scan))))
2164 && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX) {
2166 if (OP(n) == TAIL || n > next)
2168 if (PL_regkind[OP(n)] == NOTHING) {
2169 DEBUG_PEEP("skip:",n,depth);
2170 NEXT_OFF(scan) += NEXT_OFF(n);
2171 next = n + NODE_STEP_REGNODE;
2178 else if (stringok) {
2179 const unsigned int oldl = STR_LEN(scan);
2180 regnode * const nnext = regnext(n);
2182 DEBUG_PEEP("merg",n,depth);
2185 if (oldl + STR_LEN(n) > U8_MAX)
2187 NEXT_OFF(scan) += NEXT_OFF(n);
2188 STR_LEN(scan) += STR_LEN(n);
2189 next = n + NODE_SZ_STR(n);
2190 /* Now we can overwrite *n : */
2191 Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
2199 #ifdef EXPERIMENTAL_INPLACESCAN
2200 if (flags && !NEXT_OFF(n)) {
2201 DEBUG_PEEP("atch", val, depth);
2202 if (reg_off_by_arg[OP(n)]) {
2203 ARG_SET(n, val - n);
2206 NEXT_OFF(n) = val - n;
2213 if (UTF && ( OP(scan) == EXACTF ) && ( STR_LEN(scan) >= 6 ) ) {
2215 Two problematic code points in Unicode casefolding of EXACT nodes:
2217 U+0390 - GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
2218 U+03B0 - GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
2224 U+03B9 U+0308 U+0301 0xCE 0xB9 0xCC 0x88 0xCC 0x81
2225 U+03C5 U+0308 U+0301 0xCF 0x85 0xCC 0x88 0xCC 0x81
2227 This means that in case-insensitive matching (or "loose matching",
2228 as Unicode calls it), an EXACTF of length six (the UTF-8 encoded byte
2229 length of the above casefolded versions) can match a target string
2230 of length two (the byte length of UTF-8 encoded U+0390 or U+03B0).
2231 This would rather mess up the minimum length computation.
2233 What we'll do is to look for the tail four bytes, and then peek
2234 at the preceding two bytes to see whether we need to decrease
2235 the minimum length by four (six minus two).
2237 Thanks to the design of UTF-8, there cannot be false matches:
2238 A sequence of valid UTF-8 bytes cannot be a subsequence of
2239 another valid sequence of UTF-8 bytes.
2242 char * const s0 = STRING(scan), *s, *t;
2243 char * const s1 = s0 + STR_LEN(scan) - 1;
2244 char * const s2 = s1 - 4;
2245 #ifdef EBCDIC /* RD tunifold greek 0390 and 03B0 */
2246 const char t0[] = "\xaf\x49\xaf\x42";
2248 const char t0[] = "\xcc\x88\xcc\x81";
2250 const char * const t1 = t0 + 3;
2253 s < s2 && (t = ninstr(s, s1, t0, t1));
2256 if (((U8)t[-1] == 0x68 && (U8)t[-2] == 0xB4) ||
2257 ((U8)t[-1] == 0x46 && (U8)t[-2] == 0xB5))
2259 if (((U8)t[-1] == 0xB9 && (U8)t[-2] == 0xCE) ||
2260 ((U8)t[-1] == 0x85 && (U8)t[-2] == 0xCF))
2268 n = scan + NODE_SZ_STR(scan);
2270 if (PL_regkind[OP(n)] != NOTHING || OP(n) == NOTHING) {
2277 DEBUG_OPTIMISE_r(if (merged){DEBUG_PEEP("finl",scan,depth)});
2281 /* REx optimizer. Converts nodes into quickier variants "in place".
2282 Finds fixed substrings. */
2284 /* Stops at toplevel WHILEM as well as at "last". At end *scanp is set
2285 to the position after last scanned or to NULL. */
2287 #define INIT_AND_WITHP \
2288 assert(!and_withp); \
2289 Newx(and_withp,1,struct regnode_charclass_class); \
2290 SAVEFREEPV(and_withp)
2292 /* this is a chain of data about sub patterns we are processing that
2293 need to be handled seperately/specially in study_chunk. Its so
2294 we can simulate recursion without losing state. */
2296 typedef struct scan_frame {
2297 regnode *last; /* last node to process in this frame */
2298 regnode *next; /* next node to process when last is reached */
2299 struct scan_frame *prev; /*previous frame*/
2300 I32 stop; /* what stopparen do we use */
2304 S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
2305 I32 *minlenp, I32 *deltap,
2310 struct regnode_charclass_class *and_withp,
2311 U32 flags, U32 depth)
2312 /* scanp: Start here (read-write). */
2313 /* deltap: Write maxlen-minlen here. */
2314 /* last: Stop before this one. */
2315 /* data: string data about the pattern */
2316 /* stopparen: treat close N as END */
2317 /* recursed: which subroutines have we recursed into */
2318 /* and_withp: Valid if flags & SCF_DO_STCLASS_OR */
2321 I32 min = 0, pars = 0, code;
2322 regnode *scan = *scanp, *next;
2324 int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
2325 int is_inf_internal = 0; /* The studied chunk is infinite */
2326 I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
2327 scan_data_t data_fake;
2328 SV *re_trie_maxbuff = NULL;
2329 regnode *first_non_open = scan;
2330 I32 stopmin = I32_MAX;
2331 scan_frame last_frame= { last, NULL, NULL, stopparen };
2332 scan_frame *frame=&last_frame;
2334 GET_RE_DEBUG_FLAGS_DECL;
2337 StructCopy(&zero_scan_data, &data_fake, scan_data_t);
2341 while (first_non_open && OP(first_non_open) == OPEN)
2342 first_non_open=regnext(first_non_open);
2347 DEBUG_PEEP("FBEG",scan,depth);
2348 while ( scan && OP(scan) != END && scan < frame->last ) {
2349 /* Peephole optimizer: */
2350 DEBUG_STUDYDATA(data,depth);
2351 DEBUG_PEEP("Peep",scan,depth);
2352 JOIN_EXACT(scan,&min,0);
2354 /* Follow the next-chain of the current node and optimize
2355 away all the NOTHINGs from it. */
2356 if (OP(scan) != CURLYX) {
2357 const int max = (reg_off_by_arg[OP(scan)]
2359 /* I32 may be smaller than U16 on CRAYs! */
2360 : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
2361 int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
2365 /* Skip NOTHING and LONGJMP. */
2366 while ((n = regnext(n))
2367 && ((PL_regkind[OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
2368 || ((OP(n) == LONGJMP) && (noff = ARG(n))))
2369 && off + noff < max)
2371 if (reg_off_by_arg[OP(scan)])
2374 NEXT_OFF(scan) = off;
2377 /* The principal pseudo-switch. Cannot be a switch, since we
2378 look into several different things. */
2379 if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
2380 || OP(scan) == IFTHEN) {
2381 next = regnext(scan);
2383 /* demq: the op(next)==code check is to see if we have "branch-branch" AFAICT */
2385 if (OP(next) == code || code == IFTHEN) {
2386 /* NOTE - There is similar code to this block below for handling
2387 TRIE nodes on a re-study. If you change stuff here check there
2389 I32 max1 = 0, min1 = I32_MAX, num = 0;
2390 struct regnode_charclass_class accum;
2391 regnode * const startbranch=scan;
2393 if (flags & SCF_DO_SUBSTR)
2394 scan_commit(pRExC_state, data, minlenp); /* Cannot merge strings after this. */
2395 if (flags & SCF_DO_STCLASS)
2396 cl_init_zero(pRExC_state, &accum);
2398 while (OP(scan) == code) {
2399 I32 deltanext, minnext, f = 0, fake;
2400 struct regnode_charclass_class this_class;
2403 data_fake.flags = 0;
2405 data_fake.whilem_c = data->whilem_c;
2406 data_fake.last_closep = data->last_closep;
2409 data_fake.last_closep = &fake;
2410 next = regnext(scan);
2411 scan = NEXTOPER(scan);
2413 scan = NEXTOPER(scan);
2414 if (flags & SCF_DO_STCLASS) {
2415 cl_init(pRExC_state, &this_class);
2416 data_fake.start_class = &this_class;
2417 f = SCF_DO_STCLASS_AND;
2419 if (flags & SCF_WHILEM_VISITED_POS)
2420 f |= SCF_WHILEM_VISITED_POS;
2422 /* we suppose the run is continuous, last=next...*/
2423 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
2425 stopparen, recursed, NULL, f,depth+1);
2428 if (max1 < minnext + deltanext)
2429 max1 = minnext + deltanext;
2430 if (deltanext == I32_MAX)
2431 is_inf = is_inf_internal = 1;
2433 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
2435 if (data_fake.flags & SCF_SEEN_ACCEPT) {
2436 if ( stopmin > minnext)
2437 stopmin = min + min1;
2438 flags &= ~SCF_DO_SUBSTR;
2440 data->flags |= SCF_SEEN_ACCEPT;
2443 if (data_fake.flags & SF_HAS_EVAL)
2444 data->flags |= SF_HAS_EVAL;
2445 data->whilem_c = data_fake.whilem_c;
2447 if (flags & SCF_DO_STCLASS)
2448 cl_or(pRExC_state, &accum, &this_class);
2450 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
2452 if (flags & SCF_DO_SUBSTR) {
2453 data->pos_min += min1;
2454 data->pos_delta += max1 - min1;
2455 if (max1 != min1 || is_inf)
2456 data->longest = &(data->longest_float);
2459 delta += max1 - min1;
2460 if (flags & SCF_DO_STCLASS_OR) {
2461 cl_or(pRExC_state, data->start_class, &accum);
2463 cl_and(data->start_class, and_withp);
2464 flags &= ~SCF_DO_STCLASS;
2467 else if (flags & SCF_DO_STCLASS_AND) {
2469 cl_and(data->start_class, &accum);
2470 flags &= ~SCF_DO_STCLASS;
2473 /* Switch to OR mode: cache the old value of
2474 * data->start_class */
2476 StructCopy(data->start_class, and_withp,
2477 struct regnode_charclass_class);
2478 flags &= ~SCF_DO_STCLASS_AND;
2479 StructCopy(&accum, data->start_class,
2480 struct regnode_charclass_class);
2481 flags |= SCF_DO_STCLASS_OR;
2482 data->start_class->flags |= ANYOF_EOS;
2486 if (PERL_ENABLE_TRIE_OPTIMISATION && OP( startbranch ) == BRANCH ) {
2489 Assuming this was/is a branch we are dealing with: 'scan' now
2490 points at the item that follows the branch sequence, whatever
2491 it is. We now start at the beginning of the sequence and look
2498 which would be constructed from a pattern like /A|LIST|OF|WORDS/
2500 If we can find such a subseqence we need to turn the first
2501 element into a trie and then add the subsequent branch exact
2502 strings to the trie.
2506 1. patterns where the whole set of branch can be converted.
2508 2. patterns where only a subset can be converted.
2510 In case 1 we can replace the whole set with a single regop
2511 for the trie. In case 2 we need to keep the start and end
2514 'BRANCH EXACT; BRANCH EXACT; BRANCH X'
2515 becomes BRANCH TRIE; BRANCH X;
2517 There is an additional case, that being where there is a
2518 common prefix, which gets split out into an EXACT like node
2519 preceding the TRIE node.
2521 If x(1..n)==tail then we can do a simple trie, if not we make
2522 a "jump" trie, such that when we match the appropriate word
2523 we "jump" to the appopriate tail node. Essentailly we turn
2524 a nested if into a case structure of sorts.
2529 if (!re_trie_maxbuff) {
2530 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
2531 if (!SvIOK(re_trie_maxbuff))
2532 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
2534 if ( SvIV(re_trie_maxbuff)>=0 ) {
2536 regnode *first = (regnode *)NULL;
2537 regnode *last = (regnode *)NULL;
2538 regnode *tail = scan;
2543 SV * const mysv = sv_newmortal(); /* for dumping */
2545 /* var tail is used because there may be a TAIL
2546 regop in the way. Ie, the exacts will point to the
2547 thing following the TAIL, but the last branch will
2548 point at the TAIL. So we advance tail. If we
2549 have nested (?:) we may have to move through several
2553 while ( OP( tail ) == TAIL ) {
2554 /* this is the TAIL generated by (?:) */
2555 tail = regnext( tail );
2560 regprop(RExC_rx, mysv, tail );
2561 PerlIO_printf( Perl_debug_log, "%*s%s%s\n",
2562 (int)depth * 2 + 2, "",
2563 "Looking for TRIE'able sequences. Tail node is: ",
2564 SvPV_nolen_const( mysv )
2570 step through the branches, cur represents each
2571 branch, noper is the first thing to be matched
2572 as part of that branch and noper_next is the
2573 regnext() of that node. if noper is an EXACT
2574 and noper_next is the same as scan (our current
2575 position in the regex) then the EXACT branch is
2576 a possible optimization target. Once we have
2577 two or more consequetive such branches we can
2578 create a trie of the EXACT's contents and stich
2579 it in place. If the sequence represents all of
2580 the branches we eliminate the whole thing and
2581 replace it with a single TRIE. If it is a
2582 subsequence then we need to stitch it in. This
2583 means the first branch has to remain, and needs
2584 to be repointed at the item on the branch chain
2585 following the last branch optimized. This could
2586 be either a BRANCH, in which case the
2587 subsequence is internal, or it could be the
2588 item following the branch sequence in which
2589 case the subsequence is at the end.
2593 /* dont use tail as the end marker for this traverse */
2594 for ( cur = startbranch ; cur != scan ; cur = regnext( cur ) ) {
2595 regnode * const noper = NEXTOPER( cur );
2596 #if defined(DEBUGGING) || defined(NOJUMPTRIE)
2597 regnode * const noper_next = regnext( noper );
2601 regprop(RExC_rx, mysv, cur);
2602 PerlIO_printf( Perl_debug_log, "%*s- %s (%d)",
2603 (int)depth * 2 + 2,"", SvPV_nolen_const( mysv ), REG_NODE_NUM(cur) );
2605 regprop(RExC_rx, mysv, noper);
2606 PerlIO_printf( Perl_debug_log, " -> %s",
2607 SvPV_nolen_const(mysv));
2610 regprop(RExC_rx, mysv, noper_next );
2611 PerlIO_printf( Perl_debug_log,"\t=> %s\t",
2612 SvPV_nolen_const(mysv));
2614 PerlIO_printf( Perl_debug_log, "(First==%d,Last==%d,Cur==%d)\n",
2615 REG_NODE_NUM(first), REG_NODE_NUM(last), REG_NODE_NUM(cur) );
2617 if ( (((first && optype!=NOTHING) ? OP( noper ) == optype
2618 : PL_regkind[ OP( noper ) ] == EXACT )
2619 || OP(noper) == NOTHING )
2621 && noper_next == tail
2626 if ( !first || optype == NOTHING ) {
2627 if (!first) first = cur;
2628 optype = OP( noper );
2634 make_trie( pRExC_state,
2635 startbranch, first, cur, tail, count,
2638 if ( PL_regkind[ OP( noper ) ] == EXACT
2640 && noper_next == tail
2645 optype = OP( noper );
2655 regprop(RExC_rx, mysv, cur);
2656 PerlIO_printf( Perl_debug_log,
2657 "%*s- %s (%d) <SCAN FINISHED>\n", (int)depth * 2 + 2,
2658 "", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur));
2662 made= make_trie( pRExC_state, startbranch, first, scan, tail, count, optype, depth+1 );
2663 #ifdef TRIE_STUDY_OPT
2664 if ( ((made == MADE_EXACT_TRIE &&
2665 startbranch == first)
2666 || ( first_non_open == first )) &&
2668 flags |= SCF_TRIE_RESTUDY;
2669 if ( startbranch == first
2672 RExC_seen &=~REG_TOP_LEVEL_BRANCHES;
2682 else if ( code == BRANCHJ ) { /* single branch is optimized. */
2683 scan = NEXTOPER(NEXTOPER(scan));
2684 } else /* single branch is optimized. */
2685 scan = NEXTOPER(scan);
2687 } else if (OP(scan) == SUSPEND || OP(scan) == GOSUB || OP(scan) == GOSTART) {
2688 scan_frame *newframe = NULL;
2693 if (OP(scan) != SUSPEND) {
2694 /* set the pointer */
2695 if (OP(scan) == GOSUB) {
2697 RExC_recurse[ARG2L(scan)] = scan;
2698 start = RExC_open_parens[paren-1];
2699 end = RExC_close_parens[paren-1];
2702 start = RExC_rx->program + 1;
2706 Newxz(recursed, (((RExC_npar)>>3) +1), U8);
2707 SAVEFREEPV(recursed);
2709 if (!PAREN_TEST(recursed,paren+1)) {
2710 PAREN_SET(recursed,paren+1);
2711 Newx(newframe,1,scan_frame);
2713 if (flags & SCF_DO_SUBSTR) {
2714 scan_commit(pRExC_state,data,minlenp);
2715 data->longest = &(data->longest_float);
2717 is_inf = is_inf_internal = 1;
2718 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
2719 cl_anything(pRExC_state, data->start_class);
2720 flags &= ~SCF_DO_STCLASS;
2723 Newx(newframe,1,scan_frame);
2726 end = regnext(scan);
2731 SAVEFREEPV(newframe);
2732 newframe->next = regnext(scan);
2733 newframe->last = end;
2734 newframe->stop = stopparen;
2735 newframe->prev = frame;
2742 else if (OP(scan) == EXACT) {
2743 I32 l = STR_LEN(scan);
2746 const U8 * const s = (U8*)STRING(scan);
2747 l = utf8_length(s, s + l);
2748 uc = utf8_to_uvchr(s, NULL);
2750 uc = *((U8*)STRING(scan));
2753 if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
2754 /* The code below prefers earlier match for fixed
2755 offset, later match for variable offset. */
2756 if (data->last_end == -1) { /* Update the start info. */
2757 data->last_start_min = data->pos_min;
2758 data->last_start_max = is_inf
2759 ? I32_MAX : data->pos_min + data->pos_delta;
2761 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
2763 SvUTF8_on(data->last_found);
2765 SV * const sv = data->last_found;
2766 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
2767 mg_find(sv, PERL_MAGIC_utf8) : NULL;
2768 if (mg && mg->mg_len >= 0)
2769 mg->mg_len += utf8_length((U8*)STRING(scan),
2770 (U8*)STRING(scan)+STR_LEN(scan));
2772 data->last_end = data->pos_min + l;
2773 data->pos_min += l; /* As in the first entry. */
2774 data->flags &= ~SF_BEFORE_EOL;
2776 if (flags & SCF_DO_STCLASS_AND) {
2777 /* Check whether it is compatible with what we know already! */
2781 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
2782 && !ANYOF_BITMAP_TEST(data->start_class, uc)
2783 && (!(data->start_class->flags & ANYOF_FOLD)
2784 || !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
2787 ANYOF_CLASS_ZERO(data->start_class);
2788 ANYOF_BITMAP_ZERO(data->start_class);
2790 ANYOF_BITMAP_SET(data->start_class, uc);
2791 data->start_class->flags &= ~ANYOF_EOS;
2793 data->start_class->flags &= ~ANYOF_UNICODE_ALL;
2795 else if (flags & SCF_DO_STCLASS_OR) {
2796 /* false positive possible if the class is case-folded */
2798 ANYOF_BITMAP_SET(data->start_class, uc);
2800 data->start_class->flags |= ANYOF_UNICODE_ALL;
2801 data->start_class->flags &= ~ANYOF_EOS;
2802 cl_and(data->start_class, and_withp);
2804 flags &= ~SCF_DO_STCLASS;
2806 else if (PL_regkind[OP(scan)] == EXACT) { /* But OP != EXACT! */
2807 I32 l = STR_LEN(scan);
2808 UV uc = *((U8*)STRING(scan));
2810 /* Search for fixed substrings supports EXACT only. */
2811 if (flags & SCF_DO_SUBSTR) {
2813 scan_commit(pRExC_state, data, minlenp);
2816 const U8 * const s = (U8 *)STRING(scan);
2817 l = utf8_length(s, s + l);
2818 uc = utf8_to_uvchr(s, NULL);
2821 if (flags & SCF_DO_SUBSTR)
2823 if (flags & SCF_DO_STCLASS_AND) {
2824 /* Check whether it is compatible with what we know already! */
2828 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
2829 && !ANYOF_BITMAP_TEST(data->start_class, uc)
2830 && !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
2832 ANYOF_CLASS_ZERO(data->start_class);
2833 ANYOF_BITMAP_ZERO(data->start_class);
2835 ANYOF_BITMAP_SET(data->start_class, uc);
2836 data->start_class->flags &= ~ANYOF_EOS;
2837 data->start_class->flags |= ANYOF_FOLD;
2838 if (OP(scan) == EXACTFL)
2839 data->start_class->flags |= ANYOF_LOCALE;
2842 else if (flags & SCF_DO_STCLASS_OR) {
2843 if (data->start_class->flags & ANYOF_FOLD) {
2844 /* false positive possible if the class is case-folded.
2845 Assume that the locale settings are the same... */
2847 ANYOF_BITMAP_SET(data->start_class, uc);
2848 data->start_class->flags &= ~ANYOF_EOS;
2850 cl_and(data->start_class, and_withp);
2852 flags &= ~SCF_DO_STCLASS;
2854 else if (strchr((const char*)PL_varies,OP(scan))) {
2855 I32 mincount, maxcount, minnext, deltanext, fl = 0;
2856 I32 f = flags, pos_before = 0;
2857 regnode * const oscan = scan;
2858 struct regnode_charclass_class this_class;
2859 struct regnode_charclass_class *oclass = NULL;
2860 I32 next_is_eval = 0;
2862 switch (PL_regkind[OP(scan)]) {
2863 case WHILEM: /* End of (?:...)* . */
2864 scan = NEXTOPER(scan);
2867 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
2868 next = NEXTOPER(scan);
2869 if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) {
2871 maxcount = REG_INFTY;
2872 next = regnext(scan);
2873 scan = NEXTOPER(scan);
2877 if (flags & SCF_DO_SUBSTR)
2882 if (flags & SCF_DO_STCLASS) {
2884 maxcount = REG_INFTY;
2885 next = regnext(scan);
2886 scan = NEXTOPER(scan);
2889 is_inf = is_inf_internal = 1;
2890 scan = regnext(scan);
2891 if (flags & SCF_DO_SUBSTR) {
2892 scan_commit(pRExC_state, data, minlenp); /* Cannot extend fixed substrings */
2893 data->longest = &(data->longest_float);
2895 goto optimize_curly_tail;
2897 if (stopparen>0 && (OP(scan)==CURLYN || OP(scan)==CURLYM)
2898 && (scan->flags == stopparen))
2903 mincount = ARG1(scan);
2904 maxcount = ARG2(scan);
2906 next = regnext(scan);
2907 if (OP(scan) == CURLYX) {
2908 I32 lp = (data ? *(data->last_closep) : 0);
2909 scan->flags = ((lp <= (I32)U8_MAX) ? (U8)lp : U8_MAX);
2911 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
2912 next_is_eval = (OP(scan) == EVAL);
2914 if (flags & SCF_DO_SUBSTR) {
2915 if (mincount == 0) scan_commit(pRExC_state,data,minlenp); /* Cannot extend fixed substrings */
2916 pos_before = data->pos_min;
2920 data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
2922 data->flags |= SF_IS_INF;
2924 if (flags & SCF_DO_STCLASS) {
2925 cl_init(pRExC_state, &this_class);
2926 oclass = data->start_class;
2927 data->start_class = &this_class;
2928 f |= SCF_DO_STCLASS_AND;
2929 f &= ~SCF_DO_STCLASS_OR;
2931 /* These are the cases when once a subexpression
2932 fails at a particular position, it cannot succeed
2933 even after backtracking at the enclosing scope.
2935 XXXX what if minimal match and we are at the
2936 initial run of {n,m}? */
2937 if ((mincount != maxcount - 1) && (maxcount != REG_INFTY))
2938 f &= ~SCF_WHILEM_VISITED_POS;
2940 /* This will finish on WHILEM, setting scan, or on NULL: */
2941 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
2942 last, data, stopparen, recursed, NULL,
2944 ? (f & ~SCF_DO_SUBSTR) : f),depth+1);
2946 if (flags & SCF_DO_STCLASS)
2947 data->start_class = oclass;
2948 if (mincount == 0 || minnext == 0) {
2949 if (flags & SCF_DO_STCLASS_OR) {
2950 cl_or(pRExC_state, data->start_class, &this_class);
2952 else if (flags & SCF_DO_STCLASS_AND) {
2953 /* Switch to OR mode: cache the old value of
2954 * data->start_class */
2956 StructCopy(data->start_class, and_withp,
2957 struct regnode_charclass_class);
2958 flags &= ~SCF_DO_STCLASS_AND;
2959 StructCopy(&this_class, data->start_class,
2960 struct regnode_charclass_class);
2961 flags |= SCF_DO_STCLASS_OR;
2962 data->start_class->flags |= ANYOF_EOS;
2964 } else { /* Non-zero len */
2965 if (flags & SCF_DO_STCLASS_OR) {
2966 cl_or(pRExC_state, data->start_class, &this_class);
2967 cl_and(data->start_class, and_withp);
2969 else if (flags & SCF_DO_STCLASS_AND)
2970 cl_and(data->start_class, &this_class);
2971 flags &= ~SCF_DO_STCLASS;
2973 if (!scan) /* It was not CURLYX, but CURLY. */
2975 if ( /* ? quantifier ok, except for (?{ ... }) */
2976 (next_is_eval || !(mincount == 0 && maxcount == 1))
2977 && (minnext == 0) && (deltanext == 0)
2978 && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
2979 && maxcount <= REG_INFTY/3 /* Complement check for big count */
2980 && ckWARN(WARN_REGEXP))
2983 "Quantifier unexpected on zero-length expression");
2986 min += minnext * mincount;
2987 is_inf_internal |= ((maxcount == REG_INFTY
2988 && (minnext + deltanext) > 0)
2989 || deltanext == I32_MAX);
2990 is_inf |= is_inf_internal;
2991 delta += (minnext + deltanext) * maxcount - minnext * mincount;
2993 /* Try powerful optimization CURLYX => CURLYN. */
2994 if ( OP(oscan) == CURLYX && data
2995 && data->flags & SF_IN_PAR
2996 && !(data->flags & SF_HAS_EVAL)
2997 && !deltanext && minnext == 1 ) {
2998 /* Try to optimize to CURLYN. */
2999 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
3000 regnode * const nxt1 = nxt;
3007 if (!strchr((const char*)PL_simple,OP(nxt))
3008 && !(PL_regkind[OP(nxt)] == EXACT
3009 && STR_LEN(nxt) == 1))
3015 if (OP(nxt) != CLOSE)
3017 if (RExC_open_parens) {
3018 RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
3019 RExC_close_parens[ARG(nxt1)-1]=nxt+2; /*close->while*/
3021 /* Now we know that nxt2 is the only contents: */
3022 oscan->flags = (U8)ARG(nxt);
3024 OP(nxt1) = NOTHING; /* was OPEN. */
3027 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
3028 NEXT_OFF(nxt1+ 1) = 0; /* just for consistancy. */
3029 NEXT_OFF(nxt2) = 0; /* just for consistancy with CURLY. */
3030 OP(nxt) = OPTIMIZED; /* was CLOSE. */
3031 OP(nxt + 1) = OPTIMIZED; /* was count. */
3032 NEXT_OFF(nxt+ 1) = 0; /* just for consistancy. */
3037 /* Try optimization CURLYX => CURLYM. */
3038 if ( OP(oscan) == CURLYX && data
3039 && !(data->flags & SF_HAS_PAR)
3040 && !(data->flags & SF_HAS_EVAL)
3041 && !deltanext /* atom is fixed width */
3042 && minnext != 0 /* CURLYM can't handle zero width */
3044 /* XXXX How to optimize if data == 0? */
3045 /* Optimize to a simpler form. */
3046 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
3050 while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
3051 && (OP(nxt2) != WHILEM))
3053 OP(nxt2) = SUCCEED; /* Whas WHILEM */
3054 /* Need to optimize away parenths. */
3055 if (data->flags & SF_IN_PAR) {
3056 /* Set the parenth number. */
3057 regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
3059 if (OP(nxt) != CLOSE)
3060 FAIL("Panic opt close");
3061 oscan->flags = (U8)ARG(nxt);
3062 if (RExC_open_parens) {
3063 RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
3064 RExC_close_parens[ARG(nxt1)-1]=nxt2+1; /*close->NOTHING*/
3066 OP(nxt1) = OPTIMIZED; /* was OPEN. */
3067 OP(nxt) = OPTIMIZED; /* was CLOSE. */
3070 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
3071 OP(nxt + 1) = OPTIMIZED; /* was count. */
3072 NEXT_OFF(nxt1 + 1) = 0; /* just for consistancy. */
3073 NEXT_OFF(nxt + 1) = 0; /* just for consistancy. */
3076 while ( nxt1 && (OP(nxt1) != WHILEM)) {
3077 regnode *nnxt = regnext(nxt1);
3080 if (reg_off_by_arg[OP(nxt1)])
3081 ARG_SET(nxt1, nxt2 - nxt1);
3082 else if (nxt2 - nxt1 < U16_MAX)
3083 NEXT_OFF(nxt1) = nxt2 - nxt1;
3085 OP(nxt) = NOTHING; /* Cannot beautify */
3090 /* Optimize again: */
3091 study_chunk(pRExC_state, &nxt1, minlenp, &deltanext, nxt,
3092 NULL, stopparen, recursed, NULL, 0,depth+1);
3097 else if ((OP(oscan) == CURLYX)
3098 && (flags & SCF_WHILEM_VISITED_POS)
3099 /* See the comment on a similar expression above.
3100 However, this time it not a subexpression
3101 we care about, but the expression itself. */
3102 && (maxcount == REG_INFTY)
3103 && data && ++data->whilem_c < 16) {
3104 /* This stays as CURLYX, we can put the count/of pair. */
3105 /* Find WHILEM (as in regexec.c) */
3106 regnode *nxt = oscan + NEXT_OFF(oscan);
3108 if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
3110 PREVOPER(nxt)->flags = (U8)(data->whilem_c
3111 | (RExC_whilem_seen << 4)); /* On WHILEM */
3113 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
3115 if (flags & SCF_DO_SUBSTR) {
3116 SV *last_str = NULL;
3117 int counted = mincount != 0;
3119 if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */
3120 #if defined(SPARC64_GCC_WORKAROUND)
3123 const char *s = NULL;
3126 if (pos_before >= data->last_start_min)
3129 b = data->last_start_min;
3132 s = SvPV_const(data->last_found, l);
3133 old = b - data->last_start_min;
3136 I32 b = pos_before >= data->last_start_min
3137 ? pos_before : data->last_start_min;
3139 const char * const s = SvPV_const(data->last_found, l);
3140 I32 old = b - data->last_start_min;
3144 old = utf8_hop((U8*)s, old) - (U8*)s;
3147 /* Get the added string: */
3148 last_str = newSVpvn(s + old, l);
3150 SvUTF8_on(last_str);
3151 if (deltanext == 0 && pos_before == b) {
3152 /* What was added is a constant string */
3154 SvGROW(last_str, (mincount * l) + 1);
3155 repeatcpy(SvPVX(last_str) + l,
3156 SvPVX_const(last_str), l, mincount - 1);
3157 SvCUR_set(last_str, SvCUR(last_str) * mincount);
3158 /* Add additional parts. */
3159 SvCUR_set(data->last_found,
3160 SvCUR(data->last_found) - l);
3161 sv_catsv(data->last_found, last_str);
3163 SV * sv = data->last_found;
3165 SvUTF8(sv) && SvMAGICAL(sv) ?
3166 mg_find(sv, PERL_MAGIC_utf8) : NULL;
3167 if (mg && mg->mg_len >= 0)
3168 mg->mg_len += CHR_SVLEN(last_str);
3170 data->last_end += l * (mincount - 1);
3173 /* start offset must point into the last copy */
3174 data->last_start_min += minnext * (mincount - 1);
3175 data->last_start_max += is_inf ? I32_MAX
3176 : (maxcount - 1) * (minnext + data->pos_delta);
3179 /* It is counted once already... */
3180 data->pos_min += minnext * (mincount - counted);
3181 data->pos_delta += - counted * deltanext +
3182 (minnext + deltanext) * maxcount - minnext * mincount;
3183 if (mincount != maxcount) {
3184 /* Cannot extend fixed substrings found inside
3186 scan_commit(pRExC_state,data,minlenp);
3187 if (mincount && last_str) {
3188 SV * const sv = data->last_found;
3189 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
3190 mg_find(sv, PERL_MAGIC_utf8) : NULL;
3194 sv_setsv(sv, last_str);
3195 data->last_end = data->pos_min;
3196 data->last_start_min =
3197 data->pos_min - CHR_SVLEN(last_str);
3198 data->last_start_max = is_inf
3200 : data->pos_min + data->pos_delta
3201 - CHR_SVLEN(last_str);
3203 data->longest = &(data->longest_float);
3205 SvREFCNT_dec(last_str);
3207 if (data && (fl & SF_HAS_EVAL))
3208 data->flags |= SF_HAS_EVAL;
3209 optimize_curly_tail:
3210 if (OP(oscan) != CURLYX) {
3211 while (PL_regkind[OP(next = regnext(oscan))] == NOTHING
3213 NEXT_OFF(oscan) += NEXT_OFF(next);
3216 default: /* REF and CLUMP only? */
3217 if (flags & SCF_DO_SUBSTR) {
3218 scan_commit(pRExC_state,data,minlenp); /* Cannot expect anything... */
3219 data->longest = &(data->longest_float);
3221 is_inf = is_inf_internal = 1;
3222 if (flags & SCF_DO_STCLASS_OR)
3223 cl_anything(pRExC_state, data->start_class);
3224 flags &= ~SCF_DO_STCLASS;
3228 else if (strchr((const char*)PL_simple,OP(scan))) {
3231 if (flags & SCF_DO_SUBSTR) {
3232 scan_commit(pRExC_state,data,minlenp);
3236 if (flags & SCF_DO_STCLASS) {
3237 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
3239 /* Some of the logic below assumes that switching
3240 locale on will only add false positives. */
3241 switch (PL_regkind[OP(scan)]) {
3245 /* Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan)); */
3246 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
3247 cl_anything(pRExC_state, data->start_class);
3250 if (OP(scan) == SANY)
3252 if (flags & SCF_DO_STCLASS_OR) { /* Everything but \n */
3253 value = (ANYOF_BITMAP_TEST(data->start_class,'\n')
3254 || (data->start_class->flags & ANYOF_CLASS));
3255 cl_anything(pRExC_state, data->start_class);
3257 if (flags & SCF_DO_STCLASS_AND || !value)
3258 ANYOF_BITMAP_CLEAR(data->start_class,'\n');
3261 if (flags & SCF_DO_STCLASS_AND)
3262 cl_and(data->start_class,
3263 (struct regnode_charclass_class*)scan);
3265 cl_or(pRExC_state, data->start_class,
3266 (struct regnode_charclass_class*)scan);
3269 if (flags & SCF_DO_STCLASS_AND) {
3270 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3271 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
3272 for (value = 0; value < 256; value++)
3273 if (!isALNUM(value))
3274 ANYOF_BITMAP_CLEAR(data->start_class, value);
3278 if (data->start_class->flags & ANYOF_LOCALE)
3279 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
3281 for (value = 0; value < 256; value++)
3283 ANYOF_BITMAP_SET(data->start_class, value);
3288 if (flags & SCF_DO_STCLASS_AND) {
3289 if (data->start_class->flags & ANYOF_LOCALE)
3290 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
3293 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
3294 data->start_class->flags |= ANYOF_LOCALE;
3298 if (flags & SCF_DO_STCLASS_AND) {
3299 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3300 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
3301 for (value = 0; value < 256; value++)
3303 ANYOF_BITMAP_CLEAR(data->start_class, value);
3307 if (data->start_class->flags & ANYOF_LOCALE)
3308 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
3310 for (value = 0; value < 256; value++)
3311 if (!isALNUM(value))
3312 ANYOF_BITMAP_SET(data->start_class, value);
3317 if (flags & SCF_DO_STCLASS_AND) {
3318 if (data->start_class->flags & ANYOF_LOCALE)
3319 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
3322 data->start_class->flags |= ANYOF_LOCALE;
3323 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
3327 if (flags & SCF_DO_STCLASS_AND) {
3328 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3329 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
3330 for (value = 0; value < 256; value++)
3331 if (!isSPACE(value))
3332 ANYOF_BITMAP_CLEAR(data->start_class, value);
3336 if (data->start_class->flags & ANYOF_LOCALE)
3337 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
3339 for (value = 0; value < 256; value++)
3341 ANYOF_BITMAP_SET(data->start_class, value);
3346 if (flags & SCF_DO_STCLASS_AND) {
3347 if (data->start_class->flags & ANYOF_LOCALE)
3348 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
3351 data->start_class->flags |= ANYOF_LOCALE;
3352 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
3356 if (flags & SCF_DO_STCLASS_AND) {
3357 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3358 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
3359 for (value = 0; value < 256; value++)
3361 ANYOF_BITMAP_CLEAR(data->start_class, value);
3365 if (data->start_class->flags & ANYOF_LOCALE)
3366 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
3368 for (value = 0; value < 256; value++)
3369 if (!isSPACE(value))
3370 ANYOF_BITMAP_SET(data->start_class, value);
3375 if (flags & SCF_DO_STCLASS_AND) {
3376 if (data->start_class->flags & ANYOF_LOCALE) {
3377 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
3378 for (value = 0; value < 256; value++)
3379 if (!isSPACE(value))
3380 ANYOF_BITMAP_CLEAR(data->start_class, value);
3384 data->start_class->flags |= ANYOF_LOCALE;
3385 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
3389 if (flags & SCF_DO_STCLASS_AND) {
3390 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NDIGIT);
3391 for (value = 0; value < 256; value++)
3392 if (!isDIGIT(value))
3393 ANYOF_BITMAP_CLEAR(data->start_class, value);
3396 if (data->start_class->flags & ANYOF_LOCALE)
3397 ANYOF_CLASS_SET(data->start_class,ANYOF_DIGIT);
3399 for (value = 0; value < 256; value++)
3401 ANYOF_BITMAP_SET(data->start_class, value);
3406 if (flags & SCF_DO_STCLASS_AND) {
3407 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_DIGIT);
3408 for (value = 0; value < 256; value++)
3410 ANYOF_BITMAP_CLEAR(data->start_class, value);
3413 if (data->start_class->flags & ANYOF_LOCALE)
3414 ANYOF_CLASS_SET(data->start_class,ANYOF_NDIGIT);
3416 for (value = 0; value < 256; value++)
3417 if (!isDIGIT(value))
3418 ANYOF_BITMAP_SET(data->start_class, value);
3423 if (flags & SCF_DO_STCLASS_OR)
3424 cl_and(data->start_class, and_withp);
3425 flags &= ~SCF_DO_STCLASS;
3428 else if (PL_regkind[OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
3429 data->flags |= (OP(scan) == MEOL
3433 else if ( PL_regkind[OP(scan)] == BRANCHJ
3434 /* Lookbehind, or need to calculate parens/evals/stclass: */
3435 && (scan->flags || data || (flags & SCF_DO_STCLASS))
3436 && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) {
3437 if ( !PERL_ENABLE_POSITIVE_ASSERTION_STUDY
3438 || OP(scan) == UNLESSM )
3440 /* Negative Lookahead/lookbehind
3441 In this case we can't do fixed string optimisation.
3444 I32 deltanext, minnext, fake = 0;
3446 struct regnode_charclass_class intrnl;
3449 data_fake.flags = 0;
3451 data_fake.whilem_c = data->whilem_c;
3452 data_fake.last_closep = data->last_closep;
3455 data_fake.last_closep = &fake;
3456 if ( flags & SCF_DO_STCLASS && !scan->flags
3457 && OP(scan) == IFMATCH ) { /* Lookahead */
3458 cl_init(pRExC_state, &intrnl);
3459 data_fake.start_class = &intrnl;
3460 f |= SCF_DO_STCLASS_AND;
3462 if (flags & SCF_WHILEM_VISITED_POS)
3463 f |= SCF_WHILEM_VISITED_POS;
3464 next = regnext(scan);
3465 nscan = NEXTOPER(NEXTOPER(scan));
3466 minnext = study_chunk(pRExC_state, &nscan, minlenp, &deltanext,
3467 last, &data_fake, stopparen, recursed, NULL, f, depth+1);
3470 vFAIL("Variable length lookbehind not implemented");
3472 else if (minnext > (I32)U8_MAX) {
3473 vFAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
3475 scan->flags = (U8)minnext;
3478 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3480 if (data_fake.flags & SF_HAS_EVAL)
3481 data->flags |= SF_HAS_EVAL;
3482 data->whilem_c = data_fake.whilem_c;
3484 if (f & SCF_DO_STCLASS_AND) {
3485 const int was = (data->start_class->flags & ANYOF_EOS);
3487 cl_and(data->start_class, &intrnl);
3489 data->start_class->flags |= ANYOF_EOS;
3492 #if PERL_ENABLE_POSITIVE_ASSERTION_STUDY
3494 /* Positive Lookahead/lookbehind
3495 In this case we can do fixed string optimisation,
3496 but we must be careful about it. Note in the case of
3497 lookbehind the positions will be offset by the minimum
3498 length of the pattern, something we won't know about
3499 until after the recurse.
3501 I32 deltanext, fake = 0;
3503 struct regnode_charclass_class intrnl;
3505 /* We use SAVEFREEPV so that when the full compile
3506 is finished perl will clean up the allocated
3507 minlens when its all done. This was we don't
3508 have to worry about freeing them when we know
3509 they wont be used, which would be a pain.
3512 Newx( minnextp, 1, I32 );
3513 SAVEFREEPV(minnextp);
3516 StructCopy(data, &data_fake, scan_data_t);
3517 if ((flags & SCF_DO_SUBSTR) && data->last_found) {
3520 scan_commit(pRExC_state, &data_fake,minlenp);
3521 data_fake.last_found=newSVsv(data->last_found);
3525 data_fake.last_closep = &fake;
3526 data_fake.flags = 0;
3528 data_fake.flags |= SF_IS_INF;
3529 if ( flags & SCF_DO_STCLASS && !scan->flags
3530 && OP(scan) == IFMATCH ) { /* Lookahead */
3531 cl_init(pRExC_state, &intrnl);
3532 data_fake.start_class = &intrnl;
3533 f |= SCF_DO_STCLASS_AND;
3535 if (flags & SCF_WHILEM_VISITED_POS)
3536 f |= SCF_WHILEM_VISITED_POS;
3537 next = regnext(scan);
3538 nscan = NEXTOPER(NEXTOPER(scan));
3540 *minnextp = study_chunk(pRExC_state, &nscan, minnextp, &deltanext,
3541 last, &data_fake, stopparen, recursed, NULL, f,depth+1);
3544 vFAIL("Variable length lookbehind not implemented");
3546 else if (*minnextp > (I32)U8_MAX) {
3547 vFAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
3549 scan->flags = (U8)*minnextp;
3554 if (f & SCF_DO_STCLASS_AND) {
3555 const int was = (data->start_class->flags & ANYOF_EOS);
3557 cl_and(data->start_class, &intrnl);
3559 data->start_class->flags |= ANYOF_EOS;
3562 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3564 if (data_fake.flags & SF_HAS_EVAL)
3565 data->flags |= SF_HAS_EVAL;
3566 data->whilem_c = data_fake.whilem_c;
3567 if ((flags & SCF_DO_SUBSTR) && data_fake.last_found) {
3568 if (RExC_rx->minlen<*minnextp)
3569 RExC_rx->minlen=*minnextp;
3570 scan_commit(pRExC_state, &data_fake, minnextp);
3571 SvREFCNT_dec(data_fake.last_found);
3573 if ( data_fake.minlen_fixed != minlenp )
3575 data->offset_fixed= data_fake.offset_fixed;
3576 data->minlen_fixed= data_fake.minlen_fixed;
3577 data->lookbehind_fixed+= scan->flags;
3579 if ( data_fake.minlen_float != minlenp )
3581 data->minlen_float= data_fake.minlen_float;
3582 data->offset_float_min=data_fake.offset_float_min;
3583 data->offset_float_max=data_fake.offset_float_max;
3584 data->lookbehind_float+= scan->flags;
3593 else if (OP(scan) == OPEN) {
3594 if (stopparen != (I32)ARG(scan))
3597 else if (OP(scan) == CLOSE) {
3598 if (stopparen == (I32)ARG(scan)) {
3601 if ((I32)ARG(scan) == is_par) {
3602 next = regnext(scan);
3604 if ( next && (OP(next) != WHILEM) && next < last)
3605 is_par = 0; /* Disable optimization */
3608 *(data->last_closep) = ARG(scan);
3610 else if (OP(scan) == EVAL) {
3612 data->flags |= SF_HAS_EVAL;
3614 else if ( PL_regkind[OP(scan)] == ENDLIKE ) {
3615 if (flags & SCF_DO_SUBSTR) {
3616 scan_commit(pRExC_state,data,minlenp);
3617 flags &= ~SCF_DO_SUBSTR;
3619 if (data && OP(scan)==ACCEPT) {
3620 data->flags |= SCF_SEEN_ACCEPT;
3625 else if (OP(scan) == LOGICAL && scan->flags == 2) /* Embedded follows */
3627 if (flags & SCF_DO_SUBSTR) {
3628 scan_commit(pRExC_state,data,minlenp);
3629 data->longest = &(data->longest_float);
3631 is_inf = is_inf_internal = 1;
3632 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
3633 cl_anything(pRExC_state, data->start_class);
3634 flags &= ~SCF_DO_STCLASS;
3636 #ifdef TRIE_STUDY_OPT
3637 #ifdef FULL_TRIE_STUDY
3638 else if (PL_regkind[OP(scan)] == TRIE) {
3639 /* NOTE - There is similar code to this block above for handling
3640 BRANCH nodes on the initial study. If you change stuff here
3642 regnode *trie_node= scan;
3643 regnode *tail= regnext(scan);
3644 reg_trie_data *trie = (reg_trie_data*)RExC_rx->data->data[ ARG(scan) ];
3645 I32 max1 = 0, min1 = I32_MAX;
3646 struct regnode_charclass_class accum;
3648 if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
3649 scan_commit(pRExC_state, data,minlenp); /* Cannot merge strings after this. */
3650 if (flags & SCF_DO_STCLASS)
3651 cl_init_zero(pRExC_state, &accum);
3657 const regnode *nextbranch= NULL;
3660 for ( word=1 ; word <= trie->wordcount ; word++)
3662 I32 deltanext=0, minnext=0, f = 0, fake;
3663 struct regnode_charclass_class this_class;
3665 data_fake.flags = 0;
3667 data_fake.whilem_c = data->whilem_c;
3668 data_fake.last_closep = data->last_closep;
3671 data_fake.last_closep = &fake;
3673 if (flags & SCF_DO_STCLASS) {
3674 cl_init(pRExC_state, &this_class);
3675 data_fake.start_class = &this_class;
3676 f = SCF_DO_STCLASS_AND;
3678 if (flags & SCF_WHILEM_VISITED_POS)
3679 f |= SCF_WHILEM_VISITED_POS;
3681 if (trie->jump[word]) {
3683 nextbranch = trie_node + trie->jump[0];
3684 scan= trie_node + trie->jump[word];
3685 /* We go from the jump point to the branch that follows
3686 it. Note this means we need the vestigal unused branches
3687 even though they arent otherwise used.
3689 minnext = study_chunk(pRExC_state, &scan, minlenp,
3690 &deltanext, (regnode *)nextbranch, &data_fake,
3691 stopparen, recursed, NULL, f,depth+1);
3693 if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
3694 nextbranch= regnext((regnode*)nextbranch);
3696 if (min1 > (I32)(minnext + trie->minlen))
3697 min1 = minnext + trie->minlen;
3698 if (max1 < (I32)(minnext + deltanext + trie->maxlen))
3699 max1 = minnext + deltanext + trie->maxlen;
3700 if (deltanext == I32_MAX)
3701 is_inf = is_inf_internal = 1;
3703 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3705 if (data_fake.flags & SCF_SEEN_ACCEPT) {
3706 if ( stopmin > min + min1)
3707 stopmin = min + min1;
3708 flags &= ~SCF_DO_SUBSTR;
3710 data->flags |= SCF_SEEN_ACCEPT;
3713 if (data_fake.flags & SF_HAS_EVAL)
3714 data->flags |= SF_HAS_EVAL;
3715 data->whilem_c = data_fake.whilem_c;
3717 if (flags & SCF_DO_STCLASS)
3718 cl_or(pRExC_state, &accum, &this_class);
3721 if (flags & SCF_DO_SUBSTR) {
3722 data->pos_min += min1;
3723 data->pos_delta += max1 - min1;
3724 if (max1 != min1 || is_inf)
3725 data->longest = &(data->longest_float);
3728 delta += max1 - min1;
3729 if (flags & SCF_DO_STCLASS_OR) {
3730 cl_or(pRExC_state, data->start_class, &accum);
3732 cl_and(data->start_class, and_withp);
3733 flags &= ~SCF_DO_STCLASS;
3736 else if (flags & SCF_DO_STCLASS_AND) {
3738 cl_and(data->start_class, &accum);
3739 flags &= ~SCF_DO_STCLASS;
3742 /* Switch to OR mode: cache the old value of
3743 * data->start_class */
3745 StructCopy(data->start_class, and_withp,
3746 struct regnode_charclass_class);
3747 flags &= ~SCF_DO_STCLASS_AND;
3748 StructCopy(&accum, data->start_class,
3749 struct regnode_charclass_class);
3750 flags |= SCF_DO_STCLASS_OR;
3751 data->start_class->flags |= ANYOF_EOS;
3758 else if (PL_regkind[OP(scan)] == TRIE) {
3759 reg_trie_data *trie = (reg_trie_data*)RExC_rx->data->data[ ARG(scan) ];
3762 min += trie->minlen;
3763 delta += (trie->maxlen - trie->minlen);
3764 flags &= ~SCF_DO_STCLASS; /* xxx */
3765 if (flags & SCF_DO_SUBSTR) {
3766 scan_commit(pRExC_state,data,minlenp); /* Cannot expect anything... */
3767 data->pos_min += trie->minlen;
3768 data->pos_delta += (trie->maxlen - trie->minlen);
3769 if (trie->maxlen != trie->minlen)
3770 data->longest = &(data->longest_float);
3772 if (trie->jump) /* no more substrings -- for now /grr*/
3773 flags &= ~SCF_DO_SUBSTR;
3775 #endif /* old or new */
3776 #endif /* TRIE_STUDY_OPT */
3777 /* Else: zero-length, ignore. */
3778 scan = regnext(scan);
3780 DEBUG_PEEP("FEND",scan,depth);
3782 stopparen = frame->stop;
3783 frame = frame->prev;
3789 *deltap = is_inf_internal ? I32_MAX : delta;
3790 if (flags & SCF_DO_SUBSTR && is_inf)
3791 data->pos_delta = I32_MAX - data->pos_min;
3792 if (is_par > (I32)U8_MAX)
3794 if (is_par && pars==1 && data) {
3795 data->flags |= SF_IN_PAR;
3796 data->flags &= ~SF_HAS_PAR;
3798 else if (pars && data) {
3799 data->flags |= SF_HAS_PAR;
3800 data->flags &= ~SF_IN_PAR;
3802 if (flags & SCF_DO_STCLASS_OR)
3803 cl_and(data->start_class, and_withp);
3804 if (flags & SCF_TRIE_RESTUDY)
3805 data->flags |= SCF_TRIE_RESTUDY;
3807 DEBUG_STUDYDATA(data,depth);
3809 return min < stopmin ? min : stopmin;
3813 S_add_data(RExC_state_t *pRExC_state, I32 n, const char *s)
3815 if (RExC_rx->data) {
3816 const U32 count = RExC_rx->data->count;
3817 Renewc(RExC_rx->data,
3818 sizeof(*RExC_rx->data) + sizeof(void*) * (count + n - 1),
3819 char, struct reg_data);
3820 Renew(RExC_rx->data->what, count + n, U8);
3821 RExC_rx->data->count += n;
3824 Newxc(RExC_rx->data, sizeof(*RExC_rx->data) + sizeof(void*) * (n - 1),
3825 char, struct reg_data);
3826 Newx(RExC_rx->data->what, n, U8);
3827 RExC_rx->data->count = n;
3829 Copy(s, RExC_rx->data->what + RExC_rx->data->count - n, n, U8);
3830 return RExC_rx->data->count - n;
3833 #ifndef PERL_IN_XSUB_RE
3835 Perl_reginitcolors(pTHX)
3838 const char * const s = PerlEnv_getenv("PERL_RE_COLORS");
3840 char *t = savepv(s);
3844 t = strchr(t, '\t');
3850 PL_colors[i] = t = (char *)"";
3855 PL_colors[i++] = (char *)"";
3862 #ifdef TRIE_STUDY_OPT
3863 #define CHECK_RESTUDY_GOTO \
3865 (data.flags & SCF_TRIE_RESTUDY) \
3869 #define CHECK_RESTUDY_GOTO
3873 - pregcomp - compile a regular expression into internal code
3875 * We can't allocate space until we know how big the compiled form will be,
3876 * but we can't compile it (and thus know how big it is) until we've got a
3877 * place to put the code. So we cheat: we compile it twice, once with code
3878 * generation turned off and size counting turned on, and once "for real".
3879 * This also means that we don't allocate space until we are sure that the
3880 * thing really will compile successfully, and we never have to move the
3881 * code and thus invalidate pointers into it. (Note that it has to be in
3882 * one piece because free() must be able to free it all.) [NB: not true in perl]
3884 * Beware that the optimization-preparation code in here knows about some
3885 * of the structure of the compiled regexp. [I'll say.]
3890 #ifndef PERL_IN_XSUB_RE
3891 #define RE_ENGINE_PTR &PL_core_reg_engine
3893 extern const struct regexp_engine my_reg_engine;
3894 #define RE_ENGINE_PTR &my_reg_engine
3896 /* these make a few things look better, to avoid indentation */
3897 #define BEGIN_BLOCK {
3901 Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm)
3904 GET_RE_DEBUG_FLAGS_DECL;
3905 DEBUG_r(if (!PL_colorset) reginitcolors());
3906 #ifndef PERL_IN_XSUB_RE
3908 /* Dispatch a request to compile a regexp to correct
3910 HV * const table = GvHV(PL_hintgv);
3912 SV **ptr= hv_fetchs(table, "regcomp", FALSE);
3913 if (ptr && SvIOK(*ptr) && SvIV(*ptr)) {
3914 const regexp_engine *eng=INT2PTR(regexp_engine*,SvIV(*ptr));
3916 PerlIO_printf(Perl_debug_log, "Using engine %"UVxf"\n",
3919 return CALLREGCOMP_ENG(eng, exp, xend, pm);
3933 RExC_state_t RExC_state;
3934 RExC_state_t * const pRExC_state = &RExC_state;
3935 #ifdef TRIE_STUDY_OPT
3937 RExC_state_t copyRExC_state;
3940 FAIL("NULL regexp argument");
3942 RExC_utf8 = pm->op_pmdynflags & PMdf_CMP_UTF8;
3946 SV *dsv= sv_newmortal();
3947 RE_PV_QUOTED_DECL(s, RExC_utf8,
3948 dsv, RExC_precomp, (xend - exp), 60);
3949 PerlIO_printf(Perl_debug_log, "%sCompiling REx%s %s\n",
3950 PL_colors[4],PL_colors[5],s);
3952 RExC_flags = pm->op_pmflags;
3956 RExC_seen_zerolen = *exp == '^' ? -1 : 0;
3957 RExC_seen_evals = 0;
3960 /* First pass: determine size, legality. */
3969 RExC_emit = &PL_regdummy;
3970 RExC_whilem_seen = 0;
3971 RExC_charnames = NULL;
3972 RExC_open_parens = NULL;
3973 RExC_close_parens = NULL;
3975 RExC_paren_names = NULL;
3976 RExC_recurse = NULL;
3977 RExC_recurse_count = 0;
3979 #if 0 /* REGC() is (currently) a NOP at the first pass.
3980 * Clever compilers notice this and complain. --jhi */
3981 REGC((U8)REG_MAGIC, (char*)RExC_emit);
3983 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "Starting first pass (sizing)\n"));
3984 if (reg(pRExC_state, 0, &flags,1) == NULL) {
3985 RExC_precomp = NULL;
3989 PerlIO_printf(Perl_debug_log,
3990 "Required size %"IVdf" nodes\n"
3991 "Starting second pass (creation)\n",
3994 RExC_lastparse=NULL;
3996 /* Small enough for pointer-storage convention?
3997 If extralen==0, this means that we will not need long jumps. */
3998 if (RExC_size >= 0x10000L && RExC_extralen)
3999 RExC_size += RExC_extralen;
4002 if (RExC_whilem_seen > 15)
4003 RExC_whilem_seen = 15;
4006 /* Make room for a sentinel value at the end of the program */
4010 /* Allocate space and zero-initialize. Note, the two step process
4011 of zeroing when in debug mode, thus anything assigned has to
4012 happen after that */
4013 Newxc(r, sizeof(regexp) + (unsigned)RExC_size * sizeof(regnode),
4016 FAIL("Regexp out of space");
4018 /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */
4019 Zero(r, sizeof(regexp) + (unsigned)RExC_size * sizeof(regnode), char);
4021 /* initialization begins here */
4022 r->engine= RE_ENGINE_PTR;
4024 r->prelen = xend - exp;
4025 r->precomp = savepvn(RExC_precomp, r->prelen);
4027 #ifdef PERL_OLD_COPY_ON_WRITE
4028 r->saved_copy = NULL;
4030 r->reganch = pm->op_pmflags & PMf_COMPILETIME;
4031 r->nparens = RExC_npar - 1; /* set early to validate backrefs */
4032 r->lastparen = 0; /* mg.c reads this. */
4034 r->substrs = 0; /* Useful during FAIL. */
4035 r->startp = 0; /* Useful during FAIL. */
4040 if (RExC_seen & REG_SEEN_RECURSE) {
4041 Newxz(RExC_open_parens, RExC_npar,regnode *);
4042 SAVEFREEPV(RExC_open_parens);
4043 Newxz(RExC_close_parens,RExC_npar,regnode *);
4044 SAVEFREEPV(RExC_close_parens);
4047 /* Useful during FAIL. */
4048 Newxz(r->offsets, 2*RExC_size+1, U32); /* MJD 20001228 */
4050 r->offsets[0] = RExC_size;
4052 DEBUG_OFFSETS_r(PerlIO_printf(Perl_debug_log,
4053 "%s %"UVuf" bytes for offset annotations.\n",
4054 r->offsets ? "Got" : "Couldn't get",
4055 (UV)((2*RExC_size+1) * sizeof(U32))));
4059 /* Second pass: emit code. */
4060 RExC_flags = pm->op_pmflags; /* don't let top level (?i) bleed */
4066 RExC_emit_start = r->program;
4067 RExC_emit = r->program;
4069 /* put a sentinal on the end of the program so we can check for
4071 r->program[RExC_size].type = 255;
4073 /* Store the count of eval-groups for security checks: */
4074 RExC_emit->next_off = (RExC_seen_evals > (I32)U16_MAX) ? U16_MAX : (U16)RExC_seen_evals;
4075 REGC((U8)REG_MAGIC, (char*) RExC_emit++);
4077 if (reg(pRExC_state, 0, &flags,1) == NULL)
4080 /* XXXX To minimize changes to RE engine we always allocate
4081 3-units-long substrs field. */
4082 Newx(r->substrs, 1, struct reg_substr_data);
4083 if (RExC_recurse_count) {
4084 Newxz(RExC_recurse,RExC_recurse_count,regnode *);
4085 SAVEFREEPV(RExC_recurse);
4089 r->minlen = minlen = sawplus = sawopen = 0;
4090 Zero(r->substrs, 1, struct reg_substr_data);
4092 #ifdef TRIE_STUDY_OPT
4095 DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log,"Restudying\n"));
4097 RExC_state = copyRExC_state;
4098 if (seen & REG_TOP_LEVEL_BRANCHES)
4099 RExC_seen |= REG_TOP_LEVEL_BRANCHES;
4101 RExC_seen &= ~REG_TOP_LEVEL_BRANCHES;
4102 if (data.last_found) {
4103 SvREFCNT_dec(data.longest_fixed);
4104 SvREFCNT_dec(data.longest_float);
4105 SvREFCNT_dec(data.last_found);
4107 StructCopy(&zero_scan_data, &data, scan_data_t);
4109 StructCopy(&zero_scan_data, &data, scan_data_t);
4110 copyRExC_state = RExC_state;
4113 StructCopy(&zero_scan_data, &data, scan_data_t);
4116 /* Dig out information for optimizations. */
4117 r->reganch = pm->op_pmflags & PMf_COMPILETIME; /* Again? */
4118 pm->op_pmflags = RExC_flags;
4120 r->reganch |= ROPT_UTF8; /* Unicode in it? */
4121 r->regstclass = NULL;
4122 if (RExC_naughty >= 10) /* Probably an expensive pattern. */
4123 r->reganch |= ROPT_NAUGHTY;
4124 scan = r->program + 1; /* First BRANCH. */
4126 /* testing for BRANCH here tells us whether there is "must appear"
4127 data in the pattern. If there is then we can use it for optimisations */
4128 if (!(RExC_seen & REG_TOP_LEVEL_BRANCHES)) { /* Only one top-level choice. */
4130 STRLEN longest_float_length, longest_fixed_length;
4131 struct regnode_charclass_class ch_class; /* pointed to by data */
4133 I32 last_close = 0; /* pointed to by data */
4136 /* Skip introductions and multiplicators >= 1. */
4137 while ((OP(first) == OPEN && (sawopen = 1)) ||
4138 /* An OR of *one* alternative - should not happen now. */
4139 (OP(first) == BRANCH && OP(regnext(first)) != BRANCH) ||
4140 /* for now we can't handle lookbehind IFMATCH*/
4141 (OP(first) == IFMATCH && !first->flags) ||
4142 (OP(first) == PLUS) ||
4143 (OP(first) == MINMOD) ||
4144 /* An {n,m} with n>0 */
4145 (PL_regkind[OP(first)] == CURLY && ARG1(first) > 0) )
4148 if (OP(first) == PLUS)
4151 first += regarglen[OP(first)];
4152 if (OP(first) == IFMATCH) {
4153 first = NEXTOPER(first);
4154 first += EXTRA_STEP_2ARGS;
4155 } else /* XXX possible optimisation for /(?=)/ */
4156 first = NEXTOPER(first);
4159 /* Starting-point info. */
4161 DEBUG_PEEP("first:",first,0);
4162 /* Ignore EXACT as we deal with it later. */
4163 if (PL_regkind[OP(first)] == EXACT) {
4164 if (OP(first) == EXACT)
4165 NOOP; /* Empty, get anchored substr later. */
4166 else if ((OP(first) == EXACTF || OP(first) == EXACTFL))
4167 r->regstclass = first;
4170 else if (PL_regkind[OP(first)] == TRIE &&
4171 ((reg_trie_data *)r->data->data[ ARG(first) ])->minlen>0)
4174 /* this can happen only on restudy */
4175 if ( OP(first) == TRIE ) {
4176 struct regnode_1 *trieop;
4177 Newxz(trieop,1,struct regnode_1);
4178 StructCopy(first,trieop,struct regnode_1);
4179 trie_op=(regnode *)trieop;
4181 struct regnode_charclass *trieop;
4182 Newxz(trieop,1,struct regnode_charclass);
4183 StructCopy(first,trieop,struct regnode_charclass);
4184 trie_op=(regnode *)trieop;
4187 make_trie_failtable(pRExC_state, (regnode *)first, trie_op, 0);
4188 r->regstclass = trie_op;
4191 else if (strchr((const char*)PL_simple,OP(first)))
4192 r->regstclass = first;
4193 else if (PL_regkind[OP(first)] == BOUND ||
4194 PL_regkind[OP(first)] == NBOUND)
4195 r->regstclass = first;
4196 else if (PL_regkind[OP(first)] == BOL) {
4197 r->reganch |= (OP(first) == MBOL
4199 : (OP(first) == SBOL
4202 first = NEXTOPER(first);
4205 else if (OP(first) == GPOS) {
4206 r->reganch |= ROPT_ANCH_GPOS;
4207 first = NEXTOPER(first);
4210 else if (!sawopen && (OP(first) == STAR &&
4211 PL_regkind[OP(NEXTOPER(first))] == REG_ANY) &&
4212 !(r->reganch & ROPT_ANCH) )
4214 /* turn .* into ^.* with an implied $*=1 */
4216 (OP(NEXTOPER(first)) == REG_ANY)
4219 r->reganch |= type | ROPT_IMPLICIT;
4220 first = NEXTOPER(first);
4223 if (sawplus && (!sawopen || !RExC_sawback)
4224 && !(RExC_seen & REG_SEEN_EVAL)) /* May examine pos and $& */
4225 /* x+ must match at the 1st pos of run of x's */
4226 r->reganch |= ROPT_SKIP;
4228 /* Scan is after the zeroth branch, first is atomic matcher. */
4229 #ifdef TRIE_STUDY_OPT
4232 PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
4233 (IV)(first - scan + 1))
4237 PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
4238 (IV)(first - scan + 1))
4244 * If there's something expensive in the r.e., find the
4245 * longest literal string that must appear and make it the
4246 * regmust. Resolve ties in favor of later strings, since
4247 * the regstart check works with the beginning of the r.e.
4248 * and avoiding duplication strengthens checking. Not a
4249 * strong reason, but sufficient in the absence of others.
4250 * [Now we resolve ties in favor of the earlier string if
4251 * it happens that c_offset_min has been invalidated, since the
4252 * earlier string may buy us something the later one won't.]
4255 data.longest_fixed = newSVpvs("");
4256 data.longest_float = newSVpvs("");
4257 data.last_found = newSVpvs("");
4258 data.longest = &(data.longest_fixed);
4260 if (!r->regstclass) {
4261 cl_init(pRExC_state, &ch_class);
4262 data.start_class = &ch_class;
4263 stclass_flag = SCF_DO_STCLASS_AND;
4264 } else /* XXXX Check for BOUND? */
4266 data.last_closep = &last_close;
4268 minlen = study_chunk(pRExC_state, &first, &minlen, &fake, scan + RExC_size, /* Up to end */
4269 &data, -1, NULL, NULL,
4270 SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag,0);
4276 if ( RExC_npar == 1 && data.longest == &(data.longest_fixed)
4277 && data.last_start_min == 0 && data.last_end > 0
4278 && !RExC_seen_zerolen
4279 && (!(RExC_seen & REG_SEEN_GPOS) || (r->reganch & ROPT_ANCH_GPOS)))
4280 r->reganch |= ROPT_CHECK_ALL;
4281 scan_commit(pRExC_state, &data,&minlen);
4282 SvREFCNT_dec(data.last_found);
4284 /* Note that code very similar to this but for anchored string
4285 follows immediately below, changes may need to be made to both.
4288 longest_float_length = CHR_SVLEN(data.longest_float);
4289 if (longest_float_length
4290 || (data.flags & SF_FL_BEFORE_EOL
4291 && (!(data.flags & SF_FL_BEFORE_MEOL)
4292 || (RExC_flags & PMf_MULTILINE))))
4296 if (SvCUR(data.longest_fixed) /* ok to leave SvCUR */
4297 && data.offset_fixed == data.offset_float_min
4298 && SvCUR(data.longest_fixed) == SvCUR(data.longest_float))
4299 goto remove_float; /* As in (a)+. */
4301 /* copy the information about the longest float from the reg_scan_data
4302 over to the program. */
4303 if (SvUTF8(data.longest_float)) {
4304 r->float_utf8 = data.longest_float;
4305 r->float_substr = NULL;
4307 r->float_substr = data.longest_float;
4308 r->float_utf8 = NULL;
4310 /* float_end_shift is how many chars that must be matched that
4311 follow this item. We calculate it ahead of time as once the
4312 lookbehind offset is added in we lose the ability to correctly
4314 ml = data.minlen_float ? *(data.minlen_float)
4315 : (I32)longest_float_length;
4316 r->float_end_shift = ml - data.offset_float_min
4317 - longest_float_length + (SvTAIL(data.longest_float) != 0)
4318 + data.lookbehind_float;
4319 r->float_min_offset = data.offset_float_min - data.lookbehind_float;
4320 r->float_max_offset = data.offset_float_max;
4321 if (data.offset_float_max < I32_MAX) /* Don't offset infinity */
4322 r->float_max_offset -= data.lookbehind_float;
4324 t = (data.flags & SF_FL_BEFORE_EOL /* Can't have SEOL and MULTI */
4325 && (!(data.flags & SF_FL_BEFORE_MEOL)
4326 || (RExC_flags & PMf_MULTILINE)));
4327 fbm_compile(data.longest_float, t ? FBMcf_TAIL : 0);
4331 r->float_substr = r->float_utf8 = NULL;
4332 SvREFCNT_dec(data.longest_float);
4333 longest_float_length = 0;
4336 /* Note that code very similar to this but for floating string
4337 is immediately above, changes may need to be made to both.
4340 longest_fixed_length = CHR_SVLEN(data.longest_fixed);
4341 if (longest_fixed_length
4342 || (data.flags & SF_FIX_BEFORE_EOL /* Cannot have SEOL and MULTI */
4343 && (!(data.flags & SF_FIX_BEFORE_MEOL)
4344 || (RExC_flags & PMf_MULTILINE))))
4348 /* copy the information about the longest fixed
4349 from the reg_scan_data over to the program. */
4350 if (SvUTF8(data.longest_fixed)) {
4351 r->anchored_utf8 = data.longest_fixed;
4352 r->anchored_substr = NULL;
4354 r->anchored_substr = data.longest_fixed;
4355 r->anchored_utf8 = NULL;
4357 /* fixed_end_shift is how many chars that must be matched that
4358 follow this item. We calculate it ahead of time as once the
4359 lookbehind offset is added in we lose the ability to correctly
4361 ml = data.minlen_fixed ? *(data.minlen_fixed)
4362 : (I32)longest_fixed_length;
4363 r->anchored_end_shift = ml - data.offset_fixed
4364 - longest_fixed_length + (SvTAIL(data.longest_fixed) != 0)
4365 + data.lookbehind_fixed;
4366 r->anchored_offset = data.offset_fixed - data.lookbehind_fixed;
4368 t = (data.flags & SF_FIX_BEFORE_EOL /* Can't have SEOL and MULTI */
4369 && (!(data.flags & SF_FIX_BEFORE_MEOL)
4370 || (RExC_flags & PMf_MULTILINE)));
4371 fbm_compile(data.longest_fixed, t ? FBMcf_TAIL : 0);
4374 r->anchored_substr = r->anchored_utf8 = NULL;
4375 SvREFCNT_dec(data.longest_fixed);
4376 longest_fixed_length = 0;
4379 && (OP(r->regstclass) == REG_ANY || OP(r->regstclass) == SANY))
4380 r->regstclass = NULL;
4381 if ((!(r->anchored_substr || r->anchored_utf8) || r->anchored_offset)
4383 && !(data.start_class->flags & ANYOF_EOS)
4384 && !cl_is_anything(data.start_class))
4386 const I32 n = add_data(pRExC_state, 1, "f");
4388 Newx(RExC_rx->data->data[n], 1,
4389 struct regnode_charclass_class);
4390 StructCopy(data.start_class,
4391 (struct regnode_charclass_class*)RExC_rx->data->data[n],
4392 struct regnode_charclass_class);
4393 r->regstclass = (regnode*)RExC_rx->data->data[n];
4394 r->reganch &= ~ROPT_SKIP; /* Used in find_byclass(). */
4395 DEBUG_COMPILE_r({ SV *sv = sv_newmortal();
4396 regprop(r, sv, (regnode*)data.start_class);
4397 PerlIO_printf(Perl_debug_log,
4398 "synthetic stclass \"%s\".\n",
4399 SvPVX_const(sv));});
4402 /* A temporary algorithm prefers floated substr to fixed one to dig more info. */
4403 if (longest_fixed_length > longest_float_length) {
4404 r->check_end_shift = r->anchored_end_shift;
4405 r->check_substr = r->anchored_substr;
4406 r->check_utf8 = r->anchored_utf8;
4407 r->check_offset_min = r->check_offset_max = r->anchored_offset;
4408 if (r->reganch & ROPT_ANCH_SINGLE)
4409 r->reganch |= ROPT_NOSCAN;
4412 r->check_end_shift = r->float_end_shift;
4413 r->check_substr = r->float_substr;
4414 r->check_utf8 = r->float_utf8;
4415 r->check_offset_min = r->float_min_offset;
4416 r->check_offset_max = r->float_max_offset;
4418 /* XXXX Currently intuiting is not compatible with ANCH_GPOS.
4419 This should be changed ASAP! */
4420 if ((r->check_substr || r->check_utf8) && !(r->reganch & ROPT_ANCH_GPOS)) {
4421 r->reganch |= RE_USE_INTUIT;
4422 if (SvTAIL(r->check_substr ? r->check_substr : r->check_utf8))
4423 r->reganch |= RE_INTUIT_TAIL;
4425 /* XXX Unneeded? dmq (shouldn't as this is handled elsewhere)
4426 if ( (STRLEN)minlen < longest_float_length )
4427 minlen= longest_float_length;
4428 if ( (STRLEN)minlen < longest_fixed_length )
4429 minlen= longest_fixed_length;
4433 /* Several toplevels. Best we can is to set minlen. */
4435 struct regnode_charclass_class ch_class;
4438 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "\nMulti Top Level\n"));
4440 scan = r->program + 1;
4441 cl_init(pRExC_state, &ch_class);
4442 data.start_class = &ch_class;
4443 data.last_closep = &last_close;
4446 minlen = study_chunk(pRExC_state, &scan, &minlen, &fake, scan + RExC_size,
4447 &data, -1, NULL, NULL, SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS,0);
4451 r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8
4452 = r->float_substr = r->float_utf8 = NULL;
4453 if (!(data.start_class->flags & ANYOF_EOS)
4454 && !cl_is_anything(data.start_class))
4456 const I32 n = add_data(pRExC_state, 1, "f");
4458 Newx(RExC_rx->data->data[n], 1,
4459 struct regnode_charclass_class);
4460 StructCopy(data.start_class,
4461 (struct regnode_charclass_class*)RExC_rx->data->data[n],
4462 struct regnode_charclass_class);
4463 r->regstclass = (regnode*)RExC_rx->data->data[n];
4464 r->reganch &= ~ROPT_SKIP; /* Used in find_byclass(). */
4465 DEBUG_COMPILE_r({ SV* sv = sv_newmortal();
4466 regprop(r, sv, (regnode*)data.start_class);
4467 PerlIO_printf(Perl_debug_log,
4468 "synthetic stclass \"%s\".\n",
4469 SvPVX_const(sv));});
4473 /* Guard against an embedded (?=) or (?<=) with a longer minlen than
4474 the "real" pattern. */
4476 PerlIO_printf(Perl_debug_log,"minlen: %"IVdf" r->minlen:%"IVdf"\n",
4479 r->minlenret = minlen;
4480 if (r->minlen < minlen)
4483 if (RExC_seen & REG_SEEN_GPOS)
4484 r->reganch |= ROPT_GPOS_SEEN;
4485 if (RExC_seen & REG_SEEN_LOOKBEHIND)
4486 r->reganch |= ROPT_LOOKBEHIND_SEEN;
4487 if (RExC_seen & REG_SEEN_EVAL)
4488 r->reganch |= ROPT_EVAL_SEEN;
4489 if (RExC_seen & REG_SEEN_CANY)
4490 r->reganch |= ROPT_CANY_SEEN;
4491 if (RExC_seen & REG_SEEN_VERBARG)
4492 r->reganch |= ROPT_VERBARG_SEEN;
4493 if (RExC_seen & REG_SEEN_CUTGROUP)
4494 r->reganch |= ROPT_CUTGROUP_SEEN;
4495 if (RExC_paren_names)
4496 r->paren_names = (HV*)SvREFCNT_inc(RExC_paren_names);
4498 r->paren_names = NULL;
4500 if (RExC_recurse_count) {
4501 for ( ; RExC_recurse_count ; RExC_recurse_count-- ) {
4502 const regnode *scan = RExC_recurse[RExC_recurse_count-1];
4503 ARG2L_SET( scan, RExC_open_parens[ARG(scan)-1] - scan );
4506 Newxz(r->startp, RExC_npar, I32);
4507 Newxz(r->endp, RExC_npar, I32);
4508 /* assume we don't need to swap parens around before we match */
4511 PerlIO_printf(Perl_debug_log,"Final program:\n");
4514 DEBUG_OFFSETS_r(if (r->offsets) {
4515 const U32 len = r->offsets[0];
4517 GET_RE_DEBUG_FLAGS_DECL;
4518 PerlIO_printf(Perl_debug_log, "Offsets: [%"UVuf"]\n\t", (UV)r->offsets[0]);
4519 for (i = 1; i <= len; i++) {
4520 if (r->offsets[i*2-1] || r->offsets[i*2])
4521 PerlIO_printf(Perl_debug_log, "%"UVuf":%"UVuf"[%"UVuf"] ",
4522 (UV)i, (UV)r->offsets[i*2-1], (UV)r->offsets[i*2]);
4524 PerlIO_printf(Perl_debug_log, "\n");
4530 #undef CORE_ONLY_BLOCK
4532 #undef RE_ENGINE_PTR
4534 #ifndef PERL_IN_XSUB_RE
4536 Perl_reg_named_buff_sv(pTHX_ SV* namesv)
4538 I32 parno = 0; /* no match */
4540 const REGEXP * const rx = PM_GETRE(PL_curpm);
4541 if (rx && rx->paren_names) {
4542 HE *he_str = hv_fetch_ent( rx->paren_names, namesv, 0, 0 );
4545 SV* sv_dat=HeVAL(he_str);
4546 I32 *nums=(I32*)SvPVX(sv_dat);
4547 for ( i=0; i<SvIVX(sv_dat); i++ ) {
4548 if ((I32)(rx->lastparen) >= nums[i] &&
4549 rx->endp[nums[i]] != -1)
4562 SV *sv= sv_newmortal();
4563 Perl_sv_setpvf(aTHX_ sv, "%"IVdf,(IV)parno);
4564 gv_paren= Perl_gv_fetchsv(aTHX_ sv, GV_ADD, SVt_PVGV);
4565 return GvSVn(gv_paren);
4570 /* Scans the name of a named buffer from the pattern.
4571 * If flags is REG_RSN_RETURN_NULL returns null.
4572 * If flags is REG_RSN_RETURN_NAME returns an SV* containing the name
4573 * If flags is REG_RSN_RETURN_DATA returns the data SV* corresponding
4574 * to the parsed name as looked up in the RExC_paren_names hash.
4575 * If there is an error throws a vFAIL().. type exception.
4578 #define REG_RSN_RETURN_NULL 0
4579 #define REG_RSN_RETURN_NAME 1
4580 #define REG_RSN_RETURN_DATA 2
4583 S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags) {
4584 char *name_start = RExC_parse;
4587 while( isIDFIRST_uni(utf8n_to_uvchr((U8*)RExC_parse,
4588 RExC_end - RExC_parse, &numlen, UTF8_ALLOW_DEFAULT)))
4590 RExC_parse += numlen;
4593 while( isIDFIRST(*RExC_parse) )
4597 SV* sv_name = sv_2mortal(Perl_newSVpvn(aTHX_ name_start,
4598 (int)(RExC_parse - name_start)));
4601 if ( flags == REG_RSN_RETURN_NAME)
4603 else if (flags==REG_RSN_RETURN_DATA) {
4606 if ( ! sv_name ) /* should not happen*/
4607 Perl_croak(aTHX_ "panic: no svname in reg_scan_name");
4608 if (RExC_paren_names)
4609 he_str = hv_fetch_ent( RExC_paren_names, sv_name, 0, 0 );
4611 sv_dat = HeVAL(he_str);
4613 vFAIL("Reference to nonexistent named group");
4617 Perl_croak(aTHX_ "panic: bad flag in reg_scan_name");
4624 #define DEBUG_PARSE_MSG(funcname) DEBUG_PARSE_r({ \
4625 int rem=(int)(RExC_end - RExC_parse); \
4634 if (RExC_lastparse!=RExC_parse) \
4635 PerlIO_printf(Perl_debug_log," >%.*s%-*s", \
4638 iscut ? "..." : "<" \
4641 PerlIO_printf(Perl_debug_log,"%16s",""); \
4646 num=REG_NODE_NUM(RExC_emit); \
4647 if (RExC_lastnum!=num) \
4648 PerlIO_printf(Perl_debug_log,"|%4d",num); \
4650 PerlIO_printf(Perl_debug_log,"|%4s",""); \
4651 PerlIO_printf(Perl_debug_log,"|%*s%-4s", \
4652 (int)((depth*2)), "", \
4656 RExC_lastparse=RExC_parse; \
4661 #define DEBUG_PARSE(funcname) DEBUG_PARSE_r({ \
4662 DEBUG_PARSE_MSG((funcname)); \
4663 PerlIO_printf(Perl_debug_log,"%4s","\n"); \
4665 #define DEBUG_PARSE_FMT(funcname,fmt,args) DEBUG_PARSE_r({ \
4666 DEBUG_PARSE_MSG((funcname)); \
4667 PerlIO_printf(Perl_debug_log,fmt "\n",args); \
4670 - reg - regular expression, i.e. main body or parenthesized thing
4672 * Caller must absorb opening parenthesis.
4674 * Combining parenthesis handling with the base level of regular expression
4675 * is a trifle forced, but the need to tie the tails of the branches to what
4676 * follows makes it hard to avoid.
4678 #define REGTAIL(x,y,z) regtail((x),(y),(z),depth+1)
4680 #define REGTAIL_STUDY(x,y,z) regtail_study((x),(y),(z),depth+1)
4682 #define REGTAIL_STUDY(x,y,z) regtail((x),(y),(z),depth+1)
4685 /* this idea is borrowed from STR_WITH_LEN in handy.h */
4686 #define CHECK_WORD(s,v,l) \
4687 (((sizeof(s)-1)==(l)) && (strnEQ(start_verb, (s ""), (sizeof(s)-1))))
4690 S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
4691 /* paren: Parenthesized? 0=top, 1=(, inside: changed to letter. */
4694 register regnode *ret; /* Will be the head of the group. */
4695 register regnode *br;
4696 register regnode *lastbr;
4697 register regnode *ender = NULL;
4698 register I32 parno = 0;
4700 const I32 oregflags = RExC_flags;
4701 bool have_branch = 0;
4704 /* for (?g), (?gc), and (?o) warnings; warning
4705 about (?c) will warn about (?g) -- japhy */
4707 #define WASTED_O 0x01
4708 #define WASTED_G 0x02
4709 #define WASTED_C 0x04
4710 #define WASTED_GC (0x02|0x04)
4711 I32 wastedflags = 0x00;
4713 char * parse_start = RExC_parse; /* MJD */
4714 char * const oregcomp_parse = RExC_parse;
4716 GET_RE_DEBUG_FLAGS_DECL;
4717 DEBUG_PARSE("reg ");
4720 *flagp = 0; /* Tentatively. */
4723 /* Make an OPEN node, if parenthesized. */
4725 if ( *RExC_parse == '*') { /* (*VERB:ARG) */
4726 char *start_verb = RExC_parse;
4727 STRLEN verb_len = 0;
4728 char *start_arg = NULL;
4729 unsigned char op = 0;
4731 int internal_argval = 0; /* internal_argval is only useful if !argok */
4732 while ( *RExC_parse && *RExC_parse != ')' ) {
4733 if ( *RExC_parse == ':' ) {
4734 start_arg = RExC_parse + 1;
4740 verb_len = RExC_parse - start_verb;
4743 while ( *RExC_parse && *RExC_parse != ')' )
4745 if ( *RExC_parse != ')' )
4746 vFAIL("Unterminated verb pattern argument");
4747 if ( RExC_parse == start_arg )
4750 if ( *RExC_parse != ')' )
4751 vFAIL("Unterminated verb pattern");
4754 switch ( *start_verb ) {
4755 case 'A': /* (*ACCEPT) */
4756 if ( CHECK_WORD("ACCEPT",start_verb,verb_len) ) {
4758 internal_argval = RExC_nestroot;
4761 case 'C': /* (*COMMIT) */
4762 if ( CHECK_WORD("COMMIT",start_verb,verb_len) )
4765 case 'F': /* (*FAIL) */
4766 if ( verb_len==1 || CHECK_WORD("FAIL",start_verb,verb_len) ) {
4771 case ':': /* (*:NAME) */
4772 case 'M': /* (*MARK:NAME) */
4773 if ( verb_len==0 || CHECK_WORD("MARK",start_verb,verb_len) ) {
4778 case 'P': /* (*PRUNE) */
4779 if ( CHECK_WORD("PRUNE",start_verb,verb_len) )
4782 case 'S': /* (*SKIP) */
4783 if ( CHECK_WORD("SKIP",start_verb,verb_len) )
4786 case 'T': /* (*THEN) */
4787 /* [19:06] <TimToady> :: is then */
4788 if ( CHECK_WORD("THEN",start_verb,verb_len) ) {
4790 RExC_seen |= REG_SEEN_CUTGROUP;
4796 vFAIL3("Unknown verb pattern '%.*s'",
4797 verb_len, start_verb);
4800 if ( start_arg && internal_argval ) {
4801 vFAIL3("Verb pattern '%.*s' may not have an argument",
4802 verb_len, start_verb);
4803 } else if ( argok < 0 && !start_arg ) {
4804 vFAIL3("Verb pattern '%.*s' has a mandatory argument",
4805 verb_len, start_verb);
4807 ret = reganode(pRExC_state, op, internal_argval);
4808 if ( ! internal_argval && ! SIZE_ONLY ) {
4810 SV *sv = newSVpvn( start_arg, RExC_parse - start_arg);
4811 ARG(ret) = add_data( pRExC_state, 1, "S" );
4812 RExC_rx->data->data[ARG(ret)]=(void*)sv;
4819 if (!internal_argval)
4820 RExC_seen |= REG_SEEN_VERBARG;
4821 } else if ( start_arg ) {
4822 vFAIL3("Verb pattern '%.*s' may not have an argument",
4823 verb_len, start_verb);
4825 ret = reg_node(pRExC_state, op);
4827 nextchar(pRExC_state);
4830 if (*RExC_parse == '?') { /* (?...) */
4831 U32 posflags = 0, negflags = 0;
4832 U32 *flagsp = &posflags;
4833 bool is_logical = 0;
4834 const char * const seqstart = RExC_parse;
4837 paren = *RExC_parse++;
4838 ret = NULL; /* For look-ahead/behind. */
4841 case '<': /* (?<...) */
4842 if (*RExC_parse == '!')
4844 else if (*RExC_parse != '=')
4849 case '\'': /* (?'...') */
4850 name_start= RExC_parse;
4851 svname = reg_scan_name(pRExC_state,
4852 SIZE_ONLY ? /* reverse test from the others */
4853 REG_RSN_RETURN_NAME :
4854 REG_RSN_RETURN_NULL);
4855 if (RExC_parse == name_start)
4857 if (*RExC_parse != paren)
4858 vFAIL2("Sequence (?%c... not terminated",
4859 paren=='>' ? '<' : paren);
4863 if (!svname) /* shouldnt happen */
4865 "panic: reg_scan_name returned NULL");
4866 if (!RExC_paren_names) {
4867 RExC_paren_names= newHV();
4868 sv_2mortal((SV*)RExC_paren_names);
4870 he_str = hv_fetch_ent( RExC_paren_names, svname, 1, 0 );
4872 sv_dat = HeVAL(he_str);
4874 /* croak baby croak */
4876 "panic: paren_name hash element allocation failed");
4877 } else if ( SvPOK(sv_dat) ) {
4878 IV count=SvIV(sv_dat);
4879 I32 *pv=(I32*)SvGROW(sv_dat,SvCUR(sv_dat)+sizeof(I32)+1);
4880 SvCUR_set(sv_dat,SvCUR(sv_dat)+sizeof(I32));
4881 pv[count]=RExC_npar;
4884 (void)SvUPGRADE(sv_dat,SVt_PVNV);
4885 sv_setpvn(sv_dat, (char *)&(RExC_npar), sizeof(I32));
4890 /*sv_dump(sv_dat);*/
4892 nextchar(pRExC_state);
4894 goto capturing_parens;
4896 RExC_seen |= REG_SEEN_LOOKBEHIND;
4898 case '=': /* (?=...) */
4899 case '!': /* (?!...) */
4900 RExC_seen_zerolen++;
4901 if (*RExC_parse == ')') {
4902 ret=reg_node(pRExC_state, OPFAIL);
4903 nextchar(pRExC_state);
4906 case ':': /* (?:...) */
4907 case '>': /* (?>...) */
4909 case '$': /* (?$...) */
4910 case '@': /* (?@...) */
4911 vFAIL2("Sequence (?%c...) not implemented", (int)paren);
4913 case '#': /* (?#...) */
4914 while (*RExC_parse && *RExC_parse != ')')
4916 if (*RExC_parse != ')')
4917 FAIL("Sequence (?#... not terminated");
4918 nextchar(pRExC_state);
4921 case '0' : /* (?0) */
4922 case 'R' : /* (?R) */
4923 if (*RExC_parse != ')')
4924 FAIL("Sequence (?R) not terminated");
4925 ret = reg_node(pRExC_state, GOSTART);
4926 nextchar(pRExC_state);
4929 { /* named and numeric backreferences */
4932 case '&': /* (?&NAME) */
4933 parse_start = RExC_parse - 1;
4935 SV *sv_dat = reg_scan_name(pRExC_state,
4936 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
4937 num = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
4939 goto gen_recurse_regop;
4942 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
4944 vFAIL("Illegal pattern");
4946 goto parse_recursion;
4948 case '-': /* (?-1) */
4949 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
4950 RExC_parse--; /* rewind to let it be handled later */
4954 case '1': case '2': case '3': case '4': /* (?1) */
4955 case '5': case '6': case '7': case '8': case '9':
4958 num = atoi(RExC_parse);
4959 parse_start = RExC_parse - 1; /* MJD */
4960 if (*RExC_parse == '-')
4962 while (isDIGIT(*RExC_parse))
4964 if (*RExC_parse!=')')
4965 vFAIL("Expecting close bracket");
4968 if ( paren == '-' ) {
4970 Diagram of capture buffer numbering.
4971 Top line is the normal capture buffer numbers
4972 Botton line is the negative indexing as from
4976 /(a(x)y)(a(b(c(?-2)d)e)f)(g(h))/
4980 num = RExC_npar + num;
4983 vFAIL("Reference to nonexistent group");
4985 } else if ( paren == '+' ) {
4986 num = RExC_npar + num - 1;
4989 ret = reganode(pRExC_state, GOSUB, num);
4991 if (num > (I32)RExC_rx->nparens) {
4993 vFAIL("Reference to nonexistent group");
4995 ARG2L_SET( ret, RExC_recurse_count++);
4997 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
4998 "Recurse #%"UVuf" to %"IVdf"\n", (UV)ARG(ret), (IV)ARG2L(ret)));
5002 RExC_seen |= REG_SEEN_RECURSE;
5003 Set_Node_Length(ret, 1 + regarglen[OP(ret)]); /* MJD */
5004 Set_Node_Offset(ret, parse_start); /* MJD */
5006 nextchar(pRExC_state);
5008 } /* named and numeric backreferences */
5011 case 'p': /* (?p...) */
5012 if (SIZE_ONLY && ckWARN2(WARN_DEPRECATED, WARN_REGEXP))
5013 vWARNdep(RExC_parse, "(?p{}) is deprecated - use (??{})");
5015 case '?': /* (??...) */
5017 if (*RExC_parse != '{')
5019 paren = *RExC_parse++;
5021 case '{': /* (?{...}) */
5023 I32 count = 1, n = 0;
5025 char *s = RExC_parse;
5027 RExC_seen_zerolen++;
5028 RExC_seen |= REG_SEEN_EVAL;
5029 while (count && (c = *RExC_parse)) {
5040 if (*RExC_parse != ')') {
5042 vFAIL("Sequence (?{...}) not terminated or not {}-balanced");
5046 OP_4tree *sop, *rop;
5047 SV * const sv = newSVpvn(s, RExC_parse - 1 - s);
5050 Perl_save_re_context(aTHX);
5051 rop = sv_compile_2op(sv, &sop, "re", &pad);
5052 sop->op_private |= OPpREFCOUNTED;
5053 /* re_dup will OpREFCNT_inc */
5054 OpREFCNT_set(sop, 1);
5057 n = add_data(pRExC_state, 3, "nop");
5058 RExC_rx->data->data[n] = (void*)rop;
5059 RExC_rx->data->data[n+1] = (void*)sop;
5060 RExC_rx->data->data[n+2] = (void*)pad;
5063 else { /* First pass */
5064 if (PL_reginterp_cnt < ++RExC_seen_evals
5066 /* No compiled RE interpolated, has runtime
5067 components ===> unsafe. */
5068 FAIL("Eval-group not allowed at runtime, use re 'eval'");
5069 if (PL_tainting && PL_tainted)
5070 FAIL("Eval-group in insecure regular expression");
5071 #if PERL_VERSION > 8
5072 if (IN_PERL_COMPILETIME)
5077 nextchar(pRExC_state);
5079 ret = reg_node(pRExC_state, LOGICAL);
5082 REGTAIL(pRExC_state, ret, reganode(pRExC_state, EVAL, n));
5083 /* deal with the length of this later - MJD */
5086 ret = reganode(pRExC_state, EVAL, n);
5087 Set_Node_Length(ret, RExC_parse - parse_start + 1);
5088 Set_Node_Offset(ret, parse_start);
5091 case '(': /* (?(?{...})...) and (?(?=...)...) */
5094 if (RExC_parse[0] == '?') { /* (?(?...)) */
5095 if (RExC_parse[1] == '=' || RExC_parse[1] == '!'
5096 || RExC_parse[1] == '<'
5097 || RExC_parse[1] == '{') { /* Lookahead or eval. */
5100 ret = reg_node(pRExC_state, LOGICAL);
5103 REGTAIL(pRExC_state, ret, reg(pRExC_state, 1, &flag,depth+1));
5107 else if ( RExC_parse[0] == '<' /* (?(<NAME>)...) */
5108 || RExC_parse[0] == '\'' ) /* (?('NAME')...) */
5110 char ch = RExC_parse[0] == '<' ? '>' : '\'';
5111 char *name_start= RExC_parse++;
5113 SV *sv_dat=reg_scan_name(pRExC_state,
5114 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
5115 if (RExC_parse == name_start || *RExC_parse != ch)
5116 vFAIL2("Sequence (?(%c... not terminated",
5117 (ch == '>' ? '<' : ch));
5120 num = add_data( pRExC_state, 1, "S" );
5121 RExC_rx->data->data[num]=(void*)sv_dat;
5122 SvREFCNT_inc(sv_dat);
5124 ret = reganode(pRExC_state,NGROUPP,num);
5125 goto insert_if_check_paren;
5127 else if (RExC_parse[0] == 'D' &&
5128 RExC_parse[1] == 'E' &&
5129 RExC_parse[2] == 'F' &&
5130 RExC_parse[3] == 'I' &&
5131 RExC_parse[4] == 'N' &&
5132 RExC_parse[5] == 'E')
5134 ret = reganode(pRExC_state,DEFINEP,0);
5137 goto insert_if_check_paren;
5139 else if (RExC_parse[0] == 'R') {
5142 if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
5143 parno = atoi(RExC_parse++);
5144 while (isDIGIT(*RExC_parse))
5146 } else if (RExC_parse[0] == '&') {
5149 sv_dat = reg_scan_name(pRExC_state,
5150 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
5151 parno = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
5153 ret = reganode(pRExC_state,INSUBP,parno);
5154 goto insert_if_check_paren;
5156 else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
5159 parno = atoi(RExC_parse++);
5161 while (isDIGIT(*RExC_parse))
5163 ret = reganode(pRExC_state, GROUPP, parno);
5165 insert_if_check_paren:
5166 if ((c = *nextchar(pRExC_state)) != ')')
5167 vFAIL("Switch condition not recognized");
5169 REGTAIL(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
5170 br = regbranch(pRExC_state, &flags, 1,depth+1);
5172 br = reganode(pRExC_state, LONGJMP, 0);
5174 REGTAIL(pRExC_state, br, reganode(pRExC_state, LONGJMP, 0));
5175 c = *nextchar(pRExC_state);
5180 vFAIL("(?(DEFINE)....) does not allow branches");
5181 lastbr = reganode(pRExC_state, IFTHEN, 0); /* Fake one for optimizer. */
5182 regbranch(pRExC_state, &flags, 1,depth+1);
5183 REGTAIL(pRExC_state, ret, lastbr);
5186 c = *nextchar(pRExC_state);
5191 vFAIL("Switch (?(condition)... contains too many branches");
5192 ender = reg_node(pRExC_state, TAIL);
5193 REGTAIL(pRExC_state, br, ender);
5195 REGTAIL(pRExC_state, lastbr, ender);
5196 REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
5199 REGTAIL(pRExC_state, ret, ender);
5203 vFAIL2("Unknown switch condition (?(%.2s", RExC_parse);
5207 RExC_parse--; /* for vFAIL to print correctly */
5208 vFAIL("Sequence (? incomplete");
5212 parse_flags: /* (?i) */
5213 while (*RExC_parse && strchr("iogcmsx", *RExC_parse)) {
5214 /* (?g), (?gc) and (?o) are useless here
5215 and must be globally applied -- japhy */
5217 if (*RExC_parse == 'o' || *RExC_parse == 'g') {
5218 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
5219 const I32 wflagbit = *RExC_parse == 'o' ? WASTED_O : WASTED_G;
5220 if (! (wastedflags & wflagbit) ) {
5221 wastedflags |= wflagbit;
5224 "Useless (%s%c) - %suse /%c modifier",
5225 flagsp == &negflags ? "?-" : "?",
5227 flagsp == &negflags ? "don't " : "",
5233 else if (*RExC_parse == 'c') {
5234 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
5235 if (! (wastedflags & WASTED_C) ) {
5236 wastedflags |= WASTED_GC;
5239 "Useless (%sc) - %suse /gc modifier",
5240 flagsp == &negflags ? "?-" : "?",
5241 flagsp == &negflags ? "don't " : ""
5246 else { pmflag(flagsp, *RExC_parse); }
5250 if (*RExC_parse == '-') {
5252 wastedflags = 0; /* reset so (?g-c) warns twice */
5256 RExC_flags |= posflags;
5257 RExC_flags &= ~negflags;
5258 if (*RExC_parse == ':') {
5264 if (*RExC_parse != ')') {
5266 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
5268 nextchar(pRExC_state);
5278 ret = reganode(pRExC_state, OPEN, parno);
5281 RExC_nestroot = parno;
5282 if (RExC_seen & REG_SEEN_RECURSE) {
5283 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
5284 "Setting open paren #%"IVdf" to %d\n",
5285 (IV)parno, REG_NODE_NUM(ret)));
5286 RExC_open_parens[parno-1]= ret;
5289 Set_Node_Length(ret, 1); /* MJD */
5290 Set_Node_Offset(ret, RExC_parse); /* MJD */
5297 /* Pick up the branches, linking them together. */
5298 parse_start = RExC_parse; /* MJD */
5299 br = regbranch(pRExC_state, &flags, 1,depth+1);
5300 /* branch_len = (paren != 0); */
5304 if (*RExC_parse == '|') {
5305 if (!SIZE_ONLY && RExC_extralen) {
5306 reginsert(pRExC_state, BRANCHJ, br, depth+1);
5309 reginsert(pRExC_state, BRANCH, br, depth+1);
5310 Set_Node_Length(br, paren != 0);
5311 Set_Node_Offset_To_R(br-RExC_emit_start, parse_start-RExC_start);
5315 RExC_extralen += 1; /* For BRANCHJ-BRANCH. */
5317 else if (paren == ':') {
5318 *flagp |= flags&SIMPLE;
5320 if (is_open) { /* Starts with OPEN. */
5321 REGTAIL(pRExC_state, ret, br); /* OPEN -> first. */
5323 else if (paren != '?') /* Not Conditional */
5325 *flagp |= flags & (SPSTART | HASWIDTH);
5327 while (*RExC_parse == '|') {
5328 if (!SIZE_ONLY && RExC_extralen) {
5329 ender = reganode(pRExC_state, LONGJMP,0);
5330 REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender); /* Append to the previous. */
5333 RExC_extralen += 2; /* Account for LONGJMP. */
5334 nextchar(pRExC_state);
5335 br = regbranch(pRExC_state, &flags, 0, depth+1);
5339 REGTAIL(pRExC_state, lastbr, br); /* BRANCH -> BRANCH. */
5343 *flagp |= flags&SPSTART;
5346 if (have_branch || paren != ':') {
5347 /* Make a closing node, and hook it on the end. */
5350 ender = reg_node(pRExC_state, TAIL);
5354 ender = reganode(pRExC_state, CLOSE, parno);
5355 if (!SIZE_ONLY && RExC_seen & REG_SEEN_RECURSE) {
5356 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
5357 "Setting close paren #%"IVdf" to %d\n",
5358 (IV)parno, REG_NODE_NUM(ender)));
5359 RExC_close_parens[parno-1]= ender;
5360 if (RExC_nestroot == parno)
5363 Set_Node_Offset(ender,RExC_parse+1); /* MJD */
5364 Set_Node_Length(ender,1); /* MJD */
5370 *flagp &= ~HASWIDTH;
5373 ender = reg_node(pRExC_state, SUCCEED);
5376 ender = reg_node(pRExC_state, END);
5378 assert(!RExC_opend); /* there can only be one! */
5383 REGTAIL(pRExC_state, lastbr, ender);
5385 if (have_branch && !SIZE_ONLY) {
5387 RExC_seen |= REG_TOP_LEVEL_BRANCHES;
5389 /* Hook the tails of the branches to the closing node. */
5390 for (br = ret; br; br = regnext(br)) {
5391 const U8 op = PL_regkind[OP(br)];
5393 REGTAIL_STUDY(pRExC_state, NEXTOPER(br), ender);
5395 else if (op == BRANCHJ) {
5396 REGTAIL_STUDY(pRExC_state, NEXTOPER(NEXTOPER(br)), ender);
5404 static const char parens[] = "=!<,>";
5406 if (paren && (p = strchr(parens, paren))) {
5407 U8 node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
5408 int flag = (p - parens) > 1;
5411 node = SUSPEND, flag = 0;
5412 reginsert(pRExC_state, node,ret, depth+1);
5413 Set_Node_Cur_Length(ret);
5414 Set_Node_Offset(ret, parse_start + 1);
5416 REGTAIL_STUDY(pRExC_state, ret, reg_node(pRExC_state, TAIL));
5420 /* Check for proper termination. */
5422 RExC_flags = oregflags;
5423 if (RExC_parse >= RExC_end || *nextchar(pRExC_state) != ')') {
5424 RExC_parse = oregcomp_parse;
5425 vFAIL("Unmatched (");
5428 else if (!paren && RExC_parse < RExC_end) {
5429 if (*RExC_parse == ')') {
5431 vFAIL("Unmatched )");
5434 FAIL("Junk on end of regexp"); /* "Can't happen". */
5442 - regbranch - one alternative of an | operator
5444 * Implements the concatenation operator.
5447 S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first, U32 depth)
5450 register regnode *ret;
5451 register regnode *chain = NULL;
5452 register regnode *latest;
5453 I32 flags = 0, c = 0;
5454 GET_RE_DEBUG_FLAGS_DECL;
5455 DEBUG_PARSE("brnc");
5459 if (!SIZE_ONLY && RExC_extralen)
5460 ret = reganode(pRExC_state, BRANCHJ,0);
5462 ret = reg_node(pRExC_state, BRANCH);
5463 Set_Node_Length(ret, 1);
5467 if (!first && SIZE_ONLY)
5468 RExC_extralen += 1; /* BRANCHJ */
5470 *flagp = WORST; /* Tentatively. */
5473 nextchar(pRExC_state);
5474 while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
5476 latest = regpiece(pRExC_state, &flags,depth+1);
5477 if (latest == NULL) {
5478 if (flags & TRYAGAIN)
5482 else if (ret == NULL)
5484 *flagp |= flags&HASWIDTH;
5485 if (chain == NULL) /* First piece. */
5486 *flagp |= flags&SPSTART;
5489 REGTAIL(pRExC_state, chain, latest);
5494 if (chain == NULL) { /* Loop ran zero times. */
5495 chain = reg_node(pRExC_state, NOTHING);
5500 *flagp |= flags&SIMPLE;
5507 - regpiece - something followed by possible [*+?]
5509 * Note that the branching code sequences used for ? and the general cases
5510 * of * and + are somewhat optimized: they use the same NOTHING node as
5511 * both the endmarker for their branch list and the body of the last branch.
5512 * It might seem that this node could be dispensed with entirely, but the
5513 * endmarker role is not redundant.
5516 S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
5519 register regnode *ret;
5521 register char *next;
5523 const char * const origparse = RExC_parse;
5525 I32 max = REG_INFTY;
5527 const char *maxpos = NULL;
5528 GET_RE_DEBUG_FLAGS_DECL;
5529 DEBUG_PARSE("piec");
5531 ret = regatom(pRExC_state, &flags,depth+1);
5533 if (flags & TRYAGAIN)
5540 if (op == '{' && regcurly(RExC_parse)) {
5542 parse_start = RExC_parse; /* MJD */
5543 next = RExC_parse + 1;
5544 while (isDIGIT(*next) || *next == ',') {
5553 if (*next == '}') { /* got one */
5557 min = atoi(RExC_parse);
5561 maxpos = RExC_parse;
5563 if (!max && *maxpos != '0')
5564 max = REG_INFTY; /* meaning "infinity" */
5565 else if (max >= REG_INFTY)
5566 vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
5568 nextchar(pRExC_state);
5571 if ((flags&SIMPLE)) {
5572 RExC_naughty += 2 + RExC_naughty / 2;
5573 reginsert(pRExC_state, CURLY, ret, depth+1);
5574 Set_Node_Offset(ret, parse_start+1); /* MJD */
5575 Set_Node_Cur_Length(ret);
5578 regnode * const w = reg_node(pRExC_state, WHILEM);
5581 REGTAIL(pRExC_state, ret, w);
5582 if (!SIZE_ONLY && RExC_extralen) {
5583 reginsert(pRExC_state, LONGJMP,ret, depth+1);
5584 reginsert(pRExC_state, NOTHING,ret, depth+1);
5585 NEXT_OFF(ret) = 3; /* Go over LONGJMP. */
5587 reginsert(pRExC_state, CURLYX,ret, depth+1);
5589 Set_Node_Offset(ret, parse_start+1);
5590 Set_Node_Length(ret,
5591 op == '{' ? (RExC_parse - parse_start) : 1);
5593 if (!SIZE_ONLY && RExC_extralen)
5594 NEXT_OFF(ret) = 3; /* Go over NOTHING to LONGJMP. */
5595 REGTAIL(pRExC_state, ret, reg_node(pRExC_state, NOTHING));
5597 RExC_whilem_seen++, RExC_extralen += 3;
5598 RExC_naughty += 4 + RExC_naughty; /* compound interest */
5606 if (max && max < min)
5607 vFAIL("Can't do {n,m} with n > m");
5609 ARG1_SET(ret, (U16)min);
5610 ARG2_SET(ret, (U16)max);
5622 #if 0 /* Now runtime fix should be reliable. */
5624 /* if this is reinstated, don't forget to put this back into perldiag:
5626 =item Regexp *+ operand could be empty at {#} in regex m/%s/
5628 (F) The part of the regexp subject to either the * or + quantifier
5629 could match an empty string. The {#} shows in the regular
5630 expression about where the problem was discovered.
5634 if (!(flags&HASWIDTH) && op != '?')
5635 vFAIL("Regexp *+ operand could be empty");
5638 parse_start = RExC_parse;
5639 nextchar(pRExC_state);
5641 *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH);
5643 if (op == '*' && (flags&SIMPLE)) {
5644 reginsert(pRExC_state, STAR, ret, depth+1);
5648 else if (op == '*') {
5652 else if (op == '+' && (flags&SIMPLE)) {
5653 reginsert(pRExC_state, PLUS, ret, depth+1);
5657 else if (op == '+') {
5661 else if (op == '?') {
5666 if (!SIZE_ONLY && !(flags&HASWIDTH) && max > REG_INFTY/3 && ckWARN(WARN_REGEXP)) {
5668 "%.*s matches null string many times",
5669 (int)(RExC_parse >= origparse ? RExC_parse - origparse : 0),
5673 if (RExC_parse < RExC_end && *RExC_parse == '?') {
5674 nextchar(pRExC_state);
5675 reginsert(pRExC_state, MINMOD, ret, depth+1);
5676 REGTAIL(pRExC_state, ret, ret + NODE_STEP_REGNODE);
5678 #ifndef REG_ALLOW_MINMOD_SUSPEND
5681 if (RExC_parse < RExC_end && *RExC_parse == '+') {
5683 nextchar(pRExC_state);
5684 ender = reg_node(pRExC_state, SUCCEED);
5685 REGTAIL(pRExC_state, ret, ender);
5686 reginsert(pRExC_state, SUSPEND, ret, depth+1);
5688 ender = reg_node(pRExC_state, TAIL);
5689 REGTAIL(pRExC_state, ret, ender);
5693 if (RExC_parse < RExC_end && ISMULT2(RExC_parse)) {
5695 vFAIL("Nested quantifiers");
5702 /* reg_namedseq(pRExC_state,UVp)
5704 This is expected to be called by a parser routine that has
5705 recognized'\N' and needs to handle the rest. RExC_parse is
5706 expected to point at the first char following the N at the time
5709 If valuep is non-null then it is assumed that we are parsing inside
5710 of a charclass definition and the first codepoint in the resolved
5711 string is returned via *valuep and the routine will return NULL.
5712 In this mode if a multichar string is returned from the charnames
5713 handler a warning will be issued, and only the first char in the
5714 sequence will be examined. If the string returned is zero length
5715 then the value of *valuep is undefined and NON-NULL will
5716 be returned to indicate failure. (This will NOT be a valid pointer
5719 If value is null then it is assumed that we are parsing normal text
5720 and inserts a new EXACT node into the program containing the resolved
5721 string and returns a pointer to the new node. If the string is
5722 zerolength a NOTHING node is emitted.
5724 On success RExC_parse is set to the char following the endbrace.
5725 Parsing failures will generate a fatal errorvia vFAIL(...)
5727 NOTE: We cache all results from the charnames handler locally in
5728 the RExC_charnames hash (created on first use) to prevent a charnames
5729 handler from playing silly-buggers and returning a short string and
5730 then a long string for a given pattern. Since the regexp program
5731 size is calculated during an initial parse this would result
5732 in a buffer overrun so we cache to prevent the charname result from
5733 changing during the course of the parse.
5737 S_reg_namedseq(pTHX_ RExC_state_t *pRExC_state, UV *valuep)
5739 char * name; /* start of the content of the name */
5740 char * endbrace; /* endbrace following the name */
5743 STRLEN len; /* this has various purposes throughout the code */
5744 bool cached = 0; /* if this is true then we shouldn't refcount dev sv_str */
5745 regnode *ret = NULL;
5747 if (*RExC_parse != '{') {
5748 vFAIL("Missing braces on \\N{}");
5750 name = RExC_parse+1;
5751 endbrace = strchr(RExC_parse, '}');
5754 vFAIL("Missing right brace on \\N{}");
5756 RExC_parse = endbrace + 1;
5759 /* RExC_parse points at the beginning brace,
5760 endbrace points at the last */
5761 if ( name[0]=='U' && name[1]=='+' ) {
5762 /* its a "unicode hex" notation {U+89AB} */
5763 I32 fl = PERL_SCAN_ALLOW_UNDERSCORES
5764 | PERL_SCAN_DISALLOW_PREFIX
5765 | (SIZE_ONLY ? PERL_SCAN_SILENT_ILLDIGIT : 0);
5767 len = (STRLEN)(endbrace - name - 2);
5768 cp = grok_hex(name + 2, &len, &fl, NULL);
5769 if ( len != (STRLEN)(endbrace - name - 2) ) {
5778 sv_str= Perl_newSVpvf_nocontext("%c",(int)cp);
5780 /* fetch the charnames handler for this scope */
5781 HV * const table = GvHV(PL_hintgv);
5783 hv_fetchs(table, "charnames", FALSE) :
5785 SV *cv= cvp ? *cvp : NULL;
5788 /* create an SV with the name as argument */
5789 sv_name = newSVpvn(name, endbrace - name);
5791 if (!table || !(PL_hints & HINT_LOCALIZE_HH)) {
5792 vFAIL2("Constant(\\N{%s}) unknown: "
5793 "(possibly a missing \"use charnames ...\")",
5796 if (!cvp || !SvOK(*cvp)) { /* when $^H{charnames} = undef; */
5797 vFAIL2("Constant(\\N{%s}): "
5798 "$^H{charnames} is not defined",SvPVX(sv_name));
5803 if (!RExC_charnames) {
5804 /* make sure our cache is allocated */
5805 RExC_charnames = newHV();
5806 sv_2mortal((SV*)RExC_charnames);
5808 /* see if we have looked this one up before */
5809 he_str = hv_fetch_ent( RExC_charnames, sv_name, 0, 0 );
5811 sv_str = HeVAL(he_str);
5824 count= call_sv(cv, G_SCALAR);
5826 if (count == 1) { /* XXXX is this right? dmq */
5828 SvREFCNT_inc_simple_void(sv_str);
5836 if ( !sv_str || !SvOK(sv_str) ) {
5837 vFAIL2("Constant(\\N{%s}): Call to &{$^H{charnames}} "
5838 "did not return a defined value",SvPVX(sv_name));
5840 if (hv_store_ent( RExC_charnames, sv_name, sv_str, 0))
5845 char *p = SvPV(sv_str, len);
5848 if ( SvUTF8(sv_str) ) {
5849 *valuep = utf8_to_uvchr((U8*)p, &numlen);
5853 We have to turn on utf8 for high bit chars otherwise
5854 we get failures with
5856 "ss" =~ /[\N{LATIN SMALL LETTER SHARP S}]/i
5857 "SS" =~ /[\N{LATIN SMALL LETTER SHARP S}]/i
5859 This is different from what \x{} would do with the same
5860 codepoint, where the condition is > 0xFF.
5867 /* warn if we havent used the whole string? */
5869 if (numlen<len && SIZE_ONLY && ckWARN(WARN_REGEXP)) {
5871 "Ignoring excess chars from \\N{%s} in character class",
5875 } else if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
5877 "Ignoring zero length \\N{%s} in character class",
5882 SvREFCNT_dec(sv_name);
5884 SvREFCNT_dec(sv_str);
5885 return len ? NULL : (regnode *)&len;
5886 } else if(SvCUR(sv_str)) {
5891 char * parse_start = name-3; /* needed for the offsets */
5892 GET_RE_DEBUG_FLAGS_DECL; /* needed for the offsets */
5894 ret = reg_node(pRExC_state,
5895 (U8)(FOLD ? (LOC ? EXACTFL : EXACTF) : EXACT));
5898 if ( RExC_utf8 && !SvUTF8(sv_str) ) {
5899 sv_utf8_upgrade(sv_str);
5900 } else if ( !RExC_utf8 && SvUTF8(sv_str) ) {
5904 p = SvPV(sv_str, len);
5906 /* len is the length written, charlen is the size the char read */
5907 for ( len = 0; p < pend; p += charlen ) {
5909 UV uvc = utf8_to_uvchr((U8*)p, &charlen);
5911 STRLEN foldlen,numlen;
5912 U8 tmpbuf[UTF8_MAXBYTES_CASE+1], *foldbuf;
5913 uvc = toFOLD_uni(uvc, tmpbuf, &foldlen);
5914 /* Emit all the Unicode characters. */
5916 for (foldbuf = tmpbuf;
5920 uvc = utf8_to_uvchr(foldbuf, &numlen);
5922 const STRLEN unilen = reguni(pRExC_state, uvc, s);
5925 /* In EBCDIC the numlen
5926 * and unilen can differ. */
5928 if (numlen >= foldlen)
5932 break; /* "Can't happen." */
5935 const STRLEN unilen = reguni(pRExC_state, uvc, s);
5947 RExC_size += STR_SZ(len);
5950 RExC_emit += STR_SZ(len);
5952 Set_Node_Cur_Length(ret); /* MJD */
5954 nextchar(pRExC_state);
5956 ret = reg_node(pRExC_state,NOTHING);
5959 SvREFCNT_dec(sv_str);
5962 SvREFCNT_dec(sv_name);
5972 * It returns the code point in utf8 for the value in *encp.
5973 * value: a code value in the source encoding
5974 * encp: a pointer to an Encode object
5976 * If the result from Encode is not a single character,
5977 * it returns U+FFFD (Replacement character) and sets *encp to NULL.
5980 S_reg_recode(pTHX_ const char value, SV **encp)
5983 SV * const sv = sv_2mortal(newSVpvn(&value, numlen));
5984 const char * const s = encp && *encp ? sv_recode_to_utf8(sv, *encp)
5986 const STRLEN newlen = SvCUR(sv);
5987 UV uv = UNICODE_REPLACEMENT;
5991 ? utf8n_to_uvchr((U8*)s, newlen, &numlen, UTF8_ALLOW_DEFAULT)
5994 if (!newlen || numlen != newlen) {
5995 uv = UNICODE_REPLACEMENT;
6004 - regatom - the lowest level
6006 * Optimization: gobbles an entire sequence of ordinary characters so that
6007 * it can turn them into a single node, which is smaller to store and
6008 * faster to run. Backslashed characters are exceptions, each becoming a
6009 * separate node; the code is simpler that way and it's not worth fixing.
6011 * [Yes, it is worth fixing, some scripts can run twice the speed.]
6012 * [It looks like its ok, as in S_study_chunk we merge adjacent EXACT nodes]
6015 S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
6018 register regnode *ret = NULL;
6020 char *parse_start = RExC_parse;
6021 GET_RE_DEBUG_FLAGS_DECL;
6022 DEBUG_PARSE("atom");
6023 *flagp = WORST; /* Tentatively. */
6026 switch (*RExC_parse) {
6028 RExC_seen_zerolen++;
6029 nextchar(pRExC_state);
6030 if (RExC_flags & PMf_MULTILINE)
6031 ret = reg_node(pRExC_state, MBOL);
6032 else if (RExC_flags & PMf_SINGLELINE)
6033 ret = reg_node(pRExC_state, SBOL);
6035 ret = reg_node(pRExC_state, BOL);
6036 Set_Node_Length(ret, 1); /* MJD */
6039 nextchar(pRExC_state);
6041 RExC_seen_zerolen++;
6042 if (RExC_flags & PMf_MULTILINE)
6043 ret = reg_node(pRExC_state, MEOL);
6044 else if (RExC_flags & PMf_SINGLELINE)
6045 ret = reg_node(pRExC_state, SEOL);
6047 ret = reg_node(pRExC_state, EOL);
6048 Set_Node_Length(ret, 1); /* MJD */
6051 nextchar(pRExC_state);
6052 if (RExC_flags & PMf_SINGLELINE)
6053 ret = reg_node(pRExC_state, SANY);
6055 ret = reg_node(pRExC_state, REG_ANY);
6056 *flagp |= HASWIDTH|SIMPLE;
6058 Set_Node_Length(ret, 1); /* MJD */
6062 char * const oregcomp_parse = ++RExC_parse;
6063 ret = regclass(pRExC_state,depth+1);
6064 if (*RExC_parse != ']') {
6065 RExC_parse = oregcomp_parse;
6066 vFAIL("Unmatched [");
6068 nextchar(pRExC_state);
6069 *flagp |= HASWIDTH|SIMPLE;
6070 Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */
6074 nextchar(pRExC_state);
6075 ret = reg(pRExC_state, 1, &flags,depth+1);
6077 if (flags & TRYAGAIN) {
6078 if (RExC_parse == RExC_end) {
6079 /* Make parent create an empty node if needed. */
6087 *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE);
6091 if (flags & TRYAGAIN) {
6095 vFAIL("Internal urp");
6096 /* Supposed to be caught earlier. */
6099 if (!regcurly(RExC_parse)) {
6108 vFAIL("Quantifier follows nothing");
6111 switch (*++RExC_parse) {
6113 RExC_seen_zerolen++;
6114 ret = reg_node(pRExC_state, SBOL);
6116 nextchar(pRExC_state);
6117 Set_Node_Length(ret, 2); /* MJD */
6120 ret = reg_node(pRExC_state, GPOS);
6121 RExC_seen |= REG_SEEN_GPOS;
6123 nextchar(pRExC_state);
6124 Set_Node_Length(ret, 2); /* MJD */
6127 ret = reg_node(pRExC_state, SEOL);
6129 RExC_seen_zerolen++; /* Do not optimize RE away */
6130 nextchar(pRExC_state);
6133 ret = reg_node(pRExC_state, EOS);
6135 RExC_seen_zerolen++; /* Do not optimize RE away */
6136 nextchar(pRExC_state);
6137 Set_Node_Length(ret, 2); /* MJD */
6140 ret = reg_node(pRExC_state, CANY);
6141 RExC_seen |= REG_SEEN_CANY;
6142 *flagp |= HASWIDTH|SIMPLE;
6143 nextchar(pRExC_state);
6144 Set_Node_Length(ret, 2); /* MJD */
6147 ret = reg_node(pRExC_state, CLUMP);
6149 nextchar(pRExC_state);
6150 Set_Node_Length(ret, 2); /* MJD */
6153 ret = reg_node(pRExC_state, (U8)(LOC ? ALNUML : ALNUM));
6154 *flagp |= HASWIDTH|SIMPLE;
6155 nextchar(pRExC_state);
6156 Set_Node_Length(ret, 2); /* MJD */
6159 ret = reg_node(pRExC_state, (U8)(LOC ? NALNUML : NALNUM));
6160 *flagp |= HASWIDTH|SIMPLE;
6161 nextchar(pRExC_state);
6162 Set_Node_Length(ret, 2); /* MJD */
6165 RExC_seen_zerolen++;
6166 RExC_seen |= REG_SEEN_LOOKBEHIND;
6167 ret = reg_node(pRExC_state, (U8)(LOC ? BOUNDL : BOUND));
6169 nextchar(pRExC_state);
6170 Set_Node_Length(ret, 2); /* MJD */
6173 RExC_seen_zerolen++;
6174 RExC_seen |= REG_SEEN_LOOKBEHIND;
6175 ret = reg_node(pRExC_state, (U8)(LOC ? NBOUNDL : NBOUND));
6177 nextchar(pRExC_state);
6178 Set_Node_Length(ret, 2); /* MJD */
6181 ret = reg_node(pRExC_state, (U8)(LOC ? SPACEL : SPACE));
6182 *flagp |= HASWIDTH|SIMPLE;
6183 nextchar(pRExC_state);
6184 Set_Node_Length(ret, 2); /* MJD */
6187 ret = reg_node(pRExC_state, (U8)(LOC ? NSPACEL : NSPACE));
6188 *flagp |= HASWIDTH|SIMPLE;
6189 nextchar(pRExC_state);
6190 Set_Node_Length(ret, 2); /* MJD */
6193 ret = reg_node(pRExC_state, DIGIT);
6194 *flagp |= HASWIDTH|SIMPLE;
6195 nextchar(pRExC_state);
6196 Set_Node_Length(ret, 2); /* MJD */
6199 ret = reg_node(pRExC_state, NDIGIT);
6200 *flagp |= HASWIDTH|SIMPLE;
6201 nextchar(pRExC_state);
6202 Set_Node_Length(ret, 2); /* MJD */
6207 char* const oldregxend = RExC_end;
6208 char* parse_start = RExC_parse - 2;
6210 if (RExC_parse[1] == '{') {
6211 /* a lovely hack--pretend we saw [\pX] instead */
6212 RExC_end = strchr(RExC_parse, '}');
6214 const U8 c = (U8)*RExC_parse;
6216 RExC_end = oldregxend;
6217 vFAIL2("Missing right brace on \\%c{}", c);
6222 RExC_end = RExC_parse + 2;
6223 if (RExC_end > oldregxend)
6224 RExC_end = oldregxend;
6228 ret = regclass(pRExC_state,depth+1);
6230 RExC_end = oldregxend;
6233 Set_Node_Offset(ret, parse_start + 2);
6234 Set_Node_Cur_Length(ret);
6235 nextchar(pRExC_state);
6236 *flagp |= HASWIDTH|SIMPLE;
6240 /* Handle \N{NAME} here and not below because it can be
6241 multicharacter. join_exact() will join them up later on.
6242 Also this makes sure that things like /\N{BLAH}+/ and
6243 \N{BLAH} being multi char Just Happen. dmq*/
6245 ret= reg_namedseq(pRExC_state, NULL);
6247 case 'k': /* Handle \k<NAME> and \k'NAME' */
6249 char ch= RExC_parse[1];
6250 if (ch != '<' && ch != '\'') {
6252 vWARN( RExC_parse + 1,
6253 "Possible broken named back reference treated as literal k");
6257 char* name_start = (RExC_parse += 2);
6259 SV *sv_dat = reg_scan_name(pRExC_state,
6260 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
6261 ch= (ch == '<') ? '>' : '\'';
6263 if (RExC_parse == name_start || *RExC_parse != ch)
6264 vFAIL2("Sequence \\k%c... not terminated",
6265 (ch == '>' ? '<' : ch));
6268 ret = reganode(pRExC_state,
6269 (U8)(FOLD ? (LOC ? NREFFL : NREFF) : NREF),
6275 num = add_data( pRExC_state, 1, "S" );
6277 RExC_rx->data->data[num]=(void*)sv_dat;
6278 SvREFCNT_inc(sv_dat);
6280 /* override incorrect value set in reganode MJD */
6281 Set_Node_Offset(ret, parse_start+1);
6282 Set_Node_Cur_Length(ret); /* MJD */
6283 nextchar(pRExC_state);
6299 case '1': case '2': case '3': case '4':
6300 case '5': case '6': case '7': case '8': case '9':
6303 bool isrel=(*RExC_parse=='R');
6306 num = atoi(RExC_parse);
6308 num = RExC_cpar - num;
6310 vFAIL("Reference to nonexistent or unclosed group");
6312 if (num > 9 && num >= RExC_npar)
6315 char * const parse_start = RExC_parse - 1; /* MJD */
6316 while (isDIGIT(*RExC_parse))
6320 if (num > (I32)RExC_rx->nparens)
6321 vFAIL("Reference to nonexistent group");
6322 /* People make this error all the time apparently.
6323 So we cant fail on it, even though we should
6325 else if (num >= RExC_cpar)
6326 vFAIL("Reference to unclosed group will always match");
6330 ret = reganode(pRExC_state,
6331 (U8)(FOLD ? (LOC ? REFFL : REFF) : REF),
6335 /* override incorrect value set in reganode MJD */
6336 Set_Node_Offset(ret, parse_start+1);
6337 Set_Node_Cur_Length(ret); /* MJD */
6339 nextchar(pRExC_state);
6344 if (RExC_parse >= RExC_end)
6345 FAIL("Trailing \\");
6348 /* Do not generate "unrecognized" warnings here, we fall
6349 back into the quick-grab loop below */
6356 if (RExC_flags & PMf_EXTENDED) {
6357 while (RExC_parse < RExC_end && *RExC_parse != '\n')
6359 if (RExC_parse < RExC_end)
6365 register STRLEN len;
6370 U8 tmpbuf[UTF8_MAXBYTES_CASE+1], *foldbuf;
6372 parse_start = RExC_parse - 1;
6378 ret = reg_node(pRExC_state,
6379 (U8)(FOLD ? (LOC ? EXACTFL : EXACTF) : EXACT));
6381 for (len = 0, p = RExC_parse - 1;
6382 len < 127 && p < RExC_end;
6385 char * const oldp = p;
6387 if (RExC_flags & PMf_EXTENDED)
6388 p = regwhite(p, RExC_end);
6437 ender = ASCII_TO_NATIVE('\033');
6441 ender = ASCII_TO_NATIVE('\007');
6446 char* const e = strchr(p, '}');
6450 vFAIL("Missing right brace on \\x{}");
6453 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
6454 | PERL_SCAN_DISALLOW_PREFIX;
6455 STRLEN numlen = e - p - 1;
6456 ender = grok_hex(p + 1, &numlen, &flags, NULL);
6463 I32 flags = PERL_SCAN_DISALLOW_PREFIX;
6465 ender = grok_hex(p, &numlen, &flags, NULL);
6468 if (PL_encoding && ender < 0x100)
6469 goto recode_encoding;
6473 ender = UCHARAT(p++);
6474 ender = toCTRL(ender);
6476 case '0': case '1': case '2': case '3':case '4':
6477 case '5': case '6': case '7': case '8':case '9':
6479 (isDIGIT(p[1]) && atoi(p) >= RExC_npar) ) {
6482 ender = grok_oct(p, &numlen, &flags, NULL);
6489 if (PL_encoding && ender < 0x100)
6490 goto recode_encoding;
6494 SV* enc = PL_encoding;
6495 ender = reg_recode((const char)(U8)ender, &enc);
6496 if (!enc && SIZE_ONLY && ckWARN(WARN_REGEXP))
6497 vWARN(p, "Invalid escape in the specified encoding");
6503 FAIL("Trailing \\");
6506 if (!SIZE_ONLY&& isALPHA(*p) && ckWARN(WARN_REGEXP))
6507 vWARN2(p + 1, "Unrecognized escape \\%c passed through", UCHARAT(p));
6508 goto normal_default;
6513 if (UTF8_IS_START(*p) && UTF) {
6515 ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
6516 &numlen, UTF8_ALLOW_DEFAULT);
6523 if (RExC_flags & PMf_EXTENDED)
6524 p = regwhite(p, RExC_end);
6526 /* Prime the casefolded buffer. */
6527 ender = toFOLD_uni(ender, tmpbuf, &foldlen);
6529 if (ISMULT2(p)) { /* Back off on ?+*. */
6534 /* Emit all the Unicode characters. */
6536 for (foldbuf = tmpbuf;
6538 foldlen -= numlen) {
6539 ender = utf8_to_uvchr(foldbuf, &numlen);
6541 const STRLEN unilen = reguni(pRExC_state, ender, s);
6544 /* In EBCDIC the numlen
6545 * and unilen can differ. */
6547 if (numlen >= foldlen)
6551 break; /* "Can't happen." */
6555 const STRLEN unilen = reguni(pRExC_state, ender, s);
6564 REGC((char)ender, s++);
6570 /* Emit all the Unicode characters. */
6572 for (foldbuf = tmpbuf;
6574 foldlen -= numlen) {
6575 ender = utf8_to_uvchr(foldbuf, &numlen);
6577 const STRLEN unilen = reguni(pRExC_state, ender, s);
6580 /* In EBCDIC the numlen
6581 * and unilen can differ. */
6583 if (numlen >= foldlen)
6591 const STRLEN unilen = reguni(pRExC_state, ender, s);
6600 REGC((char)ender, s++);
6604 Set_Node_Cur_Length(ret); /* MJD */
6605 nextchar(pRExC_state);
6607 /* len is STRLEN which is unsigned, need to copy to signed */
6610 vFAIL("Internal disaster");
6614 if (len == 1 && UNI_IS_INVARIANT(ender))
6618 RExC_size += STR_SZ(len);
6621 RExC_emit += STR_SZ(len);
6631 S_regwhite(char *p, const char *e)
6636 else if (*p == '#') {
6639 } while (p < e && *p != '\n');
6647 /* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
6648 Character classes ([:foo:]) can also be negated ([:^foo:]).
6649 Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
6650 Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
6651 but trigger failures because they are currently unimplemented. */
6653 #define POSIXCC_DONE(c) ((c) == ':')
6654 #define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.')
6655 #define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c))
6658 S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value)
6661 I32 namedclass = OOB_NAMEDCLASS;
6663 if (value == '[' && RExC_parse + 1 < RExC_end &&
6664 /* I smell either [: or [= or [. -- POSIX has been here, right? */
6665 POSIXCC(UCHARAT(RExC_parse))) {
6666 const char c = UCHARAT(RExC_parse);
6667 char* const s = RExC_parse++;
6669 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != c)
6671 if (RExC_parse == RExC_end)
6672 /* Grandfather lone [:, [=, [. */
6675 const char* const t = RExC_parse++; /* skip over the c */
6678 if (UCHARAT(RExC_parse) == ']') {
6679 const char *posixcc = s + 1;
6680 RExC_parse++; /* skip over the ending ] */
6683 const I32 complement = *posixcc == '^' ? *posixcc++ : 0;
6684 const I32 skip = t - posixcc;
6686 /* Initially switch on the length of the name. */
6689 if (memEQ(posixcc, "word", 4)) /* this is not POSIX, this is the Perl \w */
6690 namedclass = complement ? ANYOF_NALNUM : ANYOF_ALNUM;
6693 /* Names all of length 5. */
6694 /* alnum alpha ascii blank cntrl digit graph lower
6695 print punct space upper */
6696 /* Offset 4 gives the best switch position. */
6697 switch (posixcc[4]) {
6699 if (memEQ(posixcc, "alph", 4)) /* alpha */
6700 namedclass = complement ? ANYOF_NALPHA : ANYOF_ALPHA;
6703 if (memEQ(posixcc, "spac", 4)) /* space */
6704 namedclass = complement ? ANYOF_NPSXSPC : ANYOF_PSXSPC;
6707 if (memEQ(posixcc, "grap", 4)) /* graph */
6708 namedclass = complement ? ANYOF_NGRAPH : ANYOF_GRAPH;
6711 if (memEQ(posixcc, "asci", 4)) /* ascii */
6712 namedclass = complement ? ANYOF_NASCII : ANYOF_ASCII;
6715 if (memEQ(posixcc, "blan", 4)) /* blank */
6716 namedclass = complement ? ANYOF_NBLANK : ANYOF_BLANK;
6719 if (memEQ(posixcc, "cntr", 4)) /* cntrl */
6720 namedclass = complement ? ANYOF_NCNTRL : ANYOF_CNTRL;
6723 if (memEQ(posixcc, "alnu", 4)) /* alnum */
6724 namedclass = complement ? ANYOF_NALNUMC : ANYOF_ALNUMC;
6727 if (memEQ(posixcc, "lowe", 4)) /* lower */
6728 namedclass = complement ? ANYOF_NLOWER : ANYOF_LOWER;
6729 else if (memEQ(posixcc, "uppe", 4)) /* upper */
6730 namedclass = complement ? ANYOF_NUPPER : ANYOF_UPPER;
6733 if (memEQ(posixcc, "digi", 4)) /* digit */
6734 namedclass = complement ? ANYOF_NDIGIT : ANYOF_DIGIT;
6735 else if (memEQ(posixcc, "prin", 4)) /* print */
6736 namedclass = complement ? ANYOF_NPRINT : ANYOF_PRINT;
6737 else if (memEQ(posixcc, "punc", 4)) /* punct */
6738 namedclass = complement ? ANYOF_NPUNCT : ANYOF_PUNCT;
6743 if (memEQ(posixcc, "xdigit", 6))
6744 namedclass = complement ? ANYOF_NXDIGIT : ANYOF_XDIGIT;
6748 if (namedclass == OOB_NAMEDCLASS)
6749 Simple_vFAIL3("POSIX class [:%.*s:] unknown",
6751 assert (posixcc[skip] == ':');
6752 assert (posixcc[skip+1] == ']');
6753 } else if (!SIZE_ONLY) {
6754 /* [[=foo=]] and [[.foo.]] are still future. */
6756 /* adjust RExC_parse so the warning shows after
6758 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse) != ']')
6760 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
6763 /* Maternal grandfather:
6764 * "[:" ending in ":" but not in ":]" */
6774 S_checkposixcc(pTHX_ RExC_state_t *pRExC_state)
6777 if (POSIXCC(UCHARAT(RExC_parse))) {
6778 const char *s = RExC_parse;
6779 const char c = *s++;
6783 if (*s && c == *s && s[1] == ']') {
6784 if (ckWARN(WARN_REGEXP))
6786 "POSIX syntax [%c %c] belongs inside character classes",
6789 /* [[=foo=]] and [[.foo.]] are still future. */
6790 if (POSIXCC_NOTYET(c)) {
6791 /* adjust RExC_parse so the error shows after
6793 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse++) != ']')
6795 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
6803 parse a class specification and produce either an ANYOF node that
6804 matches the pattern. If the pattern matches a single char only and
6805 that char is < 256 then we produce an EXACT node instead.
6808 S_regclass(pTHX_ RExC_state_t *pRExC_state, U32 depth)
6811 register UV value = 0;
6812 register UV nextvalue;
6813 register IV prevvalue = OOB_UNICODE;
6814 register IV range = 0;
6815 register regnode *ret;
6818 char *rangebegin = NULL;
6819 bool need_class = 0;
6822 bool optimize_invert = TRUE;
6823 AV* unicode_alternate = NULL;
6825 UV literal_endpoint = 0;
6827 UV stored = 0; /* number of chars stored in the class */
6829 regnode * const orig_emit = RExC_emit; /* Save the original RExC_emit in
6830 case we need to change the emitted regop to an EXACT. */
6831 const char * orig_parse = RExC_parse;
6832 GET_RE_DEBUG_FLAGS_DECL;
6834 PERL_UNUSED_ARG(depth);
6837 DEBUG_PARSE("clas");
6839 /* Assume we are going to generate an ANYOF node. */
6840 ret = reganode(pRExC_state, ANYOF, 0);
6843 ANYOF_FLAGS(ret) = 0;
6845 if (UCHARAT(RExC_parse) == '^') { /* Complement of range. */
6849 ANYOF_FLAGS(ret) |= ANYOF_INVERT;
6853 RExC_size += ANYOF_SKIP;
6854 listsv = &PL_sv_undef; /* For code scanners: listsv always non-NULL. */
6857 RExC_emit += ANYOF_SKIP;
6859 ANYOF_FLAGS(ret) |= ANYOF_FOLD;
6861 ANYOF_FLAGS(ret) |= ANYOF_LOCALE;
6862 ANYOF_BITMAP_ZERO(ret);
6863 listsv = newSVpvs("# comment\n");
6866 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
6868 if (!SIZE_ONLY && POSIXCC(nextvalue))
6869 checkposixcc(pRExC_state);
6871 /* allow 1st char to be ] (allowing it to be - is dealt with later) */
6872 if (UCHARAT(RExC_parse) == ']')
6876 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != ']') {
6880 namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
6883 rangebegin = RExC_parse;
6885 value = utf8n_to_uvchr((U8*)RExC_parse,
6886 RExC_end - RExC_parse,
6887 &numlen, UTF8_ALLOW_DEFAULT);
6888 RExC_parse += numlen;
6891 value = UCHARAT(RExC_parse++);
6893 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
6894 if (value == '[' && POSIXCC(nextvalue))
6895 namedclass = regpposixcc(pRExC_state, value);
6896 else if (value == '\\') {
6898 value = utf8n_to_uvchr((U8*)RExC_parse,
6899 RExC_end - RExC_parse,
6900 &numlen, UTF8_ALLOW_DEFAULT);
6901 RExC_parse += numlen;
6904 value = UCHARAT(RExC_parse++);
6905 /* Some compilers cannot handle switching on 64-bit integer
6906 * values, therefore value cannot be an UV. Yes, this will
6907 * be a problem later if we want switch on Unicode.
6908 * A similar issue a little bit later when switching on
6909 * namedclass. --jhi */
6910 switch ((I32)value) {
6911 case 'w': namedclass = ANYOF_ALNUM; break;
6912 case 'W': namedclass = ANYOF_NALNUM; break;
6913 case 's': namedclass = ANYOF_SPACE; break;
6914 case 'S': namedclass = ANYOF_NSPACE; break;
6915 case 'd': namedclass = ANYOF_DIGIT; break;
6916 case 'D': namedclass = ANYOF_NDIGIT; break;
6917 case 'N': /* Handle \N{NAME} in class */
6919 /* We only pay attention to the first char of
6920 multichar strings being returned. I kinda wonder
6921 if this makes sense as it does change the behaviour
6922 from earlier versions, OTOH that behaviour was broken
6924 UV v; /* value is register so we cant & it /grrr */
6925 if (reg_namedseq(pRExC_state, &v)) {
6935 if (RExC_parse >= RExC_end)
6936 vFAIL2("Empty \\%c{}", (U8)value);
6937 if (*RExC_parse == '{') {
6938 const U8 c = (U8)value;
6939 e = strchr(RExC_parse++, '}');
6941 vFAIL2("Missing right brace on \\%c{}", c);
6942 while (isSPACE(UCHARAT(RExC_parse)))
6944 if (e == RExC_parse)
6945 vFAIL2("Empty \\%c{}", c);
6947 while (isSPACE(UCHARAT(RExC_parse + n - 1)))
6955 if (UCHARAT(RExC_parse) == '^') {
6958 value = value == 'p' ? 'P' : 'p'; /* toggle */
6959 while (isSPACE(UCHARAT(RExC_parse))) {
6964 Perl_sv_catpvf(aTHX_ listsv, "%cutf8::%.*s\n",
6965 (value=='p' ? '+' : '!'), (int)n, RExC_parse);
6968 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
6969 namedclass = ANYOF_MAX; /* no official name, but it's named */
6972 case 'n': value = '\n'; break;
6973 case 'r': value = '\r'; break;
6974 case 't': value = '\t'; break;
6975 case 'f': value = '\f'; break;
6976 case 'b': value = '\b'; break;
6977 case 'e': value = ASCII_TO_NATIVE('\033');break;
6978 case 'a': value = ASCII_TO_NATIVE('\007');break;
6980 if (*RExC_parse == '{') {
6981 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
6982 | PERL_SCAN_DISALLOW_PREFIX;
6983 char * const e = strchr(RExC_parse++, '}');
6985 vFAIL("Missing right brace on \\x{}");
6987 numlen = e - RExC_parse;
6988 value = grok_hex(RExC_parse, &numlen, &flags, NULL);
6992 I32 flags = PERL_SCAN_DISALLOW_PREFIX;
6994 value = grok_hex(RExC_parse, &numlen, &flags, NULL);
6995 RExC_parse += numlen;
6997 if (PL_encoding && value < 0x100)
6998 goto recode_encoding;
7001 value = UCHARAT(RExC_parse++);
7002 value = toCTRL(value);
7004 case '0': case '1': case '2': case '3': case '4':
7005 case '5': case '6': case '7': case '8': case '9':
7009 value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
7010 RExC_parse += numlen;
7011 if (PL_encoding && value < 0x100)
7012 goto recode_encoding;
7017 SV* enc = PL_encoding;
7018 value = reg_recode((const char)(U8)value, &enc);
7019 if (!enc && SIZE_ONLY && ckWARN(WARN_REGEXP))
7021 "Invalid escape in the specified encoding");
7025 if (!SIZE_ONLY && isALPHA(value) && ckWARN(WARN_REGEXP))
7027 "Unrecognized escape \\%c in character class passed through",
7031 } /* end of \blah */
7037 if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
7039 if (!SIZE_ONLY && !need_class)
7040 ANYOF_CLASS_ZERO(ret);
7044 /* a bad range like a-\d, a-[:digit:] ? */
7047 if (ckWARN(WARN_REGEXP)) {
7049 RExC_parse >= rangebegin ?
7050 RExC_parse - rangebegin : 0;
7052 "False [] range \"%*.*s\"",
7055 if (prevvalue < 256) {
7056 ANYOF_BITMAP_SET(ret, prevvalue);
7057 ANYOF_BITMAP_SET(ret, '-');
7060 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
7061 Perl_sv_catpvf(aTHX_ listsv,
7062 "%04"UVxf"\n%04"UVxf"\n", (UV)prevvalue, (UV) '-');
7066 range = 0; /* this was not a true range */
7070 const char *what = NULL;
7073 if (namedclass > OOB_NAMEDCLASS)
7074 optimize_invert = FALSE;
7075 /* Possible truncation here but in some 64-bit environments
7076 * the compiler gets heartburn about switch on 64-bit values.
7077 * A similar issue a little earlier when switching on value.
7079 switch ((I32)namedclass) {
7082 ANYOF_CLASS_SET(ret, ANYOF_ALNUM);
7084 for (value = 0; value < 256; value++)
7086 ANYOF_BITMAP_SET(ret, value);
7093 ANYOF_CLASS_SET(ret, ANYOF_NALNUM);
7095 for (value = 0; value < 256; value++)
7096 if (!isALNUM(value))
7097 ANYOF_BITMAP_SET(ret, value);
7104 ANYOF_CLASS_SET(ret, ANYOF_ALNUMC);
7106 for (value = 0; value < 256; value++)
7107 if (isALNUMC(value))
7108 ANYOF_BITMAP_SET(ret, value);
7115 ANYOF_CLASS_SET(ret, ANYOF_NALNUMC);
7117 for (value = 0; value < 256; value++)
7118 if (!isALNUMC(value))
7119 ANYOF_BITMAP_SET(ret, value);
7126 ANYOF_CLASS_SET(ret, ANYOF_ALPHA);
7128 for (value = 0; value < 256; value++)
7130 ANYOF_BITMAP_SET(ret, value);
7137 ANYOF_CLASS_SET(ret, ANYOF_NALPHA);
7139 for (value = 0; value < 256; value++)
7140 if (!isALPHA(value))
7141 ANYOF_BITMAP_SET(ret, value);
7148 ANYOF_CLASS_SET(ret, ANYOF_ASCII);
7151 for (value = 0; value < 128; value++)
7152 ANYOF_BITMAP_SET(ret, value);
7154 for (value = 0; value < 256; value++) {
7156 ANYOF_BITMAP_SET(ret, value);
7165 ANYOF_CLASS_SET(ret, ANYOF_NASCII);
7168 for (value = 128; value < 256; value++)
7169 ANYOF_BITMAP_SET(ret, value);
7171 for (value = 0; value < 256; value++) {
7172 if (!isASCII(value))
7173 ANYOF_BITMAP_SET(ret, value);
7182 ANYOF_CLASS_SET(ret, ANYOF_BLANK);
7184 for (value = 0; value < 256; value++)
7186 ANYOF_BITMAP_SET(ret, value);
7193 ANYOF_CLASS_SET(ret, ANYOF_NBLANK);
7195 for (value = 0; value < 256; value++)
7196 if (!isBLANK(value))
7197 ANYOF_BITMAP_SET(ret, value);
7204 ANYOF_CLASS_SET(ret, ANYOF_CNTRL);
7206 for (value = 0; value < 256; value++)
7208 ANYOF_BITMAP_SET(ret, value);
7215 ANYOF_CLASS_SET(ret, ANYOF_NCNTRL);
7217 for (value = 0; value < 256; value++)
7218 if (!isCNTRL(value))
7219 ANYOF_BITMAP_SET(ret, value);
7226 ANYOF_CLASS_SET(ret, ANYOF_DIGIT);
7228 /* consecutive digits assumed */
7229 for (value = '0'; value <= '9'; value++)
7230 ANYOF_BITMAP_SET(ret, value);
7237 ANYOF_CLASS_SET(ret, ANYOF_NDIGIT);
7239 /* consecutive digits assumed */
7240 for (value = 0; value < '0'; value++)
7241 ANYOF_BITMAP_SET(ret, value);
7242 for (value = '9' + 1; value < 256; value++)
7243 ANYOF_BITMAP_SET(ret, value);
7250 ANYOF_CLASS_SET(ret, ANYOF_GRAPH);
7252 for (value = 0; value < 256; value++)
7254 ANYOF_BITMAP_SET(ret, value);
7261 ANYOF_CLASS_SET(ret, ANYOF_NGRAPH);
7263 for (value = 0; value < 256; value++)
7264 if (!isGRAPH(value))
7265 ANYOF_BITMAP_SET(ret, value);
7272 ANYOF_CLASS_SET(ret, ANYOF_LOWER);
7274 for (value = 0; value < 256; value++)
7276 ANYOF_BITMAP_SET(ret, value);
7283 ANYOF_CLASS_SET(ret, ANYOF_NLOWER);
7285 for (value = 0; value < 256; value++)
7286 if (!isLOWER(value))
7287 ANYOF_BITMAP_SET(ret, value);
7294 ANYOF_CLASS_SET(ret, ANYOF_PRINT);
7296 for (value = 0; value < 256; value++)
7298 ANYOF_BITMAP_SET(ret, value);
7305 ANYOF_CLASS_SET(ret, ANYOF_NPRINT);
7307 for (value = 0; value < 256; value++)
7308 if (!isPRINT(value))
7309 ANYOF_BITMAP_SET(ret, value);
7316 ANYOF_CLASS_SET(ret, ANYOF_PSXSPC);
7318 for (value = 0; value < 256; value++)
7319 if (isPSXSPC(value))
7320 ANYOF_BITMAP_SET(ret, value);
7327 ANYOF_CLASS_SET(ret, ANYOF_NPSXSPC);
7329 for (value = 0; value < 256; value++)
7330 if (!isPSXSPC(value))
7331 ANYOF_BITMAP_SET(ret, value);
7338 ANYOF_CLASS_SET(ret, ANYOF_PUNCT);
7340 for (value = 0; value < 256; value++)
7342 ANYOF_BITMAP_SET(ret, value);
7349 ANYOF_CLASS_SET(ret, ANYOF_NPUNCT);
7351 for (value = 0; value < 256; value++)
7352 if (!isPUNCT(value))
7353 ANYOF_BITMAP_SET(ret, value);
7360 ANYOF_CLASS_SET(ret, ANYOF_SPACE);
7362 for (value = 0; value < 256; value++)
7364 ANYOF_BITMAP_SET(ret, value);
7371 ANYOF_CLASS_SET(ret, ANYOF_NSPACE);
7373 for (value = 0; value < 256; value++)
7374 if (!isSPACE(value))
7375 ANYOF_BITMAP_SET(ret, value);
7382 ANYOF_CLASS_SET(ret, ANYOF_UPPER);
7384 for (value = 0; value < 256; value++)
7386 ANYOF_BITMAP_SET(ret, value);
7393 ANYOF_CLASS_SET(ret, ANYOF_NUPPER);
7395 for (value = 0; value < 256; value++)
7396 if (!isUPPER(value))
7397 ANYOF_BITMAP_SET(ret, value);
7404 ANYOF_CLASS_SET(ret, ANYOF_XDIGIT);
7406 for (value = 0; value < 256; value++)
7407 if (isXDIGIT(value))
7408 ANYOF_BITMAP_SET(ret, value);
7415 ANYOF_CLASS_SET(ret, ANYOF_NXDIGIT);
7417 for (value = 0; value < 256; value++)
7418 if (!isXDIGIT(value))
7419 ANYOF_BITMAP_SET(ret, value);
7425 /* this is to handle \p and \P */
7428 vFAIL("Invalid [::] class");
7432 /* Strings such as "+utf8::isWord\n" */
7433 Perl_sv_catpvf(aTHX_ listsv, "%cutf8::Is%s\n", yesno, what);
7436 ANYOF_FLAGS(ret) |= ANYOF_CLASS;
7439 } /* end of namedclass \blah */
7442 if (prevvalue > (IV)value) /* b-a */ {
7443 const int w = RExC_parse - rangebegin;
7444 Simple_vFAIL4("Invalid [] range \"%*.*s\"", w, w, rangebegin);
7445 range = 0; /* not a valid range */
7449 prevvalue = value; /* save the beginning of the range */
7450 if (*RExC_parse == '-' && RExC_parse+1 < RExC_end &&
7451 RExC_parse[1] != ']') {
7454 /* a bad range like \w-, [:word:]- ? */
7455 if (namedclass > OOB_NAMEDCLASS) {
7456 if (ckWARN(WARN_REGEXP)) {
7458 RExC_parse >= rangebegin ?
7459 RExC_parse - rangebegin : 0;
7461 "False [] range \"%*.*s\"",
7465 ANYOF_BITMAP_SET(ret, '-');
7467 range = 1; /* yeah, it's a range! */
7468 continue; /* but do it the next time */
7472 /* now is the next time */
7473 /*stored += (value - prevvalue + 1);*/
7475 if (prevvalue < 256) {
7476 const IV ceilvalue = value < 256 ? value : 255;
7479 /* In EBCDIC [\x89-\x91] should include
7480 * the \x8e but [i-j] should not. */
7481 if (literal_endpoint == 2 &&
7482 ((isLOWER(prevvalue) && isLOWER(ceilvalue)) ||
7483 (isUPPER(prevvalue) && isUPPER(ceilvalue))))
7485 if (isLOWER(prevvalue)) {
7486 for (i = prevvalue; i <= ceilvalue; i++)
7488 ANYOF_BITMAP_SET(ret, i);
7490 for (i = prevvalue; i <= ceilvalue; i++)
7492 ANYOF_BITMAP_SET(ret, i);
7497 for (i = prevvalue; i <= ceilvalue; i++) {
7498 if (!ANYOF_BITMAP_TEST(ret,i)) {
7500 ANYOF_BITMAP_SET(ret, i);
7504 if (value > 255 || UTF) {
7505 const UV prevnatvalue = NATIVE_TO_UNI(prevvalue);
7506 const UV natvalue = NATIVE_TO_UNI(value);
7507 stored+=2; /* can't optimize this class */
7508 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
7509 if (prevnatvalue < natvalue) { /* what about > ? */
7510 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\t%04"UVxf"\n",
7511 prevnatvalue, natvalue);
7513 else if (prevnatvalue == natvalue) {
7514 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n", natvalue);
7516 U8 foldbuf[UTF8_MAXBYTES_CASE+1];
7518 const UV f = to_uni_fold(natvalue, foldbuf, &foldlen);
7520 #ifdef EBCDIC /* RD t/uni/fold ff and 6b */
7521 if (RExC_precomp[0] == ':' &&
7522 RExC_precomp[1] == '[' &&
7523 (f == 0xDF || f == 0x92)) {
7524 f = NATIVE_TO_UNI(f);
7527 /* If folding and foldable and a single
7528 * character, insert also the folded version
7529 * to the charclass. */
7531 #ifdef EBCDIC /* RD tunifold ligatures s,t fb05, fb06 */
7532 if ((RExC_precomp[0] == ':' &&
7533 RExC_precomp[1] == '[' &&
7535 (value == 0xFB05 || value == 0xFB06))) ?
7536 foldlen == ((STRLEN)UNISKIP(f) - 1) :
7537 foldlen == (STRLEN)UNISKIP(f) )
7539 if (foldlen == (STRLEN)UNISKIP(f))
7541 Perl_sv_catpvf(aTHX_ listsv,
7544 /* Any multicharacter foldings
7545 * require the following transform:
7546 * [ABCDEF] -> (?:[ABCabcDEFd]|pq|rst)
7547 * where E folds into "pq" and F folds
7548 * into "rst", all other characters
7549 * fold to single characters. We save
7550 * away these multicharacter foldings,
7551 * to be later saved as part of the
7552 * additional "s" data. */
7555 if (!unicode_alternate)
7556 unicode_alternate = newAV();
7557 sv = newSVpvn((char*)foldbuf, foldlen);
7559 av_push(unicode_alternate, sv);
7563 /* If folding and the value is one of the Greek
7564 * sigmas insert a few more sigmas to make the
7565 * folding rules of the sigmas to work right.
7566 * Note that not all the possible combinations
7567 * are handled here: some of them are handled
7568 * by the standard folding rules, and some of
7569 * them (literal or EXACTF cases) are handled
7570 * during runtime in regexec.c:S_find_byclass(). */
7571 if (value == UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA) {
7572 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
7573 (UV)UNICODE_GREEK_CAPITAL_LETTER_SIGMA);
7574 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
7575 (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
7577 else if (value == UNICODE_GREEK_CAPITAL_LETTER_SIGMA)
7578 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
7579 (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
7584 literal_endpoint = 0;
7588 range = 0; /* this range (if it was one) is done now */
7592 ANYOF_FLAGS(ret) |= ANYOF_LARGE;
7594 RExC_size += ANYOF_CLASS_ADD_SKIP;
7596 RExC_emit += ANYOF_CLASS_ADD_SKIP;
7602 /****** !SIZE_ONLY AFTER HERE *********/
7604 if( stored == 1 && value < 256
7605 && !( ANYOF_FLAGS(ret) & ( ANYOF_FLAGS_ALL ^ ANYOF_FOLD ) )
7607 /* optimize single char class to an EXACT node
7608 but *only* when its not a UTF/high char */
7609 const char * cur_parse= RExC_parse;
7610 RExC_emit = (regnode *)orig_emit;
7611 RExC_parse = (char *)orig_parse;
7612 ret = reg_node(pRExC_state,
7613 (U8)((ANYOF_FLAGS(ret) & ANYOF_FOLD) ? EXACTF : EXACT));
7614 RExC_parse = (char *)cur_parse;
7615 *STRING(ret)= (char)value;
7617 RExC_emit += STR_SZ(1);
7620 /* optimize case-insensitive simple patterns (e.g. /[a-z]/i) */
7621 if ( /* If the only flag is folding (plus possibly inversion). */
7622 ((ANYOF_FLAGS(ret) & (ANYOF_FLAGS_ALL ^ ANYOF_INVERT)) == ANYOF_FOLD)
7624 for (value = 0; value < 256; ++value) {
7625 if (ANYOF_BITMAP_TEST(ret, value)) {
7626 UV fold = PL_fold[value];
7629 ANYOF_BITMAP_SET(ret, fold);
7632 ANYOF_FLAGS(ret) &= ~ANYOF_FOLD;
7635 /* optimize inverted simple patterns (e.g. [^a-z]) */
7636 if (optimize_invert &&
7637 /* If the only flag is inversion. */
7638 (ANYOF_FLAGS(ret) & ANYOF_FLAGS_ALL) == ANYOF_INVERT) {
7639 for (value = 0; value < ANYOF_BITMAP_SIZE; ++value)
7640 ANYOF_BITMAP(ret)[value] ^= ANYOF_FLAGS_ALL;
7641 ANYOF_FLAGS(ret) = ANYOF_UNICODE_ALL;
7644 AV * const av = newAV();
7646 /* The 0th element stores the character class description
7647 * in its textual form: used later (regexec.c:Perl_regclass_swash())
7648 * to initialize the appropriate swash (which gets stored in
7649 * the 1st element), and also useful for dumping the regnode.
7650 * The 2nd element stores the multicharacter foldings,
7651 * used later (regexec.c:S_reginclass()). */
7652 av_store(av, 0, listsv);
7653 av_store(av, 1, NULL);
7654 av_store(av, 2, (SV*)unicode_alternate);
7655 rv = newRV_noinc((SV*)av);
7656 n = add_data(pRExC_state, 1, "s");
7657 RExC_rx->data->data[n] = (void*)rv;
7664 S_nextchar(pTHX_ RExC_state_t *pRExC_state)
7666 char* const retval = RExC_parse++;
7669 if (*RExC_parse == '(' && RExC_parse[1] == '?' &&
7670 RExC_parse[2] == '#') {
7671 while (*RExC_parse != ')') {
7672 if (RExC_parse == RExC_end)
7673 FAIL("Sequence (?#... not terminated");
7679 if (RExC_flags & PMf_EXTENDED) {
7680 if (isSPACE(*RExC_parse)) {
7684 else if (*RExC_parse == '#') {
7685 while (RExC_parse < RExC_end)
7686 if (*RExC_parse++ == '\n') break;
7695 - reg_node - emit a node
7697 STATIC regnode * /* Location. */
7698 S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
7701 register regnode *ptr;
7702 regnode * const ret = RExC_emit;
7703 GET_RE_DEBUG_FLAGS_DECL;
7706 SIZE_ALIGN(RExC_size);
7711 if (OP(RExC_emit) == 255)
7712 Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %s: %d ",
7713 reg_name[op], OP(RExC_emit));
7715 NODE_ALIGN_FILL(ret);
7717 FILL_ADVANCE_NODE(ptr, op);
7718 if (RExC_offsets) { /* MJD */
7719 MJD_OFFSET_DEBUG(("%s:%d: (op %s) %s %"UVuf" (len %"UVuf") (max %"UVuf").\n",
7720 "reg_node", __LINE__,
7722 (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0]
7723 ? "Overwriting end of array!\n" : "OK",
7724 (UV)(RExC_emit - RExC_emit_start),
7725 (UV)(RExC_parse - RExC_start),
7726 (UV)RExC_offsets[0]));
7727 Set_Node_Offset(RExC_emit, RExC_parse + (op == END));
7735 - reganode - emit a node with an argument
7737 STATIC regnode * /* Location. */
7738 S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
7741 register regnode *ptr;
7742 regnode * const ret = RExC_emit;
7743 GET_RE_DEBUG_FLAGS_DECL;
7746 SIZE_ALIGN(RExC_size);
7751 assert(2==regarglen[op]+1);
7753 Anything larger than this has to allocate the extra amount.
7754 If we changed this to be:
7756 RExC_size += (1 + regarglen[op]);
7758 then it wouldn't matter. Its not clear what side effect
7759 might come from that so its not done so far.
7765 if (OP(RExC_emit) == 255)
7766 Perl_croak(aTHX_ "panic: reganode overwriting end of allocated program space");
7768 NODE_ALIGN_FILL(ret);
7770 FILL_ADVANCE_NODE_ARG(ptr, op, arg);
7771 if (RExC_offsets) { /* MJD */
7772 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
7776 (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0] ?
7777 "Overwriting end of array!\n" : "OK",
7778 (UV)(RExC_emit - RExC_emit_start),
7779 (UV)(RExC_parse - RExC_start),
7780 (UV)RExC_offsets[0]));
7781 Set_Cur_Node_Offset;
7789 - reguni - emit (if appropriate) a Unicode character
7792 S_reguni(pTHX_ const RExC_state_t *pRExC_state, UV uv, char* s)
7795 return SIZE_ONLY ? UNISKIP(uv) : (uvchr_to_utf8((U8*)s, uv) - (U8*)s);
7799 - reginsert - insert an operator in front of already-emitted operand
7801 * Means relocating the operand.
7804 S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd, U32 depth)
7807 register regnode *src;
7808 register regnode *dst;
7809 register regnode *place;
7810 const int offset = regarglen[(U8)op];
7811 const int size = NODE_STEP_REGNODE + offset;
7812 GET_RE_DEBUG_FLAGS_DECL;
7813 /* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */
7814 DEBUG_PARSE_FMT("inst"," - %s",reg_name[op]);
7823 if (RExC_open_parens) {
7825 DEBUG_PARSE_FMT("inst"," - %"IVdf, (IV)RExC_npar);
7826 for ( paren=0 ; paren < RExC_npar ; paren++ ) {
7827 if ( RExC_open_parens[paren] >= opnd ) {
7828 DEBUG_PARSE_FMT("open"," - %d",size);
7829 RExC_open_parens[paren] += size;
7831 DEBUG_PARSE_FMT("open"," - %s","ok");
7833 if ( RExC_close_parens[paren] >= opnd ) {
7834 DEBUG_PARSE_FMT("close"," - %d",size);
7835 RExC_close_parens[paren] += size;
7837 DEBUG_PARSE_FMT("close"," - %s","ok");
7842 while (src > opnd) {
7843 StructCopy(--src, --dst, regnode);
7844 if (RExC_offsets) { /* MJD 20010112 */
7845 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s copy %"UVuf" -> %"UVuf" (max %"UVuf").\n",
7849 (UV)(dst - RExC_emit_start) > RExC_offsets[0]
7850 ? "Overwriting end of array!\n" : "OK",
7851 (UV)(src - RExC_emit_start),
7852 (UV)(dst - RExC_emit_start),
7853 (UV)RExC_offsets[0]));
7854 Set_Node_Offset_To_R(dst-RExC_emit_start, Node_Offset(src));
7855 Set_Node_Length_To_R(dst-RExC_emit_start, Node_Length(src));
7860 place = opnd; /* Op node, where operand used to be. */
7861 if (RExC_offsets) { /* MJD */
7862 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
7866 (UV)(place - RExC_emit_start) > RExC_offsets[0]
7867 ? "Overwriting end of array!\n" : "OK",
7868 (UV)(place - RExC_emit_start),
7869 (UV)(RExC_parse - RExC_start),
7870 (UV)RExC_offsets[0]));
7871 Set_Node_Offset(place, RExC_parse);
7872 Set_Node_Length(place, 1);
7874 src = NEXTOPER(place);
7875 FILL_ADVANCE_NODE(place, op);
7876 Zero(src, offset, regnode);
7880 - regtail - set the next-pointer at the end of a node chain of p to val.
7881 - SEE ALSO: regtail_study
7883 /* TODO: All three parms should be const */
7885 S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
7888 register regnode *scan;
7889 GET_RE_DEBUG_FLAGS_DECL;
7891 PERL_UNUSED_ARG(depth);
7897 /* Find last node. */
7900 regnode * const temp = regnext(scan);
7902 SV * const mysv=sv_newmortal();
7903 DEBUG_PARSE_MSG((scan==p ? "tail" : ""));
7904 regprop(RExC_rx, mysv, scan);
7905 PerlIO_printf(Perl_debug_log, "~ %s (%d) %s %s\n",
7906 SvPV_nolen_const(mysv), REG_NODE_NUM(scan),
7907 (temp == NULL ? "->" : ""),
7908 (temp == NULL ? reg_name[OP(val)] : "")
7916 if (reg_off_by_arg[OP(scan)]) {
7917 ARG_SET(scan, val - scan);
7920 NEXT_OFF(scan) = val - scan;
7926 - regtail_study - set the next-pointer at the end of a node chain of p to val.
7927 - Look for optimizable sequences at the same time.
7928 - currently only looks for EXACT chains.
7930 This is expermental code. The idea is to use this routine to perform
7931 in place optimizations on branches and groups as they are constructed,
7932 with the long term intention of removing optimization from study_chunk so
7933 that it is purely analytical.
7935 Currently only used when in DEBUG mode. The macro REGTAIL_STUDY() is used
7936 to control which is which.
7939 /* TODO: All four parms should be const */
7942 S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
7945 register regnode *scan;
7947 #ifdef EXPERIMENTAL_INPLACESCAN
7951 GET_RE_DEBUG_FLAGS_DECL;
7957 /* Find last node. */
7961 regnode * const temp = regnext(scan);
7962 #ifdef EXPERIMENTAL_INPLACESCAN
7963 if (PL_regkind[OP(scan)] == EXACT)
7964 if (join_exact(pRExC_state,scan,&min,1,val,depth+1))
7972 if( exact == PSEUDO )
7974 else if ( exact != OP(scan) )
7983 SV * const mysv=sv_newmortal();
7984 DEBUG_PARSE_MSG((scan==p ? "tsdy" : ""));
7985 regprop(RExC_rx, mysv, scan);
7986 PerlIO_printf(Perl_debug_log, "~ %s (%d) -> %s\n",
7987 SvPV_nolen_const(mysv),
7996 SV * const mysv_val=sv_newmortal();
7997 DEBUG_PARSE_MSG("");
7998 regprop(RExC_rx, mysv_val, val);
7999 PerlIO_printf(Perl_debug_log, "~ attach to %s (%d) offset to %d\n",
8000 SvPV_nolen_const(mysv_val),
8005 if (reg_off_by_arg[OP(scan)]) {
8006 ARG_SET(scan, val - scan);
8009 NEXT_OFF(scan) = val - scan;
8017 - regcurly - a little FSA that accepts {\d+,?\d*}
8020 S_regcurly(register const char *s)
8039 - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
8042 Perl_regdump(pTHX_ const regexp *r)
8046 SV * const sv = sv_newmortal();
8047 SV *dsv= sv_newmortal();
8049 (void)dumpuntil(r, r->program, r->program + 1, NULL, NULL, sv, 0, 0);
8051 /* Header fields of interest. */
8052 if (r->anchored_substr) {
8053 RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->anchored_substr),
8054 RE_SV_DUMPLEN(r->anchored_substr), 30);
8055 PerlIO_printf(Perl_debug_log,
8056 "anchored %s%s at %"IVdf" ",
8057 s, RE_SV_TAIL(r->anchored_substr),
8058 (IV)r->anchored_offset);
8059 } else if (r->anchored_utf8) {
8060 RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->anchored_utf8),
8061 RE_SV_DUMPLEN(r->anchored_utf8), 30);
8062 PerlIO_printf(Perl_debug_log,
8063 "anchored utf8 %s%s at %"IVdf" ",
8064 s, RE_SV_TAIL(r->anchored_utf8),
8065 (IV)r->anchored_offset);
8067 if (r->float_substr) {
8068 RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->float_substr),
8069 RE_SV_DUMPLEN(r->float_substr), 30);
8070 PerlIO_printf(Perl_debug_log,
8071 "floating %s%s at %"IVdf"..%"UVuf" ",
8072 s, RE_SV_TAIL(r->float_substr),
8073 (IV)r->float_min_offset, (UV)r->float_max_offset);
8074 } else if (r->float_utf8) {
8075 RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->float_utf8),
8076 RE_SV_DUMPLEN(r->float_utf8), 30);
8077 PerlIO_printf(Perl_debug_log,
8078 "floating utf8 %s%s at %"IVdf"..%"UVuf" ",
8079 s, RE_SV_TAIL(r->float_utf8),
8080 (IV)r->float_min_offset, (UV)r->float_max_offset);
8082 if (r->check_substr || r->check_utf8)
8083 PerlIO_printf(Perl_debug_log,
8085 (r->check_substr == r->float_substr
8086 && r->check_utf8 == r->float_utf8
8087 ? "(checking floating" : "(checking anchored"));
8088 if (r->reganch & ROPT_NOSCAN)
8089 PerlIO_printf(Perl_debug_log, " noscan");
8090 if (r->reganch & ROPT_CHECK_ALL)
8091 PerlIO_printf(Perl_debug_log, " isall");
8092 if (r->check_substr || r->check_utf8)
8093 PerlIO_printf(Perl_debug_log, ") ");
8095 if (r->regstclass) {
8096 regprop(r, sv, r->regstclass);
8097 PerlIO_printf(Perl_debug_log, "stclass %s ", SvPVX_const(sv));
8099 if (r->reganch & ROPT_ANCH) {
8100 PerlIO_printf(Perl_debug_log, "anchored");
8101 if (r->reganch & ROPT_ANCH_BOL)
8102 PerlIO_printf(Perl_debug_log, "(BOL)");
8103 if (r->reganch & ROPT_ANCH_MBOL)
8104 PerlIO_printf(Perl_debug_log, "(MBOL)");
8105 if (r->reganch & ROPT_ANCH_SBOL)
8106 PerlIO_printf(Perl_debug_log, "(SBOL)");
8107 if (r->reganch & ROPT_ANCH_GPOS)
8108 PerlIO_printf(Perl_debug_log, "(GPOS)");
8109 PerlIO_putc(Perl_debug_log, ' ');
8111 if (r->reganch & ROPT_GPOS_SEEN)
8112 PerlIO_printf(Perl_debug_log, "GPOS ");
8113 if (r->reganch & ROPT_SKIP)
8114 PerlIO_printf(Perl_debug_log, "plus ");
8115 if (r->reganch & ROPT_IMPLICIT)
8116 PerlIO_printf(Perl_debug_log, "implicit ");
8117 PerlIO_printf(Perl_debug_log, "minlen %ld ", (long) r->minlen);
8118 if (r->reganch & ROPT_EVAL_SEEN)
8119 PerlIO_printf(Perl_debug_log, "with eval ");
8120 PerlIO_printf(Perl_debug_log, "\n");
8122 PERL_UNUSED_CONTEXT;
8124 #endif /* DEBUGGING */
8128 - regprop - printable representation of opcode
8131 Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o)
8136 GET_RE_DEBUG_FLAGS_DECL;
8138 sv_setpvn(sv, "", 0);
8140 if (OP(o) > REGNODE_MAX) /* regnode.type is unsigned */
8141 /* It would be nice to FAIL() here, but this may be called from
8142 regexec.c, and it would be hard to supply pRExC_state. */
8143 Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d", (int)OP(o), (int)REGNODE_MAX);
8144 sv_catpv(sv, reg_name[OP(o)]); /* Take off const! */
8146 k = PL_regkind[OP(o)];
8149 SV * const dsv = sv_2mortal(newSVpvs(""));
8150 /* Using is_utf8_string() (via PERL_PV_UNI_DETECT)
8151 * is a crude hack but it may be the best for now since
8152 * we have no flag "this EXACTish node was UTF-8"
8154 const char * const s =
8155 pv_pretty(dsv, STRING(o), STR_LEN(o), 60,
8156 PL_colors[0], PL_colors[1],
8157 PERL_PV_ESCAPE_UNI_DETECT |
8158 PERL_PV_PRETTY_ELIPSES |
8161 Perl_sv_catpvf(aTHX_ sv, " %s", s );
8162 } else if (k == TRIE) {
8163 /* print the details of the trie in dumpuntil instead, as
8164 * prog->data isn't available here */
8165 const char op = OP(o);
8166 const I32 n = ARG(o);
8167 const reg_ac_data * const ac = IS_TRIE_AC(op) ?
8168 (reg_ac_data *)prog->data->data[n] :
8170 const reg_trie_data * const trie = !IS_TRIE_AC(op) ?
8171 (reg_trie_data*)prog->data->data[n] :
8174 Perl_sv_catpvf(aTHX_ sv, "-%s",reg_name[o->flags]);
8175 DEBUG_TRIE_COMPILE_r(
8176 Perl_sv_catpvf(aTHX_ sv,
8177 "<S:%"UVuf"/%"IVdf" W:%"UVuf" L:%"UVuf"/%"UVuf" C:%"UVuf"/%"UVuf">",
8178 (UV)trie->startstate,
8179 (IV)trie->statecount-1, /* -1 because of the unused 0 element */
8180 (UV)trie->wordcount,
8183 (UV)TRIE_CHARCOUNT(trie),
8184 (UV)trie->uniquecharcount
8187 if ( IS_ANYOF_TRIE(op) || trie->bitmap ) {
8189 int rangestart = -1;
8190 U8* bitmap = IS_ANYOF_TRIE(op) ? (U8*)ANYOF_BITMAP(o) : (U8*)TRIE_BITMAP(trie);
8191 Perl_sv_catpvf(aTHX_ sv, "[");
8192 for (i = 0; i <= 256; i++) {
8193 if (i < 256 && BITMAP_TEST(bitmap,i)) {
8194 if (rangestart == -1)
8196 } else if (rangestart != -1) {
8197 if (i <= rangestart + 3)
8198 for (; rangestart < i; rangestart++)
8199 put_byte(sv, rangestart);
8201 put_byte(sv, rangestart);
8203 put_byte(sv, i - 1);
8208 Perl_sv_catpvf(aTHX_ sv, "]");
8211 } else if (k == CURLY) {
8212 if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX)
8213 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */
8214 Perl_sv_catpvf(aTHX_ sv, " {%d,%d}", ARG1(o), ARG2(o));
8216 else if (k == WHILEM && o->flags) /* Ordinal/of */
8217 Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4);
8218 else if (k == REF || k == OPEN || k == CLOSE || k == GROUPP || OP(o)==ACCEPT)
8219 Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o)); /* Parenth number */
8220 else if (k == GOSUB)
8221 Perl_sv_catpvf(aTHX_ sv, "%d[%+d]", (int)ARG(o),(int)ARG2L(o)); /* Paren and offset */
8222 else if (k == VERB) {
8224 Perl_sv_catpvf(aTHX_ sv, ":%"SVf,
8225 (SV*)prog->data->data[ ARG( o ) ]);
8226 } else if (k == LOGICAL)
8227 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* 2: embedded, otherwise 1 */
8228 else if (k == ANYOF) {
8229 int i, rangestart = -1;
8230 const U8 flags = ANYOF_FLAGS(o);
8232 /* Should be synchronized with * ANYOF_ #xdefines in regcomp.h */
8233 static const char * const anyofs[] = {
8266 if (flags & ANYOF_LOCALE)
8267 sv_catpvs(sv, "{loc}");
8268 if (flags & ANYOF_FOLD)
8269 sv_catpvs(sv, "{i}");
8270 Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
8271 if (flags & ANYOF_INVERT)
8273 for (i = 0; i <= 256; i++) {
8274 if (i < 256 && ANYOF_BITMAP_TEST(o,i)) {
8275 if (rangestart == -1)
8277 } else if (rangestart != -1) {
8278 if (i <= rangestart + 3)
8279 for (; rangestart < i; rangestart++)
8280 put_byte(sv, rangestart);
8282 put_byte(sv, rangestart);
8284 put_byte(sv, i - 1);
8290 if (o->flags & ANYOF_CLASS)
8291 for (i = 0; i < (int)(sizeof(anyofs)/sizeof(char*)); i++)
8292 if (ANYOF_CLASS_TEST(o,i))
8293 sv_catpv(sv, anyofs[i]);
8295 if (flags & ANYOF_UNICODE)
8296 sv_catpvs(sv, "{unicode}");
8297 else if (flags & ANYOF_UNICODE_ALL)
8298 sv_catpvs(sv, "{unicode_all}");
8302 SV * const sw = regclass_swash(prog, o, FALSE, &lv, 0);
8306 U8 s[UTF8_MAXBYTES_CASE+1];
8308 for (i = 0; i <= 256; i++) { /* just the first 256 */
8309 uvchr_to_utf8(s, i);
8311 if (i < 256 && swash_fetch(sw, s, TRUE)) {
8312 if (rangestart == -1)
8314 } else if (rangestart != -1) {
8315 if (i <= rangestart + 3)
8316 for (; rangestart < i; rangestart++) {
8317 const U8 * const e = uvchr_to_utf8(s,rangestart);
8319 for(p = s; p < e; p++)
8323 const U8 *e = uvchr_to_utf8(s,rangestart);
8325 for (p = s; p < e; p++)
8328 e = uvchr_to_utf8(s, i-1);
8329 for (p = s; p < e; p++)
8336 sv_catpvs(sv, "..."); /* et cetera */
8340 char *s = savesvpv(lv);
8341 char * const origs = s;
8343 while (*s && *s != '\n')
8347 const char * const t = ++s;
8365 Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
8367 else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH))
8368 Perl_sv_catpvf(aTHX_ sv, "[%d]", -(o->flags));
8370 PERL_UNUSED_CONTEXT;
8371 PERL_UNUSED_ARG(sv);
8373 PERL_UNUSED_ARG(prog);
8374 #endif /* DEBUGGING */
8378 Perl_re_intuit_string(pTHX_ regexp *prog)
8379 { /* Assume that RE_INTUIT is set */
8381 GET_RE_DEBUG_FLAGS_DECL;
8382 PERL_UNUSED_CONTEXT;
8386 const char * const s = SvPV_nolen_const(prog->check_substr
8387 ? prog->check_substr : prog->check_utf8);
8389 if (!PL_colorset) reginitcolors();
8390 PerlIO_printf(Perl_debug_log,
8391 "%sUsing REx %ssubstr:%s \"%s%.60s%s%s\"\n",
8393 prog->check_substr ? "" : "utf8 ",
8394 PL_colors[5],PL_colors[0],
8397 (strlen(s) > 60 ? "..." : ""));
8400 return prog->check_substr ? prog->check_substr : prog->check_utf8;
8404 pregfree - free a regexp
8406 See regdupe below if you change anything here.
8410 Perl_pregfree(pTHX_ struct regexp *r)
8414 GET_RE_DEBUG_FLAGS_DECL;
8416 if (!r || (--r->refcnt > 0))
8422 SV *dsv= sv_newmortal();
8423 RE_PV_QUOTED_DECL(s, (r->reganch & ROPT_UTF8),
8424 dsv, r->precomp, r->prelen, 60);
8425 PerlIO_printf(Perl_debug_log,"%sFreeing REx:%s %s\n",
8426 PL_colors[4],PL_colors[5],s);
8430 /* gcov results gave these as non-null 100% of the time, so there's no
8431 optimisation in checking them before calling Safefree */
8432 Safefree(r->precomp);
8433 Safefree(r->offsets); /* 20010421 MJD */
8434 RX_MATCH_COPY_FREE(r);
8435 #ifdef PERL_OLD_COPY_ON_WRITE
8437 SvREFCNT_dec(r->saved_copy);
8440 if (r->anchored_substr)
8441 SvREFCNT_dec(r->anchored_substr);
8442 if (r->anchored_utf8)
8443 SvREFCNT_dec(r->anchored_utf8);
8444 if (r->float_substr)
8445 SvREFCNT_dec(r->float_substr);
8447 SvREFCNT_dec(r->float_utf8);
8448 Safefree(r->substrs);
8451 SvREFCNT_dec(r->paren_names);
8453 int n = r->data->count;
8454 PAD* new_comppad = NULL;
8459 /* If you add a ->what type here, update the comment in regcomp.h */
8460 switch (r->data->what[n]) {
8463 SvREFCNT_dec((SV*)r->data->data[n]);
8466 Safefree(r->data->data[n]);
8469 new_comppad = (AV*)r->data->data[n];
8472 if (new_comppad == NULL)
8473 Perl_croak(aTHX_ "panic: pregfree comppad");
8474 PAD_SAVE_LOCAL(old_comppad,
8475 /* Watch out for global destruction's random ordering. */
8476 (SvTYPE(new_comppad) == SVt_PVAV) ? new_comppad : NULL
8479 refcnt = OpREFCNT_dec((OP_4tree*)r->data->data[n]);
8482 op_free((OP_4tree*)r->data->data[n]);
8484 PAD_RESTORE_LOCAL(old_comppad);
8485 SvREFCNT_dec((SV*)new_comppad);
8491 { /* Aho Corasick add-on structure for a trie node.
8492 Used in stclass optimization only */
8494 reg_ac_data *aho=(reg_ac_data*)r->data->data[n];
8496 refcount = --aho->refcount;
8499 Safefree(aho->states);
8500 Safefree(aho->fail);
8501 aho->trie=NULL; /* not necessary to free this as it is
8502 handled by the 't' case */
8503 Safefree(r->data->data[n]); /* do this last!!!! */
8504 Safefree(r->regstclass);
8510 /* trie structure. */
8512 reg_trie_data *trie=(reg_trie_data*)r->data->data[n];
8514 refcount = --trie->refcount;
8517 Safefree(trie->charmap);
8518 if (trie->widecharmap)
8519 SvREFCNT_dec((SV*)trie->widecharmap);
8520 Safefree(trie->states);
8521 Safefree(trie->trans);
8523 Safefree(trie->bitmap);
8525 Safefree(trie->wordlen);
8527 Safefree(trie->jump);
8529 Safefree(trie->nextword);
8532 SvREFCNT_dec((SV*)trie->words);
8533 if (trie->revcharmap)
8534 SvREFCNT_dec((SV*)trie->revcharmap);
8536 Safefree(r->data->data[n]); /* do this last!!!! */
8541 Perl_croak(aTHX_ "panic: regfree data code '%c'", r->data->what[n]);
8544 Safefree(r->data->what);
8547 Safefree(r->startp);
8550 Safefree(r->swap->startp);
8551 Safefree(r->swap->endp);
8557 #define sv_dup_inc(s,t) SvREFCNT_inc(sv_dup(s,t))
8558 #define av_dup_inc(s,t) (AV*)SvREFCNT_inc(sv_dup((SV*)s,t))
8559 #define hv_dup_inc(s,t) (HV*)SvREFCNT_inc(sv_dup((SV*)s,t))
8560 #define SAVEPVN(p,n) ((p) ? savepvn(p,n) : NULL)
8563 regdupe - duplicate a regexp.
8565 This routine is called by sv.c's re_dup and is expected to clone a
8566 given regexp structure. It is a no-op when not under USE_ITHREADS.
8567 (Originally this *was* re_dup() for change history see sv.c)
8569 See pregfree() above if you change anything here.
8571 #if defined(USE_ITHREADS)
8573 Perl_regdupe(pTHX_ const regexp *r, CLONE_PARAMS *param)
8578 struct reg_substr_datum *s;
8581 return (REGEXP *)NULL;
8583 if ((ret = (REGEXP *)ptr_table_fetch(PL_ptr_table, r)))
8586 len = r->offsets[0];
8587 npar = r->nparens+1;
8589 Newxc(ret, sizeof(regexp) + (len+1)*sizeof(regnode), char, regexp);
8590 Copy(r->program, ret->program, len+1, regnode);
8592 Newx(ret->startp, npar, I32);
8593 Copy(r->startp, ret->startp, npar, I32);
8594 Newx(ret->endp, npar, I32);
8595 Copy(r->startp, ret->startp, npar, I32);
8597 Newx(ret->swap, 1, regexp_paren_ofs);
8598 /* no need to copy these */
8599 Newx(ret->swap->startp, npar, I32);
8600 Newx(ret->swap->endp, npar, I32);
8605 Newx(ret->substrs, 1, struct reg_substr_data);
8606 for (s = ret->substrs->data, i = 0; i < 3; i++, s++) {
8607 s->min_offset = r->substrs->data[i].min_offset;
8608 s->max_offset = r->substrs->data[i].max_offset;
8609 s->end_shift = r->substrs->data[i].end_shift;
8610 s->substr = sv_dup_inc(r->substrs->data[i].substr, param);
8611 s->utf8_substr = sv_dup_inc(r->substrs->data[i].utf8_substr, param);
8614 ret->regstclass = NULL;
8617 const int count = r->data->count;
8620 Newxc(d, sizeof(struct reg_data) + count*sizeof(void *),
8621 char, struct reg_data);
8622 Newx(d->what, count, U8);
8625 for (i = 0; i < count; i++) {
8626 d->what[i] = r->data->what[i];
8627 switch (d->what[i]) {
8628 /* legal options are one of: sSfpont
8629 see also regcomp.h and pregfree() */
8632 d->data[i] = sv_dup_inc((SV *)r->data->data[i], param);
8635 d->data[i] = av_dup_inc((AV *)r->data->data[i], param);
8638 /* This is cheating. */
8639 Newx(d->data[i], 1, struct regnode_charclass_class);
8640 StructCopy(r->data->data[i], d->data[i],
8641 struct regnode_charclass_class);
8642 ret->regstclass = (regnode*)d->data[i];
8645 /* Compiled op trees are readonly, and can thus be
8646 shared without duplication. */
8648 d->data[i] = (void*)OpREFCNT_inc((OP*)r->data->data[i]);
8652 d->data[i] = r->data->data[i];
8655 d->data[i] = r->data->data[i];
8657 ((reg_trie_data*)d->data[i])->refcount++;
8661 d->data[i] = r->data->data[i];
8663 ((reg_ac_data*)d->data[i])->refcount++;
8665 /* Trie stclasses are readonly and can thus be shared
8666 * without duplication. We free the stclass in pregfree
8667 * when the corresponding reg_ac_data struct is freed.
8669 ret->regstclass= r->regstclass;
8672 Perl_croak(aTHX_ "panic: re_dup unknown data code '%c'", r->data->what[i]);
8681 Newx(ret->offsets, 2*len+1, U32);
8682 Copy(r->offsets, ret->offsets, 2*len+1, U32);
8684 ret->precomp = SAVEPVN(r->precomp, r->prelen);
8685 ret->refcnt = r->refcnt;
8686 ret->minlen = r->minlen;
8687 ret->minlenret = r->minlenret;
8688 ret->prelen = r->prelen;
8689 ret->nparens = r->nparens;
8690 ret->lastparen = r->lastparen;
8691 ret->lastcloseparen = r->lastcloseparen;
8692 ret->reganch = r->reganch;
8694 ret->sublen = r->sublen;
8696 ret->engine = r->engine;
8698 ret->paren_names = hv_dup_inc(r->paren_names, param);
8700 if (RX_MATCH_COPIED(ret))
8701 ret->subbeg = SAVEPVN(r->subbeg, r->sublen);
8704 #ifdef PERL_OLD_COPY_ON_WRITE
8705 ret->saved_copy = NULL;
8708 ptr_table_store(PL_ptr_table, r, ret);
8716 converts a regexp embedded in a MAGIC struct to its stringified form,
8717 caching the converted form in the struct and returns the cached
8720 If lp is nonnull then it is used to return the length of the
8723 If flags is nonnull and the returned string contains UTF8 then
8724 (flags & 1) will be true.
8726 If haseval is nonnull then it is used to return whether the pattern
8729 Normally called via macro:
8731 CALLREG_STRINGIFY(mg,0,0);
8735 CALLREG_AS_STR(mg,lp,flags,haseval)
8737 See sv_2pv_flags() in sv.c for an example of internal usage.
8742 Perl_reg_stringify(pTHX_ MAGIC *mg, STRLEN *lp, U32 *flags, I32 *haseval ) {
8744 const regexp * const re = (regexp *)mg->mg_obj;
8747 const char *fptr = "msix";
8752 bool need_newline = 0;
8753 U16 reganch = (U16)((re->reganch & PMf_COMPILETIME) >> 12);
8755 while((ch = *fptr++)) {
8757 reflags[left++] = ch;
8760 reflags[right--] = ch;
8765 reflags[left] = '-';
8769 mg->mg_len = re->prelen + 4 + left;
8771 * If /x was used, we have to worry about a regex ending with a
8772 * comment later being embedded within another regex. If so, we don't
8773 * want this regex's "commentization" to leak out to the right part of
8774 * the enclosing regex, we must cap it with a newline.
8776 * So, if /x was used, we scan backwards from the end of the regex. If
8777 * we find a '#' before we find a newline, we need to add a newline
8778 * ourself. If we find a '\n' first (or if we don't find '#' or '\n'),
8779 * we don't need to add anything. -jfriedl
8781 if (PMf_EXTENDED & re->reganch) {
8782 const char *endptr = re->precomp + re->prelen;
8783 while (endptr >= re->precomp) {
8784 const char c = *(endptr--);
8786 break; /* don't need another */
8788 /* we end while in a comment, so we need a newline */
8789 mg->mg_len++; /* save space for it */
8790 need_newline = 1; /* note to add it */
8796 Newx(mg->mg_ptr, mg->mg_len + 1 + left, char);
8797 mg->mg_ptr[0] = '(';
8798 mg->mg_ptr[1] = '?';
8799 Copy(reflags, mg->mg_ptr+2, left, char);
8800 *(mg->mg_ptr+left+2) = ':';
8801 Copy(re->precomp, mg->mg_ptr+3+left, re->prelen, char);
8803 mg->mg_ptr[mg->mg_len - 2] = '\n';
8804 mg->mg_ptr[mg->mg_len - 1] = ')';
8805 mg->mg_ptr[mg->mg_len] = 0;
8808 *haseval = re->program[0].next_off;
8810 *flags = ((re->reganch & ROPT_UTF8) ? 1 : 0);
8818 #ifndef PERL_IN_XSUB_RE
8820 - regnext - dig the "next" pointer out of a node
8823 Perl_regnext(pTHX_ register regnode *p)
8826 register I32 offset;
8828 if (p == &PL_regdummy)
8831 offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p));
8840 S_re_croak2(pTHX_ const char* pat1,const char* pat2,...)
8843 STRLEN l1 = strlen(pat1);
8844 STRLEN l2 = strlen(pat2);
8847 const char *message;
8853 Copy(pat1, buf, l1 , char);
8854 Copy(pat2, buf + l1, l2 , char);
8855 buf[l1 + l2] = '\n';
8856 buf[l1 + l2 + 1] = '\0';
8858 /* ANSI variant takes additional second argument */
8859 va_start(args, pat2);
8863 msv = vmess(buf, &args);
8865 message = SvPV_const(msv,l1);
8868 Copy(message, buf, l1 , char);
8869 buf[l1-1] = '\0'; /* Overwrite \n */
8870 Perl_croak(aTHX_ "%s", buf);
8873 /* XXX Here's a total kludge. But we need to re-enter for swash routines. */
8875 #ifndef PERL_IN_XSUB_RE
8877 Perl_save_re_context(pTHX)
8881 struct re_save_state *state;
8883 SAVEVPTR(PL_curcop);
8884 SSGROW(SAVESTACK_ALLOC_FOR_RE_SAVE_STATE + 1);
8886 state = (struct re_save_state *)(PL_savestack + PL_savestack_ix);
8887 PL_savestack_ix += SAVESTACK_ALLOC_FOR_RE_SAVE_STATE;
8888 SSPUSHINT(SAVEt_RE_STATE);
8890 Copy(&PL_reg_state, state, 1, struct re_save_state);
8892 PL_reg_start_tmp = 0;
8893 PL_reg_start_tmpl = 0;
8894 PL_reg_oldsaved = NULL;
8895 PL_reg_oldsavedlen = 0;
8897 PL_reg_leftiter = 0;
8898 PL_reg_poscache = NULL;
8899 PL_reg_poscache_size = 0;
8900 #ifdef PERL_OLD_COPY_ON_WRITE
8904 /* Save $1..$n (#18107: UTF-8 s/(\w+)/uc($1)/e); AMS 20021106. */
8906 const REGEXP * const rx = PM_GETRE(PL_curpm);
8909 for (i = 1; i <= rx->nparens; i++) {
8910 char digits[TYPE_CHARS(long)];
8911 const STRLEN len = my_snprintf(digits, sizeof(digits), "%lu", (long)i);
8912 GV *const *const gvp
8913 = (GV**)hv_fetch(PL_defstash, digits, len, 0);
8916 GV * const gv = *gvp;
8917 if (SvTYPE(gv) == SVt_PVGV && GvSV(gv))
8927 clear_re(pTHX_ void *r)
8930 ReREFCNT_dec((regexp *)r);
8936 S_put_byte(pTHX_ SV *sv, int c)
8938 if (isCNTRL(c) || c == 255 || !isPRINT(c))
8939 Perl_sv_catpvf(aTHX_ sv, "\\%o", c);
8940 else if (c == '-' || c == ']' || c == '\\' || c == '^')
8941 Perl_sv_catpvf(aTHX_ sv, "\\%c", c);
8943 Perl_sv_catpvf(aTHX_ sv, "%c", c);
8947 #define CLEAR_OPTSTART \
8948 if (optstart) STMT_START { \
8949 DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log, " (%d nodes)\n", node - optstart)); \
8953 #define DUMPUNTIL(b,e) CLEAR_OPTSTART; node=dumpuntil(r,start,(b),(e),last,sv,indent+1,depth+1);
8955 STATIC const regnode *
8956 S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node,
8957 const regnode *last, const regnode *plast,
8958 SV* sv, I32 indent, U32 depth)
8961 register U8 op = PSEUDO; /* Arbitrary non-END op. */
8962 register const regnode *next;
8963 const regnode *optstart= NULL;
8964 GET_RE_DEBUG_FLAGS_DECL;
8966 #ifdef DEBUG_DUMPUNTIL
8967 PerlIO_printf(Perl_debug_log, "--- %d : %d - %d - %d\n",indent,node-start,
8968 last ? last-start : 0,plast ? plast-start : 0);
8971 if (plast && plast < last)
8974 while (PL_regkind[op] != END && (!last || node < last)) {
8975 /* While that wasn't END last time... */
8981 next = regnext((regnode *)node);
8984 if (OP(node) == OPTIMIZED) {
8985 if (!optstart && RE_DEBUG_FLAG(RE_DEBUG_COMPILE_OPTIMISE))
8992 regprop(r, sv, node);
8993 PerlIO_printf(Perl_debug_log, "%4"IVdf":%*s%s", (IV)(node - start),
8994 (int)(2*indent + 1), "", SvPVX_const(sv));
8996 if (OP(node) != OPTIMIZED) {
8997 if (next == NULL) /* Next ptr. */
8998 PerlIO_printf(Perl_debug_log, "(0)");
8999 else if (PL_regkind[(U8)op] == BRANCH && PL_regkind[OP(next)] != BRANCH )
9000 PerlIO_printf(Perl_debug_log, "(FAIL)");
9002 PerlIO_printf(Perl_debug_log, "(%"IVdf")", (IV)(next - start));
9004 /*if (PL_regkind[(U8)op] != TRIE)*/
9005 (void)PerlIO_putc(Perl_debug_log, '\n');
9009 if (PL_regkind[(U8)op] == BRANCHJ) {
9012 register const regnode *nnode = (OP(next) == LONGJMP
9013 ? regnext((regnode *)next)
9015 if (last && nnode > last)
9017 DUMPUNTIL(NEXTOPER(NEXTOPER(node)), nnode);
9020 else if (PL_regkind[(U8)op] == BRANCH) {
9022 DUMPUNTIL(NEXTOPER(node), next);
9024 else if ( PL_regkind[(U8)op] == TRIE ) {
9025 const regnode *this_trie = node;
9026 const char op = OP(node);
9027 const I32 n = ARG(node);
9028 const reg_ac_data * const ac = op>=AHOCORASICK ?
9029 (reg_ac_data *)r->data->data[n] :
9031 const reg_trie_data * const trie = op<AHOCORASICK ?
9032 (reg_trie_data*)r->data->data[n] :
9034 const regnode *nextbranch= NULL;
9036 sv_setpvn(sv, "", 0);
9037 for (word_idx= 0; word_idx < (I32)trie->wordcount; word_idx++) {
9038 SV ** const elem_ptr = av_fetch(trie->words,word_idx,0);
9040 PerlIO_printf(Perl_debug_log, "%*s%s ",
9041 (int)(2*(indent+3)), "",
9042 elem_ptr ? pv_pretty(sv, SvPV_nolen_const(*elem_ptr), SvCUR(*elem_ptr), 60,
9043 PL_colors[0], PL_colors[1],
9044 (SvUTF8(*elem_ptr) ? PERL_PV_ESCAPE_UNI : 0) |
9045 PERL_PV_PRETTY_ELIPSES |
9051 U16 dist= trie->jump[word_idx+1];
9052 PerlIO_printf(Perl_debug_log, "(%u)\n",
9053 (dist ? this_trie + dist : next) - start);
9056 nextbranch= this_trie + trie->jump[0];
9057 DUMPUNTIL(this_trie + dist, nextbranch);
9059 if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
9060 nextbranch= regnext((regnode *)nextbranch);
9062 PerlIO_printf(Perl_debug_log, "\n");
9065 if (last && next > last)
9070 else if ( op == CURLY ) { /* "next" might be very big: optimizer */
9071 DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS,
9072 NEXTOPER(node) + EXTRA_STEP_2ARGS + 1);
9074 else if (PL_regkind[(U8)op] == CURLY && op != CURLYX) {
9076 DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS, next);
9078 else if ( op == PLUS || op == STAR) {
9079 DUMPUNTIL(NEXTOPER(node), NEXTOPER(node) + 1);
9081 else if (op == ANYOF) {
9082 /* arglen 1 + class block */
9083 node += 1 + ((ANYOF_FLAGS(node) & ANYOF_LARGE)
9084 ? ANYOF_CLASS_SKIP : ANYOF_SKIP);
9085 node = NEXTOPER(node);
9087 else if (PL_regkind[(U8)op] == EXACT) {
9088 /* Literal string, where present. */
9089 node += NODE_SZ_STR(node) - 1;
9090 node = NEXTOPER(node);
9093 node = NEXTOPER(node);
9094 node += regarglen[(U8)op];
9096 if (op == CURLYX || op == OPEN)
9098 else if (op == WHILEM)
9102 #ifdef DEBUG_DUMPUNTIL
9103 PerlIO_printf(Perl_debug_log, "--- %d\n",indent);
9108 #endif /* DEBUGGING */
9112 * c-indentation-style: bsd
9114 * indent-tabs-mode: t
9117 * ex: set ts=8 sts=4 sw=4 noet: