5 * "A fair jaw-cracker dwarf-language must be." --Samwise Gamgee
8 /* This file contains functions for compiling a regular expression. See
9 * also regexec.c which funnily enough, contains functions for executing
10 * a regular expression.
12 * This file is also copied at build time to ext/re/re_comp.c, where
13 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
14 * This causes the main functions to be compiled under new names and with
15 * debugging support added, which makes "use re 'debug'" work.
18 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
19 * confused with the original package (see point 3 below). Thanks, Henry!
22 /* Additional note: this code is very heavily munged from Henry's version
23 * in places. In some spots I've traded clarity for efficiency, so don't
24 * blame Henry for some of the lack of readability.
27 /* The names of the functions have been changed from regcomp and
28 * regexec to pregcomp and pregexec in order to avoid conflicts
29 * with the POSIX routines of the same names.
32 #ifdef PERL_EXT_RE_BUILD
37 * pregcomp and pregexec -- regsub and regerror are not used in perl
39 * Copyright (c) 1986 by University of Toronto.
40 * Written by Henry Spencer. Not derived from licensed software.
42 * Permission is granted to anyone to use this software for any
43 * purpose on any computer system, and to redistribute it freely,
44 * subject to the following restrictions:
46 * 1. The author is not responsible for the consequences of use of
47 * this software, no matter how awful, even if they arise
50 * 2. The origin of this software must not be misrepresented, either
51 * by explicit claim or by omission.
53 * 3. Altered versions must be plainly marked as such, and must not
54 * be misrepresented as being the original software.
57 **** Alterations to Henry's code are...
59 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
60 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, by Larry Wall and others
62 **** You may distribute under the terms of either the GNU General Public
63 **** License or the Artistic License, as specified in the README file.
66 * Beware that some of this code is subtly aware of the way operator
67 * precedence is structured in regular expressions. Serious changes in
68 * regular-expression syntax might require a total rethink.
71 #define PERL_IN_REGCOMP_C
74 #ifndef PERL_IN_XSUB_RE
79 #ifdef PERL_IN_XSUB_RE
90 # if defined(BUGGY_MSC6)
91 /* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */
92 # pragma optimize("a",off)
93 /* But MSC 6.00A is happy with 'w', for aliases only across function calls*/
94 # pragma optimize("w",on )
95 # endif /* BUGGY_MSC6 */
102 typedef struct RExC_state_t {
103 U32 flags; /* are we folding, multilining? */
104 char *precomp; /* uncompiled string. */
106 char *start; /* Start of input for compile */
107 char *end; /* End of input for compile */
108 char *parse; /* Input-scan pointer. */
109 I32 whilem_seen; /* number of WHILEM in this expr */
110 regnode *emit_start; /* Start of emitted-code area */
111 regnode *emit; /* Code-emit pointer; ®dummy = don't = compiling */
112 I32 naughty; /* How bad is this pattern? */
113 I32 sawback; /* Did we see \1, ...? */
115 I32 size; /* Code size. */
116 I32 npar; /* Capture buffer count, (OPEN). */
117 I32 cpar; /* Capture buffer count, (CLOSE). */
118 I32 nestroot; /* root parens we are in - used by accept */
122 regnode **open_parens; /* pointers to open parens */
123 regnode **close_parens; /* pointers to close parens */
124 regnode *opend; /* END node in program */
126 HV *charnames; /* cache of named sequences */
127 HV *paren_names; /* Paren names */
128 regnode **recurse; /* Recurse regops */
129 I32 recurse_count; /* Number of recurse regops */
131 char *starttry; /* -Dr: where regtry was called. */
132 #define RExC_starttry (pRExC_state->starttry)
135 const char *lastparse;
137 #define RExC_lastparse (pRExC_state->lastparse)
138 #define RExC_lastnum (pRExC_state->lastnum)
142 #define RExC_flags (pRExC_state->flags)
143 #define RExC_precomp (pRExC_state->precomp)
144 #define RExC_rx (pRExC_state->rx)
145 #define RExC_start (pRExC_state->start)
146 #define RExC_end (pRExC_state->end)
147 #define RExC_parse (pRExC_state->parse)
148 #define RExC_whilem_seen (pRExC_state->whilem_seen)
149 #define RExC_offsets (pRExC_state->rx->offsets) /* I am not like the others */
150 #define RExC_emit (pRExC_state->emit)
151 #define RExC_emit_start (pRExC_state->emit_start)
152 #define RExC_naughty (pRExC_state->naughty)
153 #define RExC_sawback (pRExC_state->sawback)
154 #define RExC_seen (pRExC_state->seen)
155 #define RExC_size (pRExC_state->size)
156 #define RExC_npar (pRExC_state->npar)
157 #define RExC_cpar (pRExC_state->cpar)
158 #define RExC_nestroot (pRExC_state->nestroot)
159 #define RExC_extralen (pRExC_state->extralen)
160 #define RExC_seen_zerolen (pRExC_state->seen_zerolen)
161 #define RExC_seen_evals (pRExC_state->seen_evals)
162 #define RExC_utf8 (pRExC_state->utf8)
163 #define RExC_charnames (pRExC_state->charnames)
164 #define RExC_open_parens (pRExC_state->open_parens)
165 #define RExC_close_parens (pRExC_state->close_parens)
166 #define RExC_opend (pRExC_state->opend)
167 #define RExC_paren_names (pRExC_state->paren_names)
168 #define RExC_recurse (pRExC_state->recurse)
169 #define RExC_recurse_count (pRExC_state->recurse_count)
171 #define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?')
172 #define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
173 ((*s) == '{' && regcurly(s)))
176 #undef SPSTART /* dratted cpp namespace... */
179 * Flags to be passed up and down.
181 #define WORST 0 /* Worst case. */
182 #define HASWIDTH 0x1 /* Known to match non-null strings. */
183 #define SIMPLE 0x2 /* Simple enough to be STAR/PLUS operand. */
184 #define SPSTART 0x4 /* Starts with * or +. */
185 #define TRYAGAIN 0x8 /* Weeded out a declaration. */
187 #define REG_NODE_NUM(x) ((x) ? (int)((x)-RExC_emit_start) : -1)
189 /* whether trie related optimizations are enabled */
190 #if PERL_ENABLE_EXTENDED_TRIE_OPTIMISATION
191 #define TRIE_STUDY_OPT
192 #define FULL_TRIE_STUDY
198 #define PBYTE(u8str,paren) ((U8*)(u8str))[(paren) >> 3]
199 #define PBITVAL(paren) (1 << ((paren) & 7))
200 #define PAREN_TEST(u8str,paren) ( PBYTE(u8str,paren) & PBITVAL(paren))
201 #define PAREN_SET(u8str,paren) PBYTE(u8str,paren) |= PBITVAL(paren)
202 #define PAREN_UNSET(u8str,paren) PBYTE(u8str,paren) &= (~PBITVAL(paren))
205 /* About scan_data_t.
207 During optimisation we recurse through the regexp program performing
208 various inplace (keyhole style) optimisations. In addition study_chunk
209 and scan_commit populate this data structure with information about
210 what strings MUST appear in the pattern. We look for the longest
211 string that must appear for at a fixed location, and we look for the
212 longest string that may appear at a floating location. So for instance
217 Both 'FOO' and 'A' are fixed strings. Both 'B' and 'BAR' are floating
218 strings (because they follow a .* construct). study_chunk will identify
219 both FOO and BAR as being the longest fixed and floating strings respectively.
221 The strings can be composites, for instance
225 will result in a composite fixed substring 'foo'.
227 For each string some basic information is maintained:
229 - offset or min_offset
230 This is the position the string must appear at, or not before.
231 It also implicitly (when combined with minlenp) tells us how many
232 character must match before the string we are searching.
233 Likewise when combined with minlenp and the length of the string
234 tells us how many characters must appear after the string we have
238 Only used for floating strings. This is the rightmost point that
239 the string can appear at. Ifset to I32 max it indicates that the
240 string can occur infinitely far to the right.
243 A pointer to the minimum length of the pattern that the string
244 was found inside. This is important as in the case of positive
245 lookahead or positive lookbehind we can have multiple patterns
250 The minimum length of the pattern overall is 3, the minimum length
251 of the lookahead part is 3, but the minimum length of the part that
252 will actually match is 1. So 'FOO's minimum length is 3, but the
253 minimum length for the F is 1. This is important as the minimum length
254 is used to determine offsets in front of and behind the string being
255 looked for. Since strings can be composites this is the length of the
256 pattern at the time it was commited with a scan_commit. Note that
257 the length is calculated by study_chunk, so that the minimum lengths
258 are not known until the full pattern has been compiled, thus the
259 pointer to the value.
263 In the case of lookbehind the string being searched for can be
264 offset past the start point of the final matching string.
265 If this value was just blithely removed from the min_offset it would
266 invalidate some of the calculations for how many chars must match
267 before or after (as they are derived from min_offset and minlen and
268 the length of the string being searched for).
269 When the final pattern is compiled and the data is moved from the
270 scan_data_t structure into the regexp structure the information
271 about lookbehind is factored in, with the information that would
272 have been lost precalculated in the end_shift field for the
275 The fields pos_min and pos_delta are used to store the minimum offset
276 and the delta to the maximum offset at the current point in the pattern.
280 typedef struct scan_data_t {
281 /*I32 len_min; unused */
282 /*I32 len_delta; unused */
286 I32 last_end; /* min value, <0 unless valid. */
289 SV **longest; /* Either &l_fixed, or &l_float. */
290 SV *longest_fixed; /* longest fixed string found in pattern */
291 I32 offset_fixed; /* offset where it starts */
292 I32 *minlen_fixed; /* pointer to the minlen relevent to the string */
293 I32 lookbehind_fixed; /* is the position of the string modfied by LB */
294 SV *longest_float; /* longest floating string found in pattern */
295 I32 offset_float_min; /* earliest point in string it can appear */
296 I32 offset_float_max; /* latest point in string it can appear */
297 I32 *minlen_float; /* pointer to the minlen relevent to the string */
298 I32 lookbehind_float; /* is the position of the string modified by LB */
302 struct regnode_charclass_class *start_class;
306 * Forward declarations for pregcomp()'s friends.
309 static const scan_data_t zero_scan_data =
310 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ,0};
312 #define SF_BEFORE_EOL (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
313 #define SF_BEFORE_SEOL 0x0001
314 #define SF_BEFORE_MEOL 0x0002
315 #define SF_FIX_BEFORE_EOL (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
316 #define SF_FL_BEFORE_EOL (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
319 # define SF_FIX_SHIFT_EOL (0+2)
320 # define SF_FL_SHIFT_EOL (0+4)
322 # define SF_FIX_SHIFT_EOL (+2)
323 # define SF_FL_SHIFT_EOL (+4)
326 #define SF_FIX_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
327 #define SF_FIX_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
329 #define SF_FL_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
330 #define SF_FL_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
331 #define SF_IS_INF 0x0040
332 #define SF_HAS_PAR 0x0080
333 #define SF_IN_PAR 0x0100
334 #define SF_HAS_EVAL 0x0200
335 #define SCF_DO_SUBSTR 0x0400
336 #define SCF_DO_STCLASS_AND 0x0800
337 #define SCF_DO_STCLASS_OR 0x1000
338 #define SCF_DO_STCLASS (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
339 #define SCF_WHILEM_VISITED_POS 0x2000
341 #define SCF_TRIE_RESTUDY 0x4000 /* Do restudy? */
342 #define SCF_SEEN_ACCEPT 0x8000
344 #define UTF (RExC_utf8 != 0)
345 #define LOC ((RExC_flags & PMf_LOCALE) != 0)
346 #define FOLD ((RExC_flags & PMf_FOLD) != 0)
348 #define OOB_UNICODE 12345678
349 #define OOB_NAMEDCLASS -1
351 #define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
352 #define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
355 /* length of regex to show in messages that don't mark a position within */
356 #define RegexLengthToShowInErrorMessages 127
359 * If MARKER[12] are adjusted, be sure to adjust the constants at the top
360 * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
361 * op/pragma/warn/regcomp.
363 #define MARKER1 "<-- HERE" /* marker as it appears in the description */
364 #define MARKER2 " <-- HERE " /* marker as it appears within the regex */
366 #define REPORT_LOCATION " in regex; marked by " MARKER1 " in m/%.*s" MARKER2 "%s/"
369 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
370 * arg. Show regex, up to a maximum length. If it's too long, chop and add
373 #define FAIL(msg) STMT_START { \
374 const char *ellipses = ""; \
375 IV len = RExC_end - RExC_precomp; \
378 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
379 if (len > RegexLengthToShowInErrorMessages) { \
380 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
381 len = RegexLengthToShowInErrorMessages - 10; \
384 Perl_croak(aTHX_ "%s in regex m/%.*s%s/", \
385 msg, (int)len, RExC_precomp, ellipses); \
389 * Simple_vFAIL -- like FAIL, but marks the current location in the scan
391 #define Simple_vFAIL(m) STMT_START { \
392 const IV offset = RExC_parse - RExC_precomp; \
393 Perl_croak(aTHX_ "%s" REPORT_LOCATION, \
394 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
398 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
400 #define vFAIL(m) STMT_START { \
402 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
407 * Like Simple_vFAIL(), but accepts two arguments.
409 #define Simple_vFAIL2(m,a1) STMT_START { \
410 const IV offset = RExC_parse - RExC_precomp; \
411 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, \
412 (int)offset, RExC_precomp, RExC_precomp + offset); \
416 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
418 #define vFAIL2(m,a1) STMT_START { \
420 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
421 Simple_vFAIL2(m, a1); \
426 * Like Simple_vFAIL(), but accepts three arguments.
428 #define Simple_vFAIL3(m, a1, a2) STMT_START { \
429 const IV offset = RExC_parse - RExC_precomp; \
430 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, \
431 (int)offset, RExC_precomp, RExC_precomp + offset); \
435 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
437 #define vFAIL3(m,a1,a2) STMT_START { \
439 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
440 Simple_vFAIL3(m, a1, a2); \
444 * Like Simple_vFAIL(), but accepts four arguments.
446 #define Simple_vFAIL4(m, a1, a2, a3) STMT_START { \
447 const IV offset = RExC_parse - RExC_precomp; \
448 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3, \
449 (int)offset, RExC_precomp, RExC_precomp + offset); \
452 #define vWARN(loc,m) STMT_START { \
453 const IV offset = loc - RExC_precomp; \
454 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s" REPORT_LOCATION, \
455 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
458 #define vWARNdep(loc,m) STMT_START { \
459 const IV offset = loc - RExC_precomp; \
460 Perl_warner(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), \
461 "%s" REPORT_LOCATION, \
462 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
466 #define vWARN2(loc, m, a1) STMT_START { \
467 const IV offset = loc - RExC_precomp; \
468 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
469 a1, (int)offset, RExC_precomp, RExC_precomp + offset); \
472 #define vWARN3(loc, m, a1, a2) STMT_START { \
473 const IV offset = loc - RExC_precomp; \
474 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
475 a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset); \
478 #define vWARN4(loc, m, a1, a2, a3) STMT_START { \
479 const IV offset = loc - RExC_precomp; \
480 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
481 a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
484 #define vWARN5(loc, m, a1, a2, a3, a4) STMT_START { \
485 const IV offset = loc - RExC_precomp; \
486 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
487 a1, a2, a3, a4, (int)offset, RExC_precomp, RExC_precomp + offset); \
491 /* Allow for side effects in s */
492 #define REGC(c,s) STMT_START { \
493 if (!SIZE_ONLY) *(s) = (c); else (void)(s); \
496 /* Macros for recording node offsets. 20001227 mjd@plover.com
497 * Nodes are numbered 1, 2, 3, 4. Node #n's position is recorded in
498 * element 2*n-1 of the array. Element #2n holds the byte length node #n.
499 * Element 0 holds the number n.
500 * Position is 1 indexed.
503 #define Set_Node_Offset_To_R(node,byte) STMT_START { \
505 MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n", \
506 __LINE__, (int)(node), (int)(byte))); \
508 Perl_croak(aTHX_ "value of node is %d in Offset macro", (int)(node)); \
510 RExC_offsets[2*(node)-1] = (byte); \
515 #define Set_Node_Offset(node,byte) \
516 Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
517 #define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
519 #define Set_Node_Length_To_R(node,len) STMT_START { \
521 MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n", \
522 __LINE__, (int)(node), (int)(len))); \
524 Perl_croak(aTHX_ "value of node is %d in Length macro", (int)(node)); \
526 RExC_offsets[2*(node)] = (len); \
531 #define Set_Node_Length(node,len) \
532 Set_Node_Length_To_R((node)-RExC_emit_start, len)
533 #define Set_Cur_Node_Length(len) Set_Node_Length(RExC_emit, len)
534 #define Set_Node_Cur_Length(node) \
535 Set_Node_Length(node, RExC_parse - parse_start)
537 /* Get offsets and lengths */
538 #define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
539 #define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
541 #define Set_Node_Offset_Length(node,offset,len) STMT_START { \
542 Set_Node_Offset_To_R((node)-RExC_emit_start, (offset)); \
543 Set_Node_Length_To_R((node)-RExC_emit_start, (len)); \
547 #if PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS
548 #define EXPERIMENTAL_INPLACESCAN
551 #define DEBUG_STUDYDATA(data,depth) \
552 DEBUG_OPTIMISE_MORE_r(if(data){ \
553 PerlIO_printf(Perl_debug_log, \
554 "%*s"/* Len:%"IVdf"/%"IVdf" */" Pos:%"IVdf"/%"IVdf \
555 " Flags: %"IVdf" Whilem_c: %"IVdf" Lcp: %"IVdf" ", \
556 (int)(depth)*2, "", \
557 (IV)((data)->pos_min), \
558 (IV)((data)->pos_delta), \
559 (IV)((data)->flags), \
560 (IV)((data)->whilem_c), \
561 (IV)((data)->last_closep ? *((data)->last_closep) : -1) \
563 if ((data)->last_found) \
564 PerlIO_printf(Perl_debug_log, \
565 "Last:'%s' %"IVdf":%"IVdf"/%"IVdf" %sFixed:'%s' @ %"IVdf \
566 " %sFloat: '%s' @ %"IVdf"/%"IVdf"", \
567 SvPVX_const((data)->last_found), \
568 (IV)((data)->last_end), \
569 (IV)((data)->last_start_min), \
570 (IV)((data)->last_start_max), \
571 ((data)->longest && \
572 (data)->longest==&((data)->longest_fixed)) ? "*" : "", \
573 SvPVX_const((data)->longest_fixed), \
574 (IV)((data)->offset_fixed), \
575 ((data)->longest && \
576 (data)->longest==&((data)->longest_float)) ? "*" : "", \
577 SvPVX_const((data)->longest_float), \
578 (IV)((data)->offset_float_min), \
579 (IV)((data)->offset_float_max) \
581 PerlIO_printf(Perl_debug_log,"\n"); \
584 static void clear_re(pTHX_ void *r);
586 /* Mark that we cannot extend a found fixed substring at this point.
587 Update the longest found anchored substring and the longest found
588 floating substrings if needed. */
591 S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data, I32 *minlenp)
593 const STRLEN l = CHR_SVLEN(data->last_found);
594 const STRLEN old_l = CHR_SVLEN(*data->longest);
595 GET_RE_DEBUG_FLAGS_DECL;
597 if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
598 SvSetMagicSV(*data->longest, data->last_found);
599 if (*data->longest == data->longest_fixed) {
600 data->offset_fixed = l ? data->last_start_min : data->pos_min;
601 if (data->flags & SF_BEFORE_EOL)
603 |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
605 data->flags &= ~SF_FIX_BEFORE_EOL;
606 data->minlen_fixed=minlenp;
607 data->lookbehind_fixed=0;
610 data->offset_float_min = l ? data->last_start_min : data->pos_min;
611 data->offset_float_max = (l
612 ? data->last_start_max
613 : data->pos_min + data->pos_delta);
614 if ((U32)data->offset_float_max > (U32)I32_MAX)
615 data->offset_float_max = I32_MAX;
616 if (data->flags & SF_BEFORE_EOL)
618 |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
620 data->flags &= ~SF_FL_BEFORE_EOL;
621 data->minlen_float=minlenp;
622 data->lookbehind_float=0;
625 SvCUR_set(data->last_found, 0);
627 SV * const sv = data->last_found;
628 if (SvUTF8(sv) && SvMAGICAL(sv)) {
629 MAGIC * const mg = mg_find(sv, PERL_MAGIC_utf8);
635 data->flags &= ~SF_BEFORE_EOL;
636 DEBUG_STUDYDATA(data,0);
639 /* Can match anything (initialization) */
641 S_cl_anything(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
643 ANYOF_CLASS_ZERO(cl);
644 ANYOF_BITMAP_SETALL(cl);
645 cl->flags = ANYOF_EOS|ANYOF_UNICODE_ALL;
647 cl->flags |= ANYOF_LOCALE;
650 /* Can match anything (initialization) */
652 S_cl_is_anything(const struct regnode_charclass_class *cl)
656 for (value = 0; value <= ANYOF_MAX; value += 2)
657 if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1))
659 if (!(cl->flags & ANYOF_UNICODE_ALL))
661 if (!ANYOF_BITMAP_TESTALLSET((const void*)cl))
666 /* Can match anything (initialization) */
668 S_cl_init(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
670 Zero(cl, 1, struct regnode_charclass_class);
672 cl_anything(pRExC_state, cl);
676 S_cl_init_zero(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
678 Zero(cl, 1, struct regnode_charclass_class);
680 cl_anything(pRExC_state, cl);
682 cl->flags |= ANYOF_LOCALE;
685 /* 'And' a given class with another one. Can create false positives */
686 /* We assume that cl is not inverted */
688 S_cl_and(struct regnode_charclass_class *cl,
689 const struct regnode_charclass_class *and_with)
692 assert(and_with->type == ANYOF);
693 if (!(and_with->flags & ANYOF_CLASS)
694 && !(cl->flags & ANYOF_CLASS)
695 && (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
696 && !(and_with->flags & ANYOF_FOLD)
697 && !(cl->flags & ANYOF_FOLD)) {
700 if (and_with->flags & ANYOF_INVERT)
701 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
702 cl->bitmap[i] &= ~and_with->bitmap[i];
704 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
705 cl->bitmap[i] &= and_with->bitmap[i];
706 } /* XXXX: logic is complicated otherwise, leave it along for a moment. */
707 if (!(and_with->flags & ANYOF_EOS))
708 cl->flags &= ~ANYOF_EOS;
710 if (cl->flags & ANYOF_UNICODE_ALL && and_with->flags & ANYOF_UNICODE &&
711 !(and_with->flags & ANYOF_INVERT)) {
712 cl->flags &= ~ANYOF_UNICODE_ALL;
713 cl->flags |= ANYOF_UNICODE;
714 ARG_SET(cl, ARG(and_with));
716 if (!(and_with->flags & ANYOF_UNICODE_ALL) &&
717 !(and_with->flags & ANYOF_INVERT))
718 cl->flags &= ~ANYOF_UNICODE_ALL;
719 if (!(and_with->flags & (ANYOF_UNICODE|ANYOF_UNICODE_ALL)) &&
720 !(and_with->flags & ANYOF_INVERT))
721 cl->flags &= ~ANYOF_UNICODE;
724 /* 'OR' a given class with another one. Can create false positives */
725 /* We assume that cl is not inverted */
727 S_cl_or(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, const struct regnode_charclass_class *or_with)
729 if (or_with->flags & ANYOF_INVERT) {
731 * (B1 | CL1) | (!B2 & !CL2) = (B1 | !B2 & !CL2) | (CL1 | (!B2 & !CL2))
732 * <= (B1 | !B2) | (CL1 | !CL2)
733 * which is wasteful if CL2 is small, but we ignore CL2:
734 * (B1 | CL1) | (!B2 & !CL2) <= (B1 | CL1) | !B2 = (B1 | !B2) | CL1
735 * XXXX Can we handle case-fold? Unclear:
736 * (OK1(i) | OK1(i')) | !(OK1(i) | OK1(i')) =
737 * (OK1(i) | OK1(i')) | (!OK1(i) & !OK1(i'))
739 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
740 && !(or_with->flags & ANYOF_FOLD)
741 && !(cl->flags & ANYOF_FOLD) ) {
744 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
745 cl->bitmap[i] |= ~or_with->bitmap[i];
746 } /* XXXX: logic is complicated otherwise */
748 cl_anything(pRExC_state, cl);
751 /* (B1 | CL1) | (B2 | CL2) = (B1 | B2) | (CL1 | CL2)) */
752 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
753 && (!(or_with->flags & ANYOF_FOLD)
754 || (cl->flags & ANYOF_FOLD)) ) {
757 /* OR char bitmap and class bitmap separately */
758 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
759 cl->bitmap[i] |= or_with->bitmap[i];
760 if (or_with->flags & ANYOF_CLASS) {
761 for (i = 0; i < ANYOF_CLASSBITMAP_SIZE; i++)
762 cl->classflags[i] |= or_with->classflags[i];
763 cl->flags |= ANYOF_CLASS;
766 else { /* XXXX: logic is complicated, leave it along for a moment. */
767 cl_anything(pRExC_state, cl);
770 if (or_with->flags & ANYOF_EOS)
771 cl->flags |= ANYOF_EOS;
773 if (cl->flags & ANYOF_UNICODE && or_with->flags & ANYOF_UNICODE &&
774 ARG(cl) != ARG(or_with)) {
775 cl->flags |= ANYOF_UNICODE_ALL;
776 cl->flags &= ~ANYOF_UNICODE;
778 if (or_with->flags & ANYOF_UNICODE_ALL) {
779 cl->flags |= ANYOF_UNICODE_ALL;
780 cl->flags &= ~ANYOF_UNICODE;
784 #define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ]
785 #define TRIE_LIST_CUR(state) ( TRIE_LIST_ITEM( state, 0 ).forid )
786 #define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate )
787 #define TRIE_LIST_USED(idx) ( trie->states[state].trans.list ? (TRIE_LIST_CUR( idx ) - 1) : 0 )
793 dump_trie_interim_list(trie,next_alloc)
794 dump_trie_interim_table(trie,next_alloc)
796 These routines dump out a trie in a somewhat readable format.
797 The _interim_ variants are used for debugging the interim
798 tables that are used to generate the final compressed
799 representation which is what dump_trie expects.
801 Part of the reason for their existance is to provide a form
802 of documentation as to how the different representations function.
808 Dumps the final compressed table form of the trie to Perl_debug_log.
809 Used for debugging make_trie().
813 S_dump_trie(pTHX_ const struct _reg_trie_data *trie,U32 depth)
816 SV *sv=sv_newmortal();
817 int colwidth= trie->widecharmap ? 6 : 4;
818 GET_RE_DEBUG_FLAGS_DECL;
821 PerlIO_printf( Perl_debug_log, "%*sChar : %-6s%-6s%-4s ",
822 (int)depth * 2 + 2,"",
823 "Match","Base","Ofs" );
825 for( state = 0 ; state < trie->uniquecharcount ; state++ ) {
826 SV ** const tmp = av_fetch( trie->revcharmap, state, 0);
828 PerlIO_printf( Perl_debug_log, "%*s",
830 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
831 PL_colors[0], PL_colors[1],
832 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
833 PERL_PV_ESCAPE_FIRSTCHAR
838 PerlIO_printf( Perl_debug_log, "\n%*sState|-----------------------",
839 (int)depth * 2 + 2,"");
841 for( state = 0 ; state < trie->uniquecharcount ; state++ )
842 PerlIO_printf( Perl_debug_log, "%.*s", colwidth, "--------");
843 PerlIO_printf( Perl_debug_log, "\n");
845 for( state = 1 ; state < trie->statecount ; state++ ) {
846 const U32 base = trie->states[ state ].trans.base;
848 PerlIO_printf( Perl_debug_log, "%*s#%4"UVXf"|", (int)depth * 2 + 2,"", (UV)state);
850 if ( trie->states[ state ].wordnum ) {
851 PerlIO_printf( Perl_debug_log, " W%4X", trie->states[ state ].wordnum );
853 PerlIO_printf( Perl_debug_log, "%6s", "" );
856 PerlIO_printf( Perl_debug_log, " @%4"UVXf" ", (UV)base );
861 while( ( base + ofs < trie->uniquecharcount ) ||
862 ( base + ofs - trie->uniquecharcount < trie->lasttrans
863 && trie->trans[ base + ofs - trie->uniquecharcount ].check != state))
866 PerlIO_printf( Perl_debug_log, "+%2"UVXf"[ ", (UV)ofs);
868 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
869 if ( ( base + ofs >= trie->uniquecharcount ) &&
870 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
871 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
873 PerlIO_printf( Perl_debug_log, "%*"UVXf,
875 (UV)trie->trans[ base + ofs - trie->uniquecharcount ].next );
877 PerlIO_printf( Perl_debug_log, "%*s",colwidth," ." );
881 PerlIO_printf( Perl_debug_log, "]");
884 PerlIO_printf( Perl_debug_log, "\n" );
888 dump_trie_interim_list(trie,next_alloc)
889 Dumps a fully constructed but uncompressed trie in list form.
890 List tries normally only are used for construction when the number of
891 possible chars (trie->uniquecharcount) is very high.
892 Used for debugging make_trie().
895 S_dump_trie_interim_list(pTHX_ const struct _reg_trie_data *trie, U32 next_alloc,U32 depth)
898 SV *sv=sv_newmortal();
899 int colwidth= trie->widecharmap ? 6 : 4;
900 GET_RE_DEBUG_FLAGS_DECL;
901 /* print out the table precompression. */
902 PerlIO_printf( Perl_debug_log, "%*sState :Word | Transition Data\n%*s%s",
903 (int)depth * 2 + 2,"", (int)depth * 2 + 2,"",
904 "------:-----+-----------------\n" );
906 for( state=1 ; state < next_alloc ; state ++ ) {
909 PerlIO_printf( Perl_debug_log, "%*s %4"UVXf" :",
910 (int)depth * 2 + 2,"", (UV)state );
911 if ( ! trie->states[ state ].wordnum ) {
912 PerlIO_printf( Perl_debug_log, "%5s| ","");
914 PerlIO_printf( Perl_debug_log, "W%4x| ",
915 trie->states[ state ].wordnum
918 for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) {
919 SV ** const tmp = av_fetch( trie->revcharmap, TRIE_LIST_ITEM(state,charid).forid, 0);
921 PerlIO_printf( Perl_debug_log, "%*s:%3X=%4"UVXf" | ",
923 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
924 PL_colors[0], PL_colors[1],
925 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
926 PERL_PV_ESCAPE_FIRSTCHAR
928 TRIE_LIST_ITEM(state,charid).forid,
929 (UV)TRIE_LIST_ITEM(state,charid).newstate
932 PerlIO_printf(Perl_debug_log, "\n%*s| ",
933 (int)((depth * 2) + 14), "");
936 PerlIO_printf( Perl_debug_log, "\n");
941 dump_trie_interim_table(trie,next_alloc)
942 Dumps a fully constructed but uncompressed trie in table form.
943 This is the normal DFA style state transition table, with a few
944 twists to facilitate compression later.
945 Used for debugging make_trie().
948 S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie, U32 next_alloc, U32 depth)
952 SV *sv=sv_newmortal();
953 int colwidth= trie->widecharmap ? 6 : 4;
954 GET_RE_DEBUG_FLAGS_DECL;
957 print out the table precompression so that we can do a visual check
958 that they are identical.
961 PerlIO_printf( Perl_debug_log, "%*sChar : ",(int)depth * 2 + 2,"" );
963 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
964 SV ** const tmp = av_fetch( trie->revcharmap, charid, 0);
966 PerlIO_printf( Perl_debug_log, "%*s",
968 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
969 PL_colors[0], PL_colors[1],
970 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
971 PERL_PV_ESCAPE_FIRSTCHAR
977 PerlIO_printf( Perl_debug_log, "\n%*sState+-",(int)depth * 2 + 2,"" );
979 for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) {
980 PerlIO_printf( Perl_debug_log, "%.*s", colwidth,"--------");
983 PerlIO_printf( Perl_debug_log, "\n" );
985 for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) {
987 PerlIO_printf( Perl_debug_log, "%*s%4"UVXf" : ",
988 (int)depth * 2 + 2,"",
989 (UV)TRIE_NODENUM( state ) );
991 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
992 UV v=(UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next );
994 PerlIO_printf( Perl_debug_log, "%*"UVXf, colwidth, v );
996 PerlIO_printf( Perl_debug_log, "%*s", colwidth, "." );
998 if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) {
999 PerlIO_printf( Perl_debug_log, " (%4"UVXf")\n", (UV)trie->trans[ state ].check );
1001 PerlIO_printf( Perl_debug_log, " (%4"UVXf") W%4X\n", (UV)trie->trans[ state ].check,
1002 trie->states[ TRIE_NODENUM( state ) ].wordnum );
1009 /* make_trie(startbranch,first,last,tail,word_count,flags,depth)
1010 startbranch: the first branch in the whole branch sequence
1011 first : start branch of sequence of branch-exact nodes.
1012 May be the same as startbranch
1013 last : Thing following the last branch.
1014 May be the same as tail.
1015 tail : item following the branch sequence
1016 count : words in the sequence
1017 flags : currently the OP() type we will be building one of /EXACT(|F|Fl)/
1018 depth : indent depth
1020 Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node.
1022 A trie is an N'ary tree where the branches are determined by digital
1023 decomposition of the key. IE, at the root node you look up the 1st character and
1024 follow that branch repeat until you find the end of the branches. Nodes can be
1025 marked as "accepting" meaning they represent a complete word. Eg:
1029 would convert into the following structure. Numbers represent states, letters
1030 following numbers represent valid transitions on the letter from that state, if
1031 the number is in square brackets it represents an accepting state, otherwise it
1032 will be in parenthesis.
1034 +-h->+-e->[3]-+-r->(8)-+-s->[9]
1038 (1) +-i->(6)-+-s->[7]
1040 +-s->(3)-+-h->(4)-+-e->[5]
1042 Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers)
1044 This shows that when matching against the string 'hers' we will begin at state 1
1045 read 'h' and move to state 2, read 'e' and move to state 3 which is accepting,
1046 then read 'r' and go to state 8 followed by 's' which takes us to state 9 which
1047 is also accepting. Thus we know that we can match both 'he' and 'hers' with a
1048 single traverse. We store a mapping from accepting to state to which word was
1049 matched, and then when we have multiple possibilities we try to complete the
1050 rest of the regex in the order in which they occured in the alternation.
1052 The only prior NFA like behaviour that would be changed by the TRIE support is
1053 the silent ignoring of duplicate alternations which are of the form:
1055 / (DUPE|DUPE) X? (?{ ... }) Y /x
1057 Thus EVAL blocks follwing a trie may be called a different number of times with
1058 and without the optimisation. With the optimisations dupes will be silently
1059 ignored. This inconsistant behaviour of EVAL type nodes is well established as
1060 the following demonstrates:
1062 'words'=~/(word|word|word)(?{ print $1 })[xyz]/
1064 which prints out 'word' three times, but
1066 'words'=~/(word|word|word)(?{ print $1 })S/
1068 which doesnt print it out at all. This is due to other optimisations kicking in.
1070 Example of what happens on a structural level:
1072 The regexp /(ac|ad|ab)+/ will produce the folowing debug output:
1074 1: CURLYM[1] {1,32767}(18)
1085 This would be optimizable with startbranch=5, first=5, last=16, tail=16
1086 and should turn into:
1088 1: CURLYM[1] {1,32767}(18)
1090 [Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1]
1098 Cases where tail != last would be like /(?foo|bar)baz/:
1108 which would be optimizable with startbranch=1, first=1, last=7, tail=8
1109 and would end up looking like:
1112 [Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1]
1119 d = uvuni_to_utf8_flags(d, uv, 0);
1121 is the recommended Unicode-aware way of saying
1126 #define TRIE_STORE_REVCHAR \
1128 SV *tmp = newSVpvs(""); \
1129 if (UTF) SvUTF8_on(tmp); \
1130 Perl_sv_catpvf( aTHX_ tmp, "%c", (int)uvc ); \
1131 av_push( TRIE_REVCHARMAP(trie), tmp ); \
1134 #define TRIE_READ_CHAR STMT_START { \
1138 if ( foldlen > 0 ) { \
1139 uvc = utf8n_to_uvuni( scan, UTF8_MAXLEN, &len, uniflags ); \
1144 uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
1145 uvc = to_uni_fold( uvc, foldbuf, &foldlen ); \
1146 foldlen -= UNISKIP( uvc ); \
1147 scan = foldbuf + UNISKIP( uvc ); \
1150 uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
1160 #define TRIE_LIST_PUSH(state,fid,ns) STMT_START { \
1161 if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) { \
1162 U32 ging = TRIE_LIST_LEN( state ) *= 2; \
1163 Renew( trie->states[ state ].trans.list, ging, reg_trie_trans_le ); \
1165 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid; \
1166 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns; \
1167 TRIE_LIST_CUR( state )++; \
1170 #define TRIE_LIST_NEW(state) STMT_START { \
1171 Newxz( trie->states[ state ].trans.list, \
1172 4, reg_trie_trans_le ); \
1173 TRIE_LIST_CUR( state ) = 1; \
1174 TRIE_LIST_LEN( state ) = 4; \
1177 #define TRIE_HANDLE_WORD(state) STMT_START { \
1178 U16 dupe= trie->states[ state ].wordnum; \
1179 regnode * const noper_next = regnext( noper ); \
1181 if (trie->wordlen) \
1182 trie->wordlen[ curword ] = wordlen; \
1184 /* store the word for dumping */ \
1186 if (OP(noper) != NOTHING) \
1187 tmp = newSVpvn(STRING(noper), STR_LEN(noper)); \
1189 tmp = newSVpvn( "", 0 ); \
1190 if ( UTF ) SvUTF8_on( tmp ); \
1191 av_push( trie->words, tmp ); \
1196 if ( noper_next < tail ) { \
1198 Newxz( trie->jump, word_count + 1, U16); \
1199 trie->jump[curword] = (U16)(noper_next - convert); \
1201 jumper = noper_next; \
1203 nextbranch= regnext(cur); \
1207 /* So it's a dupe. This means we need to maintain a */\
1208 /* linked-list from the first to the next. */\
1209 /* we only allocate the nextword buffer when there */\
1210 /* a dupe, so first time we have to do the allocation */\
1211 if (!trie->nextword) \
1212 Newxz( trie->nextword, word_count + 1, U16); \
1213 while ( trie->nextword[dupe] ) \
1214 dupe= trie->nextword[dupe]; \
1215 trie->nextword[dupe]= curword; \
1217 /* we haven't inserted this word yet. */ \
1218 trie->states[ state ].wordnum = curword; \
1223 #define TRIE_TRANS_STATE(state,base,ucharcount,charid,special) \
1224 ( ( base + charid >= ucharcount \
1225 && base + charid < ubound \
1226 && state == trie->trans[ base - ucharcount + charid ].check \
1227 && trie->trans[ base - ucharcount + charid ].next ) \
1228 ? trie->trans[ base - ucharcount + charid ].next \
1229 : ( state==1 ? special : 0 ) \
1233 #define MADE_JUMP_TRIE 2
1234 #define MADE_EXACT_TRIE 4
1237 S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *first, regnode *last, regnode *tail, U32 word_count, U32 flags, U32 depth)
1240 /* first pass, loop through and scan words */
1241 reg_trie_data *trie;
1243 const U32 uniflags = UTF8_ALLOW_DEFAULT;
1248 regnode *jumper = NULL;
1249 regnode *nextbranch = NULL;
1250 regnode *convert = NULL;
1251 /* we just use folder as a flag in utf8 */
1252 const U8 * const folder = ( flags == EXACTF
1254 : ( flags == EXACTFL
1260 const U32 data_slot = add_data( pRExC_state, 1, "t" );
1261 SV *re_trie_maxbuff;
1263 /* these are only used during construction but are useful during
1264 * debugging so we store them in the struct when debugging.
1266 STRLEN trie_charcount=0;
1267 AV *trie_revcharmap;
1269 GET_RE_DEBUG_FLAGS_DECL;
1271 PERL_UNUSED_ARG(depth);
1274 Newxz( trie, 1, reg_trie_data );
1276 trie->startstate = 1;
1277 trie->wordcount = word_count;
1278 RExC_rx->data->data[ data_slot ] = (void*)trie;
1279 Newxz( trie->charmap, 256, U16 );
1280 if (!(UTF && folder))
1281 Newxz( trie->bitmap, ANYOF_BITMAP_SIZE, char );
1283 trie->words = newAV();
1285 TRIE_REVCHARMAP(trie) = newAV();
1287 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
1288 if (!SvIOK(re_trie_maxbuff)) {
1289 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
1292 PerlIO_printf( Perl_debug_log,
1293 "%*smake_trie start==%d, first==%d, last==%d, tail==%d depth=%d\n",
1294 (int)depth * 2 + 2, "",
1295 REG_NODE_NUM(startbranch),REG_NODE_NUM(first),
1296 REG_NODE_NUM(last), REG_NODE_NUM(tail),
1300 /* Find the node we are going to overwrite */
1301 if ( first == startbranch && OP( last ) != BRANCH ) {
1302 /* whole branch chain */
1305 /* branch sub-chain */
1306 convert = NEXTOPER( first );
1309 /* -- First loop and Setup --
1311 We first traverse the branches and scan each word to determine if it
1312 contains widechars, and how many unique chars there are, this is
1313 important as we have to build a table with at least as many columns as we
1316 We use an array of integers to represent the character codes 0..255
1317 (trie->charmap) and we use a an HV* to store unicode characters. We use the
1318 native representation of the character value as the key and IV's for the
1321 *TODO* If we keep track of how many times each character is used we can
1322 remap the columns so that the table compression later on is more
1323 efficient in terms of memory by ensuring most common value is in the
1324 middle and the least common are on the outside. IMO this would be better
1325 than a most to least common mapping as theres a decent chance the most
1326 common letter will share a node with the least common, meaning the node
1327 will not be compressable. With a middle is most common approach the worst
1328 case is when we have the least common nodes twice.
1332 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1333 regnode * const noper = NEXTOPER( cur );
1334 const U8 *uc = (U8*)STRING( noper );
1335 const U8 * const e = uc + STR_LEN( noper );
1337 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1338 const U8 *scan = (U8*)NULL;
1339 U32 wordlen = 0; /* required init */
1342 if (OP(noper) == NOTHING) {
1347 TRIE_BITMAP_SET(trie,*uc);
1348 if ( folder ) TRIE_BITMAP_SET(trie,folder[ *uc ]);
1350 for ( ; uc < e ; uc += len ) {
1351 TRIE_CHARCOUNT(trie)++;
1355 if ( !trie->charmap[ uvc ] ) {
1356 trie->charmap[ uvc ]=( ++trie->uniquecharcount );
1358 trie->charmap[ folder[ uvc ] ] = trie->charmap[ uvc ];
1363 if ( !trie->widecharmap )
1364 trie->widecharmap = newHV();
1366 svpp = hv_fetch( trie->widecharmap, (char*)&uvc, sizeof( UV ), 1 );
1369 Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%"UVXf, uvc );
1371 if ( !SvTRUE( *svpp ) ) {
1372 sv_setiv( *svpp, ++trie->uniquecharcount );
1377 if( cur == first ) {
1380 } else if (chars < trie->minlen) {
1382 } else if (chars > trie->maxlen) {
1386 } /* end first pass */
1387 DEBUG_TRIE_COMPILE_r(
1388 PerlIO_printf( Perl_debug_log, "%*sTRIE(%s): W:%d C:%d Uq:%d Min:%d Max:%d\n",
1389 (int)depth * 2 + 2,"",
1390 ( trie->widecharmap ? "UTF8" : "NATIVE" ), (int)word_count,
1391 (int)TRIE_CHARCOUNT(trie), trie->uniquecharcount,
1392 (int)trie->minlen, (int)trie->maxlen )
1394 Newxz( trie->wordlen, word_count, U32 );
1397 We now know what we are dealing with in terms of unique chars and
1398 string sizes so we can calculate how much memory a naive
1399 representation using a flat table will take. If it's over a reasonable
1400 limit (as specified by ${^RE_TRIE_MAXBUF}) we use a more memory
1401 conservative but potentially much slower representation using an array
1404 At the end we convert both representations into the same compressed
1405 form that will be used in regexec.c for matching with. The latter
1406 is a form that cannot be used to construct with but has memory
1407 properties similar to the list form and access properties similar
1408 to the table form making it both suitable for fast searches and
1409 small enough that its feasable to store for the duration of a program.
1411 See the comment in the code where the compressed table is produced
1412 inplace from the flat tabe representation for an explanation of how
1413 the compression works.
1418 if ( (IV)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1) > SvIV(re_trie_maxbuff) ) {
1420 Second Pass -- Array Of Lists Representation
1422 Each state will be represented by a list of charid:state records
1423 (reg_trie_trans_le) the first such element holds the CUR and LEN
1424 points of the allocated array. (See defines above).
1426 We build the initial structure using the lists, and then convert
1427 it into the compressed table form which allows faster lookups
1428 (but cant be modified once converted).
1431 STRLEN transcount = 1;
1433 DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
1434 "%*sCompiling trie using list compiler\n",
1435 (int)depth * 2 + 2, ""));
1437 Newxz( trie->states, TRIE_CHARCOUNT(trie) + 2, reg_trie_state );
1441 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1443 regnode * const noper = NEXTOPER( cur );
1444 U8 *uc = (U8*)STRING( noper );
1445 const U8 * const e = uc + STR_LEN( noper );
1446 U32 state = 1; /* required init */
1447 U16 charid = 0; /* sanity init */
1448 U8 *scan = (U8*)NULL; /* sanity init */
1449 STRLEN foldlen = 0; /* required init */
1450 U32 wordlen = 0; /* required init */
1451 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1453 if (OP(noper) != NOTHING) {
1454 for ( ; uc < e ; uc += len ) {
1459 charid = trie->charmap[ uvc ];
1461 SV** const svpp = hv_fetch( trie->widecharmap, (char*)&uvc, sizeof( UV ), 0);
1465 charid=(U16)SvIV( *svpp );
1468 /* charid is now 0 if we dont know the char read, or nonzero if we do */
1475 if ( !trie->states[ state ].trans.list ) {
1476 TRIE_LIST_NEW( state );
1478 for ( check = 1; check <= TRIE_LIST_USED( state ); check++ ) {
1479 if ( TRIE_LIST_ITEM( state, check ).forid == charid ) {
1480 newstate = TRIE_LIST_ITEM( state, check ).newstate;
1485 newstate = next_alloc++;
1486 TRIE_LIST_PUSH( state, charid, newstate );
1491 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
1495 TRIE_HANDLE_WORD(state);
1497 } /* end second pass */
1499 /* next alloc is the NEXT state to be allocated */
1500 trie->statecount = next_alloc;
1501 Renew( trie->states, next_alloc, reg_trie_state );
1503 /* and now dump it out before we compress it */
1504 DEBUG_TRIE_COMPILE_MORE_r(
1505 dump_trie_interim_list(trie,next_alloc,depth+1)
1508 Newxz( trie->trans, transcount ,reg_trie_trans );
1515 for( state=1 ; state < next_alloc ; state ++ ) {
1519 DEBUG_TRIE_COMPILE_MORE_r(
1520 PerlIO_printf( Perl_debug_log, "tp: %d zp: %d ",tp,zp)
1524 if (trie->states[state].trans.list) {
1525 U16 minid=TRIE_LIST_ITEM( state, 1).forid;
1529 for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
1530 const U16 forid = TRIE_LIST_ITEM( state, idx).forid;
1531 if ( forid < minid ) {
1533 } else if ( forid > maxid ) {
1537 if ( transcount < tp + maxid - minid + 1) {
1539 Renew( trie->trans, transcount, reg_trie_trans );
1540 Zero( trie->trans + (transcount / 2), transcount / 2 , reg_trie_trans );
1542 base = trie->uniquecharcount + tp - minid;
1543 if ( maxid == minid ) {
1545 for ( ; zp < tp ; zp++ ) {
1546 if ( ! trie->trans[ zp ].next ) {
1547 base = trie->uniquecharcount + zp - minid;
1548 trie->trans[ zp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1549 trie->trans[ zp ].check = state;
1555 trie->trans[ tp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1556 trie->trans[ tp ].check = state;
1561 for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
1562 const U32 tid = base - trie->uniquecharcount + TRIE_LIST_ITEM( state, idx ).forid;
1563 trie->trans[ tid ].next = TRIE_LIST_ITEM( state, idx ).newstate;
1564 trie->trans[ tid ].check = state;
1566 tp += ( maxid - minid + 1 );
1568 Safefree(trie->states[ state ].trans.list);
1571 DEBUG_TRIE_COMPILE_MORE_r(
1572 PerlIO_printf( Perl_debug_log, " base: %d\n",base);
1575 trie->states[ state ].trans.base=base;
1577 trie->lasttrans = tp + 1;
1581 Second Pass -- Flat Table Representation.
1583 we dont use the 0 slot of either trans[] or states[] so we add 1 to each.
1584 We know that we will need Charcount+1 trans at most to store the data
1585 (one row per char at worst case) So we preallocate both structures
1586 assuming worst case.
1588 We then construct the trie using only the .next slots of the entry
1591 We use the .check field of the first entry of the node temporarily to
1592 make compression both faster and easier by keeping track of how many non
1593 zero fields are in the node.
1595 Since trans are numbered from 1 any 0 pointer in the table is a FAIL
1598 There are two terms at use here: state as a TRIE_NODEIDX() which is a
1599 number representing the first entry of the node, and state as a
1600 TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1) and
1601 TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3) if there
1602 are 2 entrys per node. eg:
1610 The table is internally in the right hand, idx form. However as we also
1611 have to deal with the states array which is indexed by nodenum we have to
1612 use TRIE_NODENUM() to convert.
1615 DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
1616 "%*sCompiling trie using table compiler\n",
1617 (int)depth * 2 + 2, ""));
1619 Newxz( trie->trans, ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1,
1621 Newxz( trie->states, TRIE_CHARCOUNT(trie) + 2, reg_trie_state );
1622 next_alloc = trie->uniquecharcount + 1;
1625 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1627 regnode * const noper = NEXTOPER( cur );
1628 const U8 *uc = (U8*)STRING( noper );
1629 const U8 * const e = uc + STR_LEN( noper );
1631 U32 state = 1; /* required init */
1633 U16 charid = 0; /* sanity init */
1634 U32 accept_state = 0; /* sanity init */
1635 U8 *scan = (U8*)NULL; /* sanity init */
1637 STRLEN foldlen = 0; /* required init */
1638 U32 wordlen = 0; /* required init */
1639 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1641 if ( OP(noper) != NOTHING ) {
1642 for ( ; uc < e ; uc += len ) {
1647 charid = trie->charmap[ uvc ];
1649 SV* const * const svpp = hv_fetch( trie->widecharmap, (char*)&uvc, sizeof( UV ), 0);
1650 charid = svpp ? (U16)SvIV(*svpp) : 0;
1654 if ( !trie->trans[ state + charid ].next ) {
1655 trie->trans[ state + charid ].next = next_alloc;
1656 trie->trans[ state ].check++;
1657 next_alloc += trie->uniquecharcount;
1659 state = trie->trans[ state + charid ].next;
1661 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
1663 /* charid is now 0 if we dont know the char read, or nonzero if we do */
1666 accept_state = TRIE_NODENUM( state );
1667 TRIE_HANDLE_WORD(accept_state);
1669 } /* end second pass */
1671 /* and now dump it out before we compress it */
1672 DEBUG_TRIE_COMPILE_MORE_r(
1673 dump_trie_interim_table(trie,next_alloc,depth+1)
1678 * Inplace compress the table.*
1680 For sparse data sets the table constructed by the trie algorithm will
1681 be mostly 0/FAIL transitions or to put it another way mostly empty.
1682 (Note that leaf nodes will not contain any transitions.)
1684 This algorithm compresses the tables by eliminating most such
1685 transitions, at the cost of a modest bit of extra work during lookup:
1687 - Each states[] entry contains a .base field which indicates the
1688 index in the state[] array wheres its transition data is stored.
1690 - If .base is 0 there are no valid transitions from that node.
1692 - If .base is nonzero then charid is added to it to find an entry in
1695 -If trans[states[state].base+charid].check!=state then the
1696 transition is taken to be a 0/Fail transition. Thus if there are fail
1697 transitions at the front of the node then the .base offset will point
1698 somewhere inside the previous nodes data (or maybe even into a node
1699 even earlier), but the .check field determines if the transition is
1703 The following process inplace converts the table to the compressed
1704 table: We first do not compress the root node 1,and mark its all its
1705 .check pointers as 1 and set its .base pointer as 1 as well. This
1706 allows to do a DFA construction from the compressed table later, and
1707 ensures that any .base pointers we calculate later are greater than
1710 - We set 'pos' to indicate the first entry of the second node.
1712 - We then iterate over the columns of the node, finding the first and
1713 last used entry at l and m. We then copy l..m into pos..(pos+m-l),
1714 and set the .check pointers accordingly, and advance pos
1715 appropriately and repreat for the next node. Note that when we copy
1716 the next pointers we have to convert them from the original
1717 NODEIDX form to NODENUM form as the former is not valid post
1720 - If a node has no transitions used we mark its base as 0 and do not
1721 advance the pos pointer.
1723 - If a node only has one transition we use a second pointer into the
1724 structure to fill in allocated fail transitions from other states.
1725 This pointer is independent of the main pointer and scans forward
1726 looking for null transitions that are allocated to a state. When it
1727 finds one it writes the single transition into the "hole". If the
1728 pointer doesnt find one the single transition is appended as normal.
1730 - Once compressed we can Renew/realloc the structures to release the
1733 See "Table-Compression Methods" in sec 3.9 of the Red Dragon,
1734 specifically Fig 3.47 and the associated pseudocode.
1738 const U32 laststate = TRIE_NODENUM( next_alloc );
1741 trie->statecount = laststate;
1743 for ( state = 1 ; state < laststate ; state++ ) {
1745 const U32 stateidx = TRIE_NODEIDX( state );
1746 const U32 o_used = trie->trans[ stateidx ].check;
1747 U32 used = trie->trans[ stateidx ].check;
1748 trie->trans[ stateidx ].check = 0;
1750 for ( charid = 0 ; used && charid < trie->uniquecharcount ; charid++ ) {
1751 if ( flag || trie->trans[ stateidx + charid ].next ) {
1752 if ( trie->trans[ stateidx + charid ].next ) {
1754 for ( ; zp < pos ; zp++ ) {
1755 if ( ! trie->trans[ zp ].next ) {
1759 trie->states[ state ].trans.base = zp + trie->uniquecharcount - charid ;
1760 trie->trans[ zp ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
1761 trie->trans[ zp ].check = state;
1762 if ( ++zp > pos ) pos = zp;
1769 trie->states[ state ].trans.base = pos + trie->uniquecharcount - charid ;
1771 trie->trans[ pos ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
1772 trie->trans[ pos ].check = state;
1777 trie->lasttrans = pos + 1;
1778 Renew( trie->states, laststate, reg_trie_state);
1779 DEBUG_TRIE_COMPILE_MORE_r(
1780 PerlIO_printf( Perl_debug_log,
1781 "%*sAlloc: %d Orig: %"IVdf" elements, Final:%"IVdf". Savings of %%%5.2f\n",
1782 (int)depth * 2 + 2,"",
1783 (int)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1 ),
1786 ( ( next_alloc - pos ) * 100 ) / (double)next_alloc );
1789 } /* end table compress */
1791 DEBUG_TRIE_COMPILE_MORE_r(
1792 PerlIO_printf(Perl_debug_log, "%*sStatecount:%"UVxf" Lasttrans:%"UVxf"\n",
1793 (int)depth * 2 + 2, "",
1794 (UV)trie->statecount,
1795 (UV)trie->lasttrans)
1797 /* resize the trans array to remove unused space */
1798 Renew( trie->trans, trie->lasttrans, reg_trie_trans);
1800 /* and now dump out the compressed format */
1801 DEBUG_TRIE_COMPILE_r(
1802 dump_trie(trie,depth+1)
1805 { /* Modify the program and insert the new TRIE node*/
1806 U8 nodetype =(U8)(flags & 0xFF);
1810 regnode *optimize = NULL;
1812 U32 mjd_nodelen = 0;
1815 This means we convert either the first branch or the first Exact,
1816 depending on whether the thing following (in 'last') is a branch
1817 or not and whther first is the startbranch (ie is it a sub part of
1818 the alternation or is it the whole thing.)
1819 Assuming its a sub part we conver the EXACT otherwise we convert
1820 the whole branch sequence, including the first.
1822 /* Find the node we are going to overwrite */
1823 if ( first != startbranch || OP( last ) == BRANCH ) {
1824 /* branch sub-chain */
1825 NEXT_OFF( first ) = (U16)(last - first);
1827 mjd_offset= Node_Offset((convert));
1828 mjd_nodelen= Node_Length((convert));
1830 /* whole branch chain */
1833 const regnode *nop = NEXTOPER( convert );
1834 mjd_offset= Node_Offset((nop));
1835 mjd_nodelen= Node_Length((nop));
1840 PerlIO_printf(Perl_debug_log, "%*sMJD offset:%"UVuf" MJD length:%"UVuf"\n",
1841 (int)depth * 2 + 2, "",
1842 (UV)mjd_offset, (UV)mjd_nodelen)
1845 /* But first we check to see if there is a common prefix we can
1846 split out as an EXACT and put in front of the TRIE node. */
1847 trie->startstate= 1;
1848 if ( trie->bitmap && !trie->widecharmap && !trie->jump ) {
1850 for ( state = 1 ; state < trie->statecount-1 ; state++ ) {
1854 const U32 base = trie->states[ state ].trans.base;
1856 if ( trie->states[state].wordnum )
1859 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
1860 if ( ( base + ofs >= trie->uniquecharcount ) &&
1861 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
1862 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
1864 if ( ++count > 1 ) {
1865 SV **tmp = av_fetch( TRIE_REVCHARMAP(trie), ofs, 0);
1866 const U8 *ch = (U8*)SvPV_nolen_const( *tmp );
1867 if ( state == 1 ) break;
1869 Zero(trie->bitmap, ANYOF_BITMAP_SIZE, char);
1871 PerlIO_printf(Perl_debug_log,
1872 "%*sNew Start State=%"UVuf" Class: [",
1873 (int)depth * 2 + 2, "",
1876 SV ** const tmp = av_fetch( TRIE_REVCHARMAP(trie), idx, 0);
1877 const U8 * const ch = (U8*)SvPV_nolen_const( *tmp );
1879 TRIE_BITMAP_SET(trie,*ch);
1881 TRIE_BITMAP_SET(trie, folder[ *ch ]);
1883 PerlIO_printf(Perl_debug_log, (char*)ch)
1887 TRIE_BITMAP_SET(trie,*ch);
1889 TRIE_BITMAP_SET(trie,folder[ *ch ]);
1890 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"%s", ch));
1896 SV **tmp = av_fetch( TRIE_REVCHARMAP(trie), idx, 0);
1897 const char *ch = SvPV_nolen_const( *tmp );
1899 PerlIO_printf( Perl_debug_log,
1900 "%*sPrefix State: %"UVuf" Idx:%"UVuf" Char='%s'\n",
1901 (int)depth * 2 + 2, "",
1902 (UV)state, (UV)idx, ch)
1905 OP( convert ) = nodetype;
1906 str=STRING(convert);
1915 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"]\n"));
1921 regnode *n = convert+NODE_SZ_STR(convert);
1922 NEXT_OFF(convert) = NODE_SZ_STR(convert);
1923 trie->startstate = state;
1924 trie->minlen -= (state - 1);
1925 trie->maxlen -= (state - 1);
1927 regnode *fix = convert;
1929 Set_Node_Offset_Length(convert, mjd_offset, state - 1);
1930 while( ++fix < n ) {
1931 Set_Node_Offset_Length(fix, 0, 0);
1937 NEXT_OFF(convert) = (U16)(tail - convert);
1938 DEBUG_r(optimize= n);
1944 if ( trie->maxlen ) {
1945 NEXT_OFF( convert ) = (U16)(tail - convert);
1946 ARG_SET( convert, data_slot );
1947 /* Store the offset to the first unabsorbed branch in
1948 jump[0], which is otherwise unused by the jump logic.
1949 We use this when dumping a trie and during optimisation. */
1951 trie->jump[0] = (U16)(nextbranch - convert);
1954 if ( !trie->states[trie->startstate].wordnum && trie->bitmap &&
1955 ( (char *)jumper - (char *)convert) >= (int)sizeof(struct regnode_charclass) )
1957 OP( convert ) = TRIEC;
1958 Copy(trie->bitmap, ((struct regnode_charclass *)convert)->bitmap, ANYOF_BITMAP_SIZE, char);
1959 Safefree(trie->bitmap);
1962 OP( convert ) = TRIE;
1964 /* store the type in the flags */
1965 convert->flags = nodetype;
1969 + regarglen[ OP( convert ) ];
1971 /* XXX We really should free up the resource in trie now,
1972 as we won't use them - (which resources?) dmq */
1974 /* needed for dumping*/
1975 DEBUG_r(if (optimize) {
1976 regnode *opt = convert;
1977 while ( ++opt < optimize) {
1978 Set_Node_Offset_Length(opt,0,0);
1981 Try to clean up some of the debris left after the
1984 while( optimize < jumper ) {
1985 mjd_nodelen += Node_Length((optimize));
1986 OP( optimize ) = OPTIMIZED;
1987 Set_Node_Offset_Length(optimize,0,0);
1990 Set_Node_Offset_Length(convert,mjd_offset,mjd_nodelen);
1992 } /* end node insert */
1994 SvREFCNT_dec(TRIE_REVCHARMAP(trie));
1998 : trie->startstate>1
2004 S_make_trie_failtable(pTHX_ RExC_state_t *pRExC_state, regnode *source, regnode *stclass, U32 depth)
2006 /* The Trie is constructed and compressed now so we can build a fail array now if its needed
2008 This is basically the Aho-Corasick algorithm. Its from exercise 3.31 and 3.32 in the
2009 "Red Dragon" -- Compilers, principles, techniques, and tools. Aho, Sethi, Ullman 1985/88
2012 We find the fail state for each state in the trie, this state is the longest proper
2013 suffix of the current states 'word' that is also a proper prefix of another word in our
2014 trie. State 1 represents the word '' and is the thus the default fail state. This allows
2015 the DFA not to have to restart after its tried and failed a word at a given point, it
2016 simply continues as though it had been matching the other word in the first place.
2018 'abcdgu'=~/abcdefg|cdgu/
2019 When we get to 'd' we are still matching the first word, we would encounter 'g' which would
2020 fail, which would bring use to the state representing 'd' in the second word where we would
2021 try 'g' and succeed, prodceding to match 'cdgu'.
2023 /* add a fail transition */
2024 reg_trie_data *trie=(reg_trie_data *)RExC_rx->data->data[ARG(source)];
2026 const U32 ucharcount = trie->uniquecharcount;
2027 const U32 numstates = trie->statecount;
2028 const U32 ubound = trie->lasttrans + ucharcount;
2032 U32 base = trie->states[ 1 ].trans.base;
2035 const U32 data_slot = add_data( pRExC_state, 1, "T" );
2036 GET_RE_DEBUG_FLAGS_DECL;
2038 PERL_UNUSED_ARG(depth);
2042 ARG_SET( stclass, data_slot );
2043 Newxz( aho, 1, reg_ac_data );
2044 RExC_rx->data->data[ data_slot ] = (void*)aho;
2046 aho->states=(reg_trie_state *)savepvn((const char*)trie->states,
2047 numstates * sizeof(reg_trie_state));
2048 Newxz( q, numstates, U32);
2049 Newxz( aho->fail, numstates, U32 );
2052 /* initialize fail[0..1] to be 1 so that we always have
2053 a valid final fail state */
2054 fail[ 0 ] = fail[ 1 ] = 1;
2056 for ( charid = 0; charid < ucharcount ; charid++ ) {
2057 const U32 newstate = TRIE_TRANS_STATE( 1, base, ucharcount, charid, 0 );
2059 q[ q_write ] = newstate;
2060 /* set to point at the root */
2061 fail[ q[ q_write++ ] ]=1;
2064 while ( q_read < q_write) {
2065 const U32 cur = q[ q_read++ % numstates ];
2066 base = trie->states[ cur ].trans.base;
2068 for ( charid = 0 ; charid < ucharcount ; charid++ ) {
2069 const U32 ch_state = TRIE_TRANS_STATE( cur, base, ucharcount, charid, 1 );
2071 U32 fail_state = cur;
2074 fail_state = fail[ fail_state ];
2075 fail_base = aho->states[ fail_state ].trans.base;
2076 } while ( !TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 ) );
2078 fail_state = TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 );
2079 fail[ ch_state ] = fail_state;
2080 if ( !aho->states[ ch_state ].wordnum && aho->states[ fail_state ].wordnum )
2082 aho->states[ ch_state ].wordnum = aho->states[ fail_state ].wordnum;
2084 q[ q_write++ % numstates] = ch_state;
2088 /* restore fail[0..1] to 0 so that we "fall out" of the AC loop
2089 when we fail in state 1, this allows us to use the
2090 charclass scan to find a valid start char. This is based on the principle
2091 that theres a good chance the string being searched contains lots of stuff
2092 that cant be a start char.
2094 fail[ 0 ] = fail[ 1 ] = 0;
2095 DEBUG_TRIE_COMPILE_r({
2096 PerlIO_printf(Perl_debug_log,
2097 "%*sStclass Failtable (%"UVuf" states): 0",
2098 (int)(depth * 2), "", (UV)numstates
2100 for( q_read=1; q_read<numstates; q_read++ ) {
2101 PerlIO_printf(Perl_debug_log, ", %"UVuf, (UV)fail[q_read]);
2103 PerlIO_printf(Perl_debug_log, "\n");
2106 /*RExC_seen |= REG_SEEN_TRIEDFA;*/
2111 * There are strange code-generation bugs caused on sparc64 by gcc-2.95.2.
2112 * These need to be revisited when a newer toolchain becomes available.
2114 #if defined(__sparc64__) && defined(__GNUC__)
2115 # if __GNUC__ < 2 || (__GNUC__ == 2 && __GNUC_MINOR__ < 96)
2116 # undef SPARC64_GCC_WORKAROUND
2117 # define SPARC64_GCC_WORKAROUND 1
2121 #define DEBUG_PEEP(str,scan,depth) \
2122 DEBUG_OPTIMISE_r({ \
2123 SV * const mysv=sv_newmortal(); \
2124 regnode *Next = regnext(scan); \
2125 regprop(RExC_rx, mysv, scan); \
2126 PerlIO_printf(Perl_debug_log, "%*s" str ">%3d: %s (%d)\n", \
2127 (int)depth*2, "", REG_NODE_NUM(scan), SvPV_nolen_const(mysv),\
2128 Next ? (REG_NODE_NUM(Next)) : 0 ); \
2135 #define JOIN_EXACT(scan,min,flags) \
2136 if (PL_regkind[OP(scan)] == EXACT) \
2137 join_exact(pRExC_state,(scan),(min),(flags),NULL,depth+1)
2140 S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, I32 *min, U32 flags,regnode *val, U32 depth) {
2141 /* Merge several consecutive EXACTish nodes into one. */
2142 regnode *n = regnext(scan);
2144 regnode *next = scan + NODE_SZ_STR(scan);
2148 regnode *stop = scan;
2149 GET_RE_DEBUG_FLAGS_DECL;
2151 PERL_UNUSED_ARG(depth);
2153 #ifndef EXPERIMENTAL_INPLACESCAN
2154 PERL_UNUSED_ARG(flags);
2155 PERL_UNUSED_ARG(val);
2157 DEBUG_PEEP("join",scan,depth);
2159 /* Skip NOTHING, merge EXACT*. */
2161 ( PL_regkind[OP(n)] == NOTHING ||
2162 (stringok && (OP(n) == OP(scan))))
2164 && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX) {
2166 if (OP(n) == TAIL || n > next)
2168 if (PL_regkind[OP(n)] == NOTHING) {
2169 DEBUG_PEEP("skip:",n,depth);
2170 NEXT_OFF(scan) += NEXT_OFF(n);
2171 next = n + NODE_STEP_REGNODE;
2178 else if (stringok) {
2179 const unsigned int oldl = STR_LEN(scan);
2180 regnode * const nnext = regnext(n);
2182 DEBUG_PEEP("merg",n,depth);
2185 if (oldl + STR_LEN(n) > U8_MAX)
2187 NEXT_OFF(scan) += NEXT_OFF(n);
2188 STR_LEN(scan) += STR_LEN(n);
2189 next = n + NODE_SZ_STR(n);
2190 /* Now we can overwrite *n : */
2191 Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
2199 #ifdef EXPERIMENTAL_INPLACESCAN
2200 if (flags && !NEXT_OFF(n)) {
2201 DEBUG_PEEP("atch", val, depth);
2202 if (reg_off_by_arg[OP(n)]) {
2203 ARG_SET(n, val - n);
2206 NEXT_OFF(n) = val - n;
2213 if (UTF && ( OP(scan) == EXACTF ) && ( STR_LEN(scan) >= 6 ) ) {
2215 Two problematic code points in Unicode casefolding of EXACT nodes:
2217 U+0390 - GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
2218 U+03B0 - GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
2224 U+03B9 U+0308 U+0301 0xCE 0xB9 0xCC 0x88 0xCC 0x81
2225 U+03C5 U+0308 U+0301 0xCF 0x85 0xCC 0x88 0xCC 0x81
2227 This means that in case-insensitive matching (or "loose matching",
2228 as Unicode calls it), an EXACTF of length six (the UTF-8 encoded byte
2229 length of the above casefolded versions) can match a target string
2230 of length two (the byte length of UTF-8 encoded U+0390 or U+03B0).
2231 This would rather mess up the minimum length computation.
2233 What we'll do is to look for the tail four bytes, and then peek
2234 at the preceding two bytes to see whether we need to decrease
2235 the minimum length by four (six minus two).
2237 Thanks to the design of UTF-8, there cannot be false matches:
2238 A sequence of valid UTF-8 bytes cannot be a subsequence of
2239 another valid sequence of UTF-8 bytes.
2242 char * const s0 = STRING(scan), *s, *t;
2243 char * const s1 = s0 + STR_LEN(scan) - 1;
2244 char * const s2 = s1 - 4;
2245 #ifdef EBCDIC /* RD tunifold greek 0390 and 03B0 */
2246 const char t0[] = "\xaf\x49\xaf\x42";
2248 const char t0[] = "\xcc\x88\xcc\x81";
2250 const char * const t1 = t0 + 3;
2253 s < s2 && (t = ninstr(s, s1, t0, t1));
2256 if (((U8)t[-1] == 0x68 && (U8)t[-2] == 0xB4) ||
2257 ((U8)t[-1] == 0x46 && (U8)t[-2] == 0xB5))
2259 if (((U8)t[-1] == 0xB9 && (U8)t[-2] == 0xCE) ||
2260 ((U8)t[-1] == 0x85 && (U8)t[-2] == 0xCF))
2268 n = scan + NODE_SZ_STR(scan);
2270 if (PL_regkind[OP(n)] != NOTHING || OP(n) == NOTHING) {
2277 DEBUG_OPTIMISE_r(if (merged){DEBUG_PEEP("finl",scan,depth)});
2281 /* REx optimizer. Converts nodes into quickier variants "in place".
2282 Finds fixed substrings. */
2284 /* Stops at toplevel WHILEM as well as at "last". At end *scanp is set
2285 to the position after last scanned or to NULL. */
2287 #define INIT_AND_WITHP \
2288 assert(!and_withp); \
2289 Newx(and_withp,1,struct regnode_charclass_class); \
2290 SAVEFREEPV(and_withp)
2293 S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
2294 I32 *minlenp, I32 *deltap,
2299 struct regnode_charclass_class *and_withp,
2300 U32 flags, U32 depth)
2301 /* scanp: Start here (read-write). */
2302 /* deltap: Write maxlen-minlen here. */
2303 /* last: Stop before this one. */
2304 /* data: string data about the pattern */
2305 /* stopparen: treat close N as END */
2306 /* recursed: which subroutines have we recursed into */
2307 /* and_withp: Valid if flags & SCF_DO_STCLASS_OR */
2310 I32 min = 0, pars = 0, code;
2311 regnode *scan = *scanp, *next;
2313 int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
2314 int is_inf_internal = 0; /* The studied chunk is infinite */
2315 I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
2316 scan_data_t data_fake;
2317 SV *re_trie_maxbuff = NULL;
2318 regnode *first_non_open = scan;
2319 I32 stopmin = I32_MAX;
2320 GET_RE_DEBUG_FLAGS_DECL;
2322 StructCopy(&zero_scan_data, &data_fake, scan_data_t);
2326 while (first_non_open && OP(first_non_open) == OPEN)
2327 first_non_open=regnext(first_non_open);
2331 while (scan && OP(scan) != END && scan < last) {
2332 /* Peephole optimizer: */
2333 DEBUG_STUDYDATA(data,depth);
2334 DEBUG_PEEP("Peep",scan,depth);
2335 JOIN_EXACT(scan,&min,0);
2337 /* Follow the next-chain of the current node and optimize
2338 away all the NOTHINGs from it. */
2339 if (OP(scan) != CURLYX) {
2340 const int max = (reg_off_by_arg[OP(scan)]
2342 /* I32 may be smaller than U16 on CRAYs! */
2343 : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
2344 int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
2348 /* Skip NOTHING and LONGJMP. */
2349 while ((n = regnext(n))
2350 && ((PL_regkind[OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
2351 || ((OP(n) == LONGJMP) && (noff = ARG(n))))
2352 && off + noff < max)
2354 if (reg_off_by_arg[OP(scan)])
2357 NEXT_OFF(scan) = off;
2362 /* The principal pseudo-switch. Cannot be a switch, since we
2363 look into several different things. */
2364 if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
2365 || OP(scan) == IFTHEN || OP(scan) == SUSPEND) {
2366 next = regnext(scan);
2368 /* demq: the op(next)==code check is to see if we have "branch-branch" AFAICT */
2370 if (OP(next) == code || code == IFTHEN || code == SUSPEND) {
2371 /* NOTE - There is similar code to this block below for handling
2372 TRIE nodes on a re-study. If you change stuff here check there
2374 I32 max1 = 0, min1 = I32_MAX, num = 0;
2375 struct regnode_charclass_class accum;
2376 regnode * const startbranch=scan;
2378 if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
2379 scan_commit(pRExC_state, data, minlenp); /* Cannot merge strings after this. */
2380 if (flags & SCF_DO_STCLASS)
2381 cl_init_zero(pRExC_state, &accum);
2383 while (OP(scan) == code) {
2384 I32 deltanext, minnext, f = 0, fake;
2385 struct regnode_charclass_class this_class;
2388 data_fake.flags = 0;
2390 data_fake.whilem_c = data->whilem_c;
2391 data_fake.last_closep = data->last_closep;
2394 data_fake.last_closep = &fake;
2395 next = regnext(scan);
2396 scan = NEXTOPER(scan);
2398 scan = NEXTOPER(scan);
2399 if (flags & SCF_DO_STCLASS) {
2400 cl_init(pRExC_state, &this_class);
2401 data_fake.start_class = &this_class;
2402 f = SCF_DO_STCLASS_AND;
2404 if (flags & SCF_WHILEM_VISITED_POS)
2405 f |= SCF_WHILEM_VISITED_POS;
2407 /* we suppose the run is continuous, last=next...*/
2408 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
2410 stopparen, recursed, NULL, f,depth+1);
2413 if (max1 < minnext + deltanext)
2414 max1 = minnext + deltanext;
2415 if (deltanext == I32_MAX)
2416 is_inf = is_inf_internal = 1;
2418 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
2420 if (data_fake.flags & SCF_SEEN_ACCEPT) {
2421 if ( stopmin > minnext)
2422 stopmin = min + min1;
2423 flags &= ~SCF_DO_SUBSTR;
2425 data->flags |= SCF_SEEN_ACCEPT;
2428 if (data_fake.flags & SF_HAS_EVAL)
2429 data->flags |= SF_HAS_EVAL;
2430 data->whilem_c = data_fake.whilem_c;
2432 if (flags & SCF_DO_STCLASS)
2433 cl_or(pRExC_state, &accum, &this_class);
2434 if (code == SUSPEND)
2437 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
2439 if (flags & SCF_DO_SUBSTR) {
2440 data->pos_min += min1;
2441 data->pos_delta += max1 - min1;
2442 if (max1 != min1 || is_inf)
2443 data->longest = &(data->longest_float);
2446 delta += max1 - min1;
2447 if (flags & SCF_DO_STCLASS_OR) {
2448 cl_or(pRExC_state, data->start_class, &accum);
2450 cl_and(data->start_class, and_withp);
2451 flags &= ~SCF_DO_STCLASS;
2454 else if (flags & SCF_DO_STCLASS_AND) {
2456 cl_and(data->start_class, &accum);
2457 flags &= ~SCF_DO_STCLASS;
2460 /* Switch to OR mode: cache the old value of
2461 * data->start_class */
2463 StructCopy(data->start_class, and_withp,
2464 struct regnode_charclass_class);
2465 flags &= ~SCF_DO_STCLASS_AND;
2466 StructCopy(&accum, data->start_class,
2467 struct regnode_charclass_class);
2468 flags |= SCF_DO_STCLASS_OR;
2469 data->start_class->flags |= ANYOF_EOS;
2473 if (PERL_ENABLE_TRIE_OPTIMISATION && OP( startbranch ) == BRANCH ) {
2476 Assuming this was/is a branch we are dealing with: 'scan' now
2477 points at the item that follows the branch sequence, whatever
2478 it is. We now start at the beginning of the sequence and look
2485 which would be constructed from a pattern like /A|LIST|OF|WORDS/
2487 If we can find such a subseqence we need to turn the first
2488 element into a trie and then add the subsequent branch exact
2489 strings to the trie.
2493 1. patterns where the whole set of branch can be converted.
2495 2. patterns where only a subset can be converted.
2497 In case 1 we can replace the whole set with a single regop
2498 for the trie. In case 2 we need to keep the start and end
2501 'BRANCH EXACT; BRANCH EXACT; BRANCH X'
2502 becomes BRANCH TRIE; BRANCH X;
2504 There is an additional case, that being where there is a
2505 common prefix, which gets split out into an EXACT like node
2506 preceding the TRIE node.
2508 If x(1..n)==tail then we can do a simple trie, if not we make
2509 a "jump" trie, such that when we match the appropriate word
2510 we "jump" to the appopriate tail node. Essentailly we turn
2511 a nested if into a case structure of sorts.
2516 if (!re_trie_maxbuff) {
2517 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
2518 if (!SvIOK(re_trie_maxbuff))
2519 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
2521 if ( SvIV(re_trie_maxbuff)>=0 ) {
2523 regnode *first = (regnode *)NULL;
2524 regnode *last = (regnode *)NULL;
2525 regnode *tail = scan;
2530 SV * const mysv = sv_newmortal(); /* for dumping */
2532 /* var tail is used because there may be a TAIL
2533 regop in the way. Ie, the exacts will point to the
2534 thing following the TAIL, but the last branch will
2535 point at the TAIL. So we advance tail. If we
2536 have nested (?:) we may have to move through several
2540 while ( OP( tail ) == TAIL ) {
2541 /* this is the TAIL generated by (?:) */
2542 tail = regnext( tail );
2547 regprop(RExC_rx, mysv, tail );
2548 PerlIO_printf( Perl_debug_log, "%*s%s%s\n",
2549 (int)depth * 2 + 2, "",
2550 "Looking for TRIE'able sequences. Tail node is: ",
2551 SvPV_nolen_const( mysv )
2557 step through the branches, cur represents each
2558 branch, noper is the first thing to be matched
2559 as part of that branch and noper_next is the
2560 regnext() of that node. if noper is an EXACT
2561 and noper_next is the same as scan (our current
2562 position in the regex) then the EXACT branch is
2563 a possible optimization target. Once we have
2564 two or more consequetive such branches we can
2565 create a trie of the EXACT's contents and stich
2566 it in place. If the sequence represents all of
2567 the branches we eliminate the whole thing and
2568 replace it with a single TRIE. If it is a
2569 subsequence then we need to stitch it in. This
2570 means the first branch has to remain, and needs
2571 to be repointed at the item on the branch chain
2572 following the last branch optimized. This could
2573 be either a BRANCH, in which case the
2574 subsequence is internal, or it could be the
2575 item following the branch sequence in which
2576 case the subsequence is at the end.
2580 /* dont use tail as the end marker for this traverse */
2581 for ( cur = startbranch ; cur != scan ; cur = regnext( cur ) ) {
2582 regnode * const noper = NEXTOPER( cur );
2583 #if defined(DEBUGGING) || defined(NOJUMPTRIE)
2584 regnode * const noper_next = regnext( noper );
2588 regprop(RExC_rx, mysv, cur);
2589 PerlIO_printf( Perl_debug_log, "%*s- %s (%d)",
2590 (int)depth * 2 + 2,"", SvPV_nolen_const( mysv ), REG_NODE_NUM(cur) );
2592 regprop(RExC_rx, mysv, noper);
2593 PerlIO_printf( Perl_debug_log, " -> %s",
2594 SvPV_nolen_const(mysv));
2597 regprop(RExC_rx, mysv, noper_next );
2598 PerlIO_printf( Perl_debug_log,"\t=> %s\t",
2599 SvPV_nolen_const(mysv));
2601 PerlIO_printf( Perl_debug_log, "(First==%d,Last==%d,Cur==%d)\n",
2602 REG_NODE_NUM(first), REG_NODE_NUM(last), REG_NODE_NUM(cur) );
2604 if ( (((first && optype!=NOTHING) ? OP( noper ) == optype
2605 : PL_regkind[ OP( noper ) ] == EXACT )
2606 || OP(noper) == NOTHING )
2608 && noper_next == tail
2613 if ( !first || optype == NOTHING ) {
2614 if (!first) first = cur;
2615 optype = OP( noper );
2621 make_trie( pRExC_state,
2622 startbranch, first, cur, tail, count,
2625 if ( PL_regkind[ OP( noper ) ] == EXACT
2627 && noper_next == tail
2632 optype = OP( noper );
2642 regprop(RExC_rx, mysv, cur);
2643 PerlIO_printf( Perl_debug_log,
2644 "%*s- %s (%d) <SCAN FINISHED>\n", (int)depth * 2 + 2,
2645 "", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur));
2649 made= make_trie( pRExC_state, startbranch, first, scan, tail, count, optype, depth+1 );
2650 #ifdef TRIE_STUDY_OPT
2651 if ( ((made == MADE_EXACT_TRIE &&
2652 startbranch == first)
2653 || ( first_non_open == first )) &&
2655 flags |= SCF_TRIE_RESTUDY;
2656 if ( startbranch == first
2659 RExC_seen &=~REG_TOP_LEVEL_BRANCHES;
2669 else if ( code == BRANCHJ ) { /* single branch is optimized. */
2670 scan = NEXTOPER(NEXTOPER(scan));
2671 } else /* single branch is optimized. */
2672 scan = NEXTOPER(scan);
2675 else if (OP(scan) == EXACT) {
2676 I32 l = STR_LEN(scan);
2679 const U8 * const s = (U8*)STRING(scan);
2680 l = utf8_length(s, s + l);
2681 uc = utf8_to_uvchr(s, NULL);
2683 uc = *((U8*)STRING(scan));
2686 if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
2687 /* The code below prefers earlier match for fixed
2688 offset, later match for variable offset. */
2689 if (data->last_end == -1) { /* Update the start info. */
2690 data->last_start_min = data->pos_min;
2691 data->last_start_max = is_inf
2692 ? I32_MAX : data->pos_min + data->pos_delta;
2694 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
2696 SvUTF8_on(data->last_found);
2698 SV * const sv = data->last_found;
2699 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
2700 mg_find(sv, PERL_MAGIC_utf8) : NULL;
2701 if (mg && mg->mg_len >= 0)
2702 mg->mg_len += utf8_length((U8*)STRING(scan),
2703 (U8*)STRING(scan)+STR_LEN(scan));
2705 data->last_end = data->pos_min + l;
2706 data->pos_min += l; /* As in the first entry. */
2707 data->flags &= ~SF_BEFORE_EOL;
2709 if (flags & SCF_DO_STCLASS_AND) {
2710 /* Check whether it is compatible with what we know already! */
2714 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
2715 && !ANYOF_BITMAP_TEST(data->start_class, uc)
2716 && (!(data->start_class->flags & ANYOF_FOLD)
2717 || !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
2720 ANYOF_CLASS_ZERO(data->start_class);
2721 ANYOF_BITMAP_ZERO(data->start_class);
2723 ANYOF_BITMAP_SET(data->start_class, uc);
2724 data->start_class->flags &= ~ANYOF_EOS;
2726 data->start_class->flags &= ~ANYOF_UNICODE_ALL;
2728 else if (flags & SCF_DO_STCLASS_OR) {
2729 /* false positive possible if the class is case-folded */
2731 ANYOF_BITMAP_SET(data->start_class, uc);
2733 data->start_class->flags |= ANYOF_UNICODE_ALL;
2734 data->start_class->flags &= ~ANYOF_EOS;
2735 cl_and(data->start_class, and_withp);
2737 flags &= ~SCF_DO_STCLASS;
2739 else if (PL_regkind[OP(scan)] == EXACT) { /* But OP != EXACT! */
2740 I32 l = STR_LEN(scan);
2741 UV uc = *((U8*)STRING(scan));
2743 /* Search for fixed substrings supports EXACT only. */
2744 if (flags & SCF_DO_SUBSTR) {
2746 scan_commit(pRExC_state, data, minlenp);
2749 const U8 * const s = (U8 *)STRING(scan);
2750 l = utf8_length(s, s + l);
2751 uc = utf8_to_uvchr(s, NULL);
2754 if (flags & SCF_DO_SUBSTR)
2756 if (flags & SCF_DO_STCLASS_AND) {
2757 /* Check whether it is compatible with what we know already! */
2761 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
2762 && !ANYOF_BITMAP_TEST(data->start_class, uc)
2763 && !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
2765 ANYOF_CLASS_ZERO(data->start_class);
2766 ANYOF_BITMAP_ZERO(data->start_class);
2768 ANYOF_BITMAP_SET(data->start_class, uc);
2769 data->start_class->flags &= ~ANYOF_EOS;
2770 data->start_class->flags |= ANYOF_FOLD;
2771 if (OP(scan) == EXACTFL)
2772 data->start_class->flags |= ANYOF_LOCALE;
2775 else if (flags & SCF_DO_STCLASS_OR) {
2776 if (data->start_class->flags & ANYOF_FOLD) {
2777 /* false positive possible if the class is case-folded.
2778 Assume that the locale settings are the same... */
2780 ANYOF_BITMAP_SET(data->start_class, uc);
2781 data->start_class->flags &= ~ANYOF_EOS;
2783 cl_and(data->start_class, and_withp);
2785 flags &= ~SCF_DO_STCLASS;
2787 else if (strchr((const char*)PL_varies,OP(scan))) {
2788 I32 mincount, maxcount, minnext, deltanext, fl = 0;
2789 I32 f = flags, pos_before = 0;
2790 regnode * const oscan = scan;
2791 struct regnode_charclass_class this_class;
2792 struct regnode_charclass_class *oclass = NULL;
2793 I32 next_is_eval = 0;
2795 switch (PL_regkind[OP(scan)]) {
2796 case WHILEM: /* End of (?:...)* . */
2797 scan = NEXTOPER(scan);
2800 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
2801 next = NEXTOPER(scan);
2802 if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) {
2804 maxcount = REG_INFTY;
2805 next = regnext(scan);
2806 scan = NEXTOPER(scan);
2810 if (flags & SCF_DO_SUBSTR)
2815 if (flags & SCF_DO_STCLASS) {
2817 maxcount = REG_INFTY;
2818 next = regnext(scan);
2819 scan = NEXTOPER(scan);
2822 is_inf = is_inf_internal = 1;
2823 scan = regnext(scan);
2824 if (flags & SCF_DO_SUBSTR) {
2825 scan_commit(pRExC_state, data, minlenp); /* Cannot extend fixed substrings */
2826 data->longest = &(data->longest_float);
2828 goto optimize_curly_tail;
2830 if (stopparen>0 && (OP(scan)==CURLYN || OP(scan)==CURLYM)
2831 && (scan->flags == stopparen))
2836 mincount = ARG1(scan);
2837 maxcount = ARG2(scan);
2839 next = regnext(scan);
2840 if (OP(scan) == CURLYX) {
2841 I32 lp = (data ? *(data->last_closep) : 0);
2842 scan->flags = ((lp <= (I32)U8_MAX) ? (U8)lp : U8_MAX);
2844 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
2845 next_is_eval = (OP(scan) == EVAL);
2847 if (flags & SCF_DO_SUBSTR) {
2848 if (mincount == 0) scan_commit(pRExC_state,data,minlenp); /* Cannot extend fixed substrings */
2849 pos_before = data->pos_min;
2853 data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
2855 data->flags |= SF_IS_INF;
2857 if (flags & SCF_DO_STCLASS) {
2858 cl_init(pRExC_state, &this_class);
2859 oclass = data->start_class;
2860 data->start_class = &this_class;
2861 f |= SCF_DO_STCLASS_AND;
2862 f &= ~SCF_DO_STCLASS_OR;
2864 /* These are the cases when once a subexpression
2865 fails at a particular position, it cannot succeed
2866 even after backtracking at the enclosing scope.
2868 XXXX what if minimal match and we are at the
2869 initial run of {n,m}? */
2870 if ((mincount != maxcount - 1) && (maxcount != REG_INFTY))
2871 f &= ~SCF_WHILEM_VISITED_POS;
2873 /* This will finish on WHILEM, setting scan, or on NULL: */
2874 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
2875 last, data, stopparen, recursed, NULL,
2877 ? (f & ~SCF_DO_SUBSTR) : f),depth+1);
2879 if (flags & SCF_DO_STCLASS)
2880 data->start_class = oclass;
2881 if (mincount == 0 || minnext == 0) {
2882 if (flags & SCF_DO_STCLASS_OR) {
2883 cl_or(pRExC_state, data->start_class, &this_class);
2885 else if (flags & SCF_DO_STCLASS_AND) {
2886 /* Switch to OR mode: cache the old value of
2887 * data->start_class */
2889 StructCopy(data->start_class, and_withp,
2890 struct regnode_charclass_class);
2891 flags &= ~SCF_DO_STCLASS_AND;
2892 StructCopy(&this_class, data->start_class,
2893 struct regnode_charclass_class);
2894 flags |= SCF_DO_STCLASS_OR;
2895 data->start_class->flags |= ANYOF_EOS;
2897 } else { /* Non-zero len */
2898 if (flags & SCF_DO_STCLASS_OR) {
2899 cl_or(pRExC_state, data->start_class, &this_class);
2900 cl_and(data->start_class, and_withp);
2902 else if (flags & SCF_DO_STCLASS_AND)
2903 cl_and(data->start_class, &this_class);
2904 flags &= ~SCF_DO_STCLASS;
2906 if (!scan) /* It was not CURLYX, but CURLY. */
2908 if ( /* ? quantifier ok, except for (?{ ... }) */
2909 (next_is_eval || !(mincount == 0 && maxcount == 1))
2910 && (minnext == 0) && (deltanext == 0)
2911 && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
2912 && maxcount <= REG_INFTY/3 /* Complement check for big count */
2913 && ckWARN(WARN_REGEXP))
2916 "Quantifier unexpected on zero-length expression");
2919 min += minnext * mincount;
2920 is_inf_internal |= ((maxcount == REG_INFTY
2921 && (minnext + deltanext) > 0)
2922 || deltanext == I32_MAX);
2923 is_inf |= is_inf_internal;
2924 delta += (minnext + deltanext) * maxcount - minnext * mincount;
2926 /* Try powerful optimization CURLYX => CURLYN. */
2927 if ( OP(oscan) == CURLYX && data
2928 && data->flags & SF_IN_PAR
2929 && !(data->flags & SF_HAS_EVAL)
2930 && !deltanext && minnext == 1 ) {
2931 /* Try to optimize to CURLYN. */
2932 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
2933 regnode * const nxt1 = nxt;
2940 if (!strchr((const char*)PL_simple,OP(nxt))
2941 && !(PL_regkind[OP(nxt)] == EXACT
2942 && STR_LEN(nxt) == 1))
2948 if (OP(nxt) != CLOSE)
2950 if (RExC_open_parens) {
2951 RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
2952 RExC_close_parens[ARG(nxt1)-1]=nxt+2; /*close->while*/
2954 /* Now we know that nxt2 is the only contents: */
2955 oscan->flags = (U8)ARG(nxt);
2957 OP(nxt1) = NOTHING; /* was OPEN. */
2960 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
2961 NEXT_OFF(nxt1+ 1) = 0; /* just for consistancy. */
2962 NEXT_OFF(nxt2) = 0; /* just for consistancy with CURLY. */
2963 OP(nxt) = OPTIMIZED; /* was CLOSE. */
2964 OP(nxt + 1) = OPTIMIZED; /* was count. */
2965 NEXT_OFF(nxt+ 1) = 0; /* just for consistancy. */
2970 /* Try optimization CURLYX => CURLYM. */
2971 if ( OP(oscan) == CURLYX && data
2972 && !(data->flags & SF_HAS_PAR)
2973 && !(data->flags & SF_HAS_EVAL)
2974 && !deltanext /* atom is fixed width */
2975 && minnext != 0 /* CURLYM can't handle zero width */
2977 /* XXXX How to optimize if data == 0? */
2978 /* Optimize to a simpler form. */
2979 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
2983 while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
2984 && (OP(nxt2) != WHILEM))
2986 OP(nxt2) = SUCCEED; /* Whas WHILEM */
2987 /* Need to optimize away parenths. */
2988 if (data->flags & SF_IN_PAR) {
2989 /* Set the parenth number. */
2990 regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
2992 if (OP(nxt) != CLOSE)
2993 FAIL("Panic opt close");
2994 oscan->flags = (U8)ARG(nxt);
2995 if (RExC_open_parens) {
2996 RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
2997 RExC_close_parens[ARG(nxt1)-1]=nxt2+1; /*close->NOTHING*/
2999 OP(nxt1) = OPTIMIZED; /* was OPEN. */
3000 OP(nxt) = OPTIMIZED; /* was CLOSE. */
3003 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
3004 OP(nxt + 1) = OPTIMIZED; /* was count. */
3005 NEXT_OFF(nxt1 + 1) = 0; /* just for consistancy. */
3006 NEXT_OFF(nxt + 1) = 0; /* just for consistancy. */
3009 while ( nxt1 && (OP(nxt1) != WHILEM)) {
3010 regnode *nnxt = regnext(nxt1);
3013 if (reg_off_by_arg[OP(nxt1)])
3014 ARG_SET(nxt1, nxt2 - nxt1);
3015 else if (nxt2 - nxt1 < U16_MAX)
3016 NEXT_OFF(nxt1) = nxt2 - nxt1;
3018 OP(nxt) = NOTHING; /* Cannot beautify */
3023 /* Optimize again: */
3024 study_chunk(pRExC_state, &nxt1, minlenp, &deltanext, nxt,
3025 NULL, stopparen, recursed, NULL, 0,depth+1);
3030 else if ((OP(oscan) == CURLYX)
3031 && (flags & SCF_WHILEM_VISITED_POS)
3032 /* See the comment on a similar expression above.
3033 However, this time it not a subexpression
3034 we care about, but the expression itself. */
3035 && (maxcount == REG_INFTY)
3036 && data && ++data->whilem_c < 16) {
3037 /* This stays as CURLYX, we can put the count/of pair. */
3038 /* Find WHILEM (as in regexec.c) */
3039 regnode *nxt = oscan + NEXT_OFF(oscan);
3041 if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
3043 PREVOPER(nxt)->flags = (U8)(data->whilem_c
3044 | (RExC_whilem_seen << 4)); /* On WHILEM */
3046 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
3048 if (flags & SCF_DO_SUBSTR) {
3049 SV *last_str = NULL;
3050 int counted = mincount != 0;
3052 if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */
3053 #if defined(SPARC64_GCC_WORKAROUND)
3056 const char *s = NULL;
3059 if (pos_before >= data->last_start_min)
3062 b = data->last_start_min;
3065 s = SvPV_const(data->last_found, l);
3066 old = b - data->last_start_min;
3069 I32 b = pos_before >= data->last_start_min
3070 ? pos_before : data->last_start_min;
3072 const char * const s = SvPV_const(data->last_found, l);
3073 I32 old = b - data->last_start_min;
3077 old = utf8_hop((U8*)s, old) - (U8*)s;
3080 /* Get the added string: */
3081 last_str = newSVpvn(s + old, l);
3083 SvUTF8_on(last_str);
3084 if (deltanext == 0 && pos_before == b) {
3085 /* What was added is a constant string */
3087 SvGROW(last_str, (mincount * l) + 1);
3088 repeatcpy(SvPVX(last_str) + l,
3089 SvPVX_const(last_str), l, mincount - 1);
3090 SvCUR_set(last_str, SvCUR(last_str) * mincount);
3091 /* Add additional parts. */
3092 SvCUR_set(data->last_found,
3093 SvCUR(data->last_found) - l);
3094 sv_catsv(data->last_found, last_str);
3096 SV * sv = data->last_found;
3098 SvUTF8(sv) && SvMAGICAL(sv) ?
3099 mg_find(sv, PERL_MAGIC_utf8) : NULL;
3100 if (mg && mg->mg_len >= 0)
3101 mg->mg_len += CHR_SVLEN(last_str);
3103 data->last_end += l * (mincount - 1);
3106 /* start offset must point into the last copy */
3107 data->last_start_min += minnext * (mincount - 1);
3108 data->last_start_max += is_inf ? I32_MAX
3109 : (maxcount - 1) * (minnext + data->pos_delta);
3112 /* It is counted once already... */
3113 data->pos_min += minnext * (mincount - counted);
3114 data->pos_delta += - counted * deltanext +
3115 (minnext + deltanext) * maxcount - minnext * mincount;
3116 if (mincount != maxcount) {
3117 /* Cannot extend fixed substrings found inside
3119 scan_commit(pRExC_state,data,minlenp);
3120 if (mincount && last_str) {
3121 SV * const sv = data->last_found;
3122 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
3123 mg_find(sv, PERL_MAGIC_utf8) : NULL;
3127 sv_setsv(sv, last_str);
3128 data->last_end = data->pos_min;
3129 data->last_start_min =
3130 data->pos_min - CHR_SVLEN(last_str);
3131 data->last_start_max = is_inf
3133 : data->pos_min + data->pos_delta
3134 - CHR_SVLEN(last_str);
3136 data->longest = &(data->longest_float);
3138 SvREFCNT_dec(last_str);
3140 if (data && (fl & SF_HAS_EVAL))
3141 data->flags |= SF_HAS_EVAL;
3142 optimize_curly_tail:
3143 if (OP(oscan) != CURLYX) {
3144 while (PL_regkind[OP(next = regnext(oscan))] == NOTHING
3146 NEXT_OFF(oscan) += NEXT_OFF(next);
3149 default: /* REF and CLUMP only? */
3150 if (flags & SCF_DO_SUBSTR) {
3151 scan_commit(pRExC_state,data,minlenp); /* Cannot expect anything... */
3152 data->longest = &(data->longest_float);
3154 is_inf = is_inf_internal = 1;
3155 if (flags & SCF_DO_STCLASS_OR)
3156 cl_anything(pRExC_state, data->start_class);
3157 flags &= ~SCF_DO_STCLASS;
3161 else if (strchr((const char*)PL_simple,OP(scan))) {
3164 if (flags & SCF_DO_SUBSTR) {
3165 scan_commit(pRExC_state,data,minlenp);
3169 if (flags & SCF_DO_STCLASS) {
3170 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
3172 /* Some of the logic below assumes that switching
3173 locale on will only add false positives. */
3174 switch (PL_regkind[OP(scan)]) {
3178 /* Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan)); */
3179 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
3180 cl_anything(pRExC_state, data->start_class);
3183 if (OP(scan) == SANY)
3185 if (flags & SCF_DO_STCLASS_OR) { /* Everything but \n */
3186 value = (ANYOF_BITMAP_TEST(data->start_class,'\n')
3187 || (data->start_class->flags & ANYOF_CLASS));
3188 cl_anything(pRExC_state, data->start_class);
3190 if (flags & SCF_DO_STCLASS_AND || !value)
3191 ANYOF_BITMAP_CLEAR(data->start_class,'\n');
3194 if (flags & SCF_DO_STCLASS_AND)
3195 cl_and(data->start_class,
3196 (struct regnode_charclass_class*)scan);
3198 cl_or(pRExC_state, data->start_class,
3199 (struct regnode_charclass_class*)scan);
3202 if (flags & SCF_DO_STCLASS_AND) {
3203 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3204 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
3205 for (value = 0; value < 256; value++)
3206 if (!isALNUM(value))
3207 ANYOF_BITMAP_CLEAR(data->start_class, value);
3211 if (data->start_class->flags & ANYOF_LOCALE)
3212 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
3214 for (value = 0; value < 256; value++)
3216 ANYOF_BITMAP_SET(data->start_class, value);
3221 if (flags & SCF_DO_STCLASS_AND) {
3222 if (data->start_class->flags & ANYOF_LOCALE)
3223 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
3226 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
3227 data->start_class->flags |= ANYOF_LOCALE;
3231 if (flags & SCF_DO_STCLASS_AND) {
3232 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3233 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
3234 for (value = 0; value < 256; value++)
3236 ANYOF_BITMAP_CLEAR(data->start_class, value);
3240 if (data->start_class->flags & ANYOF_LOCALE)
3241 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
3243 for (value = 0; value < 256; value++)
3244 if (!isALNUM(value))
3245 ANYOF_BITMAP_SET(data->start_class, value);
3250 if (flags & SCF_DO_STCLASS_AND) {
3251 if (data->start_class->flags & ANYOF_LOCALE)
3252 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
3255 data->start_class->flags |= ANYOF_LOCALE;
3256 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
3260 if (flags & SCF_DO_STCLASS_AND) {
3261 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3262 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
3263 for (value = 0; value < 256; value++)
3264 if (!isSPACE(value))
3265 ANYOF_BITMAP_CLEAR(data->start_class, value);
3269 if (data->start_class->flags & ANYOF_LOCALE)
3270 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
3272 for (value = 0; value < 256; value++)
3274 ANYOF_BITMAP_SET(data->start_class, value);
3279 if (flags & SCF_DO_STCLASS_AND) {
3280 if (data->start_class->flags & ANYOF_LOCALE)
3281 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
3284 data->start_class->flags |= ANYOF_LOCALE;
3285 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
3289 if (flags & SCF_DO_STCLASS_AND) {
3290 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3291 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
3292 for (value = 0; value < 256; value++)
3294 ANYOF_BITMAP_CLEAR(data->start_class, value);
3298 if (data->start_class->flags & ANYOF_LOCALE)
3299 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
3301 for (value = 0; value < 256; value++)
3302 if (!isSPACE(value))
3303 ANYOF_BITMAP_SET(data->start_class, value);
3308 if (flags & SCF_DO_STCLASS_AND) {
3309 if (data->start_class->flags & ANYOF_LOCALE) {
3310 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
3311 for (value = 0; value < 256; value++)
3312 if (!isSPACE(value))
3313 ANYOF_BITMAP_CLEAR(data->start_class, value);
3317 data->start_class->flags |= ANYOF_LOCALE;
3318 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
3322 if (flags & SCF_DO_STCLASS_AND) {
3323 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NDIGIT);
3324 for (value = 0; value < 256; value++)
3325 if (!isDIGIT(value))
3326 ANYOF_BITMAP_CLEAR(data->start_class, value);
3329 if (data->start_class->flags & ANYOF_LOCALE)
3330 ANYOF_CLASS_SET(data->start_class,ANYOF_DIGIT);
3332 for (value = 0; value < 256; value++)
3334 ANYOF_BITMAP_SET(data->start_class, value);
3339 if (flags & SCF_DO_STCLASS_AND) {
3340 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_DIGIT);
3341 for (value = 0; value < 256; value++)
3343 ANYOF_BITMAP_CLEAR(data->start_class, value);
3346 if (data->start_class->flags & ANYOF_LOCALE)
3347 ANYOF_CLASS_SET(data->start_class,ANYOF_NDIGIT);
3349 for (value = 0; value < 256; value++)
3350 if (!isDIGIT(value))
3351 ANYOF_BITMAP_SET(data->start_class, value);
3356 if (flags & SCF_DO_STCLASS_OR)
3357 cl_and(data->start_class, and_withp);
3358 flags &= ~SCF_DO_STCLASS;
3361 else if (PL_regkind[OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
3362 data->flags |= (OP(scan) == MEOL
3366 else if ( PL_regkind[OP(scan)] == BRANCHJ
3367 /* Lookbehind, or need to calculate parens/evals/stclass: */
3368 && (scan->flags || data || (flags & SCF_DO_STCLASS))
3369 && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) {
3370 if ( !PERL_ENABLE_POSITIVE_ASSERTION_STUDY
3371 || OP(scan) == UNLESSM )
3373 /* Negative Lookahead/lookbehind
3374 In this case we can't do fixed string optimisation.
3377 I32 deltanext, minnext, fake = 0;
3379 struct regnode_charclass_class intrnl;
3382 data_fake.flags = 0;
3384 data_fake.whilem_c = data->whilem_c;
3385 data_fake.last_closep = data->last_closep;
3388 data_fake.last_closep = &fake;
3389 if ( flags & SCF_DO_STCLASS && !scan->flags
3390 && OP(scan) == IFMATCH ) { /* Lookahead */
3391 cl_init(pRExC_state, &intrnl);
3392 data_fake.start_class = &intrnl;
3393 f |= SCF_DO_STCLASS_AND;
3395 if (flags & SCF_WHILEM_VISITED_POS)
3396 f |= SCF_WHILEM_VISITED_POS;
3397 next = regnext(scan);
3398 nscan = NEXTOPER(NEXTOPER(scan));
3399 minnext = study_chunk(pRExC_state, &nscan, minlenp, &deltanext,
3400 last, &data_fake, stopparen, recursed, NULL, f, depth+1);
3403 vFAIL("Variable length lookbehind not implemented");
3405 else if (minnext > (I32)U8_MAX) {
3406 vFAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
3408 scan->flags = (U8)minnext;
3411 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3413 if (data_fake.flags & SF_HAS_EVAL)
3414 data->flags |= SF_HAS_EVAL;
3415 data->whilem_c = data_fake.whilem_c;
3417 if (f & SCF_DO_STCLASS_AND) {
3418 const int was = (data->start_class->flags & ANYOF_EOS);
3420 cl_and(data->start_class, &intrnl);
3422 data->start_class->flags |= ANYOF_EOS;
3425 #if PERL_ENABLE_POSITIVE_ASSERTION_STUDY
3427 /* Positive Lookahead/lookbehind
3428 In this case we can do fixed string optimisation,
3429 but we must be careful about it. Note in the case of
3430 lookbehind the positions will be offset by the minimum
3431 length of the pattern, something we won't know about
3432 until after the recurse.
3434 I32 deltanext, fake = 0;
3436 struct regnode_charclass_class intrnl;
3438 /* We use SAVEFREEPV so that when the full compile
3439 is finished perl will clean up the allocated
3440 minlens when its all done. This was we don't
3441 have to worry about freeing them when we know
3442 they wont be used, which would be a pain.
3445 Newx( minnextp, 1, I32 );
3446 SAVEFREEPV(minnextp);
3449 StructCopy(data, &data_fake, scan_data_t);
3450 if ((flags & SCF_DO_SUBSTR) && data->last_found) {
3453 scan_commit(pRExC_state, &data_fake,minlenp);
3454 data_fake.last_found=newSVsv(data->last_found);
3458 data_fake.last_closep = &fake;
3459 data_fake.flags = 0;
3461 data_fake.flags |= SF_IS_INF;
3462 if ( flags & SCF_DO_STCLASS && !scan->flags
3463 && OP(scan) == IFMATCH ) { /* Lookahead */
3464 cl_init(pRExC_state, &intrnl);
3465 data_fake.start_class = &intrnl;
3466 f |= SCF_DO_STCLASS_AND;
3468 if (flags & SCF_WHILEM_VISITED_POS)
3469 f |= SCF_WHILEM_VISITED_POS;
3470 next = regnext(scan);
3471 nscan = NEXTOPER(NEXTOPER(scan));
3473 *minnextp = study_chunk(pRExC_state, &nscan, minnextp, &deltanext,
3474 last, &data_fake, stopparen, recursed, NULL, f,depth+1);
3477 vFAIL("Variable length lookbehind not implemented");
3479 else if (*minnextp > (I32)U8_MAX) {
3480 vFAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
3482 scan->flags = (U8)*minnextp;
3487 if (f & SCF_DO_STCLASS_AND) {
3488 const int was = (data->start_class->flags & ANYOF_EOS);
3490 cl_and(data->start_class, &intrnl);
3492 data->start_class->flags |= ANYOF_EOS;
3495 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3497 if (data_fake.flags & SF_HAS_EVAL)
3498 data->flags |= SF_HAS_EVAL;
3499 data->whilem_c = data_fake.whilem_c;
3500 if ((flags & SCF_DO_SUBSTR) && data_fake.last_found) {
3501 if (RExC_rx->minlen<*minnextp)
3502 RExC_rx->minlen=*minnextp;
3503 scan_commit(pRExC_state, &data_fake, minnextp);
3504 SvREFCNT_dec(data_fake.last_found);
3506 if ( data_fake.minlen_fixed != minlenp )
3508 data->offset_fixed= data_fake.offset_fixed;
3509 data->minlen_fixed= data_fake.minlen_fixed;
3510 data->lookbehind_fixed+= scan->flags;
3512 if ( data_fake.minlen_float != minlenp )
3514 data->minlen_float= data_fake.minlen_float;
3515 data->offset_float_min=data_fake.offset_float_min;
3516 data->offset_float_max=data_fake.offset_float_max;
3517 data->lookbehind_float+= scan->flags;
3526 else if (OP(scan) == OPEN) {
3527 if (stopparen != (I32)ARG(scan))
3530 else if (OP(scan) == CLOSE) {
3531 if (stopparen == (I32)ARG(scan)) {
3534 if ((I32)ARG(scan) == is_par) {
3535 next = regnext(scan);
3537 if ( next && (OP(next) != WHILEM) && next < last)
3538 is_par = 0; /* Disable optimization */
3541 *(data->last_closep) = ARG(scan);
3543 else if (OP(scan) == GOSUB || OP(scan) == GOSTART) {
3544 /* set the pointer */
3548 if (OP(scan) == GOSUB) {
3550 RExC_recurse[ARG2L(scan)] = scan;
3551 start = RExC_open_parens[paren-1];
3552 end = RExC_close_parens[paren-1];
3555 start = RExC_rx->program + 1;
3561 Newxz(recursed, (((RExC_npar)>>3) +1), U8);
3562 SAVEFREEPV(recursed);
3564 if (!PAREN_TEST(recursed,paren+1)) {
3566 PAREN_SET(recursed,paren+1);
3568 DEBUG_PEEP("goto",start,depth);
3581 if (deltanext == I32_MAX) {
3582 is_inf = is_inf_internal = 1;
3585 DEBUG_PEEP("rtrn",end,depth);
3586 PAREN_UNSET(recursed,paren+1);
3588 if (flags & SCF_DO_SUBSTR) {
3589 scan_commit(pRExC_state,data,minlenp);
3590 data->longest = &(data->longest_float);
3592 is_inf = is_inf_internal = 1;
3593 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
3594 cl_anything(pRExC_state, data->start_class);
3595 flags &= ~SCF_DO_STCLASS;
3598 else if (OP(scan) == EVAL) {
3600 data->flags |= SF_HAS_EVAL;
3602 else if ( PL_regkind[OP(scan)] == ENDLIKE ) {
3603 if (flags & SCF_DO_SUBSTR) {
3604 scan_commit(pRExC_state,data,minlenp);
3605 flags &= ~SCF_DO_SUBSTR;
3607 if (data && OP(scan)==ACCEPT) {
3608 data->flags |= SCF_SEEN_ACCEPT;
3613 else if (OP(scan) == LOGICAL && scan->flags == 2) /* Embedded follows */
3615 if (flags & SCF_DO_SUBSTR) {
3616 scan_commit(pRExC_state,data,minlenp);
3617 data->longest = &(data->longest_float);
3619 is_inf = is_inf_internal = 1;
3620 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
3621 cl_anything(pRExC_state, data->start_class);
3622 flags &= ~SCF_DO_STCLASS;
3624 #ifdef TRIE_STUDY_OPT
3625 #ifdef FULL_TRIE_STUDY
3626 else if (PL_regkind[OP(scan)] == TRIE) {
3627 /* NOTE - There is similar code to this block above for handling
3628 BRANCH nodes on the initial study. If you change stuff here
3630 regnode *trie_node= scan;
3631 regnode *tail= regnext(scan);
3632 reg_trie_data *trie = (reg_trie_data*)RExC_rx->data->data[ ARG(scan) ];
3633 I32 max1 = 0, min1 = I32_MAX;
3634 struct regnode_charclass_class accum;
3636 if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
3637 scan_commit(pRExC_state, data,minlenp); /* Cannot merge strings after this. */
3638 if (flags & SCF_DO_STCLASS)
3639 cl_init_zero(pRExC_state, &accum);
3645 const regnode *nextbranch= NULL;
3648 for ( word=1 ; word <= trie->wordcount ; word++)
3650 I32 deltanext=0, minnext=0, f = 0, fake;
3651 struct regnode_charclass_class this_class;
3653 data_fake.flags = 0;
3655 data_fake.whilem_c = data->whilem_c;
3656 data_fake.last_closep = data->last_closep;
3659 data_fake.last_closep = &fake;
3661 if (flags & SCF_DO_STCLASS) {
3662 cl_init(pRExC_state, &this_class);
3663 data_fake.start_class = &this_class;
3664 f = SCF_DO_STCLASS_AND;
3666 if (flags & SCF_WHILEM_VISITED_POS)
3667 f |= SCF_WHILEM_VISITED_POS;
3669 if (trie->jump[word]) {
3671 nextbranch = trie_node + trie->jump[0];
3672 scan= trie_node + trie->jump[word];
3673 /* We go from the jump point to the branch that follows
3674 it. Note this means we need the vestigal unused branches
3675 even though they arent otherwise used.
3677 minnext = study_chunk(pRExC_state, &scan, minlenp,
3678 &deltanext, (regnode *)nextbranch, &data_fake,
3679 stopparen, recursed, NULL, f,depth+1);
3681 if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
3682 nextbranch= regnext((regnode*)nextbranch);
3684 if (min1 > (I32)(minnext + trie->minlen))
3685 min1 = minnext + trie->minlen;
3686 if (max1 < (I32)(minnext + deltanext + trie->maxlen))
3687 max1 = minnext + deltanext + trie->maxlen;
3688 if (deltanext == I32_MAX)
3689 is_inf = is_inf_internal = 1;
3691 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3693 if (data_fake.flags & SCF_SEEN_ACCEPT) {
3694 if ( stopmin > min + min1)
3695 stopmin = min + min1;
3696 flags &= ~SCF_DO_SUBSTR;
3698 data->flags |= SCF_SEEN_ACCEPT;
3701 if (data_fake.flags & SF_HAS_EVAL)
3702 data->flags |= SF_HAS_EVAL;
3703 data->whilem_c = data_fake.whilem_c;
3705 if (flags & SCF_DO_STCLASS)
3706 cl_or(pRExC_state, &accum, &this_class);
3709 if (flags & SCF_DO_SUBSTR) {
3710 data->pos_min += min1;
3711 data->pos_delta += max1 - min1;
3712 if (max1 != min1 || is_inf)
3713 data->longest = &(data->longest_float);
3716 delta += max1 - min1;
3717 if (flags & SCF_DO_STCLASS_OR) {
3718 cl_or(pRExC_state, data->start_class, &accum);
3720 cl_and(data->start_class, and_withp);
3721 flags &= ~SCF_DO_STCLASS;
3724 else if (flags & SCF_DO_STCLASS_AND) {
3726 cl_and(data->start_class, &accum);
3727 flags &= ~SCF_DO_STCLASS;
3730 /* Switch to OR mode: cache the old value of
3731 * data->start_class */
3733 StructCopy(data->start_class, and_withp,
3734 struct regnode_charclass_class);
3735 flags &= ~SCF_DO_STCLASS_AND;
3736 StructCopy(&accum, data->start_class,
3737 struct regnode_charclass_class);
3738 flags |= SCF_DO_STCLASS_OR;
3739 data->start_class->flags |= ANYOF_EOS;
3746 else if (PL_regkind[OP(scan)] == TRIE) {
3747 reg_trie_data *trie = (reg_trie_data*)RExC_rx->data->data[ ARG(scan) ];
3750 min += trie->minlen;
3751 delta += (trie->maxlen - trie->minlen);
3752 flags &= ~SCF_DO_STCLASS; /* xxx */
3753 if (flags & SCF_DO_SUBSTR) {
3754 scan_commit(pRExC_state,data,minlenp); /* Cannot expect anything... */
3755 data->pos_min += trie->minlen;
3756 data->pos_delta += (trie->maxlen - trie->minlen);
3757 if (trie->maxlen != trie->minlen)
3758 data->longest = &(data->longest_float);
3760 if (trie->jump) /* no more substrings -- for now /grr*/
3761 flags &= ~SCF_DO_SUBSTR;
3763 #endif /* old or new */
3764 #endif /* TRIE_STUDY_OPT */
3765 /* Else: zero-length, ignore. */
3766 scan = regnext(scan);
3771 *deltap = is_inf_internal ? I32_MAX : delta;
3772 if (flags & SCF_DO_SUBSTR && is_inf)
3773 data->pos_delta = I32_MAX - data->pos_min;
3774 if (is_par > (I32)U8_MAX)
3776 if (is_par && pars==1 && data) {
3777 data->flags |= SF_IN_PAR;
3778 data->flags &= ~SF_HAS_PAR;
3780 else if (pars && data) {
3781 data->flags |= SF_HAS_PAR;
3782 data->flags &= ~SF_IN_PAR;
3784 if (flags & SCF_DO_STCLASS_OR)
3785 cl_and(data->start_class, and_withp);
3786 if (flags & SCF_TRIE_RESTUDY)
3787 data->flags |= SCF_TRIE_RESTUDY;
3789 DEBUG_STUDYDATA(data,depth);
3791 return min < stopmin ? min : stopmin;
3795 S_add_data(RExC_state_t *pRExC_state, I32 n, const char *s)
3797 if (RExC_rx->data) {
3798 const U32 count = RExC_rx->data->count;
3799 Renewc(RExC_rx->data,
3800 sizeof(*RExC_rx->data) + sizeof(void*) * (count + n - 1),
3801 char, struct reg_data);
3802 Renew(RExC_rx->data->what, count + n, U8);
3803 RExC_rx->data->count += n;
3806 Newxc(RExC_rx->data, sizeof(*RExC_rx->data) + sizeof(void*) * (n - 1),
3807 char, struct reg_data);
3808 Newx(RExC_rx->data->what, n, U8);
3809 RExC_rx->data->count = n;
3811 Copy(s, RExC_rx->data->what + RExC_rx->data->count - n, n, U8);
3812 return RExC_rx->data->count - n;
3815 #ifndef PERL_IN_XSUB_RE
3817 Perl_reginitcolors(pTHX)
3820 const char * const s = PerlEnv_getenv("PERL_RE_COLORS");
3822 char *t = savepv(s);
3826 t = strchr(t, '\t');
3832 PL_colors[i] = t = (char *)"";
3837 PL_colors[i++] = (char *)"";
3844 #ifdef TRIE_STUDY_OPT
3845 #define CHECK_RESTUDY_GOTO \
3847 (data.flags & SCF_TRIE_RESTUDY) \
3851 #define CHECK_RESTUDY_GOTO
3855 - pregcomp - compile a regular expression into internal code
3857 * We can't allocate space until we know how big the compiled form will be,
3858 * but we can't compile it (and thus know how big it is) until we've got a
3859 * place to put the code. So we cheat: we compile it twice, once with code
3860 * generation turned off and size counting turned on, and once "for real".
3861 * This also means that we don't allocate space until we are sure that the
3862 * thing really will compile successfully, and we never have to move the
3863 * code and thus invalidate pointers into it. (Note that it has to be in
3864 * one piece because free() must be able to free it all.) [NB: not true in perl]
3866 * Beware that the optimization-preparation code in here knows about some
3867 * of the structure of the compiled regexp. [I'll say.]
3872 #ifndef PERL_IN_XSUB_RE
3873 #define RE_ENGINE_PTR &PL_core_reg_engine
3875 extern const struct regexp_engine my_reg_engine;
3876 #define RE_ENGINE_PTR &my_reg_engine
3878 /* these make a few things look better, to avoid indentation */
3879 #define BEGIN_BLOCK {
3883 Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm)
3886 GET_RE_DEBUG_FLAGS_DECL;
3887 DEBUG_r(if (!PL_colorset) reginitcolors());
3888 #ifndef PERL_IN_XSUB_RE
3890 /* Dispatch a request to compile a regexp to correct
3892 HV * const table = GvHV(PL_hintgv);
3894 SV **ptr= hv_fetchs(table, "regcomp", FALSE);
3895 if (ptr && SvIOK(*ptr) && SvIV(*ptr)) {
3896 const regexp_engine *eng=INT2PTR(regexp_engine*,SvIV(*ptr));
3898 PerlIO_printf(Perl_debug_log, "Using engine %"UVxf"\n",
3901 return CALLREGCOMP_ENG(eng, exp, xend, pm);
3915 RExC_state_t RExC_state;
3916 RExC_state_t * const pRExC_state = &RExC_state;
3917 #ifdef TRIE_STUDY_OPT
3919 RExC_state_t copyRExC_state;
3922 FAIL("NULL regexp argument");
3924 RExC_utf8 = pm->op_pmdynflags & PMdf_CMP_UTF8;
3928 SV *dsv= sv_newmortal();
3929 RE_PV_QUOTED_DECL(s, RExC_utf8,
3930 dsv, RExC_precomp, (xend - exp), 60);
3931 PerlIO_printf(Perl_debug_log, "%sCompiling REx%s %s\n",
3932 PL_colors[4],PL_colors[5],s);
3934 RExC_flags = pm->op_pmflags;
3938 RExC_seen_zerolen = *exp == '^' ? -1 : 0;
3939 RExC_seen_evals = 0;
3942 /* First pass: determine size, legality. */
3951 RExC_emit = &PL_regdummy;
3952 RExC_whilem_seen = 0;
3953 RExC_charnames = NULL;
3954 RExC_open_parens = NULL;
3955 RExC_close_parens = NULL;
3957 RExC_paren_names = NULL;
3958 RExC_recurse = NULL;
3959 RExC_recurse_count = 0;
3961 #if 0 /* REGC() is (currently) a NOP at the first pass.
3962 * Clever compilers notice this and complain. --jhi */
3963 REGC((U8)REG_MAGIC, (char*)RExC_emit);
3965 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "Starting first pass (sizing)\n"));
3966 if (reg(pRExC_state, 0, &flags,1) == NULL) {
3967 RExC_precomp = NULL;
3971 PerlIO_printf(Perl_debug_log,
3972 "Required size %"IVdf" nodes\n"
3973 "Starting second pass (creation)\n",
3976 RExC_lastparse=NULL;
3978 /* Small enough for pointer-storage convention?
3979 If extralen==0, this means that we will not need long jumps. */
3980 if (RExC_size >= 0x10000L && RExC_extralen)
3981 RExC_size += RExC_extralen;
3984 if (RExC_whilem_seen > 15)
3985 RExC_whilem_seen = 15;
3988 /* Make room for a sentinel value at the end of the program */
3992 /* Allocate space and zero-initialize. Note, the two step process
3993 of zeroing when in debug mode, thus anything assigned has to
3994 happen after that */
3995 Newxc(r, sizeof(regexp) + (unsigned)RExC_size * sizeof(regnode),
3998 FAIL("Regexp out of space");
4000 /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */
4001 Zero(r, sizeof(regexp) + (unsigned)RExC_size * sizeof(regnode), char);
4003 /* initialization begins here */
4004 r->engine= RE_ENGINE_PTR;
4006 r->prelen = xend - exp;
4007 r->precomp = savepvn(RExC_precomp, r->prelen);
4009 #ifdef PERL_OLD_COPY_ON_WRITE
4010 r->saved_copy = NULL;
4012 r->reganch = pm->op_pmflags & PMf_COMPILETIME;
4013 r->nparens = RExC_npar - 1; /* set early to validate backrefs */
4014 r->lastparen = 0; /* mg.c reads this. */
4016 r->substrs = 0; /* Useful during FAIL. */
4017 r->startp = 0; /* Useful during FAIL. */
4022 if (RExC_seen & REG_SEEN_RECURSE) {
4023 Newxz(RExC_open_parens, RExC_npar,regnode *);
4024 SAVEFREEPV(RExC_open_parens);
4025 Newxz(RExC_close_parens,RExC_npar,regnode *);
4026 SAVEFREEPV(RExC_close_parens);
4029 /* Useful during FAIL. */
4030 Newxz(r->offsets, 2*RExC_size+1, U32); /* MJD 20001228 */
4032 r->offsets[0] = RExC_size;
4034 DEBUG_OFFSETS_r(PerlIO_printf(Perl_debug_log,
4035 "%s %"UVuf" bytes for offset annotations.\n",
4036 r->offsets ? "Got" : "Couldn't get",
4037 (UV)((2*RExC_size+1) * sizeof(U32))));
4041 /* Second pass: emit code. */
4042 RExC_flags = pm->op_pmflags; /* don't let top level (?i) bleed */
4048 RExC_emit_start = r->program;
4049 RExC_emit = r->program;
4051 /* put a sentinal on the end of the program so we can check for
4053 r->program[RExC_size].type = 255;
4055 /* Store the count of eval-groups for security checks: */
4056 RExC_emit->next_off = (RExC_seen_evals > (I32)U16_MAX) ? U16_MAX : (U16)RExC_seen_evals;
4057 REGC((U8)REG_MAGIC, (char*) RExC_emit++);
4059 if (reg(pRExC_state, 0, &flags,1) == NULL)
4062 /* XXXX To minimize changes to RE engine we always allocate
4063 3-units-long substrs field. */
4064 Newx(r->substrs, 1, struct reg_substr_data);
4065 if (RExC_recurse_count) {
4066 Newxz(RExC_recurse,RExC_recurse_count,regnode *);
4067 SAVEFREEPV(RExC_recurse);
4071 r->minlen = minlen = sawplus = sawopen = 0;
4072 Zero(r->substrs, 1, struct reg_substr_data);
4074 #ifdef TRIE_STUDY_OPT
4077 DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log,"Restudying\n"));
4079 RExC_state = copyRExC_state;
4080 if (seen & REG_TOP_LEVEL_BRANCHES)
4081 RExC_seen |= REG_TOP_LEVEL_BRANCHES;
4083 RExC_seen &= ~REG_TOP_LEVEL_BRANCHES;
4084 if (data.last_found) {
4085 SvREFCNT_dec(data.longest_fixed);
4086 SvREFCNT_dec(data.longest_float);
4087 SvREFCNT_dec(data.last_found);
4089 StructCopy(&zero_scan_data, &data, scan_data_t);
4091 StructCopy(&zero_scan_data, &data, scan_data_t);
4092 copyRExC_state = RExC_state;
4095 StructCopy(&zero_scan_data, &data, scan_data_t);
4098 /* Dig out information for optimizations. */
4099 r->reganch = pm->op_pmflags & PMf_COMPILETIME; /* Again? */
4100 pm->op_pmflags = RExC_flags;
4102 r->reganch |= ROPT_UTF8; /* Unicode in it? */
4103 r->regstclass = NULL;
4104 if (RExC_naughty >= 10) /* Probably an expensive pattern. */
4105 r->reganch |= ROPT_NAUGHTY;
4106 scan = r->program + 1; /* First BRANCH. */
4108 /* testing for BRANCH here tells us whether there is "must appear"
4109 data in the pattern. If there is then we can use it for optimisations */
4110 if (!(RExC_seen & REG_TOP_LEVEL_BRANCHES)) { /* Only one top-level choice. */
4112 STRLEN longest_float_length, longest_fixed_length;
4113 struct regnode_charclass_class ch_class; /* pointed to by data */
4115 I32 last_close = 0; /* pointed to by data */
4118 /* Skip introductions and multiplicators >= 1. */
4119 while ((OP(first) == OPEN && (sawopen = 1)) ||
4120 /* An OR of *one* alternative - should not happen now. */
4121 (OP(first) == BRANCH && OP(regnext(first)) != BRANCH) ||
4122 /* for now we can't handle lookbehind IFMATCH*/
4123 (OP(first) == IFMATCH && !first->flags) ||
4124 (OP(first) == PLUS) ||
4125 (OP(first) == MINMOD) ||
4126 /* An {n,m} with n>0 */
4127 (PL_regkind[OP(first)] == CURLY && ARG1(first) > 0) )
4130 if (OP(first) == PLUS)
4133 first += regarglen[OP(first)];
4134 if (OP(first) == IFMATCH) {
4135 first = NEXTOPER(first);
4136 first += EXTRA_STEP_2ARGS;
4137 } else /* XXX possible optimisation for /(?=)/ */
4138 first = NEXTOPER(first);
4141 /* Starting-point info. */
4143 DEBUG_PEEP("first:",first,0);
4144 /* Ignore EXACT as we deal with it later. */
4145 if (PL_regkind[OP(first)] == EXACT) {
4146 if (OP(first) == EXACT)
4147 NOOP; /* Empty, get anchored substr later. */
4148 else if ((OP(first) == EXACTF || OP(first) == EXACTFL))
4149 r->regstclass = first;
4152 else if (PL_regkind[OP(first)] == TRIE &&
4153 ((reg_trie_data *)r->data->data[ ARG(first) ])->minlen>0)
4156 /* this can happen only on restudy */
4157 if ( OP(first) == TRIE ) {
4158 struct regnode_1 *trieop;
4159 Newxz(trieop,1,struct regnode_1);
4160 StructCopy(first,trieop,struct regnode_1);
4161 trie_op=(regnode *)trieop;
4163 struct regnode_charclass *trieop;
4164 Newxz(trieop,1,struct regnode_charclass);
4165 StructCopy(first,trieop,struct regnode_charclass);
4166 trie_op=(regnode *)trieop;
4169 make_trie_failtable(pRExC_state, (regnode *)first, trie_op, 0);
4170 r->regstclass = trie_op;
4173 else if (strchr((const char*)PL_simple,OP(first)))
4174 r->regstclass = first;
4175 else if (PL_regkind[OP(first)] == BOUND ||
4176 PL_regkind[OP(first)] == NBOUND)
4177 r->regstclass = first;
4178 else if (PL_regkind[OP(first)] == BOL) {
4179 r->reganch |= (OP(first) == MBOL
4181 : (OP(first) == SBOL
4184 first = NEXTOPER(first);
4187 else if (OP(first) == GPOS) {
4188 r->reganch |= ROPT_ANCH_GPOS;
4189 first = NEXTOPER(first);
4192 else if (!sawopen && (OP(first) == STAR &&
4193 PL_regkind[OP(NEXTOPER(first))] == REG_ANY) &&
4194 !(r->reganch & ROPT_ANCH) )
4196 /* turn .* into ^.* with an implied $*=1 */
4198 (OP(NEXTOPER(first)) == REG_ANY)
4201 r->reganch |= type | ROPT_IMPLICIT;
4202 first = NEXTOPER(first);
4205 if (sawplus && (!sawopen || !RExC_sawback)
4206 && !(RExC_seen & REG_SEEN_EVAL)) /* May examine pos and $& */
4207 /* x+ must match at the 1st pos of run of x's */
4208 r->reganch |= ROPT_SKIP;
4210 /* Scan is after the zeroth branch, first is atomic matcher. */
4211 #ifdef TRIE_STUDY_OPT
4214 PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
4215 (IV)(first - scan + 1))
4219 PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
4220 (IV)(first - scan + 1))
4226 * If there's something expensive in the r.e., find the
4227 * longest literal string that must appear and make it the
4228 * regmust. Resolve ties in favor of later strings, since
4229 * the regstart check works with the beginning of the r.e.
4230 * and avoiding duplication strengthens checking. Not a
4231 * strong reason, but sufficient in the absence of others.
4232 * [Now we resolve ties in favor of the earlier string if
4233 * it happens that c_offset_min has been invalidated, since the
4234 * earlier string may buy us something the later one won't.]
4237 data.longest_fixed = newSVpvs("");
4238 data.longest_float = newSVpvs("");
4239 data.last_found = newSVpvs("");
4240 data.longest = &(data.longest_fixed);
4242 if (!r->regstclass) {
4243 cl_init(pRExC_state, &ch_class);
4244 data.start_class = &ch_class;
4245 stclass_flag = SCF_DO_STCLASS_AND;
4246 } else /* XXXX Check for BOUND? */
4248 data.last_closep = &last_close;
4250 minlen = study_chunk(pRExC_state, &first, &minlen, &fake, scan + RExC_size, /* Up to end */
4251 &data, -1, NULL, NULL,
4252 SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag,0);
4258 if ( RExC_npar == 1 && data.longest == &(data.longest_fixed)
4259 && data.last_start_min == 0 && data.last_end > 0
4260 && !RExC_seen_zerolen
4261 && (!(RExC_seen & REG_SEEN_GPOS) || (r->reganch & ROPT_ANCH_GPOS)))
4262 r->reganch |= ROPT_CHECK_ALL;
4263 scan_commit(pRExC_state, &data,&minlen);
4264 SvREFCNT_dec(data.last_found);
4266 /* Note that code very similar to this but for anchored string
4267 follows immediately below, changes may need to be made to both.
4270 longest_float_length = CHR_SVLEN(data.longest_float);
4271 if (longest_float_length
4272 || (data.flags & SF_FL_BEFORE_EOL
4273 && (!(data.flags & SF_FL_BEFORE_MEOL)
4274 || (RExC_flags & PMf_MULTILINE))))
4278 if (SvCUR(data.longest_fixed) /* ok to leave SvCUR */
4279 && data.offset_fixed == data.offset_float_min
4280 && SvCUR(data.longest_fixed) == SvCUR(data.longest_float))
4281 goto remove_float; /* As in (a)+. */
4283 /* copy the information about the longest float from the reg_scan_data
4284 over to the program. */
4285 if (SvUTF8(data.longest_float)) {
4286 r->float_utf8 = data.longest_float;
4287 r->float_substr = NULL;
4289 r->float_substr = data.longest_float;
4290 r->float_utf8 = NULL;
4292 /* float_end_shift is how many chars that must be matched that
4293 follow this item. We calculate it ahead of time as once the
4294 lookbehind offset is added in we lose the ability to correctly
4296 ml = data.minlen_float ? *(data.minlen_float)
4297 : (I32)longest_float_length;
4298 r->float_end_shift = ml - data.offset_float_min
4299 - longest_float_length + (SvTAIL(data.longest_float) != 0)
4300 + data.lookbehind_float;
4301 r->float_min_offset = data.offset_float_min - data.lookbehind_float;
4302 r->float_max_offset = data.offset_float_max;
4303 if (data.offset_float_max < I32_MAX) /* Don't offset infinity */
4304 r->float_max_offset -= data.lookbehind_float;
4306 t = (data.flags & SF_FL_BEFORE_EOL /* Can't have SEOL and MULTI */
4307 && (!(data.flags & SF_FL_BEFORE_MEOL)
4308 || (RExC_flags & PMf_MULTILINE)));
4309 fbm_compile(data.longest_float, t ? FBMcf_TAIL : 0);
4313 r->float_substr = r->float_utf8 = NULL;
4314 SvREFCNT_dec(data.longest_float);
4315 longest_float_length = 0;
4318 /* Note that code very similar to this but for floating string
4319 is immediately above, changes may need to be made to both.
4322 longest_fixed_length = CHR_SVLEN(data.longest_fixed);
4323 if (longest_fixed_length
4324 || (data.flags & SF_FIX_BEFORE_EOL /* Cannot have SEOL and MULTI */
4325 && (!(data.flags & SF_FIX_BEFORE_MEOL)
4326 || (RExC_flags & PMf_MULTILINE))))
4330 /* copy the information about the longest fixed
4331 from the reg_scan_data over to the program. */
4332 if (SvUTF8(data.longest_fixed)) {
4333 r->anchored_utf8 = data.longest_fixed;
4334 r->anchored_substr = NULL;
4336 r->anchored_substr = data.longest_fixed;
4337 r->anchored_utf8 = NULL;
4339 /* fixed_end_shift is how many chars that must be matched that
4340 follow this item. We calculate it ahead of time as once the
4341 lookbehind offset is added in we lose the ability to correctly
4343 ml = data.minlen_fixed ? *(data.minlen_fixed)
4344 : (I32)longest_fixed_length;
4345 r->anchored_end_shift = ml - data.offset_fixed
4346 - longest_fixed_length + (SvTAIL(data.longest_fixed) != 0)
4347 + data.lookbehind_fixed;
4348 r->anchored_offset = data.offset_fixed - data.lookbehind_fixed;
4350 t = (data.flags & SF_FIX_BEFORE_EOL /* Can't have SEOL and MULTI */
4351 && (!(data.flags & SF_FIX_BEFORE_MEOL)
4352 || (RExC_flags & PMf_MULTILINE)));
4353 fbm_compile(data.longest_fixed, t ? FBMcf_TAIL : 0);
4356 r->anchored_substr = r->anchored_utf8 = NULL;
4357 SvREFCNT_dec(data.longest_fixed);
4358 longest_fixed_length = 0;
4361 && (OP(r->regstclass) == REG_ANY || OP(r->regstclass) == SANY))
4362 r->regstclass = NULL;
4363 if ((!(r->anchored_substr || r->anchored_utf8) || r->anchored_offset)
4365 && !(data.start_class->flags & ANYOF_EOS)
4366 && !cl_is_anything(data.start_class))
4368 const I32 n = add_data(pRExC_state, 1, "f");
4370 Newx(RExC_rx->data->data[n], 1,
4371 struct regnode_charclass_class);
4372 StructCopy(data.start_class,
4373 (struct regnode_charclass_class*)RExC_rx->data->data[n],
4374 struct regnode_charclass_class);
4375 r->regstclass = (regnode*)RExC_rx->data->data[n];
4376 r->reganch &= ~ROPT_SKIP; /* Used in find_byclass(). */
4377 DEBUG_COMPILE_r({ SV *sv = sv_newmortal();
4378 regprop(r, sv, (regnode*)data.start_class);
4379 PerlIO_printf(Perl_debug_log,
4380 "synthetic stclass \"%s\".\n",
4381 SvPVX_const(sv));});
4384 /* A temporary algorithm prefers floated substr to fixed one to dig more info. */
4385 if (longest_fixed_length > longest_float_length) {
4386 r->check_end_shift = r->anchored_end_shift;
4387 r->check_substr = r->anchored_substr;
4388 r->check_utf8 = r->anchored_utf8;
4389 r->check_offset_min = r->check_offset_max = r->anchored_offset;
4390 if (r->reganch & ROPT_ANCH_SINGLE)
4391 r->reganch |= ROPT_NOSCAN;
4394 r->check_end_shift = r->float_end_shift;
4395 r->check_substr = r->float_substr;
4396 r->check_utf8 = r->float_utf8;
4397 r->check_offset_min = r->float_min_offset;
4398 r->check_offset_max = r->float_max_offset;
4400 /* XXXX Currently intuiting is not compatible with ANCH_GPOS.
4401 This should be changed ASAP! */
4402 if ((r->check_substr || r->check_utf8) && !(r->reganch & ROPT_ANCH_GPOS)) {
4403 r->reganch |= RE_USE_INTUIT;
4404 if (SvTAIL(r->check_substr ? r->check_substr : r->check_utf8))
4405 r->reganch |= RE_INTUIT_TAIL;
4407 /* XXX Unneeded? dmq (shouldn't as this is handled elsewhere)
4408 if ( (STRLEN)minlen < longest_float_length )
4409 minlen= longest_float_length;
4410 if ( (STRLEN)minlen < longest_fixed_length )
4411 minlen= longest_fixed_length;
4415 /* Several toplevels. Best we can is to set minlen. */
4417 struct regnode_charclass_class ch_class;
4420 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "\nMulti Top Level\n"));
4422 scan = r->program + 1;
4423 cl_init(pRExC_state, &ch_class);
4424 data.start_class = &ch_class;
4425 data.last_closep = &last_close;
4428 minlen = study_chunk(pRExC_state, &scan, &minlen, &fake, scan + RExC_size,
4429 &data, -1, NULL, NULL, SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS,0);
4433 r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8
4434 = r->float_substr = r->float_utf8 = NULL;
4435 if (!(data.start_class->flags & ANYOF_EOS)
4436 && !cl_is_anything(data.start_class))
4438 const I32 n = add_data(pRExC_state, 1, "f");
4440 Newx(RExC_rx->data->data[n], 1,
4441 struct regnode_charclass_class);
4442 StructCopy(data.start_class,
4443 (struct regnode_charclass_class*)RExC_rx->data->data[n],
4444 struct regnode_charclass_class);
4445 r->regstclass = (regnode*)RExC_rx->data->data[n];
4446 r->reganch &= ~ROPT_SKIP; /* Used in find_byclass(). */
4447 DEBUG_COMPILE_r({ SV* sv = sv_newmortal();
4448 regprop(r, sv, (regnode*)data.start_class);
4449 PerlIO_printf(Perl_debug_log,
4450 "synthetic stclass \"%s\".\n",
4451 SvPVX_const(sv));});
4455 /* Guard against an embedded (?=) or (?<=) with a longer minlen than
4456 the "real" pattern. */
4458 PerlIO_printf(Perl_debug_log,"minlen: %"IVdf" r->minlen:%"IVdf"\n",
4461 r->minlenret = minlen;
4462 if (r->minlen < minlen)
4465 if (RExC_seen & REG_SEEN_GPOS)
4466 r->reganch |= ROPT_GPOS_SEEN;
4467 if (RExC_seen & REG_SEEN_LOOKBEHIND)
4468 r->reganch |= ROPT_LOOKBEHIND_SEEN;
4469 if (RExC_seen & REG_SEEN_EVAL)
4470 r->reganch |= ROPT_EVAL_SEEN;
4471 if (RExC_seen & REG_SEEN_CANY)
4472 r->reganch |= ROPT_CANY_SEEN;
4473 if (RExC_seen & REG_SEEN_VERBARG)
4474 r->reganch |= ROPT_VERBARG_SEEN;
4475 if (RExC_seen & REG_SEEN_CUTGROUP)
4476 r->reganch |= ROPT_CUTGROUP_SEEN;
4477 if (RExC_paren_names)
4478 r->paren_names = (HV*)SvREFCNT_inc(RExC_paren_names);
4480 r->paren_names = NULL;
4482 if (RExC_recurse_count) {
4483 for ( ; RExC_recurse_count ; RExC_recurse_count-- ) {
4484 const regnode *scan = RExC_recurse[RExC_recurse_count-1];
4485 ARG2L_SET( scan, RExC_open_parens[ARG(scan)-1] - scan );
4488 Newxz(r->startp, RExC_npar, I32);
4489 Newxz(r->endp, RExC_npar, I32);
4490 /* assume we don't need to swap parens around before we match */
4493 PerlIO_printf(Perl_debug_log,"Final program:\n");
4496 DEBUG_OFFSETS_r(if (r->offsets) {
4497 const U32 len = r->offsets[0];
4499 GET_RE_DEBUG_FLAGS_DECL;
4500 PerlIO_printf(Perl_debug_log, "Offsets: [%"UVuf"]\n\t", (UV)r->offsets[0]);
4501 for (i = 1; i <= len; i++) {
4502 if (r->offsets[i*2-1] || r->offsets[i*2])
4503 PerlIO_printf(Perl_debug_log, "%"UVuf":%"UVuf"[%"UVuf"] ",
4504 (UV)i, (UV)r->offsets[i*2-1], (UV)r->offsets[i*2]);
4506 PerlIO_printf(Perl_debug_log, "\n");
4512 #undef CORE_ONLY_BLOCK
4514 #undef RE_ENGINE_PTR
4516 #ifndef PERL_IN_XSUB_RE
4518 Perl_reg_named_buff_sv(pTHX_ SV* namesv)
4520 I32 parno = 0; /* no match */
4522 const REGEXP * const rx = PM_GETRE(PL_curpm);
4523 if (rx && rx->paren_names) {
4524 HE *he_str = hv_fetch_ent( rx->paren_names, namesv, 0, 0 );
4527 SV* sv_dat=HeVAL(he_str);
4528 I32 *nums=(I32*)SvPVX(sv_dat);
4529 for ( i=0; i<SvIVX(sv_dat); i++ ) {
4530 if ((I32)(rx->lastparen) >= nums[i] &&
4531 rx->endp[nums[i]] != -1)
4544 SV *sv= sv_newmortal();
4545 Perl_sv_setpvf(aTHX_ sv, "%"IVdf,(IV)parno);
4546 gv_paren= Perl_gv_fetchsv(aTHX_ sv, GV_ADD, SVt_PVGV);
4547 return GvSVn(gv_paren);
4552 /* Scans the name of a named buffer from the pattern.
4553 * If flags is REG_RSN_RETURN_NULL returns null.
4554 * If flags is REG_RSN_RETURN_NAME returns an SV* containing the name
4555 * If flags is REG_RSN_RETURN_DATA returns the data SV* corresponding
4556 * to the parsed name as looked up in the RExC_paren_names hash.
4557 * If there is an error throws a vFAIL().. type exception.
4560 #define REG_RSN_RETURN_NULL 0
4561 #define REG_RSN_RETURN_NAME 1
4562 #define REG_RSN_RETURN_DATA 2
4565 S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags) {
4566 char *name_start = RExC_parse;
4569 while( isIDFIRST_uni(utf8n_to_uvchr((U8*)RExC_parse,
4570 RExC_end - RExC_parse, &numlen, UTF8_ALLOW_DEFAULT)))
4572 RExC_parse += numlen;
4575 while( isIDFIRST(*RExC_parse) )
4579 SV* sv_name = sv_2mortal(Perl_newSVpvn(aTHX_ name_start,
4580 (int)(RExC_parse - name_start)));
4583 if ( flags == REG_RSN_RETURN_NAME)
4585 else if (flags==REG_RSN_RETURN_DATA) {
4588 if ( ! sv_name ) /* should not happen*/
4589 Perl_croak(aTHX_ "panic: no svname in reg_scan_name");
4590 if (RExC_paren_names)
4591 he_str = hv_fetch_ent( RExC_paren_names, sv_name, 0, 0 );
4593 sv_dat = HeVAL(he_str);
4595 vFAIL("Reference to nonexistent named group");
4599 Perl_croak(aTHX_ "panic: bad flag in reg_scan_name");
4606 #define DEBUG_PARSE_MSG(funcname) DEBUG_PARSE_r({ \
4607 int rem=(int)(RExC_end - RExC_parse); \
4616 if (RExC_lastparse!=RExC_parse) \
4617 PerlIO_printf(Perl_debug_log," >%.*s%-*s", \
4620 iscut ? "..." : "<" \
4623 PerlIO_printf(Perl_debug_log,"%16s",""); \
4628 num=REG_NODE_NUM(RExC_emit); \
4629 if (RExC_lastnum!=num) \
4630 PerlIO_printf(Perl_debug_log,"|%4d",num); \
4632 PerlIO_printf(Perl_debug_log,"|%4s",""); \
4633 PerlIO_printf(Perl_debug_log,"|%*s%-4s", \
4634 (int)((depth*2)), "", \
4638 RExC_lastparse=RExC_parse; \
4643 #define DEBUG_PARSE(funcname) DEBUG_PARSE_r({ \
4644 DEBUG_PARSE_MSG((funcname)); \
4645 PerlIO_printf(Perl_debug_log,"%4s","\n"); \
4647 #define DEBUG_PARSE_FMT(funcname,fmt,args) DEBUG_PARSE_r({ \
4648 DEBUG_PARSE_MSG((funcname)); \
4649 PerlIO_printf(Perl_debug_log,fmt "\n",args); \
4652 - reg - regular expression, i.e. main body or parenthesized thing
4654 * Caller must absorb opening parenthesis.
4656 * Combining parenthesis handling with the base level of regular expression
4657 * is a trifle forced, but the need to tie the tails of the branches to what
4658 * follows makes it hard to avoid.
4660 #define REGTAIL(x,y,z) regtail((x),(y),(z),depth+1)
4662 #define REGTAIL_STUDY(x,y,z) regtail_study((x),(y),(z),depth+1)
4664 #define REGTAIL_STUDY(x,y,z) regtail((x),(y),(z),depth+1)
4667 /* this idea is borrowed from STR_WITH_LEN in handy.h */
4668 #define CHECK_WORD(s,v,l) \
4669 (((sizeof(s)-1)==(l)) && (strnEQ(start_verb, (s ""), (sizeof(s)-1))))
4672 S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
4673 /* paren: Parenthesized? 0=top, 1=(, inside: changed to letter. */
4676 register regnode *ret; /* Will be the head of the group. */
4677 register regnode *br;
4678 register regnode *lastbr;
4679 register regnode *ender = NULL;
4680 register I32 parno = 0;
4682 const I32 oregflags = RExC_flags;
4683 bool have_branch = 0;
4686 /* for (?g), (?gc), and (?o) warnings; warning
4687 about (?c) will warn about (?g) -- japhy */
4689 #define WASTED_O 0x01
4690 #define WASTED_G 0x02
4691 #define WASTED_C 0x04
4692 #define WASTED_GC (0x02|0x04)
4693 I32 wastedflags = 0x00;
4695 char * parse_start = RExC_parse; /* MJD */
4696 char * const oregcomp_parse = RExC_parse;
4698 GET_RE_DEBUG_FLAGS_DECL;
4699 DEBUG_PARSE("reg ");
4702 *flagp = 0; /* Tentatively. */
4705 /* Make an OPEN node, if parenthesized. */
4707 if ( *RExC_parse == '*') { /* (*VERB:ARG) */
4708 char *start_verb = RExC_parse;
4709 STRLEN verb_len = 0;
4710 char *start_arg = NULL;
4711 unsigned char op = 0;
4713 int internal_argval = 0; /* internal_argval is only useful if !argok */
4714 while ( *RExC_parse && *RExC_parse != ')' ) {
4715 if ( *RExC_parse == ':' ) {
4716 start_arg = RExC_parse + 1;
4722 verb_len = RExC_parse - start_verb;
4725 while ( *RExC_parse && *RExC_parse != ')' )
4727 if ( *RExC_parse != ')' )
4728 vFAIL("Unterminated verb pattern argument");
4729 if ( RExC_parse == start_arg )
4732 if ( *RExC_parse != ')' )
4733 vFAIL("Unterminated verb pattern");
4736 switch ( *start_verb ) {
4737 case 'A': /* (*ACCEPT) */
4738 if ( CHECK_WORD("ACCEPT",start_verb,verb_len) ) {
4740 internal_argval = RExC_nestroot;
4743 case 'C': /* (*COMMIT) */
4744 if ( CHECK_WORD("COMMIT",start_verb,verb_len) )
4747 case 'F': /* (*FAIL) */
4748 if ( verb_len==1 || CHECK_WORD("FAIL",start_verb,verb_len) ) {
4753 case ':': /* (*:NAME) */
4754 case 'M': /* (*MARK:NAME) */
4755 if ( verb_len==0 || CHECK_WORD("MARK",start_verb,verb_len) ) {
4760 case 'P': /* (*PRUNE) */
4761 if ( CHECK_WORD("PRUNE",start_verb,verb_len) )
4764 case 'S': /* (*SKIP) */
4765 if ( CHECK_WORD("SKIP",start_verb,verb_len) )
4768 case 'T': /* (*THEN) */
4769 /* [19:06] <TimToady> :: is then */
4770 if ( CHECK_WORD("THEN",start_verb,verb_len) ) {
4772 RExC_seen |= REG_SEEN_CUTGROUP;
4778 vFAIL3("Unknown verb pattern '%.*s'",
4779 verb_len, start_verb);
4782 if ( start_arg && internal_argval ) {
4783 vFAIL3("Verb pattern '%.*s' may not have an argument",
4784 verb_len, start_verb);
4785 } else if ( argok < 0 && !start_arg ) {
4786 vFAIL3("Verb pattern '%.*s' has a mandatory argument",
4787 verb_len, start_verb);
4789 ret = reganode(pRExC_state, op, internal_argval);
4790 if ( ! internal_argval && ! SIZE_ONLY ) {
4792 SV *sv = newSVpvn( start_arg, RExC_parse - start_arg);
4793 ARG(ret) = add_data( pRExC_state, 1, "S" );
4794 RExC_rx->data->data[ARG(ret)]=(void*)sv;
4801 if (!internal_argval)
4802 RExC_seen |= REG_SEEN_VERBARG;
4803 } else if ( start_arg ) {
4804 vFAIL3("Verb pattern '%.*s' may not have an argument",
4805 verb_len, start_verb);
4807 ret = reg_node(pRExC_state, op);
4809 nextchar(pRExC_state);
4812 if (*RExC_parse == '?') { /* (?...) */
4813 U32 posflags = 0, negflags = 0;
4814 U32 *flagsp = &posflags;
4815 bool is_logical = 0;
4816 const char * const seqstart = RExC_parse;
4819 paren = *RExC_parse++;
4820 ret = NULL; /* For look-ahead/behind. */
4823 case '<': /* (?<...) */
4824 if (*RExC_parse == '!')
4826 else if (*RExC_parse != '=')
4831 case '\'': /* (?'...') */
4832 name_start= RExC_parse;
4833 svname = reg_scan_name(pRExC_state,
4834 SIZE_ONLY ? /* reverse test from the others */
4835 REG_RSN_RETURN_NAME :
4836 REG_RSN_RETURN_NULL);
4837 if (RExC_parse == name_start)
4839 if (*RExC_parse != paren)
4840 vFAIL2("Sequence (?%c... not terminated",
4841 paren=='>' ? '<' : paren);
4845 if (!svname) /* shouldnt happen */
4847 "panic: reg_scan_name returned NULL");
4848 if (!RExC_paren_names) {
4849 RExC_paren_names= newHV();
4850 sv_2mortal((SV*)RExC_paren_names);
4852 he_str = hv_fetch_ent( RExC_paren_names, svname, 1, 0 );
4854 sv_dat = HeVAL(he_str);
4856 /* croak baby croak */
4858 "panic: paren_name hash element allocation failed");
4859 } else if ( SvPOK(sv_dat) ) {
4860 IV count=SvIV(sv_dat);
4861 I32 *pv=(I32*)SvGROW(sv_dat,SvCUR(sv_dat)+sizeof(I32)+1);
4862 SvCUR_set(sv_dat,SvCUR(sv_dat)+sizeof(I32));
4863 pv[count]=RExC_npar;
4866 (void)SvUPGRADE(sv_dat,SVt_PVNV);
4867 sv_setpvn(sv_dat, (char *)&(RExC_npar), sizeof(I32));
4872 /*sv_dump(sv_dat);*/
4874 nextchar(pRExC_state);
4876 goto capturing_parens;
4878 RExC_seen |= REG_SEEN_LOOKBEHIND;
4880 case '=': /* (?=...) */
4881 case '!': /* (?!...) */
4882 RExC_seen_zerolen++;
4883 if (*RExC_parse == ')') {
4884 ret=reg_node(pRExC_state, OPFAIL);
4885 nextchar(pRExC_state);
4888 case ':': /* (?:...) */
4889 case '>': /* (?>...) */
4891 case '$': /* (?$...) */
4892 case '@': /* (?@...) */
4893 vFAIL2("Sequence (?%c...) not implemented", (int)paren);
4895 case '#': /* (?#...) */
4896 while (*RExC_parse && *RExC_parse != ')')
4898 if (*RExC_parse != ')')
4899 FAIL("Sequence (?#... not terminated");
4900 nextchar(pRExC_state);
4903 case '0' : /* (?0) */
4904 case 'R' : /* (?R) */
4905 if (*RExC_parse != ')')
4906 FAIL("Sequence (?R) not terminated");
4907 ret = reg_node(pRExC_state, GOSTART);
4908 nextchar(pRExC_state);
4911 { /* named and numeric backreferences */
4914 case '&': /* (?&NAME) */
4915 parse_start = RExC_parse - 1;
4917 SV *sv_dat = reg_scan_name(pRExC_state,
4918 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
4919 num = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
4921 goto gen_recurse_regop;
4924 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
4926 vFAIL("Illegal pattern");
4928 goto parse_recursion;
4930 case '-': /* (?-1) */
4931 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
4932 RExC_parse--; /* rewind to let it be handled later */
4936 case '1': case '2': case '3': case '4': /* (?1) */
4937 case '5': case '6': case '7': case '8': case '9':
4940 num = atoi(RExC_parse);
4941 parse_start = RExC_parse - 1; /* MJD */
4942 if (*RExC_parse == '-')
4944 while (isDIGIT(*RExC_parse))
4946 if (*RExC_parse!=')')
4947 vFAIL("Expecting close bracket");
4950 if ( paren == '-' ) {
4952 Diagram of capture buffer numbering.
4953 Top line is the normal capture buffer numbers
4954 Botton line is the negative indexing as from
4958 /(a(x)y)(a(b(c(?-2)d)e)f)(g(h))/
4962 num = RExC_npar + num;
4965 vFAIL("Reference to nonexistent group");
4967 } else if ( paren == '+' ) {
4968 num = RExC_npar + num - 1;
4971 ret = reganode(pRExC_state, GOSUB, num);
4973 if (num > (I32)RExC_rx->nparens) {
4975 vFAIL("Reference to nonexistent group");
4977 ARG2L_SET( ret, RExC_recurse_count++);
4979 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
4980 "Recurse #%"UVuf" to %"IVdf"\n", (UV)ARG(ret), (IV)ARG2L(ret)));
4984 RExC_seen |= REG_SEEN_RECURSE;
4985 Set_Node_Length(ret, 1 + regarglen[OP(ret)]); /* MJD */
4986 Set_Node_Offset(ret, parse_start); /* MJD */
4988 nextchar(pRExC_state);
4990 } /* named and numeric backreferences */
4993 case 'p': /* (?p...) */
4994 if (SIZE_ONLY && ckWARN2(WARN_DEPRECATED, WARN_REGEXP))
4995 vWARNdep(RExC_parse, "(?p{}) is deprecated - use (??{})");
4997 case '?': /* (??...) */
4999 if (*RExC_parse != '{')
5001 paren = *RExC_parse++;
5003 case '{': /* (?{...}) */
5005 I32 count = 1, n = 0;
5007 char *s = RExC_parse;
5009 RExC_seen_zerolen++;
5010 RExC_seen |= REG_SEEN_EVAL;
5011 while (count && (c = *RExC_parse)) {
5022 if (*RExC_parse != ')') {
5024 vFAIL("Sequence (?{...}) not terminated or not {}-balanced");
5028 OP_4tree *sop, *rop;
5029 SV * const sv = newSVpvn(s, RExC_parse - 1 - s);
5032 Perl_save_re_context(aTHX);
5033 rop = sv_compile_2op(sv, &sop, "re", &pad);
5034 sop->op_private |= OPpREFCOUNTED;
5035 /* re_dup will OpREFCNT_inc */
5036 OpREFCNT_set(sop, 1);
5039 n = add_data(pRExC_state, 3, "nop");
5040 RExC_rx->data->data[n] = (void*)rop;
5041 RExC_rx->data->data[n+1] = (void*)sop;
5042 RExC_rx->data->data[n+2] = (void*)pad;
5045 else { /* First pass */
5046 if (PL_reginterp_cnt < ++RExC_seen_evals
5048 /* No compiled RE interpolated, has runtime
5049 components ===> unsafe. */
5050 FAIL("Eval-group not allowed at runtime, use re 'eval'");
5051 if (PL_tainting && PL_tainted)
5052 FAIL("Eval-group in insecure regular expression");
5053 #if PERL_VERSION > 8
5054 if (IN_PERL_COMPILETIME)
5059 nextchar(pRExC_state);
5061 ret = reg_node(pRExC_state, LOGICAL);
5064 REGTAIL(pRExC_state, ret, reganode(pRExC_state, EVAL, n));
5065 /* deal with the length of this later - MJD */
5068 ret = reganode(pRExC_state, EVAL, n);
5069 Set_Node_Length(ret, RExC_parse - parse_start + 1);
5070 Set_Node_Offset(ret, parse_start);
5073 case '(': /* (?(?{...})...) and (?(?=...)...) */
5076 if (RExC_parse[0] == '?') { /* (?(?...)) */
5077 if (RExC_parse[1] == '=' || RExC_parse[1] == '!'
5078 || RExC_parse[1] == '<'
5079 || RExC_parse[1] == '{') { /* Lookahead or eval. */
5082 ret = reg_node(pRExC_state, LOGICAL);
5085 REGTAIL(pRExC_state, ret, reg(pRExC_state, 1, &flag,depth+1));
5089 else if ( RExC_parse[0] == '<' /* (?(<NAME>)...) */
5090 || RExC_parse[0] == '\'' ) /* (?('NAME')...) */
5092 char ch = RExC_parse[0] == '<' ? '>' : '\'';
5093 char *name_start= RExC_parse++;
5095 SV *sv_dat=reg_scan_name(pRExC_state,
5096 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
5097 if (RExC_parse == name_start || *RExC_parse != ch)
5098 vFAIL2("Sequence (?(%c... not terminated",
5099 (ch == '>' ? '<' : ch));
5102 num = add_data( pRExC_state, 1, "S" );
5103 RExC_rx->data->data[num]=(void*)sv_dat;
5104 SvREFCNT_inc(sv_dat);
5106 ret = reganode(pRExC_state,NGROUPP,num);
5107 goto insert_if_check_paren;
5109 else if (RExC_parse[0] == 'D' &&
5110 RExC_parse[1] == 'E' &&
5111 RExC_parse[2] == 'F' &&
5112 RExC_parse[3] == 'I' &&
5113 RExC_parse[4] == 'N' &&
5114 RExC_parse[5] == 'E')
5116 ret = reganode(pRExC_state,DEFINEP,0);
5119 goto insert_if_check_paren;
5121 else if (RExC_parse[0] == 'R') {
5124 if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
5125 parno = atoi(RExC_parse++);
5126 while (isDIGIT(*RExC_parse))
5128 } else if (RExC_parse[0] == '&') {
5131 sv_dat = reg_scan_name(pRExC_state,
5132 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
5133 parno = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
5135 ret = reganode(pRExC_state,INSUBP,parno);
5136 goto insert_if_check_paren;
5138 else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
5141 parno = atoi(RExC_parse++);
5143 while (isDIGIT(*RExC_parse))
5145 ret = reganode(pRExC_state, GROUPP, parno);
5147 insert_if_check_paren:
5148 if ((c = *nextchar(pRExC_state)) != ')')
5149 vFAIL("Switch condition not recognized");
5151 REGTAIL(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
5152 br = regbranch(pRExC_state, &flags, 1,depth+1);
5154 br = reganode(pRExC_state, LONGJMP, 0);
5156 REGTAIL(pRExC_state, br, reganode(pRExC_state, LONGJMP, 0));
5157 c = *nextchar(pRExC_state);
5162 vFAIL("(?(DEFINE)....) does not allow branches");
5163 lastbr = reganode(pRExC_state, IFTHEN, 0); /* Fake one for optimizer. */
5164 regbranch(pRExC_state, &flags, 1,depth+1);
5165 REGTAIL(pRExC_state, ret, lastbr);
5168 c = *nextchar(pRExC_state);
5173 vFAIL("Switch (?(condition)... contains too many branches");
5174 ender = reg_node(pRExC_state, TAIL);
5175 REGTAIL(pRExC_state, br, ender);
5177 REGTAIL(pRExC_state, lastbr, ender);
5178 REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
5181 REGTAIL(pRExC_state, ret, ender);
5185 vFAIL2("Unknown switch condition (?(%.2s", RExC_parse);
5189 RExC_parse--; /* for vFAIL to print correctly */
5190 vFAIL("Sequence (? incomplete");
5194 parse_flags: /* (?i) */
5195 while (*RExC_parse && strchr("iogcmsx", *RExC_parse)) {
5196 /* (?g), (?gc) and (?o) are useless here
5197 and must be globally applied -- japhy */
5199 if (*RExC_parse == 'o' || *RExC_parse == 'g') {
5200 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
5201 const I32 wflagbit = *RExC_parse == 'o' ? WASTED_O : WASTED_G;
5202 if (! (wastedflags & wflagbit) ) {
5203 wastedflags |= wflagbit;
5206 "Useless (%s%c) - %suse /%c modifier",
5207 flagsp == &negflags ? "?-" : "?",
5209 flagsp == &negflags ? "don't " : "",
5215 else if (*RExC_parse == 'c') {
5216 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
5217 if (! (wastedflags & WASTED_C) ) {
5218 wastedflags |= WASTED_GC;
5221 "Useless (%sc) - %suse /gc modifier",
5222 flagsp == &negflags ? "?-" : "?",
5223 flagsp == &negflags ? "don't " : ""
5228 else { pmflag(flagsp, *RExC_parse); }
5232 if (*RExC_parse == '-') {
5234 wastedflags = 0; /* reset so (?g-c) warns twice */
5238 RExC_flags |= posflags;
5239 RExC_flags &= ~negflags;
5240 if (*RExC_parse == ':') {
5246 if (*RExC_parse != ')') {
5248 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
5250 nextchar(pRExC_state);
5260 ret = reganode(pRExC_state, OPEN, parno);
5263 RExC_nestroot = parno;
5264 if (RExC_seen & REG_SEEN_RECURSE) {
5265 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
5266 "Setting open paren #%"IVdf" to %d\n",
5267 (IV)parno, REG_NODE_NUM(ret)));
5268 RExC_open_parens[parno-1]= ret;
5271 Set_Node_Length(ret, 1); /* MJD */
5272 Set_Node_Offset(ret, RExC_parse); /* MJD */
5279 /* Pick up the branches, linking them together. */
5280 parse_start = RExC_parse; /* MJD */
5281 br = regbranch(pRExC_state, &flags, 1,depth+1);
5282 /* branch_len = (paren != 0); */
5286 if (*RExC_parse == '|') {
5287 if (!SIZE_ONLY && RExC_extralen) {
5288 reginsert(pRExC_state, BRANCHJ, br, depth+1);
5291 reginsert(pRExC_state, BRANCH, br, depth+1);
5292 Set_Node_Length(br, paren != 0);
5293 Set_Node_Offset_To_R(br-RExC_emit_start, parse_start-RExC_start);
5297 RExC_extralen += 1; /* For BRANCHJ-BRANCH. */
5299 else if (paren == ':') {
5300 *flagp |= flags&SIMPLE;
5302 if (is_open) { /* Starts with OPEN. */
5303 REGTAIL(pRExC_state, ret, br); /* OPEN -> first. */
5305 else if (paren != '?') /* Not Conditional */
5307 *flagp |= flags & (SPSTART | HASWIDTH);
5309 while (*RExC_parse == '|') {
5310 if (!SIZE_ONLY && RExC_extralen) {
5311 ender = reganode(pRExC_state, LONGJMP,0);
5312 REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender); /* Append to the previous. */
5315 RExC_extralen += 2; /* Account for LONGJMP. */
5316 nextchar(pRExC_state);
5317 br = regbranch(pRExC_state, &flags, 0, depth+1);
5321 REGTAIL(pRExC_state, lastbr, br); /* BRANCH -> BRANCH. */
5325 *flagp |= flags&SPSTART;
5328 if (have_branch || paren != ':') {
5329 /* Make a closing node, and hook it on the end. */
5332 ender = reg_node(pRExC_state, TAIL);
5336 ender = reganode(pRExC_state, CLOSE, parno);
5337 if (!SIZE_ONLY && RExC_seen & REG_SEEN_RECURSE) {
5338 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
5339 "Setting close paren #%"IVdf" to %d\n",
5340 (IV)parno, REG_NODE_NUM(ender)));
5341 RExC_close_parens[parno-1]= ender;
5342 if (RExC_nestroot == parno)
5345 Set_Node_Offset(ender,RExC_parse+1); /* MJD */
5346 Set_Node_Length(ender,1); /* MJD */
5352 *flagp &= ~HASWIDTH;
5355 ender = reg_node(pRExC_state, SUCCEED);
5358 ender = reg_node(pRExC_state, END);
5360 assert(!RExC_opend); /* there can only be one! */
5365 REGTAIL(pRExC_state, lastbr, ender);
5367 if (have_branch && !SIZE_ONLY) {
5369 RExC_seen |= REG_TOP_LEVEL_BRANCHES;
5371 /* Hook the tails of the branches to the closing node. */
5372 for (br = ret; br; br = regnext(br)) {
5373 const U8 op = PL_regkind[OP(br)];
5375 REGTAIL_STUDY(pRExC_state, NEXTOPER(br), ender);
5377 else if (op == BRANCHJ) {
5378 REGTAIL_STUDY(pRExC_state, NEXTOPER(NEXTOPER(br)), ender);
5386 static const char parens[] = "=!<,>";
5388 if (paren && (p = strchr(parens, paren))) {
5389 U8 node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
5390 int flag = (p - parens) > 1;
5393 node = SUSPEND, flag = 0;
5394 reginsert(pRExC_state, node,ret, depth+1);
5395 Set_Node_Cur_Length(ret);
5396 Set_Node_Offset(ret, parse_start + 1);
5398 REGTAIL_STUDY(pRExC_state, ret, reg_node(pRExC_state, TAIL));
5402 /* Check for proper termination. */
5404 RExC_flags = oregflags;
5405 if (RExC_parse >= RExC_end || *nextchar(pRExC_state) != ')') {
5406 RExC_parse = oregcomp_parse;
5407 vFAIL("Unmatched (");
5410 else if (!paren && RExC_parse < RExC_end) {
5411 if (*RExC_parse == ')') {
5413 vFAIL("Unmatched )");
5416 FAIL("Junk on end of regexp"); /* "Can't happen". */
5424 - regbranch - one alternative of an | operator
5426 * Implements the concatenation operator.
5429 S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first, U32 depth)
5432 register regnode *ret;
5433 register regnode *chain = NULL;
5434 register regnode *latest;
5435 I32 flags = 0, c = 0;
5436 GET_RE_DEBUG_FLAGS_DECL;
5437 DEBUG_PARSE("brnc");
5441 if (!SIZE_ONLY && RExC_extralen)
5442 ret = reganode(pRExC_state, BRANCHJ,0);
5444 ret = reg_node(pRExC_state, BRANCH);
5445 Set_Node_Length(ret, 1);
5449 if (!first && SIZE_ONLY)
5450 RExC_extralen += 1; /* BRANCHJ */
5452 *flagp = WORST; /* Tentatively. */
5455 nextchar(pRExC_state);
5456 while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
5458 latest = regpiece(pRExC_state, &flags,depth+1);
5459 if (latest == NULL) {
5460 if (flags & TRYAGAIN)
5464 else if (ret == NULL)
5466 *flagp |= flags&HASWIDTH;
5467 if (chain == NULL) /* First piece. */
5468 *flagp |= flags&SPSTART;
5471 REGTAIL(pRExC_state, chain, latest);
5476 if (chain == NULL) { /* Loop ran zero times. */
5477 chain = reg_node(pRExC_state, NOTHING);
5482 *flagp |= flags&SIMPLE;
5489 - regpiece - something followed by possible [*+?]
5491 * Note that the branching code sequences used for ? and the general cases
5492 * of * and + are somewhat optimized: they use the same NOTHING node as
5493 * both the endmarker for their branch list and the body of the last branch.
5494 * It might seem that this node could be dispensed with entirely, but the
5495 * endmarker role is not redundant.
5498 S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
5501 register regnode *ret;
5503 register char *next;
5505 const char * const origparse = RExC_parse;
5507 I32 max = REG_INFTY;
5509 const char *maxpos = NULL;
5510 GET_RE_DEBUG_FLAGS_DECL;
5511 DEBUG_PARSE("piec");
5513 ret = regatom(pRExC_state, &flags,depth+1);
5515 if (flags & TRYAGAIN)
5522 if (op == '{' && regcurly(RExC_parse)) {
5524 parse_start = RExC_parse; /* MJD */
5525 next = RExC_parse + 1;
5526 while (isDIGIT(*next) || *next == ',') {
5535 if (*next == '}') { /* got one */
5539 min = atoi(RExC_parse);
5543 maxpos = RExC_parse;
5545 if (!max && *maxpos != '0')
5546 max = REG_INFTY; /* meaning "infinity" */
5547 else if (max >= REG_INFTY)
5548 vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
5550 nextchar(pRExC_state);
5553 if ((flags&SIMPLE)) {
5554 RExC_naughty += 2 + RExC_naughty / 2;
5555 reginsert(pRExC_state, CURLY, ret, depth+1);
5556 Set_Node_Offset(ret, parse_start+1); /* MJD */
5557 Set_Node_Cur_Length(ret);
5560 regnode * const w = reg_node(pRExC_state, WHILEM);
5563 REGTAIL(pRExC_state, ret, w);
5564 if (!SIZE_ONLY && RExC_extralen) {
5565 reginsert(pRExC_state, LONGJMP,ret, depth+1);
5566 reginsert(pRExC_state, NOTHING,ret, depth+1);
5567 NEXT_OFF(ret) = 3; /* Go over LONGJMP. */
5569 reginsert(pRExC_state, CURLYX,ret, depth+1);
5571 Set_Node_Offset(ret, parse_start+1);
5572 Set_Node_Length(ret,
5573 op == '{' ? (RExC_parse - parse_start) : 1);
5575 if (!SIZE_ONLY && RExC_extralen)
5576 NEXT_OFF(ret) = 3; /* Go over NOTHING to LONGJMP. */
5577 REGTAIL(pRExC_state, ret, reg_node(pRExC_state, NOTHING));
5579 RExC_whilem_seen++, RExC_extralen += 3;
5580 RExC_naughty += 4 + RExC_naughty; /* compound interest */
5588 if (max && max < min)
5589 vFAIL("Can't do {n,m} with n > m");
5591 ARG1_SET(ret, (U16)min);
5592 ARG2_SET(ret, (U16)max);
5604 #if 0 /* Now runtime fix should be reliable. */
5606 /* if this is reinstated, don't forget to put this back into perldiag:
5608 =item Regexp *+ operand could be empty at {#} in regex m/%s/
5610 (F) The part of the regexp subject to either the * or + quantifier
5611 could match an empty string. The {#} shows in the regular
5612 expression about where the problem was discovered.
5616 if (!(flags&HASWIDTH) && op != '?')
5617 vFAIL("Regexp *+ operand could be empty");
5620 parse_start = RExC_parse;
5621 nextchar(pRExC_state);
5623 *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH);
5625 if (op == '*' && (flags&SIMPLE)) {
5626 reginsert(pRExC_state, STAR, ret, depth+1);
5630 else if (op == '*') {
5634 else if (op == '+' && (flags&SIMPLE)) {
5635 reginsert(pRExC_state, PLUS, ret, depth+1);
5639 else if (op == '+') {
5643 else if (op == '?') {
5648 if (!SIZE_ONLY && !(flags&HASWIDTH) && max > REG_INFTY/3 && ckWARN(WARN_REGEXP)) {
5650 "%.*s matches null string many times",
5651 (int)(RExC_parse >= origparse ? RExC_parse - origparse : 0),
5655 if (RExC_parse < RExC_end && *RExC_parse == '?') {
5656 nextchar(pRExC_state);
5657 reginsert(pRExC_state, MINMOD, ret, depth+1);
5658 REGTAIL(pRExC_state, ret, ret + NODE_STEP_REGNODE);
5660 #ifndef REG_ALLOW_MINMOD_SUSPEND
5663 if (RExC_parse < RExC_end && *RExC_parse == '+') {
5665 nextchar(pRExC_state);
5666 ender = reg_node(pRExC_state, SUCCEED);
5667 REGTAIL(pRExC_state, ret, ender);
5668 reginsert(pRExC_state, SUSPEND, ret, depth+1);
5670 ender = reg_node(pRExC_state, TAIL);
5671 REGTAIL(pRExC_state, ret, ender);
5675 if (RExC_parse < RExC_end && ISMULT2(RExC_parse)) {
5677 vFAIL("Nested quantifiers");
5684 /* reg_namedseq(pRExC_state,UVp)
5686 This is expected to be called by a parser routine that has
5687 recognized'\N' and needs to handle the rest. RExC_parse is
5688 expected to point at the first char following the N at the time
5691 If valuep is non-null then it is assumed that we are parsing inside
5692 of a charclass definition and the first codepoint in the resolved
5693 string is returned via *valuep and the routine will return NULL.
5694 In this mode if a multichar string is returned from the charnames
5695 handler a warning will be issued, and only the first char in the
5696 sequence will be examined. If the string returned is zero length
5697 then the value of *valuep is undefined and NON-NULL will
5698 be returned to indicate failure. (This will NOT be a valid pointer
5701 If value is null then it is assumed that we are parsing normal text
5702 and inserts a new EXACT node into the program containing the resolved
5703 string and returns a pointer to the new node. If the string is
5704 zerolength a NOTHING node is emitted.
5706 On success RExC_parse is set to the char following the endbrace.
5707 Parsing failures will generate a fatal errorvia vFAIL(...)
5709 NOTE: We cache all results from the charnames handler locally in
5710 the RExC_charnames hash (created on first use) to prevent a charnames
5711 handler from playing silly-buggers and returning a short string and
5712 then a long string for a given pattern. Since the regexp program
5713 size is calculated during an initial parse this would result
5714 in a buffer overrun so we cache to prevent the charname result from
5715 changing during the course of the parse.
5719 S_reg_namedseq(pTHX_ RExC_state_t *pRExC_state, UV *valuep)
5721 char * name; /* start of the content of the name */
5722 char * endbrace; /* endbrace following the name */
5725 STRLEN len; /* this has various purposes throughout the code */
5726 bool cached = 0; /* if this is true then we shouldn't refcount dev sv_str */
5727 regnode *ret = NULL;
5729 if (*RExC_parse != '{') {
5730 vFAIL("Missing braces on \\N{}");
5732 name = RExC_parse+1;
5733 endbrace = strchr(RExC_parse, '}');
5736 vFAIL("Missing right brace on \\N{}");
5738 RExC_parse = endbrace + 1;
5741 /* RExC_parse points at the beginning brace,
5742 endbrace points at the last */
5743 if ( name[0]=='U' && name[1]=='+' ) {
5744 /* its a "unicode hex" notation {U+89AB} */
5745 I32 fl = PERL_SCAN_ALLOW_UNDERSCORES
5746 | PERL_SCAN_DISALLOW_PREFIX
5747 | (SIZE_ONLY ? PERL_SCAN_SILENT_ILLDIGIT : 0);
5749 len = (STRLEN)(endbrace - name - 2);
5750 cp = grok_hex(name + 2, &len, &fl, NULL);
5751 if ( len != (STRLEN)(endbrace - name - 2) ) {
5760 sv_str= Perl_newSVpvf_nocontext("%c",(int)cp);
5762 /* fetch the charnames handler for this scope */
5763 HV * const table = GvHV(PL_hintgv);
5765 hv_fetchs(table, "charnames", FALSE) :
5767 SV *cv= cvp ? *cvp : NULL;
5770 /* create an SV with the name as argument */
5771 sv_name = newSVpvn(name, endbrace - name);
5773 if (!table || !(PL_hints & HINT_LOCALIZE_HH)) {
5774 vFAIL2("Constant(\\N{%s}) unknown: "
5775 "(possibly a missing \"use charnames ...\")",
5778 if (!cvp || !SvOK(*cvp)) { /* when $^H{charnames} = undef; */
5779 vFAIL2("Constant(\\N{%s}): "
5780 "$^H{charnames} is not defined",SvPVX(sv_name));
5785 if (!RExC_charnames) {
5786 /* make sure our cache is allocated */
5787 RExC_charnames = newHV();
5788 sv_2mortal((SV*)RExC_charnames);
5790 /* see if we have looked this one up before */
5791 he_str = hv_fetch_ent( RExC_charnames, sv_name, 0, 0 );
5793 sv_str = HeVAL(he_str);
5806 count= call_sv(cv, G_SCALAR);
5808 if (count == 1) { /* XXXX is this right? dmq */
5810 SvREFCNT_inc_simple_void(sv_str);
5818 if ( !sv_str || !SvOK(sv_str) ) {
5819 vFAIL2("Constant(\\N{%s}): Call to &{$^H{charnames}} "
5820 "did not return a defined value",SvPVX(sv_name));
5822 if (hv_store_ent( RExC_charnames, sv_name, sv_str, 0))
5827 char *p = SvPV(sv_str, len);
5830 if ( SvUTF8(sv_str) ) {
5831 *valuep = utf8_to_uvchr((U8*)p, &numlen);
5835 We have to turn on utf8 for high bit chars otherwise
5836 we get failures with
5838 "ss" =~ /[\N{LATIN SMALL LETTER SHARP S}]/i
5839 "SS" =~ /[\N{LATIN SMALL LETTER SHARP S}]/i
5841 This is different from what \x{} would do with the same
5842 codepoint, where the condition is > 0xFF.
5849 /* warn if we havent used the whole string? */
5851 if (numlen<len && SIZE_ONLY && ckWARN(WARN_REGEXP)) {
5853 "Ignoring excess chars from \\N{%s} in character class",
5857 } else if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
5859 "Ignoring zero length \\N{%s} in character class",
5864 SvREFCNT_dec(sv_name);
5866 SvREFCNT_dec(sv_str);
5867 return len ? NULL : (regnode *)&len;
5868 } else if(SvCUR(sv_str)) {
5873 char * parse_start = name-3; /* needed for the offsets */
5874 GET_RE_DEBUG_FLAGS_DECL; /* needed for the offsets */
5876 ret = reg_node(pRExC_state,
5877 (U8)(FOLD ? (LOC ? EXACTFL : EXACTF) : EXACT));
5880 if ( RExC_utf8 && !SvUTF8(sv_str) ) {
5881 sv_utf8_upgrade(sv_str);
5882 } else if ( !RExC_utf8 && SvUTF8(sv_str) ) {
5886 p = SvPV(sv_str, len);
5888 /* len is the length written, charlen is the size the char read */
5889 for ( len = 0; p < pend; p += charlen ) {
5891 UV uvc = utf8_to_uvchr((U8*)p, &charlen);
5893 STRLEN foldlen,numlen;
5894 U8 tmpbuf[UTF8_MAXBYTES_CASE+1], *foldbuf;
5895 uvc = toFOLD_uni(uvc, tmpbuf, &foldlen);
5896 /* Emit all the Unicode characters. */
5898 for (foldbuf = tmpbuf;
5902 uvc = utf8_to_uvchr(foldbuf, &numlen);
5904 const STRLEN unilen = reguni(pRExC_state, uvc, s);
5907 /* In EBCDIC the numlen
5908 * and unilen can differ. */
5910 if (numlen >= foldlen)
5914 break; /* "Can't happen." */
5917 const STRLEN unilen = reguni(pRExC_state, uvc, s);
5929 RExC_size += STR_SZ(len);
5932 RExC_emit += STR_SZ(len);
5934 Set_Node_Cur_Length(ret); /* MJD */
5936 nextchar(pRExC_state);
5938 ret = reg_node(pRExC_state,NOTHING);
5941 SvREFCNT_dec(sv_str);
5944 SvREFCNT_dec(sv_name);
5954 * It returns the code point in utf8 for the value in *encp.
5955 * value: a code value in the source encoding
5956 * encp: a pointer to an Encode object
5958 * If the result from Encode is not a single character,
5959 * it returns U+FFFD (Replacement character) and sets *encp to NULL.
5962 S_reg_recode(pTHX_ const char value, SV **encp)
5965 SV * const sv = sv_2mortal(newSVpvn(&value, numlen));
5966 const char * const s = encp && *encp ? sv_recode_to_utf8(sv, *encp)
5968 const STRLEN newlen = SvCUR(sv);
5969 UV uv = UNICODE_REPLACEMENT;
5973 ? utf8n_to_uvchr((U8*)s, newlen, &numlen, UTF8_ALLOW_DEFAULT)
5976 if (!newlen || numlen != newlen) {
5977 uv = UNICODE_REPLACEMENT;
5986 - regatom - the lowest level
5988 * Optimization: gobbles an entire sequence of ordinary characters so that
5989 * it can turn them into a single node, which is smaller to store and
5990 * faster to run. Backslashed characters are exceptions, each becoming a
5991 * separate node; the code is simpler that way and it's not worth fixing.
5993 * [Yes, it is worth fixing, some scripts can run twice the speed.]
5994 * [It looks like its ok, as in S_study_chunk we merge adjacent EXACT nodes]
5997 S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
6000 register regnode *ret = NULL;
6002 char *parse_start = RExC_parse;
6003 GET_RE_DEBUG_FLAGS_DECL;
6004 DEBUG_PARSE("atom");
6005 *flagp = WORST; /* Tentatively. */
6008 switch (*RExC_parse) {
6010 RExC_seen_zerolen++;
6011 nextchar(pRExC_state);
6012 if (RExC_flags & PMf_MULTILINE)
6013 ret = reg_node(pRExC_state, MBOL);
6014 else if (RExC_flags & PMf_SINGLELINE)
6015 ret = reg_node(pRExC_state, SBOL);
6017 ret = reg_node(pRExC_state, BOL);
6018 Set_Node_Length(ret, 1); /* MJD */
6021 nextchar(pRExC_state);
6023 RExC_seen_zerolen++;
6024 if (RExC_flags & PMf_MULTILINE)
6025 ret = reg_node(pRExC_state, MEOL);
6026 else if (RExC_flags & PMf_SINGLELINE)
6027 ret = reg_node(pRExC_state, SEOL);
6029 ret = reg_node(pRExC_state, EOL);
6030 Set_Node_Length(ret, 1); /* MJD */
6033 nextchar(pRExC_state);
6034 if (RExC_flags & PMf_SINGLELINE)
6035 ret = reg_node(pRExC_state, SANY);
6037 ret = reg_node(pRExC_state, REG_ANY);
6038 *flagp |= HASWIDTH|SIMPLE;
6040 Set_Node_Length(ret, 1); /* MJD */
6044 char * const oregcomp_parse = ++RExC_parse;
6045 ret = regclass(pRExC_state,depth+1);
6046 if (*RExC_parse != ']') {
6047 RExC_parse = oregcomp_parse;
6048 vFAIL("Unmatched [");
6050 nextchar(pRExC_state);
6051 *flagp |= HASWIDTH|SIMPLE;
6052 Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */
6056 nextchar(pRExC_state);
6057 ret = reg(pRExC_state, 1, &flags,depth+1);
6059 if (flags & TRYAGAIN) {
6060 if (RExC_parse == RExC_end) {
6061 /* Make parent create an empty node if needed. */
6069 *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE);
6073 if (flags & TRYAGAIN) {
6077 vFAIL("Internal urp");
6078 /* Supposed to be caught earlier. */
6081 if (!regcurly(RExC_parse)) {
6090 vFAIL("Quantifier follows nothing");
6093 switch (*++RExC_parse) {
6095 RExC_seen_zerolen++;
6096 ret = reg_node(pRExC_state, SBOL);
6098 nextchar(pRExC_state);
6099 Set_Node_Length(ret, 2); /* MJD */
6102 ret = reg_node(pRExC_state, GPOS);
6103 RExC_seen |= REG_SEEN_GPOS;
6105 nextchar(pRExC_state);
6106 Set_Node_Length(ret, 2); /* MJD */
6109 ret = reg_node(pRExC_state, SEOL);
6111 RExC_seen_zerolen++; /* Do not optimize RE away */
6112 nextchar(pRExC_state);
6115 ret = reg_node(pRExC_state, EOS);
6117 RExC_seen_zerolen++; /* Do not optimize RE away */
6118 nextchar(pRExC_state);
6119 Set_Node_Length(ret, 2); /* MJD */
6122 ret = reg_node(pRExC_state, CANY);
6123 RExC_seen |= REG_SEEN_CANY;
6124 *flagp |= HASWIDTH|SIMPLE;
6125 nextchar(pRExC_state);
6126 Set_Node_Length(ret, 2); /* MJD */
6129 ret = reg_node(pRExC_state, CLUMP);
6131 nextchar(pRExC_state);
6132 Set_Node_Length(ret, 2); /* MJD */
6135 ret = reg_node(pRExC_state, (U8)(LOC ? ALNUML : ALNUM));
6136 *flagp |= HASWIDTH|SIMPLE;
6137 nextchar(pRExC_state);
6138 Set_Node_Length(ret, 2); /* MJD */
6141 ret = reg_node(pRExC_state, (U8)(LOC ? NALNUML : NALNUM));
6142 *flagp |= HASWIDTH|SIMPLE;
6143 nextchar(pRExC_state);
6144 Set_Node_Length(ret, 2); /* MJD */
6147 RExC_seen_zerolen++;
6148 RExC_seen |= REG_SEEN_LOOKBEHIND;
6149 ret = reg_node(pRExC_state, (U8)(LOC ? BOUNDL : BOUND));
6151 nextchar(pRExC_state);
6152 Set_Node_Length(ret, 2); /* MJD */
6155 RExC_seen_zerolen++;
6156 RExC_seen |= REG_SEEN_LOOKBEHIND;
6157 ret = reg_node(pRExC_state, (U8)(LOC ? NBOUNDL : NBOUND));
6159 nextchar(pRExC_state);
6160 Set_Node_Length(ret, 2); /* MJD */
6163 ret = reg_node(pRExC_state, (U8)(LOC ? SPACEL : SPACE));
6164 *flagp |= HASWIDTH|SIMPLE;
6165 nextchar(pRExC_state);
6166 Set_Node_Length(ret, 2); /* MJD */
6169 ret = reg_node(pRExC_state, (U8)(LOC ? NSPACEL : NSPACE));
6170 *flagp |= HASWIDTH|SIMPLE;
6171 nextchar(pRExC_state);
6172 Set_Node_Length(ret, 2); /* MJD */
6175 ret = reg_node(pRExC_state, DIGIT);
6176 *flagp |= HASWIDTH|SIMPLE;
6177 nextchar(pRExC_state);
6178 Set_Node_Length(ret, 2); /* MJD */
6181 ret = reg_node(pRExC_state, NDIGIT);
6182 *flagp |= HASWIDTH|SIMPLE;
6183 nextchar(pRExC_state);
6184 Set_Node_Length(ret, 2); /* MJD */
6189 char* const oldregxend = RExC_end;
6190 char* parse_start = RExC_parse - 2;
6192 if (RExC_parse[1] == '{') {
6193 /* a lovely hack--pretend we saw [\pX] instead */
6194 RExC_end = strchr(RExC_parse, '}');
6196 const U8 c = (U8)*RExC_parse;
6198 RExC_end = oldregxend;
6199 vFAIL2("Missing right brace on \\%c{}", c);
6204 RExC_end = RExC_parse + 2;
6205 if (RExC_end > oldregxend)
6206 RExC_end = oldregxend;
6210 ret = regclass(pRExC_state,depth+1);
6212 RExC_end = oldregxend;
6215 Set_Node_Offset(ret, parse_start + 2);
6216 Set_Node_Cur_Length(ret);
6217 nextchar(pRExC_state);
6218 *flagp |= HASWIDTH|SIMPLE;
6222 /* Handle \N{NAME} here and not below because it can be
6223 multicharacter. join_exact() will join them up later on.
6224 Also this makes sure that things like /\N{BLAH}+/ and
6225 \N{BLAH} being multi char Just Happen. dmq*/
6227 ret= reg_namedseq(pRExC_state, NULL);
6229 case 'k': /* Handle \k<NAME> and \k'NAME' */
6231 char ch= RExC_parse[1];
6232 if (ch != '<' && ch != '\'') {
6234 vWARN( RExC_parse + 1,
6235 "Possible broken named back reference treated as literal k");
6239 char* name_start = (RExC_parse += 2);
6241 SV *sv_dat = reg_scan_name(pRExC_state,
6242 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
6243 ch= (ch == '<') ? '>' : '\'';
6245 if (RExC_parse == name_start || *RExC_parse != ch)
6246 vFAIL2("Sequence \\k%c... not terminated",
6247 (ch == '>' ? '<' : ch));
6250 ret = reganode(pRExC_state,
6251 (U8)(FOLD ? (LOC ? NREFFL : NREFF) : NREF),
6257 num = add_data( pRExC_state, 1, "S" );
6259 RExC_rx->data->data[num]=(void*)sv_dat;
6260 SvREFCNT_inc(sv_dat);
6262 /* override incorrect value set in reganode MJD */
6263 Set_Node_Offset(ret, parse_start+1);
6264 Set_Node_Cur_Length(ret); /* MJD */
6265 nextchar(pRExC_state);
6281 case '1': case '2': case '3': case '4':
6282 case '5': case '6': case '7': case '8': case '9':
6285 bool isrel=(*RExC_parse=='R');
6288 num = atoi(RExC_parse);
6290 num = RExC_cpar - num;
6292 vFAIL("Reference to nonexistent or unclosed group");
6294 if (num > 9 && num >= RExC_npar)
6297 char * const parse_start = RExC_parse - 1; /* MJD */
6298 while (isDIGIT(*RExC_parse))
6302 if (num > (I32)RExC_rx->nparens)
6303 vFAIL("Reference to nonexistent group");
6304 /* People make this error all the time apparently.
6305 So we cant fail on it, even though we should
6307 else if (num >= RExC_cpar)
6308 vFAIL("Reference to unclosed group will always match");
6312 ret = reganode(pRExC_state,
6313 (U8)(FOLD ? (LOC ? REFFL : REFF) : REF),
6317 /* override incorrect value set in reganode MJD */
6318 Set_Node_Offset(ret, parse_start+1);
6319 Set_Node_Cur_Length(ret); /* MJD */
6321 nextchar(pRExC_state);
6326 if (RExC_parse >= RExC_end)
6327 FAIL("Trailing \\");
6330 /* Do not generate "unrecognized" warnings here, we fall
6331 back into the quick-grab loop below */
6338 if (RExC_flags & PMf_EXTENDED) {
6339 while (RExC_parse < RExC_end && *RExC_parse != '\n')
6341 if (RExC_parse < RExC_end)
6347 register STRLEN len;
6352 U8 tmpbuf[UTF8_MAXBYTES_CASE+1], *foldbuf;
6354 parse_start = RExC_parse - 1;
6360 ret = reg_node(pRExC_state,
6361 (U8)(FOLD ? (LOC ? EXACTFL : EXACTF) : EXACT));
6363 for (len = 0, p = RExC_parse - 1;
6364 len < 127 && p < RExC_end;
6367 char * const oldp = p;
6369 if (RExC_flags & PMf_EXTENDED)
6370 p = regwhite(p, RExC_end);
6419 ender = ASCII_TO_NATIVE('\033');
6423 ender = ASCII_TO_NATIVE('\007');
6428 char* const e = strchr(p, '}');
6432 vFAIL("Missing right brace on \\x{}");
6435 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
6436 | PERL_SCAN_DISALLOW_PREFIX;
6437 STRLEN numlen = e - p - 1;
6438 ender = grok_hex(p + 1, &numlen, &flags, NULL);
6445 I32 flags = PERL_SCAN_DISALLOW_PREFIX;
6447 ender = grok_hex(p, &numlen, &flags, NULL);
6450 if (PL_encoding && ender < 0x100)
6451 goto recode_encoding;
6455 ender = UCHARAT(p++);
6456 ender = toCTRL(ender);
6458 case '0': case '1': case '2': case '3':case '4':
6459 case '5': case '6': case '7': case '8':case '9':
6461 (isDIGIT(p[1]) && atoi(p) >= RExC_npar) ) {
6464 ender = grok_oct(p, &numlen, &flags, NULL);
6471 if (PL_encoding && ender < 0x100)
6472 goto recode_encoding;
6476 SV* enc = PL_encoding;
6477 ender = reg_recode((const char)(U8)ender, &enc);
6478 if (!enc && SIZE_ONLY && ckWARN(WARN_REGEXP))
6479 vWARN(p, "Invalid escape in the specified encoding");
6485 FAIL("Trailing \\");
6488 if (!SIZE_ONLY&& isALPHA(*p) && ckWARN(WARN_REGEXP))
6489 vWARN2(p + 1, "Unrecognized escape \\%c passed through", UCHARAT(p));
6490 goto normal_default;
6495 if (UTF8_IS_START(*p) && UTF) {
6497 ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
6498 &numlen, UTF8_ALLOW_DEFAULT);
6505 if (RExC_flags & PMf_EXTENDED)
6506 p = regwhite(p, RExC_end);
6508 /* Prime the casefolded buffer. */
6509 ender = toFOLD_uni(ender, tmpbuf, &foldlen);
6511 if (ISMULT2(p)) { /* Back off on ?+*. */
6516 /* Emit all the Unicode characters. */
6518 for (foldbuf = tmpbuf;
6520 foldlen -= numlen) {
6521 ender = utf8_to_uvchr(foldbuf, &numlen);
6523 const STRLEN unilen = reguni(pRExC_state, ender, s);
6526 /* In EBCDIC the numlen
6527 * and unilen can differ. */
6529 if (numlen >= foldlen)
6533 break; /* "Can't happen." */
6537 const STRLEN unilen = reguni(pRExC_state, ender, s);
6546 REGC((char)ender, s++);
6552 /* Emit all the Unicode characters. */
6554 for (foldbuf = tmpbuf;
6556 foldlen -= numlen) {
6557 ender = utf8_to_uvchr(foldbuf, &numlen);
6559 const STRLEN unilen = reguni(pRExC_state, ender, s);
6562 /* In EBCDIC the numlen
6563 * and unilen can differ. */
6565 if (numlen >= foldlen)
6573 const STRLEN unilen = reguni(pRExC_state, ender, s);
6582 REGC((char)ender, s++);
6586 Set_Node_Cur_Length(ret); /* MJD */
6587 nextchar(pRExC_state);
6589 /* len is STRLEN which is unsigned, need to copy to signed */
6592 vFAIL("Internal disaster");
6596 if (len == 1 && UNI_IS_INVARIANT(ender))
6600 RExC_size += STR_SZ(len);
6603 RExC_emit += STR_SZ(len);
6613 S_regwhite(char *p, const char *e)
6618 else if (*p == '#') {
6621 } while (p < e && *p != '\n');
6629 /* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
6630 Character classes ([:foo:]) can also be negated ([:^foo:]).
6631 Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
6632 Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
6633 but trigger failures because they are currently unimplemented. */
6635 #define POSIXCC_DONE(c) ((c) == ':')
6636 #define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.')
6637 #define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c))
6640 S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value)
6643 I32 namedclass = OOB_NAMEDCLASS;
6645 if (value == '[' && RExC_parse + 1 < RExC_end &&
6646 /* I smell either [: or [= or [. -- POSIX has been here, right? */
6647 POSIXCC(UCHARAT(RExC_parse))) {
6648 const char c = UCHARAT(RExC_parse);
6649 char* const s = RExC_parse++;
6651 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != c)
6653 if (RExC_parse == RExC_end)
6654 /* Grandfather lone [:, [=, [. */
6657 const char* const t = RExC_parse++; /* skip over the c */
6660 if (UCHARAT(RExC_parse) == ']') {
6661 const char *posixcc = s + 1;
6662 RExC_parse++; /* skip over the ending ] */
6665 const I32 complement = *posixcc == '^' ? *posixcc++ : 0;
6666 const I32 skip = t - posixcc;
6668 /* Initially switch on the length of the name. */
6671 if (memEQ(posixcc, "word", 4)) /* this is not POSIX, this is the Perl \w */
6672 namedclass = complement ? ANYOF_NALNUM : ANYOF_ALNUM;
6675 /* Names all of length 5. */
6676 /* alnum alpha ascii blank cntrl digit graph lower
6677 print punct space upper */
6678 /* Offset 4 gives the best switch position. */
6679 switch (posixcc[4]) {
6681 if (memEQ(posixcc, "alph", 4)) /* alpha */
6682 namedclass = complement ? ANYOF_NALPHA : ANYOF_ALPHA;
6685 if (memEQ(posixcc, "spac", 4)) /* space */
6686 namedclass = complement ? ANYOF_NPSXSPC : ANYOF_PSXSPC;
6689 if (memEQ(posixcc, "grap", 4)) /* graph */
6690 namedclass = complement ? ANYOF_NGRAPH : ANYOF_GRAPH;
6693 if (memEQ(posixcc, "asci", 4)) /* ascii */
6694 namedclass = complement ? ANYOF_NASCII : ANYOF_ASCII;
6697 if (memEQ(posixcc, "blan", 4)) /* blank */
6698 namedclass = complement ? ANYOF_NBLANK : ANYOF_BLANK;
6701 if (memEQ(posixcc, "cntr", 4)) /* cntrl */
6702 namedclass = complement ? ANYOF_NCNTRL : ANYOF_CNTRL;
6705 if (memEQ(posixcc, "alnu", 4)) /* alnum */
6706 namedclass = complement ? ANYOF_NALNUMC : ANYOF_ALNUMC;
6709 if (memEQ(posixcc, "lowe", 4)) /* lower */
6710 namedclass = complement ? ANYOF_NLOWER : ANYOF_LOWER;
6711 else if (memEQ(posixcc, "uppe", 4)) /* upper */
6712 namedclass = complement ? ANYOF_NUPPER : ANYOF_UPPER;
6715 if (memEQ(posixcc, "digi", 4)) /* digit */
6716 namedclass = complement ? ANYOF_NDIGIT : ANYOF_DIGIT;
6717 else if (memEQ(posixcc, "prin", 4)) /* print */
6718 namedclass = complement ? ANYOF_NPRINT : ANYOF_PRINT;
6719 else if (memEQ(posixcc, "punc", 4)) /* punct */
6720 namedclass = complement ? ANYOF_NPUNCT : ANYOF_PUNCT;
6725 if (memEQ(posixcc, "xdigit", 6))
6726 namedclass = complement ? ANYOF_NXDIGIT : ANYOF_XDIGIT;
6730 if (namedclass == OOB_NAMEDCLASS)
6731 Simple_vFAIL3("POSIX class [:%.*s:] unknown",
6733 assert (posixcc[skip] == ':');
6734 assert (posixcc[skip+1] == ']');
6735 } else if (!SIZE_ONLY) {
6736 /* [[=foo=]] and [[.foo.]] are still future. */
6738 /* adjust RExC_parse so the warning shows after
6740 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse) != ']')
6742 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
6745 /* Maternal grandfather:
6746 * "[:" ending in ":" but not in ":]" */
6756 S_checkposixcc(pTHX_ RExC_state_t *pRExC_state)
6759 if (POSIXCC(UCHARAT(RExC_parse))) {
6760 const char *s = RExC_parse;
6761 const char c = *s++;
6765 if (*s && c == *s && s[1] == ']') {
6766 if (ckWARN(WARN_REGEXP))
6768 "POSIX syntax [%c %c] belongs inside character classes",
6771 /* [[=foo=]] and [[.foo.]] are still future. */
6772 if (POSIXCC_NOTYET(c)) {
6773 /* adjust RExC_parse so the error shows after
6775 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse++) != ']')
6777 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
6785 parse a class specification and produce either an ANYOF node that
6786 matches the pattern. If the pattern matches a single char only and
6787 that char is < 256 then we produce an EXACT node instead.
6790 S_regclass(pTHX_ RExC_state_t *pRExC_state, U32 depth)
6793 register UV value = 0;
6794 register UV nextvalue;
6795 register IV prevvalue = OOB_UNICODE;
6796 register IV range = 0;
6797 register regnode *ret;
6800 char *rangebegin = NULL;
6801 bool need_class = 0;
6804 bool optimize_invert = TRUE;
6805 AV* unicode_alternate = NULL;
6807 UV literal_endpoint = 0;
6809 UV stored = 0; /* number of chars stored in the class */
6811 regnode * const orig_emit = RExC_emit; /* Save the original RExC_emit in
6812 case we need to change the emitted regop to an EXACT. */
6813 const char * orig_parse = RExC_parse;
6814 GET_RE_DEBUG_FLAGS_DECL;
6816 PERL_UNUSED_ARG(depth);
6819 DEBUG_PARSE("clas");
6821 /* Assume we are going to generate an ANYOF node. */
6822 ret = reganode(pRExC_state, ANYOF, 0);
6825 ANYOF_FLAGS(ret) = 0;
6827 if (UCHARAT(RExC_parse) == '^') { /* Complement of range. */
6831 ANYOF_FLAGS(ret) |= ANYOF_INVERT;
6835 RExC_size += ANYOF_SKIP;
6836 listsv = &PL_sv_undef; /* For code scanners: listsv always non-NULL. */
6839 RExC_emit += ANYOF_SKIP;
6841 ANYOF_FLAGS(ret) |= ANYOF_FOLD;
6843 ANYOF_FLAGS(ret) |= ANYOF_LOCALE;
6844 ANYOF_BITMAP_ZERO(ret);
6845 listsv = newSVpvs("# comment\n");
6848 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
6850 if (!SIZE_ONLY && POSIXCC(nextvalue))
6851 checkposixcc(pRExC_state);
6853 /* allow 1st char to be ] (allowing it to be - is dealt with later) */
6854 if (UCHARAT(RExC_parse) == ']')
6858 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != ']') {
6862 namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
6865 rangebegin = RExC_parse;
6867 value = utf8n_to_uvchr((U8*)RExC_parse,
6868 RExC_end - RExC_parse,
6869 &numlen, UTF8_ALLOW_DEFAULT);
6870 RExC_parse += numlen;
6873 value = UCHARAT(RExC_parse++);
6875 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
6876 if (value == '[' && POSIXCC(nextvalue))
6877 namedclass = regpposixcc(pRExC_state, value);
6878 else if (value == '\\') {
6880 value = utf8n_to_uvchr((U8*)RExC_parse,
6881 RExC_end - RExC_parse,
6882 &numlen, UTF8_ALLOW_DEFAULT);
6883 RExC_parse += numlen;
6886 value = UCHARAT(RExC_parse++);
6887 /* Some compilers cannot handle switching on 64-bit integer
6888 * values, therefore value cannot be an UV. Yes, this will
6889 * be a problem later if we want switch on Unicode.
6890 * A similar issue a little bit later when switching on
6891 * namedclass. --jhi */
6892 switch ((I32)value) {
6893 case 'w': namedclass = ANYOF_ALNUM; break;
6894 case 'W': namedclass = ANYOF_NALNUM; break;
6895 case 's': namedclass = ANYOF_SPACE; break;
6896 case 'S': namedclass = ANYOF_NSPACE; break;
6897 case 'd': namedclass = ANYOF_DIGIT; break;
6898 case 'D': namedclass = ANYOF_NDIGIT; break;
6899 case 'N': /* Handle \N{NAME} in class */
6901 /* We only pay attention to the first char of
6902 multichar strings being returned. I kinda wonder
6903 if this makes sense as it does change the behaviour
6904 from earlier versions, OTOH that behaviour was broken
6906 UV v; /* value is register so we cant & it /grrr */
6907 if (reg_namedseq(pRExC_state, &v)) {
6917 if (RExC_parse >= RExC_end)
6918 vFAIL2("Empty \\%c{}", (U8)value);
6919 if (*RExC_parse == '{') {
6920 const U8 c = (U8)value;
6921 e = strchr(RExC_parse++, '}');
6923 vFAIL2("Missing right brace on \\%c{}", c);
6924 while (isSPACE(UCHARAT(RExC_parse)))
6926 if (e == RExC_parse)
6927 vFAIL2("Empty \\%c{}", c);
6929 while (isSPACE(UCHARAT(RExC_parse + n - 1)))
6937 if (UCHARAT(RExC_parse) == '^') {
6940 value = value == 'p' ? 'P' : 'p'; /* toggle */
6941 while (isSPACE(UCHARAT(RExC_parse))) {
6946 Perl_sv_catpvf(aTHX_ listsv, "%cutf8::%.*s\n",
6947 (value=='p' ? '+' : '!'), (int)n, RExC_parse);
6950 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
6951 namedclass = ANYOF_MAX; /* no official name, but it's named */
6954 case 'n': value = '\n'; break;
6955 case 'r': value = '\r'; break;
6956 case 't': value = '\t'; break;
6957 case 'f': value = '\f'; break;
6958 case 'b': value = '\b'; break;
6959 case 'e': value = ASCII_TO_NATIVE('\033');break;
6960 case 'a': value = ASCII_TO_NATIVE('\007');break;
6962 if (*RExC_parse == '{') {
6963 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
6964 | PERL_SCAN_DISALLOW_PREFIX;
6965 char * const e = strchr(RExC_parse++, '}');
6967 vFAIL("Missing right brace on \\x{}");
6969 numlen = e - RExC_parse;
6970 value = grok_hex(RExC_parse, &numlen, &flags, NULL);
6974 I32 flags = PERL_SCAN_DISALLOW_PREFIX;
6976 value = grok_hex(RExC_parse, &numlen, &flags, NULL);
6977 RExC_parse += numlen;
6979 if (PL_encoding && value < 0x100)
6980 goto recode_encoding;
6983 value = UCHARAT(RExC_parse++);
6984 value = toCTRL(value);
6986 case '0': case '1': case '2': case '3': case '4':
6987 case '5': case '6': case '7': case '8': case '9':
6991 value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
6992 RExC_parse += numlen;
6993 if (PL_encoding && value < 0x100)
6994 goto recode_encoding;
6999 SV* enc = PL_encoding;
7000 value = reg_recode((const char)(U8)value, &enc);
7001 if (!enc && SIZE_ONLY && ckWARN(WARN_REGEXP))
7003 "Invalid escape in the specified encoding");
7007 if (!SIZE_ONLY && isALPHA(value) && ckWARN(WARN_REGEXP))
7009 "Unrecognized escape \\%c in character class passed through",
7013 } /* end of \blah */
7019 if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
7021 if (!SIZE_ONLY && !need_class)
7022 ANYOF_CLASS_ZERO(ret);
7026 /* a bad range like a-\d, a-[:digit:] ? */
7029 if (ckWARN(WARN_REGEXP)) {
7031 RExC_parse >= rangebegin ?
7032 RExC_parse - rangebegin : 0;
7034 "False [] range \"%*.*s\"",
7037 if (prevvalue < 256) {
7038 ANYOF_BITMAP_SET(ret, prevvalue);
7039 ANYOF_BITMAP_SET(ret, '-');
7042 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
7043 Perl_sv_catpvf(aTHX_ listsv,
7044 "%04"UVxf"\n%04"UVxf"\n", (UV)prevvalue, (UV) '-');
7048 range = 0; /* this was not a true range */
7052 const char *what = NULL;
7055 if (namedclass > OOB_NAMEDCLASS)
7056 optimize_invert = FALSE;
7057 /* Possible truncation here but in some 64-bit environments
7058 * the compiler gets heartburn about switch on 64-bit values.
7059 * A similar issue a little earlier when switching on value.
7061 switch ((I32)namedclass) {
7064 ANYOF_CLASS_SET(ret, ANYOF_ALNUM);
7066 for (value = 0; value < 256; value++)
7068 ANYOF_BITMAP_SET(ret, value);
7075 ANYOF_CLASS_SET(ret, ANYOF_NALNUM);
7077 for (value = 0; value < 256; value++)
7078 if (!isALNUM(value))
7079 ANYOF_BITMAP_SET(ret, value);
7086 ANYOF_CLASS_SET(ret, ANYOF_ALNUMC);
7088 for (value = 0; value < 256; value++)
7089 if (isALNUMC(value))
7090 ANYOF_BITMAP_SET(ret, value);
7097 ANYOF_CLASS_SET(ret, ANYOF_NALNUMC);
7099 for (value = 0; value < 256; value++)
7100 if (!isALNUMC(value))
7101 ANYOF_BITMAP_SET(ret, value);
7108 ANYOF_CLASS_SET(ret, ANYOF_ALPHA);
7110 for (value = 0; value < 256; value++)
7112 ANYOF_BITMAP_SET(ret, value);
7119 ANYOF_CLASS_SET(ret, ANYOF_NALPHA);
7121 for (value = 0; value < 256; value++)
7122 if (!isALPHA(value))
7123 ANYOF_BITMAP_SET(ret, value);
7130 ANYOF_CLASS_SET(ret, ANYOF_ASCII);
7133 for (value = 0; value < 128; value++)
7134 ANYOF_BITMAP_SET(ret, value);
7136 for (value = 0; value < 256; value++) {
7138 ANYOF_BITMAP_SET(ret, value);
7147 ANYOF_CLASS_SET(ret, ANYOF_NASCII);
7150 for (value = 128; value < 256; value++)
7151 ANYOF_BITMAP_SET(ret, value);
7153 for (value = 0; value < 256; value++) {
7154 if (!isASCII(value))
7155 ANYOF_BITMAP_SET(ret, value);
7164 ANYOF_CLASS_SET(ret, ANYOF_BLANK);
7166 for (value = 0; value < 256; value++)
7168 ANYOF_BITMAP_SET(ret, value);
7175 ANYOF_CLASS_SET(ret, ANYOF_NBLANK);
7177 for (value = 0; value < 256; value++)
7178 if (!isBLANK(value))
7179 ANYOF_BITMAP_SET(ret, value);
7186 ANYOF_CLASS_SET(ret, ANYOF_CNTRL);
7188 for (value = 0; value < 256; value++)
7190 ANYOF_BITMAP_SET(ret, value);
7197 ANYOF_CLASS_SET(ret, ANYOF_NCNTRL);
7199 for (value = 0; value < 256; value++)
7200 if (!isCNTRL(value))
7201 ANYOF_BITMAP_SET(ret, value);
7208 ANYOF_CLASS_SET(ret, ANYOF_DIGIT);
7210 /* consecutive digits assumed */
7211 for (value = '0'; value <= '9'; value++)
7212 ANYOF_BITMAP_SET(ret, value);
7219 ANYOF_CLASS_SET(ret, ANYOF_NDIGIT);
7221 /* consecutive digits assumed */
7222 for (value = 0; value < '0'; value++)
7223 ANYOF_BITMAP_SET(ret, value);
7224 for (value = '9' + 1; value < 256; value++)
7225 ANYOF_BITMAP_SET(ret, value);
7232 ANYOF_CLASS_SET(ret, ANYOF_GRAPH);
7234 for (value = 0; value < 256; value++)
7236 ANYOF_BITMAP_SET(ret, value);
7243 ANYOF_CLASS_SET(ret, ANYOF_NGRAPH);
7245 for (value = 0; value < 256; value++)
7246 if (!isGRAPH(value))
7247 ANYOF_BITMAP_SET(ret, value);
7254 ANYOF_CLASS_SET(ret, ANYOF_LOWER);
7256 for (value = 0; value < 256; value++)
7258 ANYOF_BITMAP_SET(ret, value);
7265 ANYOF_CLASS_SET(ret, ANYOF_NLOWER);
7267 for (value = 0; value < 256; value++)
7268 if (!isLOWER(value))
7269 ANYOF_BITMAP_SET(ret, value);
7276 ANYOF_CLASS_SET(ret, ANYOF_PRINT);
7278 for (value = 0; value < 256; value++)
7280 ANYOF_BITMAP_SET(ret, value);
7287 ANYOF_CLASS_SET(ret, ANYOF_NPRINT);
7289 for (value = 0; value < 256; value++)
7290 if (!isPRINT(value))
7291 ANYOF_BITMAP_SET(ret, value);
7298 ANYOF_CLASS_SET(ret, ANYOF_PSXSPC);
7300 for (value = 0; value < 256; value++)
7301 if (isPSXSPC(value))
7302 ANYOF_BITMAP_SET(ret, value);
7309 ANYOF_CLASS_SET(ret, ANYOF_NPSXSPC);
7311 for (value = 0; value < 256; value++)
7312 if (!isPSXSPC(value))
7313 ANYOF_BITMAP_SET(ret, value);
7320 ANYOF_CLASS_SET(ret, ANYOF_PUNCT);
7322 for (value = 0; value < 256; value++)
7324 ANYOF_BITMAP_SET(ret, value);
7331 ANYOF_CLASS_SET(ret, ANYOF_NPUNCT);
7333 for (value = 0; value < 256; value++)
7334 if (!isPUNCT(value))
7335 ANYOF_BITMAP_SET(ret, value);
7342 ANYOF_CLASS_SET(ret, ANYOF_SPACE);
7344 for (value = 0; value < 256; value++)
7346 ANYOF_BITMAP_SET(ret, value);
7353 ANYOF_CLASS_SET(ret, ANYOF_NSPACE);
7355 for (value = 0; value < 256; value++)
7356 if (!isSPACE(value))
7357 ANYOF_BITMAP_SET(ret, value);
7364 ANYOF_CLASS_SET(ret, ANYOF_UPPER);
7366 for (value = 0; value < 256; value++)
7368 ANYOF_BITMAP_SET(ret, value);
7375 ANYOF_CLASS_SET(ret, ANYOF_NUPPER);
7377 for (value = 0; value < 256; value++)
7378 if (!isUPPER(value))
7379 ANYOF_BITMAP_SET(ret, value);
7386 ANYOF_CLASS_SET(ret, ANYOF_XDIGIT);
7388 for (value = 0; value < 256; value++)
7389 if (isXDIGIT(value))
7390 ANYOF_BITMAP_SET(ret, value);
7397 ANYOF_CLASS_SET(ret, ANYOF_NXDIGIT);
7399 for (value = 0; value < 256; value++)
7400 if (!isXDIGIT(value))
7401 ANYOF_BITMAP_SET(ret, value);
7407 /* this is to handle \p and \P */
7410 vFAIL("Invalid [::] class");
7414 /* Strings such as "+utf8::isWord\n" */
7415 Perl_sv_catpvf(aTHX_ listsv, "%cutf8::Is%s\n", yesno, what);
7418 ANYOF_FLAGS(ret) |= ANYOF_CLASS;
7421 } /* end of namedclass \blah */
7424 if (prevvalue > (IV)value) /* b-a */ {
7425 const int w = RExC_parse - rangebegin;
7426 Simple_vFAIL4("Invalid [] range \"%*.*s\"", w, w, rangebegin);
7427 range = 0; /* not a valid range */
7431 prevvalue = value; /* save the beginning of the range */
7432 if (*RExC_parse == '-' && RExC_parse+1 < RExC_end &&
7433 RExC_parse[1] != ']') {
7436 /* a bad range like \w-, [:word:]- ? */
7437 if (namedclass > OOB_NAMEDCLASS) {
7438 if (ckWARN(WARN_REGEXP)) {
7440 RExC_parse >= rangebegin ?
7441 RExC_parse - rangebegin : 0;
7443 "False [] range \"%*.*s\"",
7447 ANYOF_BITMAP_SET(ret, '-');
7449 range = 1; /* yeah, it's a range! */
7450 continue; /* but do it the next time */
7454 /* now is the next time */
7455 /*stored += (value - prevvalue + 1);*/
7457 if (prevvalue < 256) {
7458 const IV ceilvalue = value < 256 ? value : 255;
7461 /* In EBCDIC [\x89-\x91] should include
7462 * the \x8e but [i-j] should not. */
7463 if (literal_endpoint == 2 &&
7464 ((isLOWER(prevvalue) && isLOWER(ceilvalue)) ||
7465 (isUPPER(prevvalue) && isUPPER(ceilvalue))))
7467 if (isLOWER(prevvalue)) {
7468 for (i = prevvalue; i <= ceilvalue; i++)
7470 ANYOF_BITMAP_SET(ret, i);
7472 for (i = prevvalue; i <= ceilvalue; i++)
7474 ANYOF_BITMAP_SET(ret, i);
7479 for (i = prevvalue; i <= ceilvalue; i++) {
7480 if (!ANYOF_BITMAP_TEST(ret,i)) {
7482 ANYOF_BITMAP_SET(ret, i);
7486 if (value > 255 || UTF) {
7487 const UV prevnatvalue = NATIVE_TO_UNI(prevvalue);
7488 const UV natvalue = NATIVE_TO_UNI(value);
7489 stored+=2; /* can't optimize this class */
7490 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
7491 if (prevnatvalue < natvalue) { /* what about > ? */
7492 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\t%04"UVxf"\n",
7493 prevnatvalue, natvalue);
7495 else if (prevnatvalue == natvalue) {
7496 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n", natvalue);
7498 U8 foldbuf[UTF8_MAXBYTES_CASE+1];
7500 const UV f = to_uni_fold(natvalue, foldbuf, &foldlen);
7502 #ifdef EBCDIC /* RD t/uni/fold ff and 6b */
7503 if (RExC_precomp[0] == ':' &&
7504 RExC_precomp[1] == '[' &&
7505 (f == 0xDF || f == 0x92)) {
7506 f = NATIVE_TO_UNI(f);
7509 /* If folding and foldable and a single
7510 * character, insert also the folded version
7511 * to the charclass. */
7513 #ifdef EBCDIC /* RD tunifold ligatures s,t fb05, fb06 */
7514 if ((RExC_precomp[0] == ':' &&
7515 RExC_precomp[1] == '[' &&
7517 (value == 0xFB05 || value == 0xFB06))) ?
7518 foldlen == ((STRLEN)UNISKIP(f) - 1) :
7519 foldlen == (STRLEN)UNISKIP(f) )
7521 if (foldlen == (STRLEN)UNISKIP(f))
7523 Perl_sv_catpvf(aTHX_ listsv,
7526 /* Any multicharacter foldings
7527 * require the following transform:
7528 * [ABCDEF] -> (?:[ABCabcDEFd]|pq|rst)
7529 * where E folds into "pq" and F folds
7530 * into "rst", all other characters
7531 * fold to single characters. We save
7532 * away these multicharacter foldings,
7533 * to be later saved as part of the
7534 * additional "s" data. */
7537 if (!unicode_alternate)
7538 unicode_alternate = newAV();
7539 sv = newSVpvn((char*)foldbuf, foldlen);
7541 av_push(unicode_alternate, sv);
7545 /* If folding and the value is one of the Greek
7546 * sigmas insert a few more sigmas to make the
7547 * folding rules of the sigmas to work right.
7548 * Note that not all the possible combinations
7549 * are handled here: some of them are handled
7550 * by the standard folding rules, and some of
7551 * them (literal or EXACTF cases) are handled
7552 * during runtime in regexec.c:S_find_byclass(). */
7553 if (value == UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA) {
7554 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
7555 (UV)UNICODE_GREEK_CAPITAL_LETTER_SIGMA);
7556 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
7557 (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
7559 else if (value == UNICODE_GREEK_CAPITAL_LETTER_SIGMA)
7560 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
7561 (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
7566 literal_endpoint = 0;
7570 range = 0; /* this range (if it was one) is done now */
7574 ANYOF_FLAGS(ret) |= ANYOF_LARGE;
7576 RExC_size += ANYOF_CLASS_ADD_SKIP;
7578 RExC_emit += ANYOF_CLASS_ADD_SKIP;
7584 /****** !SIZE_ONLY AFTER HERE *********/
7586 if( stored == 1 && value < 256
7587 && !( ANYOF_FLAGS(ret) & ( ANYOF_FLAGS_ALL ^ ANYOF_FOLD ) )
7589 /* optimize single char class to an EXACT node
7590 but *only* when its not a UTF/high char */
7591 const char * cur_parse= RExC_parse;
7592 RExC_emit = (regnode *)orig_emit;
7593 RExC_parse = (char *)orig_parse;
7594 ret = reg_node(pRExC_state,
7595 (U8)((ANYOF_FLAGS(ret) & ANYOF_FOLD) ? EXACTF : EXACT));
7596 RExC_parse = (char *)cur_parse;
7597 *STRING(ret)= (char)value;
7599 RExC_emit += STR_SZ(1);
7602 /* optimize case-insensitive simple patterns (e.g. /[a-z]/i) */
7603 if ( /* If the only flag is folding (plus possibly inversion). */
7604 ((ANYOF_FLAGS(ret) & (ANYOF_FLAGS_ALL ^ ANYOF_INVERT)) == ANYOF_FOLD)
7606 for (value = 0; value < 256; ++value) {
7607 if (ANYOF_BITMAP_TEST(ret, value)) {
7608 UV fold = PL_fold[value];
7611 ANYOF_BITMAP_SET(ret, fold);
7614 ANYOF_FLAGS(ret) &= ~ANYOF_FOLD;
7617 /* optimize inverted simple patterns (e.g. [^a-z]) */
7618 if (optimize_invert &&
7619 /* If the only flag is inversion. */
7620 (ANYOF_FLAGS(ret) & ANYOF_FLAGS_ALL) == ANYOF_INVERT) {
7621 for (value = 0; value < ANYOF_BITMAP_SIZE; ++value)
7622 ANYOF_BITMAP(ret)[value] ^= ANYOF_FLAGS_ALL;
7623 ANYOF_FLAGS(ret) = ANYOF_UNICODE_ALL;
7626 AV * const av = newAV();
7628 /* The 0th element stores the character class description
7629 * in its textual form: used later (regexec.c:Perl_regclass_swash())
7630 * to initialize the appropriate swash (which gets stored in
7631 * the 1st element), and also useful for dumping the regnode.
7632 * The 2nd element stores the multicharacter foldings,
7633 * used later (regexec.c:S_reginclass()). */
7634 av_store(av, 0, listsv);
7635 av_store(av, 1, NULL);
7636 av_store(av, 2, (SV*)unicode_alternate);
7637 rv = newRV_noinc((SV*)av);
7638 n = add_data(pRExC_state, 1, "s");
7639 RExC_rx->data->data[n] = (void*)rv;
7646 S_nextchar(pTHX_ RExC_state_t *pRExC_state)
7648 char* const retval = RExC_parse++;
7651 if (*RExC_parse == '(' && RExC_parse[1] == '?' &&
7652 RExC_parse[2] == '#') {
7653 while (*RExC_parse != ')') {
7654 if (RExC_parse == RExC_end)
7655 FAIL("Sequence (?#... not terminated");
7661 if (RExC_flags & PMf_EXTENDED) {
7662 if (isSPACE(*RExC_parse)) {
7666 else if (*RExC_parse == '#') {
7667 while (RExC_parse < RExC_end)
7668 if (*RExC_parse++ == '\n') break;
7677 - reg_node - emit a node
7679 STATIC regnode * /* Location. */
7680 S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
7683 register regnode *ptr;
7684 regnode * const ret = RExC_emit;
7685 GET_RE_DEBUG_FLAGS_DECL;
7688 SIZE_ALIGN(RExC_size);
7693 if (OP(RExC_emit) == 255)
7694 Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %s: %d ",
7695 reg_name[op], OP(RExC_emit));
7697 NODE_ALIGN_FILL(ret);
7699 FILL_ADVANCE_NODE(ptr, op);
7700 if (RExC_offsets) { /* MJD */
7701 MJD_OFFSET_DEBUG(("%s:%d: (op %s) %s %"UVuf" (len %"UVuf") (max %"UVuf").\n",
7702 "reg_node", __LINE__,
7704 (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0]
7705 ? "Overwriting end of array!\n" : "OK",
7706 (UV)(RExC_emit - RExC_emit_start),
7707 (UV)(RExC_parse - RExC_start),
7708 (UV)RExC_offsets[0]));
7709 Set_Node_Offset(RExC_emit, RExC_parse + (op == END));
7717 - reganode - emit a node with an argument
7719 STATIC regnode * /* Location. */
7720 S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
7723 register regnode *ptr;
7724 regnode * const ret = RExC_emit;
7725 GET_RE_DEBUG_FLAGS_DECL;
7728 SIZE_ALIGN(RExC_size);
7733 assert(2==regarglen[op]+1);
7735 Anything larger than this has to allocate the extra amount.
7736 If we changed this to be:
7738 RExC_size += (1 + regarglen[op]);
7740 then it wouldn't matter. Its not clear what side effect
7741 might come from that so its not done so far.
7747 if (OP(RExC_emit) == 255)
7748 Perl_croak(aTHX_ "panic: reganode overwriting end of allocated program space");
7750 NODE_ALIGN_FILL(ret);
7752 FILL_ADVANCE_NODE_ARG(ptr, op, arg);
7753 if (RExC_offsets) { /* MJD */
7754 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
7758 (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0] ?
7759 "Overwriting end of array!\n" : "OK",
7760 (UV)(RExC_emit - RExC_emit_start),
7761 (UV)(RExC_parse - RExC_start),
7762 (UV)RExC_offsets[0]));
7763 Set_Cur_Node_Offset;
7771 - reguni - emit (if appropriate) a Unicode character
7774 S_reguni(pTHX_ const RExC_state_t *pRExC_state, UV uv, char* s)
7777 return SIZE_ONLY ? UNISKIP(uv) : (uvchr_to_utf8((U8*)s, uv) - (U8*)s);
7781 - reginsert - insert an operator in front of already-emitted operand
7783 * Means relocating the operand.
7786 S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd, U32 depth)
7789 register regnode *src;
7790 register regnode *dst;
7791 register regnode *place;
7792 const int offset = regarglen[(U8)op];
7793 const int size = NODE_STEP_REGNODE + offset;
7794 GET_RE_DEBUG_FLAGS_DECL;
7795 /* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */
7796 DEBUG_PARSE_FMT("inst"," - %s",reg_name[op]);
7805 if (RExC_open_parens) {
7807 DEBUG_PARSE_FMT("inst"," - %"IVdf, (IV)RExC_npar);
7808 for ( paren=0 ; paren < RExC_npar ; paren++ ) {
7809 if ( RExC_open_parens[paren] >= opnd ) {
7810 DEBUG_PARSE_FMT("open"," - %d",size);
7811 RExC_open_parens[paren] += size;
7813 DEBUG_PARSE_FMT("open"," - %s","ok");
7815 if ( RExC_close_parens[paren] >= opnd ) {
7816 DEBUG_PARSE_FMT("close"," - %d",size);
7817 RExC_close_parens[paren] += size;
7819 DEBUG_PARSE_FMT("close"," - %s","ok");
7824 while (src > opnd) {
7825 StructCopy(--src, --dst, regnode);
7826 if (RExC_offsets) { /* MJD 20010112 */
7827 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s copy %"UVuf" -> %"UVuf" (max %"UVuf").\n",
7831 (UV)(dst - RExC_emit_start) > RExC_offsets[0]
7832 ? "Overwriting end of array!\n" : "OK",
7833 (UV)(src - RExC_emit_start),
7834 (UV)(dst - RExC_emit_start),
7835 (UV)RExC_offsets[0]));
7836 Set_Node_Offset_To_R(dst-RExC_emit_start, Node_Offset(src));
7837 Set_Node_Length_To_R(dst-RExC_emit_start, Node_Length(src));
7842 place = opnd; /* Op node, where operand used to be. */
7843 if (RExC_offsets) { /* MJD */
7844 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
7848 (UV)(place - RExC_emit_start) > RExC_offsets[0]
7849 ? "Overwriting end of array!\n" : "OK",
7850 (UV)(place - RExC_emit_start),
7851 (UV)(RExC_parse - RExC_start),
7852 (UV)RExC_offsets[0]));
7853 Set_Node_Offset(place, RExC_parse);
7854 Set_Node_Length(place, 1);
7856 src = NEXTOPER(place);
7857 FILL_ADVANCE_NODE(place, op);
7858 Zero(src, offset, regnode);
7862 - regtail - set the next-pointer at the end of a node chain of p to val.
7863 - SEE ALSO: regtail_study
7865 /* TODO: All three parms should be const */
7867 S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
7870 register regnode *scan;
7871 GET_RE_DEBUG_FLAGS_DECL;
7873 PERL_UNUSED_ARG(depth);
7879 /* Find last node. */
7882 regnode * const temp = regnext(scan);
7884 SV * const mysv=sv_newmortal();
7885 DEBUG_PARSE_MSG((scan==p ? "tail" : ""));
7886 regprop(RExC_rx, mysv, scan);
7887 PerlIO_printf(Perl_debug_log, "~ %s (%d) %s %s\n",
7888 SvPV_nolen_const(mysv), REG_NODE_NUM(scan),
7889 (temp == NULL ? "->" : ""),
7890 (temp == NULL ? reg_name[OP(val)] : "")
7898 if (reg_off_by_arg[OP(scan)]) {
7899 ARG_SET(scan, val - scan);
7902 NEXT_OFF(scan) = val - scan;
7908 - regtail_study - set the next-pointer at the end of a node chain of p to val.
7909 - Look for optimizable sequences at the same time.
7910 - currently only looks for EXACT chains.
7912 This is expermental code. The idea is to use this routine to perform
7913 in place optimizations on branches and groups as they are constructed,
7914 with the long term intention of removing optimization from study_chunk so
7915 that it is purely analytical.
7917 Currently only used when in DEBUG mode. The macro REGTAIL_STUDY() is used
7918 to control which is which.
7921 /* TODO: All four parms should be const */
7924 S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
7927 register regnode *scan;
7929 #ifdef EXPERIMENTAL_INPLACESCAN
7933 GET_RE_DEBUG_FLAGS_DECL;
7939 /* Find last node. */
7943 regnode * const temp = regnext(scan);
7944 #ifdef EXPERIMENTAL_INPLACESCAN
7945 if (PL_regkind[OP(scan)] == EXACT)
7946 if (join_exact(pRExC_state,scan,&min,1,val,depth+1))
7954 if( exact == PSEUDO )
7956 else if ( exact != OP(scan) )
7965 SV * const mysv=sv_newmortal();
7966 DEBUG_PARSE_MSG((scan==p ? "tsdy" : ""));
7967 regprop(RExC_rx, mysv, scan);
7968 PerlIO_printf(Perl_debug_log, "~ %s (%d) -> %s\n",
7969 SvPV_nolen_const(mysv),
7978 SV * const mysv_val=sv_newmortal();
7979 DEBUG_PARSE_MSG("");
7980 regprop(RExC_rx, mysv_val, val);
7981 PerlIO_printf(Perl_debug_log, "~ attach to %s (%d) offset to %d\n",
7982 SvPV_nolen_const(mysv_val),
7987 if (reg_off_by_arg[OP(scan)]) {
7988 ARG_SET(scan, val - scan);
7991 NEXT_OFF(scan) = val - scan;
7999 - regcurly - a little FSA that accepts {\d+,?\d*}
8002 S_regcurly(register const char *s)
8021 - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
8024 Perl_regdump(pTHX_ const regexp *r)
8028 SV * const sv = sv_newmortal();
8029 SV *dsv= sv_newmortal();
8031 (void)dumpuntil(r, r->program, r->program + 1, NULL, NULL, sv, 0, 0);
8033 /* Header fields of interest. */
8034 if (r->anchored_substr) {
8035 RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->anchored_substr),
8036 RE_SV_DUMPLEN(r->anchored_substr), 30);
8037 PerlIO_printf(Perl_debug_log,
8038 "anchored %s%s at %"IVdf" ",
8039 s, RE_SV_TAIL(r->anchored_substr),
8040 (IV)r->anchored_offset);
8041 } else if (r->anchored_utf8) {
8042 RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->anchored_utf8),
8043 RE_SV_DUMPLEN(r->anchored_utf8), 30);
8044 PerlIO_printf(Perl_debug_log,
8045 "anchored utf8 %s%s at %"IVdf" ",
8046 s, RE_SV_TAIL(r->anchored_utf8),
8047 (IV)r->anchored_offset);
8049 if (r->float_substr) {
8050 RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->float_substr),
8051 RE_SV_DUMPLEN(r->float_substr), 30);
8052 PerlIO_printf(Perl_debug_log,
8053 "floating %s%s at %"IVdf"..%"UVuf" ",
8054 s, RE_SV_TAIL(r->float_substr),
8055 (IV)r->float_min_offset, (UV)r->float_max_offset);
8056 } else if (r->float_utf8) {
8057 RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->float_utf8),
8058 RE_SV_DUMPLEN(r->float_utf8), 30);
8059 PerlIO_printf(Perl_debug_log,
8060 "floating utf8 %s%s at %"IVdf"..%"UVuf" ",
8061 s, RE_SV_TAIL(r->float_utf8),
8062 (IV)r->float_min_offset, (UV)r->float_max_offset);
8064 if (r->check_substr || r->check_utf8)
8065 PerlIO_printf(Perl_debug_log,
8067 (r->check_substr == r->float_substr
8068 && r->check_utf8 == r->float_utf8
8069 ? "(checking floating" : "(checking anchored"));
8070 if (r->reganch & ROPT_NOSCAN)
8071 PerlIO_printf(Perl_debug_log, " noscan");
8072 if (r->reganch & ROPT_CHECK_ALL)
8073 PerlIO_printf(Perl_debug_log, " isall");
8074 if (r->check_substr || r->check_utf8)
8075 PerlIO_printf(Perl_debug_log, ") ");
8077 if (r->regstclass) {
8078 regprop(r, sv, r->regstclass);
8079 PerlIO_printf(Perl_debug_log, "stclass %s ", SvPVX_const(sv));
8081 if (r->reganch & ROPT_ANCH) {
8082 PerlIO_printf(Perl_debug_log, "anchored");
8083 if (r->reganch & ROPT_ANCH_BOL)
8084 PerlIO_printf(Perl_debug_log, "(BOL)");
8085 if (r->reganch & ROPT_ANCH_MBOL)
8086 PerlIO_printf(Perl_debug_log, "(MBOL)");
8087 if (r->reganch & ROPT_ANCH_SBOL)
8088 PerlIO_printf(Perl_debug_log, "(SBOL)");
8089 if (r->reganch & ROPT_ANCH_GPOS)
8090 PerlIO_printf(Perl_debug_log, "(GPOS)");
8091 PerlIO_putc(Perl_debug_log, ' ');
8093 if (r->reganch & ROPT_GPOS_SEEN)
8094 PerlIO_printf(Perl_debug_log, "GPOS ");
8095 if (r->reganch & ROPT_SKIP)
8096 PerlIO_printf(Perl_debug_log, "plus ");
8097 if (r->reganch & ROPT_IMPLICIT)
8098 PerlIO_printf(Perl_debug_log, "implicit ");
8099 PerlIO_printf(Perl_debug_log, "minlen %ld ", (long) r->minlen);
8100 if (r->reganch & ROPT_EVAL_SEEN)
8101 PerlIO_printf(Perl_debug_log, "with eval ");
8102 PerlIO_printf(Perl_debug_log, "\n");
8104 PERL_UNUSED_CONTEXT;
8106 #endif /* DEBUGGING */
8110 - regprop - printable representation of opcode
8113 Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o)
8118 GET_RE_DEBUG_FLAGS_DECL;
8120 sv_setpvn(sv, "", 0);
8121 if (OP(o) > REGNODE_MAX) /* regnode.type is unsigned */
8122 /* It would be nice to FAIL() here, but this may be called from
8123 regexec.c, and it would be hard to supply pRExC_state. */
8124 Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d", (int)OP(o), (int)REGNODE_MAX);
8125 sv_catpv(sv, reg_name[OP(o)]); /* Take off const! */
8127 k = PL_regkind[OP(o)];
8130 SV * const dsv = sv_2mortal(newSVpvs(""));
8131 /* Using is_utf8_string() (via PERL_PV_UNI_DETECT)
8132 * is a crude hack but it may be the best for now since
8133 * we have no flag "this EXACTish node was UTF-8"
8135 const char * const s =
8136 pv_pretty(dsv, STRING(o), STR_LEN(o), 60,
8137 PL_colors[0], PL_colors[1],
8138 PERL_PV_ESCAPE_UNI_DETECT |
8139 PERL_PV_PRETTY_ELIPSES |
8142 Perl_sv_catpvf(aTHX_ sv, " %s", s );
8143 } else if (k == TRIE) {
8144 /* print the details of the trie in dumpuntil instead, as
8145 * prog->data isn't available here */
8146 const char op = OP(o);
8147 const I32 n = ARG(o);
8148 const reg_ac_data * const ac = IS_TRIE_AC(op) ?
8149 (reg_ac_data *)prog->data->data[n] :
8151 const reg_trie_data * const trie = !IS_TRIE_AC(op) ?
8152 (reg_trie_data*)prog->data->data[n] :
8155 Perl_sv_catpvf(aTHX_ sv, "-%s",reg_name[o->flags]);
8156 DEBUG_TRIE_COMPILE_r(
8157 Perl_sv_catpvf(aTHX_ sv,
8158 "<S:%"UVuf"/%"IVdf" W:%"UVuf" L:%"UVuf"/%"UVuf" C:%"UVuf"/%"UVuf">",
8159 (UV)trie->startstate,
8160 (IV)trie->statecount-1, /* -1 because of the unused 0 element */
8161 (UV)trie->wordcount,
8164 (UV)TRIE_CHARCOUNT(trie),
8165 (UV)trie->uniquecharcount
8168 if ( IS_ANYOF_TRIE(op) || trie->bitmap ) {
8170 int rangestart = -1;
8171 U8* bitmap = IS_ANYOF_TRIE(op) ? (U8*)ANYOF_BITMAP(o) : (U8*)TRIE_BITMAP(trie);
8172 Perl_sv_catpvf(aTHX_ sv, "[");
8173 for (i = 0; i <= 256; i++) {
8174 if (i < 256 && BITMAP_TEST(bitmap,i)) {
8175 if (rangestart == -1)
8177 } else if (rangestart != -1) {
8178 if (i <= rangestart + 3)
8179 for (; rangestart < i; rangestart++)
8180 put_byte(sv, rangestart);
8182 put_byte(sv, rangestart);
8184 put_byte(sv, i - 1);
8189 Perl_sv_catpvf(aTHX_ sv, "]");
8192 } else if (k == CURLY) {
8193 if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX)
8194 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */
8195 Perl_sv_catpvf(aTHX_ sv, " {%d,%d}", ARG1(o), ARG2(o));
8197 else if (k == WHILEM && o->flags) /* Ordinal/of */
8198 Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4);
8199 else if (k == REF || k == OPEN || k == CLOSE || k == GROUPP || OP(o)==ACCEPT)
8200 Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o)); /* Parenth number */
8201 else if (k == GOSUB)
8202 Perl_sv_catpvf(aTHX_ sv, "%d[%+d]", (int)ARG(o),(int)ARG2L(o)); /* Paren and offset */
8203 else if (k == VERB) {
8205 Perl_sv_catpvf(aTHX_ sv, ":%"SVf,
8206 (SV*)prog->data->data[ ARG( o ) ]);
8207 } else if (k == LOGICAL)
8208 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* 2: embedded, otherwise 1 */
8209 else if (k == ANYOF) {
8210 int i, rangestart = -1;
8211 const U8 flags = ANYOF_FLAGS(o);
8213 /* Should be synchronized with * ANYOF_ #xdefines in regcomp.h */
8214 static const char * const anyofs[] = {
8247 if (flags & ANYOF_LOCALE)
8248 sv_catpvs(sv, "{loc}");
8249 if (flags & ANYOF_FOLD)
8250 sv_catpvs(sv, "{i}");
8251 Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
8252 if (flags & ANYOF_INVERT)
8254 for (i = 0; i <= 256; i++) {
8255 if (i < 256 && ANYOF_BITMAP_TEST(o,i)) {
8256 if (rangestart == -1)
8258 } else if (rangestart != -1) {
8259 if (i <= rangestart + 3)
8260 for (; rangestart < i; rangestart++)
8261 put_byte(sv, rangestart);
8263 put_byte(sv, rangestart);
8265 put_byte(sv, i - 1);
8271 if (o->flags & ANYOF_CLASS)
8272 for (i = 0; i < (int)(sizeof(anyofs)/sizeof(char*)); i++)
8273 if (ANYOF_CLASS_TEST(o,i))
8274 sv_catpv(sv, anyofs[i]);
8276 if (flags & ANYOF_UNICODE)
8277 sv_catpvs(sv, "{unicode}");
8278 else if (flags & ANYOF_UNICODE_ALL)
8279 sv_catpvs(sv, "{unicode_all}");
8283 SV * const sw = regclass_swash(prog, o, FALSE, &lv, 0);
8287 U8 s[UTF8_MAXBYTES_CASE+1];
8289 for (i = 0; i <= 256; i++) { /* just the first 256 */
8290 uvchr_to_utf8(s, i);
8292 if (i < 256 && swash_fetch(sw, s, TRUE)) {
8293 if (rangestart == -1)
8295 } else if (rangestart != -1) {
8296 if (i <= rangestart + 3)
8297 for (; rangestart < i; rangestart++) {
8298 const U8 * const e = uvchr_to_utf8(s,rangestart);
8300 for(p = s; p < e; p++)
8304 const U8 *e = uvchr_to_utf8(s,rangestart);
8306 for (p = s; p < e; p++)
8309 e = uvchr_to_utf8(s, i-1);
8310 for (p = s; p < e; p++)
8317 sv_catpvs(sv, "..."); /* et cetera */
8321 char *s = savesvpv(lv);
8322 char * const origs = s;
8324 while (*s && *s != '\n')
8328 const char * const t = ++s;
8346 Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
8348 else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH))
8349 Perl_sv_catpvf(aTHX_ sv, "[%d]", -(o->flags));
8351 PERL_UNUSED_CONTEXT;
8352 PERL_UNUSED_ARG(sv);
8354 PERL_UNUSED_ARG(prog);
8355 #endif /* DEBUGGING */
8359 Perl_re_intuit_string(pTHX_ regexp *prog)
8360 { /* Assume that RE_INTUIT is set */
8362 GET_RE_DEBUG_FLAGS_DECL;
8363 PERL_UNUSED_CONTEXT;
8367 const char * const s = SvPV_nolen_const(prog->check_substr
8368 ? prog->check_substr : prog->check_utf8);
8370 if (!PL_colorset) reginitcolors();
8371 PerlIO_printf(Perl_debug_log,
8372 "%sUsing REx %ssubstr:%s \"%s%.60s%s%s\"\n",
8374 prog->check_substr ? "" : "utf8 ",
8375 PL_colors[5],PL_colors[0],
8378 (strlen(s) > 60 ? "..." : ""));
8381 return prog->check_substr ? prog->check_substr : prog->check_utf8;
8385 pregfree - free a regexp
8387 See regdupe below if you change anything here.
8391 Perl_pregfree(pTHX_ struct regexp *r)
8395 GET_RE_DEBUG_FLAGS_DECL;
8397 if (!r || (--r->refcnt > 0))
8403 SV *dsv= sv_newmortal();
8404 RE_PV_QUOTED_DECL(s, (r->reganch & ROPT_UTF8),
8405 dsv, r->precomp, r->prelen, 60);
8406 PerlIO_printf(Perl_debug_log,"%sFreeing REx:%s %s\n",
8407 PL_colors[4],PL_colors[5],s);
8411 /* gcov results gave these as non-null 100% of the time, so there's no
8412 optimisation in checking them before calling Safefree */
8413 Safefree(r->precomp);
8414 Safefree(r->offsets); /* 20010421 MJD */
8415 RX_MATCH_COPY_FREE(r);
8416 #ifdef PERL_OLD_COPY_ON_WRITE
8418 SvREFCNT_dec(r->saved_copy);
8421 if (r->anchored_substr)
8422 SvREFCNT_dec(r->anchored_substr);
8423 if (r->anchored_utf8)
8424 SvREFCNT_dec(r->anchored_utf8);
8425 if (r->float_substr)
8426 SvREFCNT_dec(r->float_substr);
8428 SvREFCNT_dec(r->float_utf8);
8429 Safefree(r->substrs);
8432 SvREFCNT_dec(r->paren_names);
8434 int n = r->data->count;
8435 PAD* new_comppad = NULL;
8440 /* If you add a ->what type here, update the comment in regcomp.h */
8441 switch (r->data->what[n]) {
8444 SvREFCNT_dec((SV*)r->data->data[n]);
8447 Safefree(r->data->data[n]);
8450 new_comppad = (AV*)r->data->data[n];
8453 if (new_comppad == NULL)
8454 Perl_croak(aTHX_ "panic: pregfree comppad");
8455 PAD_SAVE_LOCAL(old_comppad,
8456 /* Watch out for global destruction's random ordering. */
8457 (SvTYPE(new_comppad) == SVt_PVAV) ? new_comppad : NULL
8460 refcnt = OpREFCNT_dec((OP_4tree*)r->data->data[n]);
8463 op_free((OP_4tree*)r->data->data[n]);
8465 PAD_RESTORE_LOCAL(old_comppad);
8466 SvREFCNT_dec((SV*)new_comppad);
8472 { /* Aho Corasick add-on structure for a trie node.
8473 Used in stclass optimization only */
8475 reg_ac_data *aho=(reg_ac_data*)r->data->data[n];
8477 refcount = --aho->refcount;
8480 Safefree(aho->states);
8481 Safefree(aho->fail);
8482 aho->trie=NULL; /* not necessary to free this as it is
8483 handled by the 't' case */
8484 Safefree(r->data->data[n]); /* do this last!!!! */
8485 Safefree(r->regstclass);
8491 /* trie structure. */
8493 reg_trie_data *trie=(reg_trie_data*)r->data->data[n];
8495 refcount = --trie->refcount;
8498 Safefree(trie->charmap);
8499 if (trie->widecharmap)
8500 SvREFCNT_dec((SV*)trie->widecharmap);
8501 Safefree(trie->states);
8502 Safefree(trie->trans);
8504 Safefree(trie->bitmap);
8506 Safefree(trie->wordlen);
8508 Safefree(trie->jump);
8510 Safefree(trie->nextword);
8513 SvREFCNT_dec((SV*)trie->words);
8514 if (trie->revcharmap)
8515 SvREFCNT_dec((SV*)trie->revcharmap);
8517 Safefree(r->data->data[n]); /* do this last!!!! */
8522 Perl_croak(aTHX_ "panic: regfree data code '%c'", r->data->what[n]);
8525 Safefree(r->data->what);
8528 Safefree(r->startp);
8531 Safefree(r->swap->startp);
8532 Safefree(r->swap->endp);
8538 #define sv_dup_inc(s,t) SvREFCNT_inc(sv_dup(s,t))
8539 #define av_dup_inc(s,t) (AV*)SvREFCNT_inc(sv_dup((SV*)s,t))
8540 #define hv_dup_inc(s,t) (HV*)SvREFCNT_inc(sv_dup((SV*)s,t))
8541 #define SAVEPVN(p,n) ((p) ? savepvn(p,n) : NULL)
8544 regdupe - duplicate a regexp.
8546 This routine is called by sv.c's re_dup and is expected to clone a
8547 given regexp structure. It is a no-op when not under USE_ITHREADS.
8548 (Originally this *was* re_dup() for change history see sv.c)
8550 See pregfree() above if you change anything here.
8552 #if defined(USE_ITHREADS)
8554 Perl_regdupe(pTHX_ const regexp *r, CLONE_PARAMS *param)
8559 struct reg_substr_datum *s;
8562 return (REGEXP *)NULL;
8564 if ((ret = (REGEXP *)ptr_table_fetch(PL_ptr_table, r)))
8567 len = r->offsets[0];
8568 npar = r->nparens+1;
8570 Newxc(ret, sizeof(regexp) + (len+1)*sizeof(regnode), char, regexp);
8571 Copy(r->program, ret->program, len+1, regnode);
8573 Newx(ret->startp, npar, I32);
8574 Copy(r->startp, ret->startp, npar, I32);
8575 Newx(ret->endp, npar, I32);
8576 Copy(r->startp, ret->startp, npar, I32);
8578 Newx(ret->swap, 1, regexp_paren_ofs);
8579 /* no need to copy these */
8580 Newx(ret->swap->startp, npar, I32);
8581 Newx(ret->swap->endp, npar, I32);
8586 Newx(ret->substrs, 1, struct reg_substr_data);
8587 for (s = ret->substrs->data, i = 0; i < 3; i++, s++) {
8588 s->min_offset = r->substrs->data[i].min_offset;
8589 s->max_offset = r->substrs->data[i].max_offset;
8590 s->end_shift = r->substrs->data[i].end_shift;
8591 s->substr = sv_dup_inc(r->substrs->data[i].substr, param);
8592 s->utf8_substr = sv_dup_inc(r->substrs->data[i].utf8_substr, param);
8595 ret->regstclass = NULL;
8598 const int count = r->data->count;
8601 Newxc(d, sizeof(struct reg_data) + count*sizeof(void *),
8602 char, struct reg_data);
8603 Newx(d->what, count, U8);
8606 for (i = 0; i < count; i++) {
8607 d->what[i] = r->data->what[i];
8608 switch (d->what[i]) {
8609 /* legal options are one of: sSfpont
8610 see also regcomp.h and pregfree() */
8613 d->data[i] = sv_dup_inc((SV *)r->data->data[i], param);
8616 d->data[i] = av_dup_inc((AV *)r->data->data[i], param);
8619 /* This is cheating. */
8620 Newx(d->data[i], 1, struct regnode_charclass_class);
8621 StructCopy(r->data->data[i], d->data[i],
8622 struct regnode_charclass_class);
8623 ret->regstclass = (regnode*)d->data[i];
8626 /* Compiled op trees are readonly, and can thus be
8627 shared without duplication. */
8629 d->data[i] = (void*)OpREFCNT_inc((OP*)r->data->data[i]);
8633 d->data[i] = r->data->data[i];
8636 d->data[i] = r->data->data[i];
8638 ((reg_trie_data*)d->data[i])->refcount++;
8642 d->data[i] = r->data->data[i];
8644 ((reg_ac_data*)d->data[i])->refcount++;
8646 /* Trie stclasses are readonly and can thus be shared
8647 * without duplication. We free the stclass in pregfree
8648 * when the corresponding reg_ac_data struct is freed.
8650 ret->regstclass= r->regstclass;
8653 Perl_croak(aTHX_ "panic: re_dup unknown data code '%c'", r->data->what[i]);
8662 Newx(ret->offsets, 2*len+1, U32);
8663 Copy(r->offsets, ret->offsets, 2*len+1, U32);
8665 ret->precomp = SAVEPVN(r->precomp, r->prelen);
8666 ret->refcnt = r->refcnt;
8667 ret->minlen = r->minlen;
8668 ret->minlenret = r->minlenret;
8669 ret->prelen = r->prelen;
8670 ret->nparens = r->nparens;
8671 ret->lastparen = r->lastparen;
8672 ret->lastcloseparen = r->lastcloseparen;
8673 ret->reganch = r->reganch;
8675 ret->sublen = r->sublen;
8677 ret->engine = r->engine;
8679 ret->paren_names = hv_dup_inc(r->paren_names, param);
8681 if (RX_MATCH_COPIED(ret))
8682 ret->subbeg = SAVEPVN(r->subbeg, r->sublen);
8685 #ifdef PERL_OLD_COPY_ON_WRITE
8686 ret->saved_copy = NULL;
8689 ptr_table_store(PL_ptr_table, r, ret);
8697 converts a regexp embedded in a MAGIC struct to its stringified form,
8698 caching the converted form in the struct and returns the cached
8701 If lp is nonnull then it is used to return the length of the
8704 If flags is nonnull and the returned string contains UTF8 then
8705 (flags & 1) will be true.
8707 If haseval is nonnull then it is used to return whether the pattern
8710 Normally called via macro:
8712 CALLREG_STRINGIFY(mg,0,0);
8716 CALLREG_AS_STR(mg,lp,flags,haseval)
8718 See sv_2pv_flags() in sv.c for an example of internal usage.
8723 Perl_reg_stringify(pTHX_ MAGIC *mg, STRLEN *lp, U32 *flags, I32 *haseval ) {
8725 const regexp * const re = (regexp *)mg->mg_obj;
8728 const char *fptr = "msix";
8733 bool need_newline = 0;
8734 U16 reganch = (U16)((re->reganch & PMf_COMPILETIME) >> 12);
8736 while((ch = *fptr++)) {
8738 reflags[left++] = ch;
8741 reflags[right--] = ch;
8746 reflags[left] = '-';
8750 mg->mg_len = re->prelen + 4 + left;
8752 * If /x was used, we have to worry about a regex ending with a
8753 * comment later being embedded within another regex. If so, we don't
8754 * want this regex's "commentization" to leak out to the right part of
8755 * the enclosing regex, we must cap it with a newline.
8757 * So, if /x was used, we scan backwards from the end of the regex. If
8758 * we find a '#' before we find a newline, we need to add a newline
8759 * ourself. If we find a '\n' first (or if we don't find '#' or '\n'),
8760 * we don't need to add anything. -jfriedl
8762 if (PMf_EXTENDED & re->reganch) {
8763 const char *endptr = re->precomp + re->prelen;
8764 while (endptr >= re->precomp) {
8765 const char c = *(endptr--);
8767 break; /* don't need another */
8769 /* we end while in a comment, so we need a newline */
8770 mg->mg_len++; /* save space for it */
8771 need_newline = 1; /* note to add it */
8777 Newx(mg->mg_ptr, mg->mg_len + 1 + left, char);
8778 mg->mg_ptr[0] = '(';
8779 mg->mg_ptr[1] = '?';
8780 Copy(reflags, mg->mg_ptr+2, left, char);
8781 *(mg->mg_ptr+left+2) = ':';
8782 Copy(re->precomp, mg->mg_ptr+3+left, re->prelen, char);
8784 mg->mg_ptr[mg->mg_len - 2] = '\n';
8785 mg->mg_ptr[mg->mg_len - 1] = ')';
8786 mg->mg_ptr[mg->mg_len] = 0;
8789 *haseval = re->program[0].next_off;
8791 *flags = ((re->reganch & ROPT_UTF8) ? 1 : 0);
8799 #ifndef PERL_IN_XSUB_RE
8801 - regnext - dig the "next" pointer out of a node
8804 Perl_regnext(pTHX_ register regnode *p)
8807 register I32 offset;
8809 if (p == &PL_regdummy)
8812 offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p));
8821 S_re_croak2(pTHX_ const char* pat1,const char* pat2,...)
8824 STRLEN l1 = strlen(pat1);
8825 STRLEN l2 = strlen(pat2);
8828 const char *message;
8834 Copy(pat1, buf, l1 , char);
8835 Copy(pat2, buf + l1, l2 , char);
8836 buf[l1 + l2] = '\n';
8837 buf[l1 + l2 + 1] = '\0';
8839 /* ANSI variant takes additional second argument */
8840 va_start(args, pat2);
8844 msv = vmess(buf, &args);
8846 message = SvPV_const(msv,l1);
8849 Copy(message, buf, l1 , char);
8850 buf[l1-1] = '\0'; /* Overwrite \n */
8851 Perl_croak(aTHX_ "%s", buf);
8854 /* XXX Here's a total kludge. But we need to re-enter for swash routines. */
8856 #ifndef PERL_IN_XSUB_RE
8858 Perl_save_re_context(pTHX)
8862 struct re_save_state *state;
8864 SAVEVPTR(PL_curcop);
8865 SSGROW(SAVESTACK_ALLOC_FOR_RE_SAVE_STATE + 1);
8867 state = (struct re_save_state *)(PL_savestack + PL_savestack_ix);
8868 PL_savestack_ix += SAVESTACK_ALLOC_FOR_RE_SAVE_STATE;
8869 SSPUSHINT(SAVEt_RE_STATE);
8871 Copy(&PL_reg_state, state, 1, struct re_save_state);
8873 PL_reg_start_tmp = 0;
8874 PL_reg_start_tmpl = 0;
8875 PL_reg_oldsaved = NULL;
8876 PL_reg_oldsavedlen = 0;
8878 PL_reg_leftiter = 0;
8879 PL_reg_poscache = NULL;
8880 PL_reg_poscache_size = 0;
8881 #ifdef PERL_OLD_COPY_ON_WRITE
8885 /* Save $1..$n (#18107: UTF-8 s/(\w+)/uc($1)/e); AMS 20021106. */
8887 const REGEXP * const rx = PM_GETRE(PL_curpm);
8890 for (i = 1; i <= rx->nparens; i++) {
8891 char digits[TYPE_CHARS(long)];
8892 const STRLEN len = my_snprintf(digits, sizeof(digits), "%lu", (long)i);
8893 GV *const *const gvp
8894 = (GV**)hv_fetch(PL_defstash, digits, len, 0);
8897 GV * const gv = *gvp;
8898 if (SvTYPE(gv) == SVt_PVGV && GvSV(gv))
8908 clear_re(pTHX_ void *r)
8911 ReREFCNT_dec((regexp *)r);
8917 S_put_byte(pTHX_ SV *sv, int c)
8919 if (isCNTRL(c) || c == 255 || !isPRINT(c))
8920 Perl_sv_catpvf(aTHX_ sv, "\\%o", c);
8921 else if (c == '-' || c == ']' || c == '\\' || c == '^')
8922 Perl_sv_catpvf(aTHX_ sv, "\\%c", c);
8924 Perl_sv_catpvf(aTHX_ sv, "%c", c);
8928 #define CLEAR_OPTSTART \
8929 if (optstart) STMT_START { \
8930 DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log, " (%d nodes)\n", node - optstart)); \
8934 #define DUMPUNTIL(b,e) CLEAR_OPTSTART; node=dumpuntil(r,start,(b),(e),last,sv,indent+1,depth+1);
8936 STATIC const regnode *
8937 S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node,
8938 const regnode *last, const regnode *plast,
8939 SV* sv, I32 indent, U32 depth)
8942 register U8 op = PSEUDO; /* Arbitrary non-END op. */
8943 register const regnode *next;
8944 const regnode *optstart= NULL;
8945 GET_RE_DEBUG_FLAGS_DECL;
8947 #ifdef DEBUG_DUMPUNTIL
8948 PerlIO_printf(Perl_debug_log, "--- %d : %d - %d - %d\n",indent,node-start,
8949 last ? last-start : 0,plast ? plast-start : 0);
8952 if (plast && plast < last)
8955 while (PL_regkind[op] != END && (!last || node < last)) {
8956 /* While that wasn't END last time... */
8962 next = regnext((regnode *)node);
8965 if (OP(node) == OPTIMIZED) {
8966 if (!optstart && RE_DEBUG_FLAG(RE_DEBUG_COMPILE_OPTIMISE))
8973 regprop(r, sv, node);
8974 PerlIO_printf(Perl_debug_log, "%4"IVdf":%*s%s", (IV)(node - start),
8975 (int)(2*indent + 1), "", SvPVX_const(sv));
8977 if (OP(node) != OPTIMIZED) {
8978 if (next == NULL) /* Next ptr. */
8979 PerlIO_printf(Perl_debug_log, "(0)");
8980 else if (PL_regkind[(U8)op] == BRANCH && PL_regkind[OP(next)] != BRANCH )
8981 PerlIO_printf(Perl_debug_log, "(FAIL)");
8983 PerlIO_printf(Perl_debug_log, "(%"IVdf")", (IV)(next - start));
8985 /*if (PL_regkind[(U8)op] != TRIE)*/
8986 (void)PerlIO_putc(Perl_debug_log, '\n');
8990 if (PL_regkind[(U8)op] == BRANCHJ) {
8993 register const regnode *nnode = (OP(next) == LONGJMP
8994 ? regnext((regnode *)next)
8996 if (last && nnode > last)
8998 DUMPUNTIL(NEXTOPER(NEXTOPER(node)), nnode);
9001 else if (PL_regkind[(U8)op] == BRANCH) {
9003 DUMPUNTIL(NEXTOPER(node), next);
9005 else if ( PL_regkind[(U8)op] == TRIE ) {
9006 const regnode *this_trie = node;
9007 const char op = OP(node);
9008 const I32 n = ARG(node);
9009 const reg_ac_data * const ac = op>=AHOCORASICK ?
9010 (reg_ac_data *)r->data->data[n] :
9012 const reg_trie_data * const trie = op<AHOCORASICK ?
9013 (reg_trie_data*)r->data->data[n] :
9015 const regnode *nextbranch= NULL;
9017 sv_setpvn(sv, "", 0);
9018 for (word_idx= 0; word_idx < (I32)trie->wordcount; word_idx++) {
9019 SV ** const elem_ptr = av_fetch(trie->words,word_idx,0);
9021 PerlIO_printf(Perl_debug_log, "%*s%s ",
9022 (int)(2*(indent+3)), "",
9023 elem_ptr ? pv_pretty(sv, SvPV_nolen_const(*elem_ptr), SvCUR(*elem_ptr), 60,
9024 PL_colors[0], PL_colors[1],
9025 (SvUTF8(*elem_ptr) ? PERL_PV_ESCAPE_UNI : 0) |
9026 PERL_PV_PRETTY_ELIPSES |
9032 U16 dist= trie->jump[word_idx+1];
9033 PerlIO_printf(Perl_debug_log, "(%u)\n",
9034 (dist ? this_trie + dist : next) - start);
9037 nextbranch= this_trie + trie->jump[0];
9038 DUMPUNTIL(this_trie + dist, nextbranch);
9040 if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
9041 nextbranch= regnext((regnode *)nextbranch);
9043 PerlIO_printf(Perl_debug_log, "\n");
9046 if (last && next > last)
9051 else if ( op == CURLY ) { /* "next" might be very big: optimizer */
9052 DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS,
9053 NEXTOPER(node) + EXTRA_STEP_2ARGS + 1);
9055 else if (PL_regkind[(U8)op] == CURLY && op != CURLYX) {
9057 DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS, next);
9059 else if ( op == PLUS || op == STAR) {
9060 DUMPUNTIL(NEXTOPER(node), NEXTOPER(node) + 1);
9062 else if (op == ANYOF) {
9063 /* arglen 1 + class block */
9064 node += 1 + ((ANYOF_FLAGS(node) & ANYOF_LARGE)
9065 ? ANYOF_CLASS_SKIP : ANYOF_SKIP);
9066 node = NEXTOPER(node);
9068 else if (PL_regkind[(U8)op] == EXACT) {
9069 /* Literal string, where present. */
9070 node += NODE_SZ_STR(node) - 1;
9071 node = NEXTOPER(node);
9074 node = NEXTOPER(node);
9075 node += regarglen[(U8)op];
9077 if (op == CURLYX || op == OPEN)
9079 else if (op == WHILEM)
9083 #ifdef DEBUG_DUMPUNTIL
9084 PerlIO_printf(Perl_debug_log, "--- %d\n",indent);
9089 #endif /* DEBUGGING */
9093 * c-indentation-style: bsd
9095 * indent-tabs-mode: t
9098 * ex: set ts=8 sts=4 sw=4 noet: