5 * "A fair jaw-cracker dwarf-language must be." --Samwise Gamgee
8 /* This file contains functions for compiling a regular expression. See
9 * also regexec.c which funnily enough, contains functions for executing
10 * a regular expression.
12 * This file is also copied at build time to ext/re/re_comp.c, where
13 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
14 * This causes the main functions to be compiled under new names and with
15 * debugging support added, which makes "use re 'debug'" work.
18 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
19 * confused with the original package (see point 3 below). Thanks, Henry!
22 /* Additional note: this code is very heavily munged from Henry's version
23 * in places. In some spots I've traded clarity for efficiency, so don't
24 * blame Henry for some of the lack of readability.
27 /* The names of the functions have been changed from regcomp and
28 * regexec to pregcomp and pregexec in order to avoid conflicts
29 * with the POSIX routines of the same names.
32 #ifdef PERL_EXT_RE_BUILD
37 * pregcomp and pregexec -- regsub and regerror are not used in perl
39 * Copyright (c) 1986 by University of Toronto.
40 * Written by Henry Spencer. Not derived from licensed software.
42 * Permission is granted to anyone to use this software for any
43 * purpose on any computer system, and to redistribute it freely,
44 * subject to the following restrictions:
46 * 1. The author is not responsible for the consequences of use of
47 * this software, no matter how awful, even if they arise
50 * 2. The origin of this software must not be misrepresented, either
51 * by explicit claim or by omission.
53 * 3. Altered versions must be plainly marked as such, and must not
54 * be misrepresented as being the original software.
57 **** Alterations to Henry's code are...
59 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
60 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, by Larry Wall and others
62 **** You may distribute under the terms of either the GNU General Public
63 **** License or the Artistic License, as specified in the README file.
66 * Beware that some of this code is subtly aware of the way operator
67 * precedence is structured in regular expressions. Serious changes in
68 * regular-expression syntax might require a total rethink.
71 #define PERL_IN_REGCOMP_C
74 #ifndef PERL_IN_XSUB_RE
79 #ifdef PERL_IN_XSUB_RE
90 # if defined(BUGGY_MSC6)
91 /* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */
92 # pragma optimize("a",off)
93 /* But MSC 6.00A is happy with 'w', for aliases only across function calls*/
94 # pragma optimize("w",on )
95 # endif /* BUGGY_MSC6 */
102 typedef struct RExC_state_t {
103 U32 flags; /* are we folding, multilining? */
104 char *precomp; /* uncompiled string. */
105 regexp *rx; /* perl core regexp structure */
106 regexp_internal *rxi; /* internal data for regexp object pprivate field */
107 char *start; /* Start of input for compile */
108 char *end; /* End of input for compile */
109 char *parse; /* Input-scan pointer. */
110 I32 whilem_seen; /* number of WHILEM in this expr */
111 regnode *emit_start; /* Start of emitted-code area */
112 regnode *emit; /* Code-emit pointer; ®dummy = don't = compiling */
113 I32 naughty; /* How bad is this pattern? */
114 I32 sawback; /* Did we see \1, ...? */
116 I32 size; /* Code size. */
117 I32 npar; /* Capture buffer count, (OPEN). */
118 I32 cpar; /* Capture buffer count, (CLOSE). */
119 I32 nestroot; /* root parens we are in - used by accept */
123 regnode **open_parens; /* pointers to open parens */
124 regnode **close_parens; /* pointers to close parens */
125 regnode *opend; /* END node in program */
127 HV *charnames; /* cache of named sequences */
128 HV *paren_names; /* Paren names */
129 regnode **recurse; /* Recurse regops */
130 I32 recurse_count; /* Number of recurse regops */
132 char *starttry; /* -Dr: where regtry was called. */
133 #define RExC_starttry (pRExC_state->starttry)
136 const char *lastparse;
138 #define RExC_lastparse (pRExC_state->lastparse)
139 #define RExC_lastnum (pRExC_state->lastnum)
143 #define RExC_flags (pRExC_state->flags)
144 #define RExC_precomp (pRExC_state->precomp)
145 #define RExC_rx (pRExC_state->rx)
146 #define RExC_rxi (pRExC_state->rxi)
147 #define RExC_start (pRExC_state->start)
148 #define RExC_end (pRExC_state->end)
149 #define RExC_parse (pRExC_state->parse)
150 #define RExC_whilem_seen (pRExC_state->whilem_seen)
151 #define RExC_offsets (pRExC_state->rxi->offsets) /* I am not like the others */
152 #define RExC_emit (pRExC_state->emit)
153 #define RExC_emit_start (pRExC_state->emit_start)
154 #define RExC_naughty (pRExC_state->naughty)
155 #define RExC_sawback (pRExC_state->sawback)
156 #define RExC_seen (pRExC_state->seen)
157 #define RExC_size (pRExC_state->size)
158 #define RExC_npar (pRExC_state->npar)
159 #define RExC_cpar (pRExC_state->cpar)
160 #define RExC_nestroot (pRExC_state->nestroot)
161 #define RExC_extralen (pRExC_state->extralen)
162 #define RExC_seen_zerolen (pRExC_state->seen_zerolen)
163 #define RExC_seen_evals (pRExC_state->seen_evals)
164 #define RExC_utf8 (pRExC_state->utf8)
165 #define RExC_charnames (pRExC_state->charnames)
166 #define RExC_open_parens (pRExC_state->open_parens)
167 #define RExC_close_parens (pRExC_state->close_parens)
168 #define RExC_opend (pRExC_state->opend)
169 #define RExC_paren_names (pRExC_state->paren_names)
170 #define RExC_recurse (pRExC_state->recurse)
171 #define RExC_recurse_count (pRExC_state->recurse_count)
173 #define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?')
174 #define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
175 ((*s) == '{' && regcurly(s)))
178 #undef SPSTART /* dratted cpp namespace... */
181 * Flags to be passed up and down.
183 #define WORST 0 /* Worst case. */
184 #define HASWIDTH 0x1 /* Known to match non-null strings. */
185 #define SIMPLE 0x2 /* Simple enough to be STAR/PLUS operand. */
186 #define SPSTART 0x4 /* Starts with * or +. */
187 #define TRYAGAIN 0x8 /* Weeded out a declaration. */
189 #define REG_NODE_NUM(x) ((x) ? (int)((x)-RExC_emit_start) : -1)
191 /* whether trie related optimizations are enabled */
192 #if PERL_ENABLE_EXTENDED_TRIE_OPTIMISATION
193 #define TRIE_STUDY_OPT
194 #define FULL_TRIE_STUDY
200 #define PBYTE(u8str,paren) ((U8*)(u8str))[(paren) >> 3]
201 #define PBITVAL(paren) (1 << ((paren) & 7))
202 #define PAREN_TEST(u8str,paren) ( PBYTE(u8str,paren) & PBITVAL(paren))
203 #define PAREN_SET(u8str,paren) PBYTE(u8str,paren) |= PBITVAL(paren)
204 #define PAREN_UNSET(u8str,paren) PBYTE(u8str,paren) &= (~PBITVAL(paren))
207 /* About scan_data_t.
209 During optimisation we recurse through the regexp program performing
210 various inplace (keyhole style) optimisations. In addition study_chunk
211 and scan_commit populate this data structure with information about
212 what strings MUST appear in the pattern. We look for the longest
213 string that must appear for at a fixed location, and we look for the
214 longest string that may appear at a floating location. So for instance
219 Both 'FOO' and 'A' are fixed strings. Both 'B' and 'BAR' are floating
220 strings (because they follow a .* construct). study_chunk will identify
221 both FOO and BAR as being the longest fixed and floating strings respectively.
223 The strings can be composites, for instance
227 will result in a composite fixed substring 'foo'.
229 For each string some basic information is maintained:
231 - offset or min_offset
232 This is the position the string must appear at, or not before.
233 It also implicitly (when combined with minlenp) tells us how many
234 character must match before the string we are searching.
235 Likewise when combined with minlenp and the length of the string
236 tells us how many characters must appear after the string we have
240 Only used for floating strings. This is the rightmost point that
241 the string can appear at. Ifset to I32 max it indicates that the
242 string can occur infinitely far to the right.
245 A pointer to the minimum length of the pattern that the string
246 was found inside. This is important as in the case of positive
247 lookahead or positive lookbehind we can have multiple patterns
252 The minimum length of the pattern overall is 3, the minimum length
253 of the lookahead part is 3, but the minimum length of the part that
254 will actually match is 1. So 'FOO's minimum length is 3, but the
255 minimum length for the F is 1. This is important as the minimum length
256 is used to determine offsets in front of and behind the string being
257 looked for. Since strings can be composites this is the length of the
258 pattern at the time it was commited with a scan_commit. Note that
259 the length is calculated by study_chunk, so that the minimum lengths
260 are not known until the full pattern has been compiled, thus the
261 pointer to the value.
265 In the case of lookbehind the string being searched for can be
266 offset past the start point of the final matching string.
267 If this value was just blithely removed from the min_offset it would
268 invalidate some of the calculations for how many chars must match
269 before or after (as they are derived from min_offset and minlen and
270 the length of the string being searched for).
271 When the final pattern is compiled and the data is moved from the
272 scan_data_t structure into the regexp structure the information
273 about lookbehind is factored in, with the information that would
274 have been lost precalculated in the end_shift field for the
277 The fields pos_min and pos_delta are used to store the minimum offset
278 and the delta to the maximum offset at the current point in the pattern.
282 typedef struct scan_data_t {
283 /*I32 len_min; unused */
284 /*I32 len_delta; unused */
288 I32 last_end; /* min value, <0 unless valid. */
291 SV **longest; /* Either &l_fixed, or &l_float. */
292 SV *longest_fixed; /* longest fixed string found in pattern */
293 I32 offset_fixed; /* offset where it starts */
294 I32 *minlen_fixed; /* pointer to the minlen relevent to the string */
295 I32 lookbehind_fixed; /* is the position of the string modfied by LB */
296 SV *longest_float; /* longest floating string found in pattern */
297 I32 offset_float_min; /* earliest point in string it can appear */
298 I32 offset_float_max; /* latest point in string it can appear */
299 I32 *minlen_float; /* pointer to the minlen relevent to the string */
300 I32 lookbehind_float; /* is the position of the string modified by LB */
304 struct regnode_charclass_class *start_class;
308 * Forward declarations for pregcomp()'s friends.
311 static const scan_data_t zero_scan_data =
312 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ,0};
314 #define SF_BEFORE_EOL (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
315 #define SF_BEFORE_SEOL 0x0001
316 #define SF_BEFORE_MEOL 0x0002
317 #define SF_FIX_BEFORE_EOL (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
318 #define SF_FL_BEFORE_EOL (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
321 # define SF_FIX_SHIFT_EOL (0+2)
322 # define SF_FL_SHIFT_EOL (0+4)
324 # define SF_FIX_SHIFT_EOL (+2)
325 # define SF_FL_SHIFT_EOL (+4)
328 #define SF_FIX_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
329 #define SF_FIX_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
331 #define SF_FL_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
332 #define SF_FL_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
333 #define SF_IS_INF 0x0040
334 #define SF_HAS_PAR 0x0080
335 #define SF_IN_PAR 0x0100
336 #define SF_HAS_EVAL 0x0200
337 #define SCF_DO_SUBSTR 0x0400
338 #define SCF_DO_STCLASS_AND 0x0800
339 #define SCF_DO_STCLASS_OR 0x1000
340 #define SCF_DO_STCLASS (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
341 #define SCF_WHILEM_VISITED_POS 0x2000
343 #define SCF_TRIE_RESTUDY 0x4000 /* Do restudy? */
344 #define SCF_SEEN_ACCEPT 0x8000
346 #define UTF (RExC_utf8 != 0)
347 #define LOC ((RExC_flags & RXf_PMf_LOCALE) != 0)
348 #define FOLD ((RExC_flags & RXf_PMf_FOLD) != 0)
350 #define OOB_UNICODE 12345678
351 #define OOB_NAMEDCLASS -1
353 #define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
354 #define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
357 /* length of regex to show in messages that don't mark a position within */
358 #define RegexLengthToShowInErrorMessages 127
361 * If MARKER[12] are adjusted, be sure to adjust the constants at the top
362 * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
363 * op/pragma/warn/regcomp.
365 #define MARKER1 "<-- HERE" /* marker as it appears in the description */
366 #define MARKER2 " <-- HERE " /* marker as it appears within the regex */
368 #define REPORT_LOCATION " in regex; marked by " MARKER1 " in m/%.*s" MARKER2 "%s/"
371 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
372 * arg. Show regex, up to a maximum length. If it's too long, chop and add
375 #define _FAIL(code) STMT_START { \
376 const char *ellipses = ""; \
377 IV len = RExC_end - RExC_precomp; \
380 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
381 if (len > RegexLengthToShowInErrorMessages) { \
382 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
383 len = RegexLengthToShowInErrorMessages - 10; \
389 #define FAIL(msg) _FAIL( \
390 Perl_croak(aTHX_ "%s in regex m/%.*s%s/", \
391 msg, (int)len, RExC_precomp, ellipses))
393 #define FAIL2(msg,arg) _FAIL( \
394 Perl_croak(aTHX_ msg " in regex m/%.*s%s/", \
395 arg, (int)len, RExC_precomp, ellipses))
398 * Simple_vFAIL -- like FAIL, but marks the current location in the scan
400 #define Simple_vFAIL(m) STMT_START { \
401 const IV offset = RExC_parse - RExC_precomp; \
402 Perl_croak(aTHX_ "%s" REPORT_LOCATION, \
403 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
407 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
409 #define vFAIL(m) STMT_START { \
411 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
416 * Like Simple_vFAIL(), but accepts two arguments.
418 #define Simple_vFAIL2(m,a1) STMT_START { \
419 const IV offset = RExC_parse - RExC_precomp; \
420 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, \
421 (int)offset, RExC_precomp, RExC_precomp + offset); \
425 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
427 #define vFAIL2(m,a1) STMT_START { \
429 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
430 Simple_vFAIL2(m, a1); \
435 * Like Simple_vFAIL(), but accepts three arguments.
437 #define Simple_vFAIL3(m, a1, a2) STMT_START { \
438 const IV offset = RExC_parse - RExC_precomp; \
439 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, \
440 (int)offset, RExC_precomp, RExC_precomp + offset); \
444 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
446 #define vFAIL3(m,a1,a2) STMT_START { \
448 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
449 Simple_vFAIL3(m, a1, a2); \
453 * Like Simple_vFAIL(), but accepts four arguments.
455 #define Simple_vFAIL4(m, a1, a2, a3) STMT_START { \
456 const IV offset = RExC_parse - RExC_precomp; \
457 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3, \
458 (int)offset, RExC_precomp, RExC_precomp + offset); \
461 #define vWARN(loc,m) STMT_START { \
462 const IV offset = loc - RExC_precomp; \
463 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s" REPORT_LOCATION, \
464 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
467 #define vWARNdep(loc,m) STMT_START { \
468 const IV offset = loc - RExC_precomp; \
469 Perl_warner(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), \
470 "%s" REPORT_LOCATION, \
471 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
475 #define vWARN2(loc, m, a1) STMT_START { \
476 const IV offset = loc - RExC_precomp; \
477 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
478 a1, (int)offset, RExC_precomp, RExC_precomp + offset); \
481 #define vWARN3(loc, m, a1, a2) STMT_START { \
482 const IV offset = loc - RExC_precomp; \
483 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
484 a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset); \
487 #define vWARN4(loc, m, a1, a2, a3) STMT_START { \
488 const IV offset = loc - RExC_precomp; \
489 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
490 a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
493 #define vWARN5(loc, m, a1, a2, a3, a4) STMT_START { \
494 const IV offset = loc - RExC_precomp; \
495 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
496 a1, a2, a3, a4, (int)offset, RExC_precomp, RExC_precomp + offset); \
500 /* Allow for side effects in s */
501 #define REGC(c,s) STMT_START { \
502 if (!SIZE_ONLY) *(s) = (c); else (void)(s); \
505 /* Macros for recording node offsets. 20001227 mjd@plover.com
506 * Nodes are numbered 1, 2, 3, 4. Node #n's position is recorded in
507 * element 2*n-1 of the array. Element #2n holds the byte length node #n.
508 * Element 0 holds the number n.
509 * Position is 1 indexed.
512 #define Set_Node_Offset_To_R(node,byte) STMT_START { \
514 MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n", \
515 __LINE__, (int)(node), (int)(byte))); \
517 Perl_croak(aTHX_ "value of node is %d in Offset macro", (int)(node)); \
519 RExC_offsets[2*(node)-1] = (byte); \
524 #define Set_Node_Offset(node,byte) \
525 Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
526 #define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
528 #define Set_Node_Length_To_R(node,len) STMT_START { \
530 MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n", \
531 __LINE__, (int)(node), (int)(len))); \
533 Perl_croak(aTHX_ "value of node is %d in Length macro", (int)(node)); \
535 RExC_offsets[2*(node)] = (len); \
540 #define Set_Node_Length(node,len) \
541 Set_Node_Length_To_R((node)-RExC_emit_start, len)
542 #define Set_Cur_Node_Length(len) Set_Node_Length(RExC_emit, len)
543 #define Set_Node_Cur_Length(node) \
544 Set_Node_Length(node, RExC_parse - parse_start)
546 /* Get offsets and lengths */
547 #define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
548 #define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
550 #define Set_Node_Offset_Length(node,offset,len) STMT_START { \
551 Set_Node_Offset_To_R((node)-RExC_emit_start, (offset)); \
552 Set_Node_Length_To_R((node)-RExC_emit_start, (len)); \
556 #if PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS
557 #define EXPERIMENTAL_INPLACESCAN
560 #define DEBUG_STUDYDATA(data,depth) \
561 DEBUG_OPTIMISE_MORE_r(if(data){ \
562 PerlIO_printf(Perl_debug_log, \
563 "%*s"/* Len:%"IVdf"/%"IVdf" */"Pos:%"IVdf"/%"IVdf \
564 " Flags: %"IVdf" Whilem_c: %"IVdf" Lcp: %"IVdf" ", \
565 (int)(depth)*2, "", \
566 (IV)((data)->pos_min), \
567 (IV)((data)->pos_delta), \
568 (IV)((data)->flags), \
569 (IV)((data)->whilem_c), \
570 (IV)((data)->last_closep ? *((data)->last_closep) : -1) \
572 if ((data)->last_found) \
573 PerlIO_printf(Perl_debug_log, \
574 "Last:'%s' %"IVdf":%"IVdf"/%"IVdf" %sFixed:'%s' @ %"IVdf \
575 " %sFloat: '%s' @ %"IVdf"/%"IVdf"", \
576 SvPVX_const((data)->last_found), \
577 (IV)((data)->last_end), \
578 (IV)((data)->last_start_min), \
579 (IV)((data)->last_start_max), \
580 ((data)->longest && \
581 (data)->longest==&((data)->longest_fixed)) ? "*" : "", \
582 SvPVX_const((data)->longest_fixed), \
583 (IV)((data)->offset_fixed), \
584 ((data)->longest && \
585 (data)->longest==&((data)->longest_float)) ? "*" : "", \
586 SvPVX_const((data)->longest_float), \
587 (IV)((data)->offset_float_min), \
588 (IV)((data)->offset_float_max) \
590 PerlIO_printf(Perl_debug_log,"\n"); \
593 static void clear_re(pTHX_ void *r);
595 /* Mark that we cannot extend a found fixed substring at this point.
596 Update the longest found anchored substring and the longest found
597 floating substrings if needed. */
600 S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data, I32 *minlenp)
602 const STRLEN l = CHR_SVLEN(data->last_found);
603 const STRLEN old_l = CHR_SVLEN(*data->longest);
604 GET_RE_DEBUG_FLAGS_DECL;
606 if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
607 SvSetMagicSV(*data->longest, data->last_found);
608 if (*data->longest == data->longest_fixed) {
609 data->offset_fixed = l ? data->last_start_min : data->pos_min;
610 if (data->flags & SF_BEFORE_EOL)
612 |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
614 data->flags &= ~SF_FIX_BEFORE_EOL;
615 data->minlen_fixed=minlenp;
616 data->lookbehind_fixed=0;
619 data->offset_float_min = l ? data->last_start_min : data->pos_min;
620 data->offset_float_max = (l
621 ? data->last_start_max
622 : data->pos_min + data->pos_delta);
623 if ((U32)data->offset_float_max > (U32)I32_MAX)
624 data->offset_float_max = I32_MAX;
625 if (data->flags & SF_BEFORE_EOL)
627 |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
629 data->flags &= ~SF_FL_BEFORE_EOL;
630 data->minlen_float=minlenp;
631 data->lookbehind_float=0;
634 SvCUR_set(data->last_found, 0);
636 SV * const sv = data->last_found;
637 if (SvUTF8(sv) && SvMAGICAL(sv)) {
638 MAGIC * const mg = mg_find(sv, PERL_MAGIC_utf8);
644 data->flags &= ~SF_BEFORE_EOL;
645 DEBUG_STUDYDATA(data,0);
648 /* Can match anything (initialization) */
650 S_cl_anything(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
652 ANYOF_CLASS_ZERO(cl);
653 ANYOF_BITMAP_SETALL(cl);
654 cl->flags = ANYOF_EOS|ANYOF_UNICODE_ALL;
656 cl->flags |= ANYOF_LOCALE;
659 /* Can match anything (initialization) */
661 S_cl_is_anything(const struct regnode_charclass_class *cl)
665 for (value = 0; value <= ANYOF_MAX; value += 2)
666 if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1))
668 if (!(cl->flags & ANYOF_UNICODE_ALL))
670 if (!ANYOF_BITMAP_TESTALLSET((const void*)cl))
675 /* Can match anything (initialization) */
677 S_cl_init(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
679 Zero(cl, 1, struct regnode_charclass_class);
681 cl_anything(pRExC_state, cl);
685 S_cl_init_zero(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
687 Zero(cl, 1, struct regnode_charclass_class);
689 cl_anything(pRExC_state, cl);
691 cl->flags |= ANYOF_LOCALE;
694 /* 'And' a given class with another one. Can create false positives */
695 /* We assume that cl is not inverted */
697 S_cl_and(struct regnode_charclass_class *cl,
698 const struct regnode_charclass_class *and_with)
701 assert(and_with->type == ANYOF);
702 if (!(and_with->flags & ANYOF_CLASS)
703 && !(cl->flags & ANYOF_CLASS)
704 && (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
705 && !(and_with->flags & ANYOF_FOLD)
706 && !(cl->flags & ANYOF_FOLD)) {
709 if (and_with->flags & ANYOF_INVERT)
710 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
711 cl->bitmap[i] &= ~and_with->bitmap[i];
713 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
714 cl->bitmap[i] &= and_with->bitmap[i];
715 } /* XXXX: logic is complicated otherwise, leave it along for a moment. */
716 if (!(and_with->flags & ANYOF_EOS))
717 cl->flags &= ~ANYOF_EOS;
719 if (cl->flags & ANYOF_UNICODE_ALL && and_with->flags & ANYOF_UNICODE &&
720 !(and_with->flags & ANYOF_INVERT)) {
721 cl->flags &= ~ANYOF_UNICODE_ALL;
722 cl->flags |= ANYOF_UNICODE;
723 ARG_SET(cl, ARG(and_with));
725 if (!(and_with->flags & ANYOF_UNICODE_ALL) &&
726 !(and_with->flags & ANYOF_INVERT))
727 cl->flags &= ~ANYOF_UNICODE_ALL;
728 if (!(and_with->flags & (ANYOF_UNICODE|ANYOF_UNICODE_ALL)) &&
729 !(and_with->flags & ANYOF_INVERT))
730 cl->flags &= ~ANYOF_UNICODE;
733 /* 'OR' a given class with another one. Can create false positives */
734 /* We assume that cl is not inverted */
736 S_cl_or(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, const struct regnode_charclass_class *or_with)
738 if (or_with->flags & ANYOF_INVERT) {
740 * (B1 | CL1) | (!B2 & !CL2) = (B1 | !B2 & !CL2) | (CL1 | (!B2 & !CL2))
741 * <= (B1 | !B2) | (CL1 | !CL2)
742 * which is wasteful if CL2 is small, but we ignore CL2:
743 * (B1 | CL1) | (!B2 & !CL2) <= (B1 | CL1) | !B2 = (B1 | !B2) | CL1
744 * XXXX Can we handle case-fold? Unclear:
745 * (OK1(i) | OK1(i')) | !(OK1(i) | OK1(i')) =
746 * (OK1(i) | OK1(i')) | (!OK1(i) & !OK1(i'))
748 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
749 && !(or_with->flags & ANYOF_FOLD)
750 && !(cl->flags & ANYOF_FOLD) ) {
753 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
754 cl->bitmap[i] |= ~or_with->bitmap[i];
755 } /* XXXX: logic is complicated otherwise */
757 cl_anything(pRExC_state, cl);
760 /* (B1 | CL1) | (B2 | CL2) = (B1 | B2) | (CL1 | CL2)) */
761 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
762 && (!(or_with->flags & ANYOF_FOLD)
763 || (cl->flags & ANYOF_FOLD)) ) {
766 /* OR char bitmap and class bitmap separately */
767 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
768 cl->bitmap[i] |= or_with->bitmap[i];
769 if (or_with->flags & ANYOF_CLASS) {
770 for (i = 0; i < ANYOF_CLASSBITMAP_SIZE; i++)
771 cl->classflags[i] |= or_with->classflags[i];
772 cl->flags |= ANYOF_CLASS;
775 else { /* XXXX: logic is complicated, leave it along for a moment. */
776 cl_anything(pRExC_state, cl);
779 if (or_with->flags & ANYOF_EOS)
780 cl->flags |= ANYOF_EOS;
782 if (cl->flags & ANYOF_UNICODE && or_with->flags & ANYOF_UNICODE &&
783 ARG(cl) != ARG(or_with)) {
784 cl->flags |= ANYOF_UNICODE_ALL;
785 cl->flags &= ~ANYOF_UNICODE;
787 if (or_with->flags & ANYOF_UNICODE_ALL) {
788 cl->flags |= ANYOF_UNICODE_ALL;
789 cl->flags &= ~ANYOF_UNICODE;
793 #define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ]
794 #define TRIE_LIST_CUR(state) ( TRIE_LIST_ITEM( state, 0 ).forid )
795 #define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate )
796 #define TRIE_LIST_USED(idx) ( trie->states[state].trans.list ? (TRIE_LIST_CUR( idx ) - 1) : 0 )
802 dump_trie_interim_list(trie,next_alloc)
803 dump_trie_interim_table(trie,next_alloc)
805 These routines dump out a trie in a somewhat readable format.
806 The _interim_ variants are used for debugging the interim
807 tables that are used to generate the final compressed
808 representation which is what dump_trie expects.
810 Part of the reason for their existance is to provide a form
811 of documentation as to how the different representations function.
817 Dumps the final compressed table form of the trie to Perl_debug_log.
818 Used for debugging make_trie().
822 S_dump_trie(pTHX_ const struct _reg_trie_data *trie,U32 depth)
825 SV *sv=sv_newmortal();
826 int colwidth= trie->widecharmap ? 6 : 4;
827 GET_RE_DEBUG_FLAGS_DECL;
830 PerlIO_printf( Perl_debug_log, "%*sChar : %-6s%-6s%-4s ",
831 (int)depth * 2 + 2,"",
832 "Match","Base","Ofs" );
834 for( state = 0 ; state < trie->uniquecharcount ; state++ ) {
835 SV ** const tmp = av_fetch( trie->revcharmap, state, 0);
837 PerlIO_printf( Perl_debug_log, "%*s",
839 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
840 PL_colors[0], PL_colors[1],
841 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
842 PERL_PV_ESCAPE_FIRSTCHAR
847 PerlIO_printf( Perl_debug_log, "\n%*sState|-----------------------",
848 (int)depth * 2 + 2,"");
850 for( state = 0 ; state < trie->uniquecharcount ; state++ )
851 PerlIO_printf( Perl_debug_log, "%.*s", colwidth, "--------");
852 PerlIO_printf( Perl_debug_log, "\n");
854 for( state = 1 ; state < trie->statecount ; state++ ) {
855 const U32 base = trie->states[ state ].trans.base;
857 PerlIO_printf( Perl_debug_log, "%*s#%4"UVXf"|", (int)depth * 2 + 2,"", (UV)state);
859 if ( trie->states[ state ].wordnum ) {
860 PerlIO_printf( Perl_debug_log, " W%4X", trie->states[ state ].wordnum );
862 PerlIO_printf( Perl_debug_log, "%6s", "" );
865 PerlIO_printf( Perl_debug_log, " @%4"UVXf" ", (UV)base );
870 while( ( base + ofs < trie->uniquecharcount ) ||
871 ( base + ofs - trie->uniquecharcount < trie->lasttrans
872 && trie->trans[ base + ofs - trie->uniquecharcount ].check != state))
875 PerlIO_printf( Perl_debug_log, "+%2"UVXf"[ ", (UV)ofs);
877 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
878 if ( ( base + ofs >= trie->uniquecharcount ) &&
879 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
880 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
882 PerlIO_printf( Perl_debug_log, "%*"UVXf,
884 (UV)trie->trans[ base + ofs - trie->uniquecharcount ].next );
886 PerlIO_printf( Perl_debug_log, "%*s",colwidth," ." );
890 PerlIO_printf( Perl_debug_log, "]");
893 PerlIO_printf( Perl_debug_log, "\n" );
897 dump_trie_interim_list(trie,next_alloc)
898 Dumps a fully constructed but uncompressed trie in list form.
899 List tries normally only are used for construction when the number of
900 possible chars (trie->uniquecharcount) is very high.
901 Used for debugging make_trie().
904 S_dump_trie_interim_list(pTHX_ const struct _reg_trie_data *trie, U32 next_alloc,U32 depth)
907 SV *sv=sv_newmortal();
908 int colwidth= trie->widecharmap ? 6 : 4;
909 GET_RE_DEBUG_FLAGS_DECL;
910 /* print out the table precompression. */
911 PerlIO_printf( Perl_debug_log, "%*sState :Word | Transition Data\n%*s%s",
912 (int)depth * 2 + 2,"", (int)depth * 2 + 2,"",
913 "------:-----+-----------------\n" );
915 for( state=1 ; state < next_alloc ; state ++ ) {
918 PerlIO_printf( Perl_debug_log, "%*s %4"UVXf" :",
919 (int)depth * 2 + 2,"", (UV)state );
920 if ( ! trie->states[ state ].wordnum ) {
921 PerlIO_printf( Perl_debug_log, "%5s| ","");
923 PerlIO_printf( Perl_debug_log, "W%4x| ",
924 trie->states[ state ].wordnum
927 for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) {
928 SV ** const tmp = av_fetch( trie->revcharmap, TRIE_LIST_ITEM(state,charid).forid, 0);
930 PerlIO_printf( Perl_debug_log, "%*s:%3X=%4"UVXf" | ",
932 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
933 PL_colors[0], PL_colors[1],
934 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
935 PERL_PV_ESCAPE_FIRSTCHAR
937 TRIE_LIST_ITEM(state,charid).forid,
938 (UV)TRIE_LIST_ITEM(state,charid).newstate
941 PerlIO_printf(Perl_debug_log, "\n%*s| ",
942 (int)((depth * 2) + 14), "");
945 PerlIO_printf( Perl_debug_log, "\n");
950 dump_trie_interim_table(trie,next_alloc)
951 Dumps a fully constructed but uncompressed trie in table form.
952 This is the normal DFA style state transition table, with a few
953 twists to facilitate compression later.
954 Used for debugging make_trie().
957 S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie, U32 next_alloc, U32 depth)
961 SV *sv=sv_newmortal();
962 int colwidth= trie->widecharmap ? 6 : 4;
963 GET_RE_DEBUG_FLAGS_DECL;
966 print out the table precompression so that we can do a visual check
967 that they are identical.
970 PerlIO_printf( Perl_debug_log, "%*sChar : ",(int)depth * 2 + 2,"" );
972 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
973 SV ** const tmp = av_fetch( trie->revcharmap, charid, 0);
975 PerlIO_printf( Perl_debug_log, "%*s",
977 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
978 PL_colors[0], PL_colors[1],
979 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
980 PERL_PV_ESCAPE_FIRSTCHAR
986 PerlIO_printf( Perl_debug_log, "\n%*sState+-",(int)depth * 2 + 2,"" );
988 for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) {
989 PerlIO_printf( Perl_debug_log, "%.*s", colwidth,"--------");
992 PerlIO_printf( Perl_debug_log, "\n" );
994 for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) {
996 PerlIO_printf( Perl_debug_log, "%*s%4"UVXf" : ",
997 (int)depth * 2 + 2,"",
998 (UV)TRIE_NODENUM( state ) );
1000 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
1001 UV v=(UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next );
1003 PerlIO_printf( Perl_debug_log, "%*"UVXf, colwidth, v );
1005 PerlIO_printf( Perl_debug_log, "%*s", colwidth, "." );
1007 if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) {
1008 PerlIO_printf( Perl_debug_log, " (%4"UVXf")\n", (UV)trie->trans[ state ].check );
1010 PerlIO_printf( Perl_debug_log, " (%4"UVXf") W%4X\n", (UV)trie->trans[ state ].check,
1011 trie->states[ TRIE_NODENUM( state ) ].wordnum );
1018 /* make_trie(startbranch,first,last,tail,word_count,flags,depth)
1019 startbranch: the first branch in the whole branch sequence
1020 first : start branch of sequence of branch-exact nodes.
1021 May be the same as startbranch
1022 last : Thing following the last branch.
1023 May be the same as tail.
1024 tail : item following the branch sequence
1025 count : words in the sequence
1026 flags : currently the OP() type we will be building one of /EXACT(|F|Fl)/
1027 depth : indent depth
1029 Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node.
1031 A trie is an N'ary tree where the branches are determined by digital
1032 decomposition of the key. IE, at the root node you look up the 1st character and
1033 follow that branch repeat until you find the end of the branches. Nodes can be
1034 marked as "accepting" meaning they represent a complete word. Eg:
1038 would convert into the following structure. Numbers represent states, letters
1039 following numbers represent valid transitions on the letter from that state, if
1040 the number is in square brackets it represents an accepting state, otherwise it
1041 will be in parenthesis.
1043 +-h->+-e->[3]-+-r->(8)-+-s->[9]
1047 (1) +-i->(6)-+-s->[7]
1049 +-s->(3)-+-h->(4)-+-e->[5]
1051 Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers)
1053 This shows that when matching against the string 'hers' we will begin at state 1
1054 read 'h' and move to state 2, read 'e' and move to state 3 which is accepting,
1055 then read 'r' and go to state 8 followed by 's' which takes us to state 9 which
1056 is also accepting. Thus we know that we can match both 'he' and 'hers' with a
1057 single traverse. We store a mapping from accepting to state to which word was
1058 matched, and then when we have multiple possibilities we try to complete the
1059 rest of the regex in the order in which they occured in the alternation.
1061 The only prior NFA like behaviour that would be changed by the TRIE support is
1062 the silent ignoring of duplicate alternations which are of the form:
1064 / (DUPE|DUPE) X? (?{ ... }) Y /x
1066 Thus EVAL blocks follwing a trie may be called a different number of times with
1067 and without the optimisation. With the optimisations dupes will be silently
1068 ignored. This inconsistant behaviour of EVAL type nodes is well established as
1069 the following demonstrates:
1071 'words'=~/(word|word|word)(?{ print $1 })[xyz]/
1073 which prints out 'word' three times, but
1075 'words'=~/(word|word|word)(?{ print $1 })S/
1077 which doesnt print it out at all. This is due to other optimisations kicking in.
1079 Example of what happens on a structural level:
1081 The regexp /(ac|ad|ab)+/ will produce the folowing debug output:
1083 1: CURLYM[1] {1,32767}(18)
1094 This would be optimizable with startbranch=5, first=5, last=16, tail=16
1095 and should turn into:
1097 1: CURLYM[1] {1,32767}(18)
1099 [Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1]
1107 Cases where tail != last would be like /(?foo|bar)baz/:
1117 which would be optimizable with startbranch=1, first=1, last=7, tail=8
1118 and would end up looking like:
1121 [Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1]
1128 d = uvuni_to_utf8_flags(d, uv, 0);
1130 is the recommended Unicode-aware way of saying
1135 #define TRIE_STORE_REVCHAR \
1137 SV *tmp = newSVpvs(""); \
1138 if (UTF) SvUTF8_on(tmp); \
1139 Perl_sv_catpvf( aTHX_ tmp, "%c", (int)uvc ); \
1140 av_push( TRIE_REVCHARMAP(trie), tmp ); \
1143 #define TRIE_READ_CHAR STMT_START { \
1147 if ( foldlen > 0 ) { \
1148 uvc = utf8n_to_uvuni( scan, UTF8_MAXLEN, &len, uniflags ); \
1153 uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
1154 uvc = to_uni_fold( uvc, foldbuf, &foldlen ); \
1155 foldlen -= UNISKIP( uvc ); \
1156 scan = foldbuf + UNISKIP( uvc ); \
1159 uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
1169 #define TRIE_LIST_PUSH(state,fid,ns) STMT_START { \
1170 if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) { \
1171 U32 ging = TRIE_LIST_LEN( state ) *= 2; \
1172 Renew( trie->states[ state ].trans.list, ging, reg_trie_trans_le ); \
1174 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid; \
1175 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns; \
1176 TRIE_LIST_CUR( state )++; \
1179 #define TRIE_LIST_NEW(state) STMT_START { \
1180 Newxz( trie->states[ state ].trans.list, \
1181 4, reg_trie_trans_le ); \
1182 TRIE_LIST_CUR( state ) = 1; \
1183 TRIE_LIST_LEN( state ) = 4; \
1186 #define TRIE_HANDLE_WORD(state) STMT_START { \
1187 U16 dupe= trie->states[ state ].wordnum; \
1188 regnode * const noper_next = regnext( noper ); \
1190 if (trie->wordlen) \
1191 trie->wordlen[ curword ] = wordlen; \
1193 /* store the word for dumping */ \
1195 if (OP(noper) != NOTHING) \
1196 tmp = newSVpvn(STRING(noper), STR_LEN(noper)); \
1198 tmp = newSVpvn( "", 0 ); \
1199 if ( UTF ) SvUTF8_on( tmp ); \
1200 av_push( trie->words, tmp ); \
1205 if ( noper_next < tail ) { \
1207 Newxz( trie->jump, word_count + 1, U16); \
1208 trie->jump[curword] = (U16)(noper_next - convert); \
1210 jumper = noper_next; \
1212 nextbranch= regnext(cur); \
1216 /* So it's a dupe. This means we need to maintain a */\
1217 /* linked-list from the first to the next. */\
1218 /* we only allocate the nextword buffer when there */\
1219 /* a dupe, so first time we have to do the allocation */\
1220 if (!trie->nextword) \
1221 Newxz( trie->nextword, word_count + 1, U16); \
1222 while ( trie->nextword[dupe] ) \
1223 dupe= trie->nextword[dupe]; \
1224 trie->nextword[dupe]= curword; \
1226 /* we haven't inserted this word yet. */ \
1227 trie->states[ state ].wordnum = curword; \
1232 #define TRIE_TRANS_STATE(state,base,ucharcount,charid,special) \
1233 ( ( base + charid >= ucharcount \
1234 && base + charid < ubound \
1235 && state == trie->trans[ base - ucharcount + charid ].check \
1236 && trie->trans[ base - ucharcount + charid ].next ) \
1237 ? trie->trans[ base - ucharcount + charid ].next \
1238 : ( state==1 ? special : 0 ) \
1242 #define MADE_JUMP_TRIE 2
1243 #define MADE_EXACT_TRIE 4
1246 S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *first, regnode *last, regnode *tail, U32 word_count, U32 flags, U32 depth)
1249 /* first pass, loop through and scan words */
1250 reg_trie_data *trie;
1252 const U32 uniflags = UTF8_ALLOW_DEFAULT;
1257 regnode *jumper = NULL;
1258 regnode *nextbranch = NULL;
1259 regnode *convert = NULL;
1260 /* we just use folder as a flag in utf8 */
1261 const U8 * const folder = ( flags == EXACTF
1263 : ( flags == EXACTFL
1269 const U32 data_slot = add_data( pRExC_state, 1, "t" );
1270 SV *re_trie_maxbuff;
1272 /* these are only used during construction but are useful during
1273 * debugging so we store them in the struct when debugging.
1275 STRLEN trie_charcount=0;
1276 AV *trie_revcharmap;
1278 GET_RE_DEBUG_FLAGS_DECL;
1280 PERL_UNUSED_ARG(depth);
1283 Newxz( trie, 1, reg_trie_data );
1285 trie->startstate = 1;
1286 trie->wordcount = word_count;
1287 RExC_rxi->data->data[ data_slot ] = (void*)trie;
1288 Newxz( trie->charmap, 256, U16 );
1289 if (!(UTF && folder))
1290 Newxz( trie->bitmap, ANYOF_BITMAP_SIZE, char );
1292 trie->words = newAV();
1294 TRIE_REVCHARMAP(trie) = newAV();
1296 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
1297 if (!SvIOK(re_trie_maxbuff)) {
1298 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
1301 PerlIO_printf( Perl_debug_log,
1302 "%*smake_trie start==%d, first==%d, last==%d, tail==%d depth=%d\n",
1303 (int)depth * 2 + 2, "",
1304 REG_NODE_NUM(startbranch),REG_NODE_NUM(first),
1305 REG_NODE_NUM(last), REG_NODE_NUM(tail),
1309 /* Find the node we are going to overwrite */
1310 if ( first == startbranch && OP( last ) != BRANCH ) {
1311 /* whole branch chain */
1314 /* branch sub-chain */
1315 convert = NEXTOPER( first );
1318 /* -- First loop and Setup --
1320 We first traverse the branches and scan each word to determine if it
1321 contains widechars, and how many unique chars there are, this is
1322 important as we have to build a table with at least as many columns as we
1325 We use an array of integers to represent the character codes 0..255
1326 (trie->charmap) and we use a an HV* to store unicode characters. We use the
1327 native representation of the character value as the key and IV's for the
1330 *TODO* If we keep track of how many times each character is used we can
1331 remap the columns so that the table compression later on is more
1332 efficient in terms of memory by ensuring most common value is in the
1333 middle and the least common are on the outside. IMO this would be better
1334 than a most to least common mapping as theres a decent chance the most
1335 common letter will share a node with the least common, meaning the node
1336 will not be compressable. With a middle is most common approach the worst
1337 case is when we have the least common nodes twice.
1341 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1342 regnode * const noper = NEXTOPER( cur );
1343 const U8 *uc = (U8*)STRING( noper );
1344 const U8 * const e = uc + STR_LEN( noper );
1346 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1347 const U8 *scan = (U8*)NULL;
1348 U32 wordlen = 0; /* required init */
1351 if (OP(noper) == NOTHING) {
1356 TRIE_BITMAP_SET(trie,*uc);
1357 if ( folder ) TRIE_BITMAP_SET(trie,folder[ *uc ]);
1359 for ( ; uc < e ; uc += len ) {
1360 TRIE_CHARCOUNT(trie)++;
1364 if ( !trie->charmap[ uvc ] ) {
1365 trie->charmap[ uvc ]=( ++trie->uniquecharcount );
1367 trie->charmap[ folder[ uvc ] ] = trie->charmap[ uvc ];
1372 if ( !trie->widecharmap )
1373 trie->widecharmap = newHV();
1375 svpp = hv_fetch( trie->widecharmap, (char*)&uvc, sizeof( UV ), 1 );
1378 Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%"UVXf, uvc );
1380 if ( !SvTRUE( *svpp ) ) {
1381 sv_setiv( *svpp, ++trie->uniquecharcount );
1386 if( cur == first ) {
1389 } else if (chars < trie->minlen) {
1391 } else if (chars > trie->maxlen) {
1395 } /* end first pass */
1396 DEBUG_TRIE_COMPILE_r(
1397 PerlIO_printf( Perl_debug_log, "%*sTRIE(%s): W:%d C:%d Uq:%d Min:%d Max:%d\n",
1398 (int)depth * 2 + 2,"",
1399 ( trie->widecharmap ? "UTF8" : "NATIVE" ), (int)word_count,
1400 (int)TRIE_CHARCOUNT(trie), trie->uniquecharcount,
1401 (int)trie->minlen, (int)trie->maxlen )
1403 Newxz( trie->wordlen, word_count, U32 );
1406 We now know what we are dealing with in terms of unique chars and
1407 string sizes so we can calculate how much memory a naive
1408 representation using a flat table will take. If it's over a reasonable
1409 limit (as specified by ${^RE_TRIE_MAXBUF}) we use a more memory
1410 conservative but potentially much slower representation using an array
1413 At the end we convert both representations into the same compressed
1414 form that will be used in regexec.c for matching with. The latter
1415 is a form that cannot be used to construct with but has memory
1416 properties similar to the list form and access properties similar
1417 to the table form making it both suitable for fast searches and
1418 small enough that its feasable to store for the duration of a program.
1420 See the comment in the code where the compressed table is produced
1421 inplace from the flat tabe representation for an explanation of how
1422 the compression works.
1427 if ( (IV)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1) > SvIV(re_trie_maxbuff) ) {
1429 Second Pass -- Array Of Lists Representation
1431 Each state will be represented by a list of charid:state records
1432 (reg_trie_trans_le) the first such element holds the CUR and LEN
1433 points of the allocated array. (See defines above).
1435 We build the initial structure using the lists, and then convert
1436 it into the compressed table form which allows faster lookups
1437 (but cant be modified once converted).
1440 STRLEN transcount = 1;
1442 DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
1443 "%*sCompiling trie using list compiler\n",
1444 (int)depth * 2 + 2, ""));
1446 Newxz( trie->states, TRIE_CHARCOUNT(trie) + 2, reg_trie_state );
1450 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1452 regnode * const noper = NEXTOPER( cur );
1453 U8 *uc = (U8*)STRING( noper );
1454 const U8 * const e = uc + STR_LEN( noper );
1455 U32 state = 1; /* required init */
1456 U16 charid = 0; /* sanity init */
1457 U8 *scan = (U8*)NULL; /* sanity init */
1458 STRLEN foldlen = 0; /* required init */
1459 U32 wordlen = 0; /* required init */
1460 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1462 if (OP(noper) != NOTHING) {
1463 for ( ; uc < e ; uc += len ) {
1468 charid = trie->charmap[ uvc ];
1470 SV** const svpp = hv_fetch( trie->widecharmap, (char*)&uvc, sizeof( UV ), 0);
1474 charid=(U16)SvIV( *svpp );
1477 /* charid is now 0 if we dont know the char read, or nonzero if we do */
1484 if ( !trie->states[ state ].trans.list ) {
1485 TRIE_LIST_NEW( state );
1487 for ( check = 1; check <= TRIE_LIST_USED( state ); check++ ) {
1488 if ( TRIE_LIST_ITEM( state, check ).forid == charid ) {
1489 newstate = TRIE_LIST_ITEM( state, check ).newstate;
1494 newstate = next_alloc++;
1495 TRIE_LIST_PUSH( state, charid, newstate );
1500 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
1504 TRIE_HANDLE_WORD(state);
1506 } /* end second pass */
1508 /* next alloc is the NEXT state to be allocated */
1509 trie->statecount = next_alloc;
1510 Renew( trie->states, next_alloc, reg_trie_state );
1512 /* and now dump it out before we compress it */
1513 DEBUG_TRIE_COMPILE_MORE_r(
1514 dump_trie_interim_list(trie,next_alloc,depth+1)
1517 Newxz( trie->trans, transcount ,reg_trie_trans );
1524 for( state=1 ; state < next_alloc ; state ++ ) {
1528 DEBUG_TRIE_COMPILE_MORE_r(
1529 PerlIO_printf( Perl_debug_log, "tp: %d zp: %d ",tp,zp)
1533 if (trie->states[state].trans.list) {
1534 U16 minid=TRIE_LIST_ITEM( state, 1).forid;
1538 for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
1539 const U16 forid = TRIE_LIST_ITEM( state, idx).forid;
1540 if ( forid < minid ) {
1542 } else if ( forid > maxid ) {
1546 if ( transcount < tp + maxid - minid + 1) {
1548 Renew( trie->trans, transcount, reg_trie_trans );
1549 Zero( trie->trans + (transcount / 2), transcount / 2 , reg_trie_trans );
1551 base = trie->uniquecharcount + tp - minid;
1552 if ( maxid == minid ) {
1554 for ( ; zp < tp ; zp++ ) {
1555 if ( ! trie->trans[ zp ].next ) {
1556 base = trie->uniquecharcount + zp - minid;
1557 trie->trans[ zp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1558 trie->trans[ zp ].check = state;
1564 trie->trans[ tp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1565 trie->trans[ tp ].check = state;
1570 for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
1571 const U32 tid = base - trie->uniquecharcount + TRIE_LIST_ITEM( state, idx ).forid;
1572 trie->trans[ tid ].next = TRIE_LIST_ITEM( state, idx ).newstate;
1573 trie->trans[ tid ].check = state;
1575 tp += ( maxid - minid + 1 );
1577 Safefree(trie->states[ state ].trans.list);
1580 DEBUG_TRIE_COMPILE_MORE_r(
1581 PerlIO_printf( Perl_debug_log, " base: %d\n",base);
1584 trie->states[ state ].trans.base=base;
1586 trie->lasttrans = tp + 1;
1590 Second Pass -- Flat Table Representation.
1592 we dont use the 0 slot of either trans[] or states[] so we add 1 to each.
1593 We know that we will need Charcount+1 trans at most to store the data
1594 (one row per char at worst case) So we preallocate both structures
1595 assuming worst case.
1597 We then construct the trie using only the .next slots of the entry
1600 We use the .check field of the first entry of the node temporarily to
1601 make compression both faster and easier by keeping track of how many non
1602 zero fields are in the node.
1604 Since trans are numbered from 1 any 0 pointer in the table is a FAIL
1607 There are two terms at use here: state as a TRIE_NODEIDX() which is a
1608 number representing the first entry of the node, and state as a
1609 TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1) and
1610 TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3) if there
1611 are 2 entrys per node. eg:
1619 The table is internally in the right hand, idx form. However as we also
1620 have to deal with the states array which is indexed by nodenum we have to
1621 use TRIE_NODENUM() to convert.
1624 DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
1625 "%*sCompiling trie using table compiler\n",
1626 (int)depth * 2 + 2, ""));
1628 Newxz( trie->trans, ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1,
1630 Newxz( trie->states, TRIE_CHARCOUNT(trie) + 2, reg_trie_state );
1631 next_alloc = trie->uniquecharcount + 1;
1634 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1636 regnode * const noper = NEXTOPER( cur );
1637 const U8 *uc = (U8*)STRING( noper );
1638 const U8 * const e = uc + STR_LEN( noper );
1640 U32 state = 1; /* required init */
1642 U16 charid = 0; /* sanity init */
1643 U32 accept_state = 0; /* sanity init */
1644 U8 *scan = (U8*)NULL; /* sanity init */
1646 STRLEN foldlen = 0; /* required init */
1647 U32 wordlen = 0; /* required init */
1648 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1650 if ( OP(noper) != NOTHING ) {
1651 for ( ; uc < e ; uc += len ) {
1656 charid = trie->charmap[ uvc ];
1658 SV* const * const svpp = hv_fetch( trie->widecharmap, (char*)&uvc, sizeof( UV ), 0);
1659 charid = svpp ? (U16)SvIV(*svpp) : 0;
1663 if ( !trie->trans[ state + charid ].next ) {
1664 trie->trans[ state + charid ].next = next_alloc;
1665 trie->trans[ state ].check++;
1666 next_alloc += trie->uniquecharcount;
1668 state = trie->trans[ state + charid ].next;
1670 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
1672 /* charid is now 0 if we dont know the char read, or nonzero if we do */
1675 accept_state = TRIE_NODENUM( state );
1676 TRIE_HANDLE_WORD(accept_state);
1678 } /* end second pass */
1680 /* and now dump it out before we compress it */
1681 DEBUG_TRIE_COMPILE_MORE_r(
1682 dump_trie_interim_table(trie,next_alloc,depth+1)
1687 * Inplace compress the table.*
1689 For sparse data sets the table constructed by the trie algorithm will
1690 be mostly 0/FAIL transitions or to put it another way mostly empty.
1691 (Note that leaf nodes will not contain any transitions.)
1693 This algorithm compresses the tables by eliminating most such
1694 transitions, at the cost of a modest bit of extra work during lookup:
1696 - Each states[] entry contains a .base field which indicates the
1697 index in the state[] array wheres its transition data is stored.
1699 - If .base is 0 there are no valid transitions from that node.
1701 - If .base is nonzero then charid is added to it to find an entry in
1704 -If trans[states[state].base+charid].check!=state then the
1705 transition is taken to be a 0/Fail transition. Thus if there are fail
1706 transitions at the front of the node then the .base offset will point
1707 somewhere inside the previous nodes data (or maybe even into a node
1708 even earlier), but the .check field determines if the transition is
1712 The following process inplace converts the table to the compressed
1713 table: We first do not compress the root node 1,and mark its all its
1714 .check pointers as 1 and set its .base pointer as 1 as well. This
1715 allows to do a DFA construction from the compressed table later, and
1716 ensures that any .base pointers we calculate later are greater than
1719 - We set 'pos' to indicate the first entry of the second node.
1721 - We then iterate over the columns of the node, finding the first and
1722 last used entry at l and m. We then copy l..m into pos..(pos+m-l),
1723 and set the .check pointers accordingly, and advance pos
1724 appropriately and repreat for the next node. Note that when we copy
1725 the next pointers we have to convert them from the original
1726 NODEIDX form to NODENUM form as the former is not valid post
1729 - If a node has no transitions used we mark its base as 0 and do not
1730 advance the pos pointer.
1732 - If a node only has one transition we use a second pointer into the
1733 structure to fill in allocated fail transitions from other states.
1734 This pointer is independent of the main pointer and scans forward
1735 looking for null transitions that are allocated to a state. When it
1736 finds one it writes the single transition into the "hole". If the
1737 pointer doesnt find one the single transition is appended as normal.
1739 - Once compressed we can Renew/realloc the structures to release the
1742 See "Table-Compression Methods" in sec 3.9 of the Red Dragon,
1743 specifically Fig 3.47 and the associated pseudocode.
1747 const U32 laststate = TRIE_NODENUM( next_alloc );
1750 trie->statecount = laststate;
1752 for ( state = 1 ; state < laststate ; state++ ) {
1754 const U32 stateidx = TRIE_NODEIDX( state );
1755 const U32 o_used = trie->trans[ stateidx ].check;
1756 U32 used = trie->trans[ stateidx ].check;
1757 trie->trans[ stateidx ].check = 0;
1759 for ( charid = 0 ; used && charid < trie->uniquecharcount ; charid++ ) {
1760 if ( flag || trie->trans[ stateidx + charid ].next ) {
1761 if ( trie->trans[ stateidx + charid ].next ) {
1763 for ( ; zp < pos ; zp++ ) {
1764 if ( ! trie->trans[ zp ].next ) {
1768 trie->states[ state ].trans.base = zp + trie->uniquecharcount - charid ;
1769 trie->trans[ zp ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
1770 trie->trans[ zp ].check = state;
1771 if ( ++zp > pos ) pos = zp;
1778 trie->states[ state ].trans.base = pos + trie->uniquecharcount - charid ;
1780 trie->trans[ pos ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
1781 trie->trans[ pos ].check = state;
1786 trie->lasttrans = pos + 1;
1787 Renew( trie->states, laststate, reg_trie_state);
1788 DEBUG_TRIE_COMPILE_MORE_r(
1789 PerlIO_printf( Perl_debug_log,
1790 "%*sAlloc: %d Orig: %"IVdf" elements, Final:%"IVdf". Savings of %%%5.2f\n",
1791 (int)depth * 2 + 2,"",
1792 (int)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1 ),
1795 ( ( next_alloc - pos ) * 100 ) / (double)next_alloc );
1798 } /* end table compress */
1800 DEBUG_TRIE_COMPILE_MORE_r(
1801 PerlIO_printf(Perl_debug_log, "%*sStatecount:%"UVxf" Lasttrans:%"UVxf"\n",
1802 (int)depth * 2 + 2, "",
1803 (UV)trie->statecount,
1804 (UV)trie->lasttrans)
1806 /* resize the trans array to remove unused space */
1807 Renew( trie->trans, trie->lasttrans, reg_trie_trans);
1809 /* and now dump out the compressed format */
1810 DEBUG_TRIE_COMPILE_r(
1811 dump_trie(trie,depth+1)
1814 { /* Modify the program and insert the new TRIE node*/
1815 U8 nodetype =(U8)(flags & 0xFF);
1819 regnode *optimize = NULL;
1821 U32 mjd_nodelen = 0;
1824 This means we convert either the first branch or the first Exact,
1825 depending on whether the thing following (in 'last') is a branch
1826 or not and whther first is the startbranch (ie is it a sub part of
1827 the alternation or is it the whole thing.)
1828 Assuming its a sub part we conver the EXACT otherwise we convert
1829 the whole branch sequence, including the first.
1831 /* Find the node we are going to overwrite */
1832 if ( first != startbranch || OP( last ) == BRANCH ) {
1833 /* branch sub-chain */
1834 NEXT_OFF( first ) = (U16)(last - first);
1836 mjd_offset= Node_Offset((convert));
1837 mjd_nodelen= Node_Length((convert));
1839 /* whole branch chain */
1842 const regnode *nop = NEXTOPER( convert );
1843 mjd_offset= Node_Offset((nop));
1844 mjd_nodelen= Node_Length((nop));
1849 PerlIO_printf(Perl_debug_log, "%*sMJD offset:%"UVuf" MJD length:%"UVuf"\n",
1850 (int)depth * 2 + 2, "",
1851 (UV)mjd_offset, (UV)mjd_nodelen)
1854 /* But first we check to see if there is a common prefix we can
1855 split out as an EXACT and put in front of the TRIE node. */
1856 trie->startstate= 1;
1857 if ( trie->bitmap && !trie->widecharmap && !trie->jump ) {
1859 for ( state = 1 ; state < trie->statecount-1 ; state++ ) {
1863 const U32 base = trie->states[ state ].trans.base;
1865 if ( trie->states[state].wordnum )
1868 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
1869 if ( ( base + ofs >= trie->uniquecharcount ) &&
1870 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
1871 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
1873 if ( ++count > 1 ) {
1874 SV **tmp = av_fetch( TRIE_REVCHARMAP(trie), ofs, 0);
1875 const U8 *ch = (U8*)SvPV_nolen_const( *tmp );
1876 if ( state == 1 ) break;
1878 Zero(trie->bitmap, ANYOF_BITMAP_SIZE, char);
1880 PerlIO_printf(Perl_debug_log,
1881 "%*sNew Start State=%"UVuf" Class: [",
1882 (int)depth * 2 + 2, "",
1885 SV ** const tmp = av_fetch( TRIE_REVCHARMAP(trie), idx, 0);
1886 const U8 * const ch = (U8*)SvPV_nolen_const( *tmp );
1888 TRIE_BITMAP_SET(trie,*ch);
1890 TRIE_BITMAP_SET(trie, folder[ *ch ]);
1892 PerlIO_printf(Perl_debug_log, (char*)ch)
1896 TRIE_BITMAP_SET(trie,*ch);
1898 TRIE_BITMAP_SET(trie,folder[ *ch ]);
1899 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"%s", ch));
1905 SV **tmp = av_fetch( TRIE_REVCHARMAP(trie), idx, 0);
1906 char *ch = SvPV_nolen( *tmp );
1908 SV *sv=sv_newmortal();
1909 PerlIO_printf( Perl_debug_log,
1910 "%*sPrefix State: %"UVuf" Idx:%"UVuf" Char='%s'\n",
1911 (int)depth * 2 + 2, "",
1913 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 6,
1914 PL_colors[0], PL_colors[1],
1915 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1916 PERL_PV_ESCAPE_FIRSTCHAR
1921 OP( convert ) = nodetype;
1922 str=STRING(convert);
1933 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"]\n"));
1939 regnode *n = convert+NODE_SZ_STR(convert);
1940 NEXT_OFF(convert) = NODE_SZ_STR(convert);
1941 trie->startstate = state;
1942 trie->minlen -= (state - 1);
1943 trie->maxlen -= (state - 1);
1945 regnode *fix = convert;
1946 U32 word = trie->wordcount;
1948 Set_Node_Offset_Length(convert, mjd_offset, state - 1);
1949 while( ++fix < n ) {
1950 Set_Node_Offset_Length(fix, 0, 0);
1953 SV ** const tmp = av_fetch( trie->words, word, 0 );
1955 if ( STR_LEN(convert) <= SvCUR(*tmp) )
1956 sv_chop(*tmp, SvPV_nolen(*tmp) + STR_LEN(convert));
1958 sv_chop(*tmp, SvPV_nolen(*tmp) + SvCUR(*tmp));
1965 NEXT_OFF(convert) = (U16)(tail - convert);
1966 DEBUG_r(optimize= n);
1972 if ( trie->maxlen ) {
1973 NEXT_OFF( convert ) = (U16)(tail - convert);
1974 ARG_SET( convert, data_slot );
1975 /* Store the offset to the first unabsorbed branch in
1976 jump[0], which is otherwise unused by the jump logic.
1977 We use this when dumping a trie and during optimisation. */
1979 trie->jump[0] = (U16)(nextbranch - convert);
1982 if ( !trie->states[trie->startstate].wordnum && trie->bitmap &&
1983 ( (char *)jumper - (char *)convert) >= (int)sizeof(struct regnode_charclass) )
1985 OP( convert ) = TRIEC;
1986 Copy(trie->bitmap, ((struct regnode_charclass *)convert)->bitmap, ANYOF_BITMAP_SIZE, char);
1987 Safefree(trie->bitmap);
1990 OP( convert ) = TRIE;
1992 /* store the type in the flags */
1993 convert->flags = nodetype;
1997 + regarglen[ OP( convert ) ];
1999 /* XXX We really should free up the resource in trie now,
2000 as we won't use them - (which resources?) dmq */
2002 /* needed for dumping*/
2003 DEBUG_r(if (optimize) {
2004 regnode *opt = convert;
2005 while ( ++opt < optimize) {
2006 Set_Node_Offset_Length(opt,0,0);
2009 Try to clean up some of the debris left after the
2012 while( optimize < jumper ) {
2013 mjd_nodelen += Node_Length((optimize));
2014 OP( optimize ) = OPTIMIZED;
2015 Set_Node_Offset_Length(optimize,0,0);
2018 Set_Node_Offset_Length(convert,mjd_offset,mjd_nodelen);
2020 } /* end node insert */
2022 SvREFCNT_dec(TRIE_REVCHARMAP(trie));
2026 : trie->startstate>1
2032 S_make_trie_failtable(pTHX_ RExC_state_t *pRExC_state, regnode *source, regnode *stclass, U32 depth)
2034 /* The Trie is constructed and compressed now so we can build a fail array now if its needed
2036 This is basically the Aho-Corasick algorithm. Its from exercise 3.31 and 3.32 in the
2037 "Red Dragon" -- Compilers, principles, techniques, and tools. Aho, Sethi, Ullman 1985/88
2040 We find the fail state for each state in the trie, this state is the longest proper
2041 suffix of the current states 'word' that is also a proper prefix of another word in our
2042 trie. State 1 represents the word '' and is the thus the default fail state. This allows
2043 the DFA not to have to restart after its tried and failed a word at a given point, it
2044 simply continues as though it had been matching the other word in the first place.
2046 'abcdgu'=~/abcdefg|cdgu/
2047 When we get to 'd' we are still matching the first word, we would encounter 'g' which would
2048 fail, which would bring use to the state representing 'd' in the second word where we would
2049 try 'g' and succeed, prodceding to match 'cdgu'.
2051 /* add a fail transition */
2052 reg_trie_data *trie=(reg_trie_data *)RExC_rxi->data->data[ARG(source)];
2054 const U32 ucharcount = trie->uniquecharcount;
2055 const U32 numstates = trie->statecount;
2056 const U32 ubound = trie->lasttrans + ucharcount;
2060 U32 base = trie->states[ 1 ].trans.base;
2063 const U32 data_slot = add_data( pRExC_state, 1, "T" );
2064 GET_RE_DEBUG_FLAGS_DECL;
2066 PERL_UNUSED_ARG(depth);
2070 ARG_SET( stclass, data_slot );
2071 Newxz( aho, 1, reg_ac_data );
2072 RExC_rxi->data->data[ data_slot ] = (void*)aho;
2074 aho->states=(reg_trie_state *)savepvn((const char*)trie->states,
2075 numstates * sizeof(reg_trie_state));
2076 Newxz( q, numstates, U32);
2077 Newxz( aho->fail, numstates, U32 );
2080 /* initialize fail[0..1] to be 1 so that we always have
2081 a valid final fail state */
2082 fail[ 0 ] = fail[ 1 ] = 1;
2084 for ( charid = 0; charid < ucharcount ; charid++ ) {
2085 const U32 newstate = TRIE_TRANS_STATE( 1, base, ucharcount, charid, 0 );
2087 q[ q_write ] = newstate;
2088 /* set to point at the root */
2089 fail[ q[ q_write++ ] ]=1;
2092 while ( q_read < q_write) {
2093 const U32 cur = q[ q_read++ % numstates ];
2094 base = trie->states[ cur ].trans.base;
2096 for ( charid = 0 ; charid < ucharcount ; charid++ ) {
2097 const U32 ch_state = TRIE_TRANS_STATE( cur, base, ucharcount, charid, 1 );
2099 U32 fail_state = cur;
2102 fail_state = fail[ fail_state ];
2103 fail_base = aho->states[ fail_state ].trans.base;
2104 } while ( !TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 ) );
2106 fail_state = TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 );
2107 fail[ ch_state ] = fail_state;
2108 if ( !aho->states[ ch_state ].wordnum && aho->states[ fail_state ].wordnum )
2110 aho->states[ ch_state ].wordnum = aho->states[ fail_state ].wordnum;
2112 q[ q_write++ % numstates] = ch_state;
2116 /* restore fail[0..1] to 0 so that we "fall out" of the AC loop
2117 when we fail in state 1, this allows us to use the
2118 charclass scan to find a valid start char. This is based on the principle
2119 that theres a good chance the string being searched contains lots of stuff
2120 that cant be a start char.
2122 fail[ 0 ] = fail[ 1 ] = 0;
2123 DEBUG_TRIE_COMPILE_r({
2124 PerlIO_printf(Perl_debug_log,
2125 "%*sStclass Failtable (%"UVuf" states): 0",
2126 (int)(depth * 2), "", (UV)numstates
2128 for( q_read=1; q_read<numstates; q_read++ ) {
2129 PerlIO_printf(Perl_debug_log, ", %"UVuf, (UV)fail[q_read]);
2131 PerlIO_printf(Perl_debug_log, "\n");
2134 /*RExC_seen |= REG_SEEN_TRIEDFA;*/
2139 * There are strange code-generation bugs caused on sparc64 by gcc-2.95.2.
2140 * These need to be revisited when a newer toolchain becomes available.
2142 #if defined(__sparc64__) && defined(__GNUC__)
2143 # if __GNUC__ < 2 || (__GNUC__ == 2 && __GNUC_MINOR__ < 96)
2144 # undef SPARC64_GCC_WORKAROUND
2145 # define SPARC64_GCC_WORKAROUND 1
2149 #define DEBUG_PEEP(str,scan,depth) \
2150 DEBUG_OPTIMISE_r({if (scan){ \
2151 SV * const mysv=sv_newmortal(); \
2152 regnode *Next = regnext(scan); \
2153 regprop(RExC_rx, mysv, scan); \
2154 PerlIO_printf(Perl_debug_log, "%*s" str ">%3d: %s (%d)\n", \
2155 (int)depth*2, "", REG_NODE_NUM(scan), SvPV_nolen_const(mysv),\
2156 Next ? (REG_NODE_NUM(Next)) : 0 ); \
2163 #define JOIN_EXACT(scan,min,flags) \
2164 if (PL_regkind[OP(scan)] == EXACT) \
2165 join_exact(pRExC_state,(scan),(min),(flags),NULL,depth+1)
2168 S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, I32 *min, U32 flags,regnode *val, U32 depth) {
2169 /* Merge several consecutive EXACTish nodes into one. */
2170 regnode *n = regnext(scan);
2172 regnode *next = scan + NODE_SZ_STR(scan);
2176 regnode *stop = scan;
2177 GET_RE_DEBUG_FLAGS_DECL;
2179 PERL_UNUSED_ARG(depth);
2181 #ifndef EXPERIMENTAL_INPLACESCAN
2182 PERL_UNUSED_ARG(flags);
2183 PERL_UNUSED_ARG(val);
2185 DEBUG_PEEP("join",scan,depth);
2187 /* Skip NOTHING, merge EXACT*. */
2189 ( PL_regkind[OP(n)] == NOTHING ||
2190 (stringok && (OP(n) == OP(scan))))
2192 && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX) {
2194 if (OP(n) == TAIL || n > next)
2196 if (PL_regkind[OP(n)] == NOTHING) {
2197 DEBUG_PEEP("skip:",n,depth);
2198 NEXT_OFF(scan) += NEXT_OFF(n);
2199 next = n + NODE_STEP_REGNODE;
2206 else if (stringok) {
2207 const unsigned int oldl = STR_LEN(scan);
2208 regnode * const nnext = regnext(n);
2210 DEBUG_PEEP("merg",n,depth);
2213 if (oldl + STR_LEN(n) > U8_MAX)
2215 NEXT_OFF(scan) += NEXT_OFF(n);
2216 STR_LEN(scan) += STR_LEN(n);
2217 next = n + NODE_SZ_STR(n);
2218 /* Now we can overwrite *n : */
2219 Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
2227 #ifdef EXPERIMENTAL_INPLACESCAN
2228 if (flags && !NEXT_OFF(n)) {
2229 DEBUG_PEEP("atch", val, depth);
2230 if (reg_off_by_arg[OP(n)]) {
2231 ARG_SET(n, val - n);
2234 NEXT_OFF(n) = val - n;
2241 if (UTF && ( OP(scan) == EXACTF ) && ( STR_LEN(scan) >= 6 ) ) {
2243 Two problematic code points in Unicode casefolding of EXACT nodes:
2245 U+0390 - GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
2246 U+03B0 - GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
2252 U+03B9 U+0308 U+0301 0xCE 0xB9 0xCC 0x88 0xCC 0x81
2253 U+03C5 U+0308 U+0301 0xCF 0x85 0xCC 0x88 0xCC 0x81
2255 This means that in case-insensitive matching (or "loose matching",
2256 as Unicode calls it), an EXACTF of length six (the UTF-8 encoded byte
2257 length of the above casefolded versions) can match a target string
2258 of length two (the byte length of UTF-8 encoded U+0390 or U+03B0).
2259 This would rather mess up the minimum length computation.
2261 What we'll do is to look for the tail four bytes, and then peek
2262 at the preceding two bytes to see whether we need to decrease
2263 the minimum length by four (six minus two).
2265 Thanks to the design of UTF-8, there cannot be false matches:
2266 A sequence of valid UTF-8 bytes cannot be a subsequence of
2267 another valid sequence of UTF-8 bytes.
2270 char * const s0 = STRING(scan), *s, *t;
2271 char * const s1 = s0 + STR_LEN(scan) - 1;
2272 char * const s2 = s1 - 4;
2273 #ifdef EBCDIC /* RD tunifold greek 0390 and 03B0 */
2274 const char t0[] = "\xaf\x49\xaf\x42";
2276 const char t0[] = "\xcc\x88\xcc\x81";
2278 const char * const t1 = t0 + 3;
2281 s < s2 && (t = ninstr(s, s1, t0, t1));
2284 if (((U8)t[-1] == 0x68 && (U8)t[-2] == 0xB4) ||
2285 ((U8)t[-1] == 0x46 && (U8)t[-2] == 0xB5))
2287 if (((U8)t[-1] == 0xB9 && (U8)t[-2] == 0xCE) ||
2288 ((U8)t[-1] == 0x85 && (U8)t[-2] == 0xCF))
2296 n = scan + NODE_SZ_STR(scan);
2298 if (PL_regkind[OP(n)] != NOTHING || OP(n) == NOTHING) {
2305 DEBUG_OPTIMISE_r(if (merged){DEBUG_PEEP("finl",scan,depth)});
2309 /* REx optimizer. Converts nodes into quickier variants "in place".
2310 Finds fixed substrings. */
2312 /* Stops at toplevel WHILEM as well as at "last". At end *scanp is set
2313 to the position after last scanned or to NULL. */
2315 #define INIT_AND_WITHP \
2316 assert(!and_withp); \
2317 Newx(and_withp,1,struct regnode_charclass_class); \
2318 SAVEFREEPV(and_withp)
2320 /* this is a chain of data about sub patterns we are processing that
2321 need to be handled seperately/specially in study_chunk. Its so
2322 we can simulate recursion without losing state. */
2324 typedef struct scan_frame {
2325 regnode *last; /* last node to process in this frame */
2326 regnode *next; /* next node to process when last is reached */
2327 struct scan_frame *prev; /*previous frame*/
2328 I32 stop; /* what stopparen do we use */
2332 S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
2333 I32 *minlenp, I32 *deltap,
2338 struct regnode_charclass_class *and_withp,
2339 U32 flags, U32 depth)
2340 /* scanp: Start here (read-write). */
2341 /* deltap: Write maxlen-minlen here. */
2342 /* last: Stop before this one. */
2343 /* data: string data about the pattern */
2344 /* stopparen: treat close N as END */
2345 /* recursed: which subroutines have we recursed into */
2346 /* and_withp: Valid if flags & SCF_DO_STCLASS_OR */
2349 I32 min = 0, pars = 0, code;
2350 regnode *scan = *scanp, *next;
2352 int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
2353 int is_inf_internal = 0; /* The studied chunk is infinite */
2354 I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
2355 scan_data_t data_fake;
2356 SV *re_trie_maxbuff = NULL;
2357 regnode *first_non_open = scan;
2358 I32 stopmin = I32_MAX;
2359 scan_frame *frame = NULL;
2361 GET_RE_DEBUG_FLAGS_DECL;
2364 StructCopy(&zero_scan_data, &data_fake, scan_data_t);
2368 while (first_non_open && OP(first_non_open) == OPEN)
2369 first_non_open=regnext(first_non_open);
2374 while ( scan && OP(scan) != END && scan < last ){
2375 /* Peephole optimizer: */
2376 DEBUG_STUDYDATA(data,depth);
2377 DEBUG_PEEP("Peep",scan,depth);
2378 JOIN_EXACT(scan,&min,0);
2380 /* Follow the next-chain of the current node and optimize
2381 away all the NOTHINGs from it. */
2382 if (OP(scan) != CURLYX) {
2383 const int max = (reg_off_by_arg[OP(scan)]
2385 /* I32 may be smaller than U16 on CRAYs! */
2386 : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
2387 int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
2391 /* Skip NOTHING and LONGJMP. */
2392 while ((n = regnext(n))
2393 && ((PL_regkind[OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
2394 || ((OP(n) == LONGJMP) && (noff = ARG(n))))
2395 && off + noff < max)
2397 if (reg_off_by_arg[OP(scan)])
2400 NEXT_OFF(scan) = off;
2405 /* The principal pseudo-switch. Cannot be a switch, since we
2406 look into several different things. */
2407 if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
2408 || OP(scan) == IFTHEN) {
2409 next = regnext(scan);
2411 /* demq: the op(next)==code check is to see if we have "branch-branch" AFAICT */
2413 if (OP(next) == code || code == IFTHEN) {
2414 /* NOTE - There is similar code to this block below for handling
2415 TRIE nodes on a re-study. If you change stuff here check there
2417 I32 max1 = 0, min1 = I32_MAX, num = 0;
2418 struct regnode_charclass_class accum;
2419 regnode * const startbranch=scan;
2421 if (flags & SCF_DO_SUBSTR)
2422 scan_commit(pRExC_state, data, minlenp); /* Cannot merge strings after this. */
2423 if (flags & SCF_DO_STCLASS)
2424 cl_init_zero(pRExC_state, &accum);
2426 while (OP(scan) == code) {
2427 I32 deltanext, minnext, f = 0, fake;
2428 struct regnode_charclass_class this_class;
2431 data_fake.flags = 0;
2433 data_fake.whilem_c = data->whilem_c;
2434 data_fake.last_closep = data->last_closep;
2437 data_fake.last_closep = &fake;
2439 data_fake.pos_delta = delta;
2440 next = regnext(scan);
2441 scan = NEXTOPER(scan);
2443 scan = NEXTOPER(scan);
2444 if (flags & SCF_DO_STCLASS) {
2445 cl_init(pRExC_state, &this_class);
2446 data_fake.start_class = &this_class;
2447 f = SCF_DO_STCLASS_AND;
2449 if (flags & SCF_WHILEM_VISITED_POS)
2450 f |= SCF_WHILEM_VISITED_POS;
2452 /* we suppose the run is continuous, last=next...*/
2453 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
2455 stopparen, recursed, NULL, f,depth+1);
2458 if (max1 < minnext + deltanext)
2459 max1 = minnext + deltanext;
2460 if (deltanext == I32_MAX)
2461 is_inf = is_inf_internal = 1;
2463 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
2465 if (data_fake.flags & SCF_SEEN_ACCEPT) {
2466 if ( stopmin > minnext)
2467 stopmin = min + min1;
2468 flags &= ~SCF_DO_SUBSTR;
2470 data->flags |= SCF_SEEN_ACCEPT;
2473 if (data_fake.flags & SF_HAS_EVAL)
2474 data->flags |= SF_HAS_EVAL;
2475 data->whilem_c = data_fake.whilem_c;
2477 if (flags & SCF_DO_STCLASS)
2478 cl_or(pRExC_state, &accum, &this_class);
2480 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
2482 if (flags & SCF_DO_SUBSTR) {
2483 data->pos_min += min1;
2484 data->pos_delta += max1 - min1;
2485 if (max1 != min1 || is_inf)
2486 data->longest = &(data->longest_float);
2489 delta += max1 - min1;
2490 if (flags & SCF_DO_STCLASS_OR) {
2491 cl_or(pRExC_state, data->start_class, &accum);
2493 cl_and(data->start_class, and_withp);
2494 flags &= ~SCF_DO_STCLASS;
2497 else if (flags & SCF_DO_STCLASS_AND) {
2499 cl_and(data->start_class, &accum);
2500 flags &= ~SCF_DO_STCLASS;
2503 /* Switch to OR mode: cache the old value of
2504 * data->start_class */
2506 StructCopy(data->start_class, and_withp,
2507 struct regnode_charclass_class);
2508 flags &= ~SCF_DO_STCLASS_AND;
2509 StructCopy(&accum, data->start_class,
2510 struct regnode_charclass_class);
2511 flags |= SCF_DO_STCLASS_OR;
2512 data->start_class->flags |= ANYOF_EOS;
2516 if (PERL_ENABLE_TRIE_OPTIMISATION && OP( startbranch ) == BRANCH ) {
2519 Assuming this was/is a branch we are dealing with: 'scan' now
2520 points at the item that follows the branch sequence, whatever
2521 it is. We now start at the beginning of the sequence and look
2528 which would be constructed from a pattern like /A|LIST|OF|WORDS/
2530 If we can find such a subseqence we need to turn the first
2531 element into a trie and then add the subsequent branch exact
2532 strings to the trie.
2536 1. patterns where the whole set of branch can be converted.
2538 2. patterns where only a subset can be converted.
2540 In case 1 we can replace the whole set with a single regop
2541 for the trie. In case 2 we need to keep the start and end
2544 'BRANCH EXACT; BRANCH EXACT; BRANCH X'
2545 becomes BRANCH TRIE; BRANCH X;
2547 There is an additional case, that being where there is a
2548 common prefix, which gets split out into an EXACT like node
2549 preceding the TRIE node.
2551 If x(1..n)==tail then we can do a simple trie, if not we make
2552 a "jump" trie, such that when we match the appropriate word
2553 we "jump" to the appopriate tail node. Essentailly we turn
2554 a nested if into a case structure of sorts.
2559 if (!re_trie_maxbuff) {
2560 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
2561 if (!SvIOK(re_trie_maxbuff))
2562 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
2564 if ( SvIV(re_trie_maxbuff)>=0 ) {
2566 regnode *first = (regnode *)NULL;
2567 regnode *last = (regnode *)NULL;
2568 regnode *tail = scan;
2573 SV * const mysv = sv_newmortal(); /* for dumping */
2575 /* var tail is used because there may be a TAIL
2576 regop in the way. Ie, the exacts will point to the
2577 thing following the TAIL, but the last branch will
2578 point at the TAIL. So we advance tail. If we
2579 have nested (?:) we may have to move through several
2583 while ( OP( tail ) == TAIL ) {
2584 /* this is the TAIL generated by (?:) */
2585 tail = regnext( tail );
2590 regprop(RExC_rx, mysv, tail );
2591 PerlIO_printf( Perl_debug_log, "%*s%s%s\n",
2592 (int)depth * 2 + 2, "",
2593 "Looking for TRIE'able sequences. Tail node is: ",
2594 SvPV_nolen_const( mysv )
2600 step through the branches, cur represents each
2601 branch, noper is the first thing to be matched
2602 as part of that branch and noper_next is the
2603 regnext() of that node. if noper is an EXACT
2604 and noper_next is the same as scan (our current
2605 position in the regex) then the EXACT branch is
2606 a possible optimization target. Once we have
2607 two or more consequetive such branches we can
2608 create a trie of the EXACT's contents and stich
2609 it in place. If the sequence represents all of
2610 the branches we eliminate the whole thing and
2611 replace it with a single TRIE. If it is a
2612 subsequence then we need to stitch it in. This
2613 means the first branch has to remain, and needs
2614 to be repointed at the item on the branch chain
2615 following the last branch optimized. This could
2616 be either a BRANCH, in which case the
2617 subsequence is internal, or it could be the
2618 item following the branch sequence in which
2619 case the subsequence is at the end.
2623 /* dont use tail as the end marker for this traverse */
2624 for ( cur = startbranch ; cur != scan ; cur = regnext( cur ) ) {
2625 regnode * const noper = NEXTOPER( cur );
2626 #if defined(DEBUGGING) || defined(NOJUMPTRIE)
2627 regnode * const noper_next = regnext( noper );
2631 regprop(RExC_rx, mysv, cur);
2632 PerlIO_printf( Perl_debug_log, "%*s- %s (%d)",
2633 (int)depth * 2 + 2,"", SvPV_nolen_const( mysv ), REG_NODE_NUM(cur) );
2635 regprop(RExC_rx, mysv, noper);
2636 PerlIO_printf( Perl_debug_log, " -> %s",
2637 SvPV_nolen_const(mysv));
2640 regprop(RExC_rx, mysv, noper_next );
2641 PerlIO_printf( Perl_debug_log,"\t=> %s\t",
2642 SvPV_nolen_const(mysv));
2644 PerlIO_printf( Perl_debug_log, "(First==%d,Last==%d,Cur==%d)\n",
2645 REG_NODE_NUM(first), REG_NODE_NUM(last), REG_NODE_NUM(cur) );
2647 if ( (((first && optype!=NOTHING) ? OP( noper ) == optype
2648 : PL_regkind[ OP( noper ) ] == EXACT )
2649 || OP(noper) == NOTHING )
2651 && noper_next == tail
2656 if ( !first || optype == NOTHING ) {
2657 if (!first) first = cur;
2658 optype = OP( noper );
2664 make_trie( pRExC_state,
2665 startbranch, first, cur, tail, count,
2668 if ( PL_regkind[ OP( noper ) ] == EXACT
2670 && noper_next == tail
2675 optype = OP( noper );
2685 regprop(RExC_rx, mysv, cur);
2686 PerlIO_printf( Perl_debug_log,
2687 "%*s- %s (%d) <SCAN FINISHED>\n", (int)depth * 2 + 2,
2688 "", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur));
2692 made= make_trie( pRExC_state, startbranch, first, scan, tail, count, optype, depth+1 );
2693 #ifdef TRIE_STUDY_OPT
2694 if ( ((made == MADE_EXACT_TRIE &&
2695 startbranch == first)
2696 || ( first_non_open == first )) &&
2698 flags |= SCF_TRIE_RESTUDY;
2699 if ( startbranch == first
2702 RExC_seen &=~REG_TOP_LEVEL_BRANCHES;
2712 else if ( code == BRANCHJ ) { /* single branch is optimized. */
2713 scan = NEXTOPER(NEXTOPER(scan));
2714 } else /* single branch is optimized. */
2715 scan = NEXTOPER(scan);
2717 } else if (OP(scan) == SUSPEND || OP(scan) == GOSUB || OP(scan) == GOSTART) {
2718 scan_frame *newframe = NULL;
2723 if (OP(scan) != SUSPEND) {
2724 /* set the pointer */
2725 if (OP(scan) == GOSUB) {
2727 RExC_recurse[ARG2L(scan)] = scan;
2728 start = RExC_open_parens[paren-1];
2729 end = RExC_close_parens[paren-1];
2732 start = RExC_rxi->program + 1;
2736 Newxz(recursed, (((RExC_npar)>>3) +1), U8);
2737 SAVEFREEPV(recursed);
2739 if (!PAREN_TEST(recursed,paren+1)) {
2740 PAREN_SET(recursed,paren+1);
2741 Newx(newframe,1,scan_frame);
2743 if (flags & SCF_DO_SUBSTR) {
2744 scan_commit(pRExC_state,data,minlenp);
2745 data->longest = &(data->longest_float);
2747 is_inf = is_inf_internal = 1;
2748 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
2749 cl_anything(pRExC_state, data->start_class);
2750 flags &= ~SCF_DO_STCLASS;
2753 Newx(newframe,1,scan_frame);
2756 end = regnext(scan);
2761 SAVEFREEPV(newframe);
2762 newframe->next = regnext(scan);
2763 newframe->last = last;
2764 newframe->stop = stopparen;
2765 newframe->prev = frame;
2775 else if (OP(scan) == EXACT) {
2776 I32 l = STR_LEN(scan);
2779 const U8 * const s = (U8*)STRING(scan);
2780 l = utf8_length(s, s + l);
2781 uc = utf8_to_uvchr(s, NULL);
2783 uc = *((U8*)STRING(scan));
2786 if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
2787 /* The code below prefers earlier match for fixed
2788 offset, later match for variable offset. */
2789 if (data->last_end == -1) { /* Update the start info. */
2790 data->last_start_min = data->pos_min;
2791 data->last_start_max = is_inf
2792 ? I32_MAX : data->pos_min + data->pos_delta;
2794 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
2796 SvUTF8_on(data->last_found);
2798 SV * const sv = data->last_found;
2799 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
2800 mg_find(sv, PERL_MAGIC_utf8) : NULL;
2801 if (mg && mg->mg_len >= 0)
2802 mg->mg_len += utf8_length((U8*)STRING(scan),
2803 (U8*)STRING(scan)+STR_LEN(scan));
2805 data->last_end = data->pos_min + l;
2806 data->pos_min += l; /* As in the first entry. */
2807 data->flags &= ~SF_BEFORE_EOL;
2809 if (flags & SCF_DO_STCLASS_AND) {
2810 /* Check whether it is compatible with what we know already! */
2814 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
2815 && !ANYOF_BITMAP_TEST(data->start_class, uc)
2816 && (!(data->start_class->flags & ANYOF_FOLD)
2817 || !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
2820 ANYOF_CLASS_ZERO(data->start_class);
2821 ANYOF_BITMAP_ZERO(data->start_class);
2823 ANYOF_BITMAP_SET(data->start_class, uc);
2824 data->start_class->flags &= ~ANYOF_EOS;
2826 data->start_class->flags &= ~ANYOF_UNICODE_ALL;
2828 else if (flags & SCF_DO_STCLASS_OR) {
2829 /* false positive possible if the class is case-folded */
2831 ANYOF_BITMAP_SET(data->start_class, uc);
2833 data->start_class->flags |= ANYOF_UNICODE_ALL;
2834 data->start_class->flags &= ~ANYOF_EOS;
2835 cl_and(data->start_class, and_withp);
2837 flags &= ~SCF_DO_STCLASS;
2839 else if (PL_regkind[OP(scan)] == EXACT) { /* But OP != EXACT! */
2840 I32 l = STR_LEN(scan);
2841 UV uc = *((U8*)STRING(scan));
2843 /* Search for fixed substrings supports EXACT only. */
2844 if (flags & SCF_DO_SUBSTR) {
2846 scan_commit(pRExC_state, data, minlenp);
2849 const U8 * const s = (U8 *)STRING(scan);
2850 l = utf8_length(s, s + l);
2851 uc = utf8_to_uvchr(s, NULL);
2854 if (flags & SCF_DO_SUBSTR)
2856 if (flags & SCF_DO_STCLASS_AND) {
2857 /* Check whether it is compatible with what we know already! */
2861 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
2862 && !ANYOF_BITMAP_TEST(data->start_class, uc)
2863 && !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
2865 ANYOF_CLASS_ZERO(data->start_class);
2866 ANYOF_BITMAP_ZERO(data->start_class);
2868 ANYOF_BITMAP_SET(data->start_class, uc);
2869 data->start_class->flags &= ~ANYOF_EOS;
2870 data->start_class->flags |= ANYOF_FOLD;
2871 if (OP(scan) == EXACTFL)
2872 data->start_class->flags |= ANYOF_LOCALE;
2875 else if (flags & SCF_DO_STCLASS_OR) {
2876 if (data->start_class->flags & ANYOF_FOLD) {
2877 /* false positive possible if the class is case-folded.
2878 Assume that the locale settings are the same... */
2880 ANYOF_BITMAP_SET(data->start_class, uc);
2881 data->start_class->flags &= ~ANYOF_EOS;
2883 cl_and(data->start_class, and_withp);
2885 flags &= ~SCF_DO_STCLASS;
2887 else if (strchr((const char*)PL_varies,OP(scan))) {
2888 I32 mincount, maxcount, minnext, deltanext, fl = 0;
2889 I32 f = flags, pos_before = 0;
2890 regnode * const oscan = scan;
2891 struct regnode_charclass_class this_class;
2892 struct regnode_charclass_class *oclass = NULL;
2893 I32 next_is_eval = 0;
2895 switch (PL_regkind[OP(scan)]) {
2896 case WHILEM: /* End of (?:...)* . */
2897 scan = NEXTOPER(scan);
2900 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
2901 next = NEXTOPER(scan);
2902 if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) {
2904 maxcount = REG_INFTY;
2905 next = regnext(scan);
2906 scan = NEXTOPER(scan);
2910 if (flags & SCF_DO_SUBSTR)
2915 if (flags & SCF_DO_STCLASS) {
2917 maxcount = REG_INFTY;
2918 next = regnext(scan);
2919 scan = NEXTOPER(scan);
2922 is_inf = is_inf_internal = 1;
2923 scan = regnext(scan);
2924 if (flags & SCF_DO_SUBSTR) {
2925 scan_commit(pRExC_state, data, minlenp); /* Cannot extend fixed substrings */
2926 data->longest = &(data->longest_float);
2928 goto optimize_curly_tail;
2930 if (stopparen>0 && (OP(scan)==CURLYN || OP(scan)==CURLYM)
2931 && (scan->flags == stopparen))
2936 mincount = ARG1(scan);
2937 maxcount = ARG2(scan);
2939 next = regnext(scan);
2940 if (OP(scan) == CURLYX) {
2941 I32 lp = (data ? *(data->last_closep) : 0);
2942 scan->flags = ((lp <= (I32)U8_MAX) ? (U8)lp : U8_MAX);
2944 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
2945 next_is_eval = (OP(scan) == EVAL);
2947 if (flags & SCF_DO_SUBSTR) {
2948 if (mincount == 0) scan_commit(pRExC_state,data,minlenp); /* Cannot extend fixed substrings */
2949 pos_before = data->pos_min;
2953 data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
2955 data->flags |= SF_IS_INF;
2957 if (flags & SCF_DO_STCLASS) {
2958 cl_init(pRExC_state, &this_class);
2959 oclass = data->start_class;
2960 data->start_class = &this_class;
2961 f |= SCF_DO_STCLASS_AND;
2962 f &= ~SCF_DO_STCLASS_OR;
2964 /* These are the cases when once a subexpression
2965 fails at a particular position, it cannot succeed
2966 even after backtracking at the enclosing scope.
2968 XXXX what if minimal match and we are at the
2969 initial run of {n,m}? */
2970 if ((mincount != maxcount - 1) && (maxcount != REG_INFTY))
2971 f &= ~SCF_WHILEM_VISITED_POS;
2973 /* This will finish on WHILEM, setting scan, or on NULL: */
2974 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
2975 last, data, stopparen, recursed, NULL,
2977 ? (f & ~SCF_DO_SUBSTR) : f),depth+1);
2979 if (flags & SCF_DO_STCLASS)
2980 data->start_class = oclass;
2981 if (mincount == 0 || minnext == 0) {
2982 if (flags & SCF_DO_STCLASS_OR) {
2983 cl_or(pRExC_state, data->start_class, &this_class);
2985 else if (flags & SCF_DO_STCLASS_AND) {
2986 /* Switch to OR mode: cache the old value of
2987 * data->start_class */
2989 StructCopy(data->start_class, and_withp,
2990 struct regnode_charclass_class);
2991 flags &= ~SCF_DO_STCLASS_AND;
2992 StructCopy(&this_class, data->start_class,
2993 struct regnode_charclass_class);
2994 flags |= SCF_DO_STCLASS_OR;
2995 data->start_class->flags |= ANYOF_EOS;
2997 } else { /* Non-zero len */
2998 if (flags & SCF_DO_STCLASS_OR) {
2999 cl_or(pRExC_state, data->start_class, &this_class);
3000 cl_and(data->start_class, and_withp);
3002 else if (flags & SCF_DO_STCLASS_AND)
3003 cl_and(data->start_class, &this_class);
3004 flags &= ~SCF_DO_STCLASS;
3006 if (!scan) /* It was not CURLYX, but CURLY. */
3008 if ( /* ? quantifier ok, except for (?{ ... }) */
3009 (next_is_eval || !(mincount == 0 && maxcount == 1))
3010 && (minnext == 0) && (deltanext == 0)
3011 && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
3012 && maxcount <= REG_INFTY/3 /* Complement check for big count */
3013 && ckWARN(WARN_REGEXP))
3016 "Quantifier unexpected on zero-length expression");
3019 min += minnext * mincount;
3020 is_inf_internal |= ((maxcount == REG_INFTY
3021 && (minnext + deltanext) > 0)
3022 || deltanext == I32_MAX);
3023 is_inf |= is_inf_internal;
3024 delta += (minnext + deltanext) * maxcount - minnext * mincount;
3026 /* Try powerful optimization CURLYX => CURLYN. */
3027 if ( OP(oscan) == CURLYX && data
3028 && data->flags & SF_IN_PAR
3029 && !(data->flags & SF_HAS_EVAL)
3030 && !deltanext && minnext == 1 ) {
3031 /* Try to optimize to CURLYN. */
3032 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
3033 regnode * const nxt1 = nxt;
3040 if (!strchr((const char*)PL_simple,OP(nxt))
3041 && !(PL_regkind[OP(nxt)] == EXACT
3042 && STR_LEN(nxt) == 1))
3048 if (OP(nxt) != CLOSE)
3050 if (RExC_open_parens) {
3051 RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
3052 RExC_close_parens[ARG(nxt1)-1]=nxt+2; /*close->while*/
3054 /* Now we know that nxt2 is the only contents: */
3055 oscan->flags = (U8)ARG(nxt);
3057 OP(nxt1) = NOTHING; /* was OPEN. */
3060 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
3061 NEXT_OFF(nxt1+ 1) = 0; /* just for consistancy. */
3062 NEXT_OFF(nxt2) = 0; /* just for consistancy with CURLY. */
3063 OP(nxt) = OPTIMIZED; /* was CLOSE. */
3064 OP(nxt + 1) = OPTIMIZED; /* was count. */
3065 NEXT_OFF(nxt+ 1) = 0; /* just for consistancy. */
3070 /* Try optimization CURLYX => CURLYM. */
3071 if ( OP(oscan) == CURLYX && data
3072 && !(data->flags & SF_HAS_PAR)
3073 && !(data->flags & SF_HAS_EVAL)
3074 && !deltanext /* atom is fixed width */
3075 && minnext != 0 /* CURLYM can't handle zero width */
3077 /* XXXX How to optimize if data == 0? */
3078 /* Optimize to a simpler form. */
3079 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
3083 while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
3084 && (OP(nxt2) != WHILEM))
3086 OP(nxt2) = SUCCEED; /* Whas WHILEM */
3087 /* Need to optimize away parenths. */
3088 if (data->flags & SF_IN_PAR) {
3089 /* Set the parenth number. */
3090 regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
3092 if (OP(nxt) != CLOSE)
3093 FAIL("Panic opt close");
3094 oscan->flags = (U8)ARG(nxt);
3095 if (RExC_open_parens) {
3096 RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
3097 RExC_close_parens[ARG(nxt1)-1]=nxt2+1; /*close->NOTHING*/
3099 OP(nxt1) = OPTIMIZED; /* was OPEN. */
3100 OP(nxt) = OPTIMIZED; /* was CLOSE. */
3103 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
3104 OP(nxt + 1) = OPTIMIZED; /* was count. */
3105 NEXT_OFF(nxt1 + 1) = 0; /* just for consistancy. */
3106 NEXT_OFF(nxt + 1) = 0; /* just for consistancy. */
3109 while ( nxt1 && (OP(nxt1) != WHILEM)) {
3110 regnode *nnxt = regnext(nxt1);
3113 if (reg_off_by_arg[OP(nxt1)])
3114 ARG_SET(nxt1, nxt2 - nxt1);
3115 else if (nxt2 - nxt1 < U16_MAX)
3116 NEXT_OFF(nxt1) = nxt2 - nxt1;
3118 OP(nxt) = NOTHING; /* Cannot beautify */
3123 /* Optimize again: */
3124 study_chunk(pRExC_state, &nxt1, minlenp, &deltanext, nxt,
3125 NULL, stopparen, recursed, NULL, 0,depth+1);
3130 else if ((OP(oscan) == CURLYX)
3131 && (flags & SCF_WHILEM_VISITED_POS)
3132 /* See the comment on a similar expression above.
3133 However, this time it not a subexpression
3134 we care about, but the expression itself. */
3135 && (maxcount == REG_INFTY)
3136 && data && ++data->whilem_c < 16) {
3137 /* This stays as CURLYX, we can put the count/of pair. */
3138 /* Find WHILEM (as in regexec.c) */
3139 regnode *nxt = oscan + NEXT_OFF(oscan);
3141 if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
3143 PREVOPER(nxt)->flags = (U8)(data->whilem_c
3144 | (RExC_whilem_seen << 4)); /* On WHILEM */
3146 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
3148 if (flags & SCF_DO_SUBSTR) {
3149 SV *last_str = NULL;
3150 int counted = mincount != 0;
3152 if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */
3153 #if defined(SPARC64_GCC_WORKAROUND)
3156 const char *s = NULL;
3159 if (pos_before >= data->last_start_min)
3162 b = data->last_start_min;
3165 s = SvPV_const(data->last_found, l);
3166 old = b - data->last_start_min;
3169 I32 b = pos_before >= data->last_start_min
3170 ? pos_before : data->last_start_min;
3172 const char * const s = SvPV_const(data->last_found, l);
3173 I32 old = b - data->last_start_min;
3177 old = utf8_hop((U8*)s, old) - (U8*)s;
3180 /* Get the added string: */
3181 last_str = newSVpvn(s + old, l);
3183 SvUTF8_on(last_str);
3184 if (deltanext == 0 && pos_before == b) {
3185 /* What was added is a constant string */
3187 SvGROW(last_str, (mincount * l) + 1);
3188 repeatcpy(SvPVX(last_str) + l,
3189 SvPVX_const(last_str), l, mincount - 1);
3190 SvCUR_set(last_str, SvCUR(last_str) * mincount);
3191 /* Add additional parts. */
3192 SvCUR_set(data->last_found,
3193 SvCUR(data->last_found) - l);
3194 sv_catsv(data->last_found, last_str);
3196 SV * sv = data->last_found;
3198 SvUTF8(sv) && SvMAGICAL(sv) ?
3199 mg_find(sv, PERL_MAGIC_utf8) : NULL;
3200 if (mg && mg->mg_len >= 0)
3201 mg->mg_len += CHR_SVLEN(last_str);
3203 data->last_end += l * (mincount - 1);
3206 /* start offset must point into the last copy */
3207 data->last_start_min += minnext * (mincount - 1);
3208 data->last_start_max += is_inf ? I32_MAX
3209 : (maxcount - 1) * (minnext + data->pos_delta);
3212 /* It is counted once already... */
3213 data->pos_min += minnext * (mincount - counted);
3214 data->pos_delta += - counted * deltanext +
3215 (minnext + deltanext) * maxcount - minnext * mincount;
3216 if (mincount != maxcount) {
3217 /* Cannot extend fixed substrings found inside
3219 scan_commit(pRExC_state,data,minlenp);
3220 if (mincount && last_str) {
3221 SV * const sv = data->last_found;
3222 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
3223 mg_find(sv, PERL_MAGIC_utf8) : NULL;
3227 sv_setsv(sv, last_str);
3228 data->last_end = data->pos_min;
3229 data->last_start_min =
3230 data->pos_min - CHR_SVLEN(last_str);
3231 data->last_start_max = is_inf
3233 : data->pos_min + data->pos_delta
3234 - CHR_SVLEN(last_str);
3236 data->longest = &(data->longest_float);
3238 SvREFCNT_dec(last_str);
3240 if (data && (fl & SF_HAS_EVAL))
3241 data->flags |= SF_HAS_EVAL;
3242 optimize_curly_tail:
3243 if (OP(oscan) != CURLYX) {
3244 while (PL_regkind[OP(next = regnext(oscan))] == NOTHING
3246 NEXT_OFF(oscan) += NEXT_OFF(next);
3249 default: /* REF and CLUMP only? */
3250 if (flags & SCF_DO_SUBSTR) {
3251 scan_commit(pRExC_state,data,minlenp); /* Cannot expect anything... */
3252 data->longest = &(data->longest_float);
3254 is_inf = is_inf_internal = 1;
3255 if (flags & SCF_DO_STCLASS_OR)
3256 cl_anything(pRExC_state, data->start_class);
3257 flags &= ~SCF_DO_STCLASS;
3261 else if (strchr((const char*)PL_simple,OP(scan))) {
3264 if (flags & SCF_DO_SUBSTR) {
3265 scan_commit(pRExC_state,data,minlenp);
3269 if (flags & SCF_DO_STCLASS) {
3270 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
3272 /* Some of the logic below assumes that switching
3273 locale on will only add false positives. */
3274 switch (PL_regkind[OP(scan)]) {
3278 /* Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan)); */
3279 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
3280 cl_anything(pRExC_state, data->start_class);
3283 if (OP(scan) == SANY)
3285 if (flags & SCF_DO_STCLASS_OR) { /* Everything but \n */
3286 value = (ANYOF_BITMAP_TEST(data->start_class,'\n')
3287 || (data->start_class->flags & ANYOF_CLASS));
3288 cl_anything(pRExC_state, data->start_class);
3290 if (flags & SCF_DO_STCLASS_AND || !value)
3291 ANYOF_BITMAP_CLEAR(data->start_class,'\n');
3294 if (flags & SCF_DO_STCLASS_AND)
3295 cl_and(data->start_class,
3296 (struct regnode_charclass_class*)scan);
3298 cl_or(pRExC_state, data->start_class,
3299 (struct regnode_charclass_class*)scan);
3302 if (flags & SCF_DO_STCLASS_AND) {
3303 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3304 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
3305 for (value = 0; value < 256; value++)
3306 if (!isALNUM(value))
3307 ANYOF_BITMAP_CLEAR(data->start_class, value);
3311 if (data->start_class->flags & ANYOF_LOCALE)
3312 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
3314 for (value = 0; value < 256; value++)
3316 ANYOF_BITMAP_SET(data->start_class, value);
3321 if (flags & SCF_DO_STCLASS_AND) {
3322 if (data->start_class->flags & ANYOF_LOCALE)
3323 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
3326 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
3327 data->start_class->flags |= ANYOF_LOCALE;
3331 if (flags & SCF_DO_STCLASS_AND) {
3332 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3333 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
3334 for (value = 0; value < 256; value++)
3336 ANYOF_BITMAP_CLEAR(data->start_class, value);
3340 if (data->start_class->flags & ANYOF_LOCALE)
3341 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
3343 for (value = 0; value < 256; value++)
3344 if (!isALNUM(value))
3345 ANYOF_BITMAP_SET(data->start_class, value);
3350 if (flags & SCF_DO_STCLASS_AND) {
3351 if (data->start_class->flags & ANYOF_LOCALE)
3352 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
3355 data->start_class->flags |= ANYOF_LOCALE;
3356 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
3360 if (flags & SCF_DO_STCLASS_AND) {
3361 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3362 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
3363 for (value = 0; value < 256; value++)
3364 if (!isSPACE(value))
3365 ANYOF_BITMAP_CLEAR(data->start_class, value);
3369 if (data->start_class->flags & ANYOF_LOCALE)
3370 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
3372 for (value = 0; value < 256; value++)
3374 ANYOF_BITMAP_SET(data->start_class, value);
3379 if (flags & SCF_DO_STCLASS_AND) {
3380 if (data->start_class->flags & ANYOF_LOCALE)
3381 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
3384 data->start_class->flags |= ANYOF_LOCALE;
3385 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
3389 if (flags & SCF_DO_STCLASS_AND) {
3390 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3391 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
3392 for (value = 0; value < 256; value++)
3394 ANYOF_BITMAP_CLEAR(data->start_class, value);
3398 if (data->start_class->flags & ANYOF_LOCALE)
3399 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
3401 for (value = 0; value < 256; value++)
3402 if (!isSPACE(value))
3403 ANYOF_BITMAP_SET(data->start_class, value);
3408 if (flags & SCF_DO_STCLASS_AND) {
3409 if (data->start_class->flags & ANYOF_LOCALE) {
3410 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
3411 for (value = 0; value < 256; value++)
3412 if (!isSPACE(value))
3413 ANYOF_BITMAP_CLEAR(data->start_class, value);
3417 data->start_class->flags |= ANYOF_LOCALE;
3418 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
3422 if (flags & SCF_DO_STCLASS_AND) {
3423 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NDIGIT);
3424 for (value = 0; value < 256; value++)
3425 if (!isDIGIT(value))
3426 ANYOF_BITMAP_CLEAR(data->start_class, value);
3429 if (data->start_class->flags & ANYOF_LOCALE)
3430 ANYOF_CLASS_SET(data->start_class,ANYOF_DIGIT);
3432 for (value = 0; value < 256; value++)
3434 ANYOF_BITMAP_SET(data->start_class, value);
3439 if (flags & SCF_DO_STCLASS_AND) {
3440 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_DIGIT);
3441 for (value = 0; value < 256; value++)
3443 ANYOF_BITMAP_CLEAR(data->start_class, value);
3446 if (data->start_class->flags & ANYOF_LOCALE)
3447 ANYOF_CLASS_SET(data->start_class,ANYOF_NDIGIT);
3449 for (value = 0; value < 256; value++)
3450 if (!isDIGIT(value))
3451 ANYOF_BITMAP_SET(data->start_class, value);
3456 if (flags & SCF_DO_STCLASS_OR)
3457 cl_and(data->start_class, and_withp);
3458 flags &= ~SCF_DO_STCLASS;
3461 else if (PL_regkind[OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
3462 data->flags |= (OP(scan) == MEOL
3466 else if ( PL_regkind[OP(scan)] == BRANCHJ
3467 /* Lookbehind, or need to calculate parens/evals/stclass: */
3468 && (scan->flags || data || (flags & SCF_DO_STCLASS))
3469 && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) {
3470 if ( !PERL_ENABLE_POSITIVE_ASSERTION_STUDY
3471 || OP(scan) == UNLESSM )
3473 /* Negative Lookahead/lookbehind
3474 In this case we can't do fixed string optimisation.
3477 I32 deltanext, minnext, fake = 0;
3479 struct regnode_charclass_class intrnl;
3482 data_fake.flags = 0;
3484 data_fake.whilem_c = data->whilem_c;
3485 data_fake.last_closep = data->last_closep;
3488 data_fake.last_closep = &fake;
3489 data_fake.pos_delta = delta;
3490 if ( flags & SCF_DO_STCLASS && !scan->flags
3491 && OP(scan) == IFMATCH ) { /* Lookahead */
3492 cl_init(pRExC_state, &intrnl);
3493 data_fake.start_class = &intrnl;
3494 f |= SCF_DO_STCLASS_AND;
3496 if (flags & SCF_WHILEM_VISITED_POS)
3497 f |= SCF_WHILEM_VISITED_POS;
3498 next = regnext(scan);
3499 nscan = NEXTOPER(NEXTOPER(scan));
3500 minnext = study_chunk(pRExC_state, &nscan, minlenp, &deltanext,
3501 last, &data_fake, stopparen, recursed, NULL, f, depth+1);
3504 FAIL("Variable length lookbehind not implemented");
3506 else if (minnext > (I32)U8_MAX) {
3507 FAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
3509 scan->flags = (U8)minnext;
3512 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3514 if (data_fake.flags & SF_HAS_EVAL)
3515 data->flags |= SF_HAS_EVAL;
3516 data->whilem_c = data_fake.whilem_c;
3518 if (f & SCF_DO_STCLASS_AND) {
3519 const int was = (data->start_class->flags & ANYOF_EOS);
3521 cl_and(data->start_class, &intrnl);
3523 data->start_class->flags |= ANYOF_EOS;
3526 #if PERL_ENABLE_POSITIVE_ASSERTION_STUDY
3528 /* Positive Lookahead/lookbehind
3529 In this case we can do fixed string optimisation,
3530 but we must be careful about it. Note in the case of
3531 lookbehind the positions will be offset by the minimum
3532 length of the pattern, something we won't know about
3533 until after the recurse.
3535 I32 deltanext, fake = 0;
3537 struct regnode_charclass_class intrnl;
3539 /* We use SAVEFREEPV so that when the full compile
3540 is finished perl will clean up the allocated
3541 minlens when its all done. This was we don't
3542 have to worry about freeing them when we know
3543 they wont be used, which would be a pain.
3546 Newx( minnextp, 1, I32 );
3547 SAVEFREEPV(minnextp);
3550 StructCopy(data, &data_fake, scan_data_t);
3551 if ((flags & SCF_DO_SUBSTR) && data->last_found) {
3554 scan_commit(pRExC_state, &data_fake,minlenp);
3555 data_fake.last_found=newSVsv(data->last_found);
3559 data_fake.last_closep = &fake;
3560 data_fake.flags = 0;
3561 data_fake.pos_delta = delta;
3563 data_fake.flags |= SF_IS_INF;
3564 if ( flags & SCF_DO_STCLASS && !scan->flags
3565 && OP(scan) == IFMATCH ) { /* Lookahead */
3566 cl_init(pRExC_state, &intrnl);
3567 data_fake.start_class = &intrnl;
3568 f |= SCF_DO_STCLASS_AND;
3570 if (flags & SCF_WHILEM_VISITED_POS)
3571 f |= SCF_WHILEM_VISITED_POS;
3572 next = regnext(scan);
3573 nscan = NEXTOPER(NEXTOPER(scan));
3575 *minnextp = study_chunk(pRExC_state, &nscan, minnextp, &deltanext,
3576 last, &data_fake, stopparen, recursed, NULL, f,depth+1);
3579 FAIL("Variable length lookbehind not implemented");
3581 else if (*minnextp > (I32)U8_MAX) {
3582 FAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
3584 scan->flags = (U8)*minnextp;
3589 if (f & SCF_DO_STCLASS_AND) {
3590 const int was = (data->start_class->flags & ANYOF_EOS);
3592 cl_and(data->start_class, &intrnl);
3594 data->start_class->flags |= ANYOF_EOS;
3597 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3599 if (data_fake.flags & SF_HAS_EVAL)
3600 data->flags |= SF_HAS_EVAL;
3601 data->whilem_c = data_fake.whilem_c;
3602 if ((flags & SCF_DO_SUBSTR) && data_fake.last_found) {
3603 if (RExC_rx->minlen<*minnextp)
3604 RExC_rx->minlen=*minnextp;
3605 scan_commit(pRExC_state, &data_fake, minnextp);
3606 SvREFCNT_dec(data_fake.last_found);
3608 if ( data_fake.minlen_fixed != minlenp )
3610 data->offset_fixed= data_fake.offset_fixed;
3611 data->minlen_fixed= data_fake.minlen_fixed;
3612 data->lookbehind_fixed+= scan->flags;
3614 if ( data_fake.minlen_float != minlenp )
3616 data->minlen_float= data_fake.minlen_float;
3617 data->offset_float_min=data_fake.offset_float_min;
3618 data->offset_float_max=data_fake.offset_float_max;
3619 data->lookbehind_float+= scan->flags;
3628 else if (OP(scan) == OPEN) {
3629 if (stopparen != (I32)ARG(scan))
3632 else if (OP(scan) == CLOSE) {
3633 if (stopparen == (I32)ARG(scan)) {
3636 if ((I32)ARG(scan) == is_par) {
3637 next = regnext(scan);
3639 if ( next && (OP(next) != WHILEM) && next < last)
3640 is_par = 0; /* Disable optimization */
3643 *(data->last_closep) = ARG(scan);
3645 else if (OP(scan) == EVAL) {
3647 data->flags |= SF_HAS_EVAL;
3649 else if ( PL_regkind[OP(scan)] == ENDLIKE ) {
3650 if (flags & SCF_DO_SUBSTR) {
3651 scan_commit(pRExC_state,data,minlenp);
3652 flags &= ~SCF_DO_SUBSTR;
3654 if (data && OP(scan)==ACCEPT) {
3655 data->flags |= SCF_SEEN_ACCEPT;
3660 else if (OP(scan) == LOGICAL && scan->flags == 2) /* Embedded follows */
3662 if (flags & SCF_DO_SUBSTR) {
3663 scan_commit(pRExC_state,data,minlenp);
3664 data->longest = &(data->longest_float);
3666 is_inf = is_inf_internal = 1;
3667 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
3668 cl_anything(pRExC_state, data->start_class);
3669 flags &= ~SCF_DO_STCLASS;
3671 else if (OP(scan) == GPOS) {
3672 if (!(RExC_rx->extflags & RXf_GPOS_FLOAT) &&
3673 !(delta || is_inf || (data && data->pos_delta)))
3675 if (!(RExC_rx->extflags & RXf_ANCH) && (flags & SCF_DO_SUBSTR))
3676 RExC_rx->extflags |= RXf_ANCH_GPOS;
3677 if (RExC_rx->gofs < (U32)min)
3678 RExC_rx->gofs = min;
3680 RExC_rx->extflags |= RXf_GPOS_FLOAT;
3684 #ifdef TRIE_STUDY_OPT
3685 #ifdef FULL_TRIE_STUDY
3686 else if (PL_regkind[OP(scan)] == TRIE) {
3687 /* NOTE - There is similar code to this block above for handling
3688 BRANCH nodes on the initial study. If you change stuff here
3690 regnode *trie_node= scan;
3691 regnode *tail= regnext(scan);
3692 reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
3693 I32 max1 = 0, min1 = I32_MAX;
3694 struct regnode_charclass_class accum;
3696 if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
3697 scan_commit(pRExC_state, data,minlenp); /* Cannot merge strings after this. */
3698 if (flags & SCF_DO_STCLASS)
3699 cl_init_zero(pRExC_state, &accum);
3705 const regnode *nextbranch= NULL;
3708 for ( word=1 ; word <= trie->wordcount ; word++)
3710 I32 deltanext=0, minnext=0, f = 0, fake;
3711 struct regnode_charclass_class this_class;
3713 data_fake.flags = 0;
3715 data_fake.whilem_c = data->whilem_c;
3716 data_fake.last_closep = data->last_closep;
3719 data_fake.last_closep = &fake;
3720 data_fake.pos_delta = delta;
3721 if (flags & SCF_DO_STCLASS) {
3722 cl_init(pRExC_state, &this_class);
3723 data_fake.start_class = &this_class;
3724 f = SCF_DO_STCLASS_AND;
3726 if (flags & SCF_WHILEM_VISITED_POS)
3727 f |= SCF_WHILEM_VISITED_POS;
3729 if (trie->jump[word]) {
3731 nextbranch = trie_node + trie->jump[0];
3732 scan= trie_node + trie->jump[word];
3733 /* We go from the jump point to the branch that follows
3734 it. Note this means we need the vestigal unused branches
3735 even though they arent otherwise used.
3737 minnext = study_chunk(pRExC_state, &scan, minlenp,
3738 &deltanext, (regnode *)nextbranch, &data_fake,
3739 stopparen, recursed, NULL, f,depth+1);
3741 if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
3742 nextbranch= regnext((regnode*)nextbranch);
3744 if (min1 > (I32)(minnext + trie->minlen))
3745 min1 = minnext + trie->minlen;
3746 if (max1 < (I32)(minnext + deltanext + trie->maxlen))
3747 max1 = minnext + deltanext + trie->maxlen;
3748 if (deltanext == I32_MAX)
3749 is_inf = is_inf_internal = 1;
3751 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3753 if (data_fake.flags & SCF_SEEN_ACCEPT) {
3754 if ( stopmin > min + min1)
3755 stopmin = min + min1;
3756 flags &= ~SCF_DO_SUBSTR;
3758 data->flags |= SCF_SEEN_ACCEPT;
3761 if (data_fake.flags & SF_HAS_EVAL)
3762 data->flags |= SF_HAS_EVAL;
3763 data->whilem_c = data_fake.whilem_c;
3765 if (flags & SCF_DO_STCLASS)
3766 cl_or(pRExC_state, &accum, &this_class);
3769 if (flags & SCF_DO_SUBSTR) {
3770 data->pos_min += min1;
3771 data->pos_delta += max1 - min1;
3772 if (max1 != min1 || is_inf)
3773 data->longest = &(data->longest_float);
3776 delta += max1 - min1;
3777 if (flags & SCF_DO_STCLASS_OR) {
3778 cl_or(pRExC_state, data->start_class, &accum);
3780 cl_and(data->start_class, and_withp);
3781 flags &= ~SCF_DO_STCLASS;
3784 else if (flags & SCF_DO_STCLASS_AND) {
3786 cl_and(data->start_class, &accum);
3787 flags &= ~SCF_DO_STCLASS;
3790 /* Switch to OR mode: cache the old value of
3791 * data->start_class */
3793 StructCopy(data->start_class, and_withp,
3794 struct regnode_charclass_class);
3795 flags &= ~SCF_DO_STCLASS_AND;
3796 StructCopy(&accum, data->start_class,
3797 struct regnode_charclass_class);
3798 flags |= SCF_DO_STCLASS_OR;
3799 data->start_class->flags |= ANYOF_EOS;
3806 else if (PL_regkind[OP(scan)] == TRIE) {
3807 reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
3810 min += trie->minlen;
3811 delta += (trie->maxlen - trie->minlen);
3812 flags &= ~SCF_DO_STCLASS; /* xxx */
3813 if (flags & SCF_DO_SUBSTR) {
3814 scan_commit(pRExC_state,data,minlenp); /* Cannot expect anything... */
3815 data->pos_min += trie->minlen;
3816 data->pos_delta += (trie->maxlen - trie->minlen);
3817 if (trie->maxlen != trie->minlen)
3818 data->longest = &(data->longest_float);
3820 if (trie->jump) /* no more substrings -- for now /grr*/
3821 flags &= ~SCF_DO_SUBSTR;
3823 #endif /* old or new */
3824 #endif /* TRIE_STUDY_OPT */
3825 /* Else: zero-length, ignore. */
3826 scan = regnext(scan);
3831 stopparen = frame->stop;
3832 frame = frame->prev;
3833 goto fake_study_recurse;
3840 *deltap = is_inf_internal ? I32_MAX : delta;
3841 if (flags & SCF_DO_SUBSTR && is_inf)
3842 data->pos_delta = I32_MAX - data->pos_min;
3843 if (is_par > (I32)U8_MAX)
3845 if (is_par && pars==1 && data) {
3846 data->flags |= SF_IN_PAR;
3847 data->flags &= ~SF_HAS_PAR;
3849 else if (pars && data) {
3850 data->flags |= SF_HAS_PAR;
3851 data->flags &= ~SF_IN_PAR;
3853 if (flags & SCF_DO_STCLASS_OR)
3854 cl_and(data->start_class, and_withp);
3855 if (flags & SCF_TRIE_RESTUDY)
3856 data->flags |= SCF_TRIE_RESTUDY;
3858 DEBUG_STUDYDATA(data,depth);
3860 return min < stopmin ? min : stopmin;
3864 S_add_data(RExC_state_t *pRExC_state, U32 n, const char *s)
3866 U32 count = RExC_rxi->data ? RExC_rxi->data->count : 0;
3868 Renewc(RExC_rxi->data,
3869 sizeof(*RExC_rxi->data) + sizeof(void*) * (count + n - 1),
3870 char, struct reg_data);
3872 Renew(RExC_rxi->data->what, count + n, U8);
3874 Newx(RExC_rxi->data->what, n, U8);
3875 RExC_rxi->data->count = count + n;
3876 Copy(s, RExC_rxi->data->what + count, n, U8);
3880 #ifndef PERL_IN_XSUB_RE
3882 Perl_reginitcolors(pTHX)
3885 const char * const s = PerlEnv_getenv("PERL_RE_COLORS");
3887 char *t = savepv(s);
3891 t = strchr(t, '\t');
3897 PL_colors[i] = t = (char *)"";
3902 PL_colors[i++] = (char *)"";
3909 #ifdef TRIE_STUDY_OPT
3910 #define CHECK_RESTUDY_GOTO \
3912 (data.flags & SCF_TRIE_RESTUDY) \
3916 #define CHECK_RESTUDY_GOTO
3920 - pregcomp - compile a regular expression into internal code
3922 * We can't allocate space until we know how big the compiled form will be,
3923 * but we can't compile it (and thus know how big it is) until we've got a
3924 * place to put the code. So we cheat: we compile it twice, once with code
3925 * generation turned off and size counting turned on, and once "for real".
3926 * This also means that we don't allocate space until we are sure that the
3927 * thing really will compile successfully, and we never have to move the
3928 * code and thus invalidate pointers into it. (Note that it has to be in
3929 * one piece because free() must be able to free it all.) [NB: not true in perl]
3931 * Beware that the optimization-preparation code in here knows about some
3932 * of the structure of the compiled regexp. [I'll say.]
3937 #ifndef PERL_IN_XSUB_RE
3938 #define RE_ENGINE_PTR &PL_core_reg_engine
3940 extern const struct regexp_engine my_reg_engine;
3941 #define RE_ENGINE_PTR &my_reg_engine
3943 /* these make a few things look better, to avoid indentation */
3944 #define BEGIN_BLOCK {
3948 Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm)
3951 GET_RE_DEBUG_FLAGS_DECL;
3952 DEBUG_r(if (!PL_colorset) reginitcolors());
3953 #ifndef PERL_IN_XSUB_RE
3955 /* Dispatch a request to compile a regexp to correct
3957 HV * const table = GvHV(PL_hintgv);
3959 SV **ptr= hv_fetchs(table, "regcomp", FALSE);
3960 if (ptr && SvIOK(*ptr) && SvIV(*ptr)) {
3961 const regexp_engine *eng=INT2PTR(regexp_engine*,SvIV(*ptr));
3963 PerlIO_printf(Perl_debug_log, "Using engine %"UVxf"\n",
3966 return CALLREGCOMP_ENG(eng, exp, xend, pm);
3973 register regexp_internal *ri;
3981 RExC_state_t RExC_state;
3982 RExC_state_t * const pRExC_state = &RExC_state;
3983 #ifdef TRIE_STUDY_OPT
3985 RExC_state_t copyRExC_state;
3988 FAIL("NULL regexp argument");
3990 RExC_utf8 = pm->op_pmdynflags & PMdf_CMP_UTF8;
3994 SV *dsv= sv_newmortal();
3995 RE_PV_QUOTED_DECL(s, RExC_utf8,
3996 dsv, RExC_precomp, (xend - exp), 60);
3997 PerlIO_printf(Perl_debug_log, "%sCompiling REx%s %s\n",
3998 PL_colors[4],PL_colors[5],s);
4000 RExC_flags = pm->op_pmflags;
4004 RExC_seen_zerolen = *exp == '^' ? -1 : 0;
4005 RExC_seen_evals = 0;
4008 /* First pass: determine size, legality. */
4017 RExC_emit = &PL_regdummy;
4018 RExC_whilem_seen = 0;
4019 RExC_charnames = NULL;
4020 RExC_open_parens = NULL;
4021 RExC_close_parens = NULL;
4023 RExC_paren_names = NULL;
4024 RExC_recurse = NULL;
4025 RExC_recurse_count = 0;
4027 #if 0 /* REGC() is (currently) a NOP at the first pass.
4028 * Clever compilers notice this and complain. --jhi */
4029 REGC((U8)REG_MAGIC, (char*)RExC_emit);
4031 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "Starting first pass (sizing)\n"));
4032 if (reg(pRExC_state, 0, &flags,1) == NULL) {
4033 RExC_precomp = NULL;
4037 PerlIO_printf(Perl_debug_log,
4038 "Required size %"IVdf" nodes\n"
4039 "Starting second pass (creation)\n",
4042 RExC_lastparse=NULL;
4044 /* Small enough for pointer-storage convention?
4045 If extralen==0, this means that we will not need long jumps. */
4046 if (RExC_size >= 0x10000L && RExC_extralen)
4047 RExC_size += RExC_extralen;
4050 if (RExC_whilem_seen > 15)
4051 RExC_whilem_seen = 15;
4054 /* Make room for a sentinel value at the end of the program */
4058 /* Allocate space and zero-initialize. Note, the two step process
4059 of zeroing when in debug mode, thus anything assigned has to
4060 happen after that */
4061 Newxz(r, 1, regexp);
4062 Newxc(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode),
4063 char, regexp_internal);
4064 if ( r == NULL || ri == NULL )
4065 FAIL("Regexp out of space");
4067 /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */
4068 Zero(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode), char);
4070 /* bulk initialize base fields with 0. */
4071 Zero(ri, sizeof(regexp_internal), char);
4074 /* non-zero initialization begins here */
4076 r->engine= RE_ENGINE_PTR;
4078 r->prelen = xend - exp;
4079 r->precomp = savepvn(RExC_precomp, r->prelen);
4080 r->extflags = pm->op_pmflags & RXf_PMf_COMPILETIME;
4082 r->nparens = RExC_npar - 1; /* set early to validate backrefs */
4084 if (RExC_seen & REG_SEEN_RECURSE) {
4085 Newxz(RExC_open_parens, RExC_npar,regnode *);
4086 SAVEFREEPV(RExC_open_parens);
4087 Newxz(RExC_close_parens,RExC_npar,regnode *);
4088 SAVEFREEPV(RExC_close_parens);
4091 /* Useful during FAIL. */
4092 Newxz(ri->offsets, 2*RExC_size+1, U32); /* MJD 20001228 */
4094 ri->offsets[0] = RExC_size;
4096 DEBUG_OFFSETS_r(PerlIO_printf(Perl_debug_log,
4097 "%s %"UVuf" bytes for offset annotations.\n",
4098 ri->offsets ? "Got" : "Couldn't get",
4099 (UV)((2*RExC_size+1) * sizeof(U32))));
4104 /* Second pass: emit code. */
4105 RExC_flags = pm->op_pmflags; /* don't let top level (?i) bleed */
4111 RExC_emit_start = ri->program;
4112 RExC_emit = ri->program;
4114 /* put a sentinal on the end of the program so we can check for
4116 ri->program[RExC_size].type = 255;
4118 /* Store the count of eval-groups for security checks: */
4119 RExC_emit->next_off = (RExC_seen_evals > (I32)U16_MAX) ? U16_MAX : (U16)RExC_seen_evals;
4120 REGC((U8)REG_MAGIC, (char*) RExC_emit++);
4121 if (reg(pRExC_state, 0, &flags,1) == NULL)
4124 /* XXXX To minimize changes to RE engine we always allocate
4125 3-units-long substrs field. */
4126 Newx(r->substrs, 1, struct reg_substr_data);
4127 if (RExC_recurse_count) {
4128 Newxz(RExC_recurse,RExC_recurse_count,regnode *);
4129 SAVEFREEPV(RExC_recurse);
4133 r->minlen = minlen = sawplus = sawopen = 0;
4134 Zero(r->substrs, 1, struct reg_substr_data);
4136 #ifdef TRIE_STUDY_OPT
4139 DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log,"Restudying\n"));
4141 RExC_state = copyRExC_state;
4142 if (seen & REG_TOP_LEVEL_BRANCHES)
4143 RExC_seen |= REG_TOP_LEVEL_BRANCHES;
4145 RExC_seen &= ~REG_TOP_LEVEL_BRANCHES;
4146 if (data.last_found) {
4147 SvREFCNT_dec(data.longest_fixed);
4148 SvREFCNT_dec(data.longest_float);
4149 SvREFCNT_dec(data.last_found);
4151 StructCopy(&zero_scan_data, &data, scan_data_t);
4153 StructCopy(&zero_scan_data, &data, scan_data_t);
4154 copyRExC_state = RExC_state;
4157 StructCopy(&zero_scan_data, &data, scan_data_t);
4160 /* Dig out information for optimizations. */
4161 r->extflags = pm->op_pmflags & RXf_PMf_COMPILETIME; /* Again? */
4162 pm->op_pmflags = RExC_flags;
4164 r->extflags |= RXf_UTF8; /* Unicode in it? */
4165 ri->regstclass = NULL;
4166 if (RExC_naughty >= 10) /* Probably an expensive pattern. */
4167 r->intflags |= PREGf_NAUGHTY;
4168 scan = ri->program + 1; /* First BRANCH. */
4170 /* testing for BRANCH here tells us whether there is "must appear"
4171 data in the pattern. If there is then we can use it for optimisations */
4172 if (!(RExC_seen & REG_TOP_LEVEL_BRANCHES)) { /* Only one top-level choice. */
4174 STRLEN longest_float_length, longest_fixed_length;
4175 struct regnode_charclass_class ch_class; /* pointed to by data */
4177 I32 last_close = 0; /* pointed to by data */
4180 /* Skip introductions and multiplicators >= 1. */
4181 while ((OP(first) == OPEN && (sawopen = 1)) ||
4182 /* An OR of *one* alternative - should not happen now. */
4183 (OP(first) == BRANCH && OP(regnext(first)) != BRANCH) ||
4184 /* for now we can't handle lookbehind IFMATCH*/
4185 (OP(first) == IFMATCH && !first->flags) ||
4186 (OP(first) == PLUS) ||
4187 (OP(first) == MINMOD) ||
4188 /* An {n,m} with n>0 */
4189 (PL_regkind[OP(first)] == CURLY && ARG1(first) > 0) )
4192 if (OP(first) == PLUS)
4195 first += regarglen[OP(first)];
4196 if (OP(first) == IFMATCH) {
4197 first = NEXTOPER(first);
4198 first += EXTRA_STEP_2ARGS;
4199 } else /* XXX possible optimisation for /(?=)/ */
4200 first = NEXTOPER(first);
4203 /* Starting-point info. */
4205 DEBUG_PEEP("first:",first,0);
4206 /* Ignore EXACT as we deal with it later. */
4207 if (PL_regkind[OP(first)] == EXACT) {
4208 if (OP(first) == EXACT)
4209 NOOP; /* Empty, get anchored substr later. */
4210 else if ((OP(first) == EXACTF || OP(first) == EXACTFL))
4211 ri->regstclass = first;
4214 else if (PL_regkind[OP(first)] == TRIE &&
4215 ((reg_trie_data *)ri->data->data[ ARG(first) ])->minlen>0)
4218 /* this can happen only on restudy */
4219 if ( OP(first) == TRIE ) {
4220 struct regnode_1 *trieop;
4221 Newxz(trieop,1,struct regnode_1);
4222 StructCopy(first,trieop,struct regnode_1);
4223 trie_op=(regnode *)trieop;
4225 struct regnode_charclass *trieop;
4226 Newxz(trieop,1,struct regnode_charclass);
4227 StructCopy(first,trieop,struct regnode_charclass);
4228 trie_op=(regnode *)trieop;
4231 make_trie_failtable(pRExC_state, (regnode *)first, trie_op, 0);
4232 ri->regstclass = trie_op;
4235 else if (strchr((const char*)PL_simple,OP(first)))
4236 ri->regstclass = first;
4237 else if (PL_regkind[OP(first)] == BOUND ||
4238 PL_regkind[OP(first)] == NBOUND)
4239 ri->regstclass = first;
4240 else if (PL_regkind[OP(first)] == BOL) {
4241 r->extflags |= (OP(first) == MBOL
4243 : (OP(first) == SBOL
4246 first = NEXTOPER(first);
4249 else if (OP(first) == GPOS) {
4250 r->extflags |= RXf_ANCH_GPOS;
4251 first = NEXTOPER(first);
4254 else if ((!sawopen || !RExC_sawback) &&
4255 (OP(first) == STAR &&
4256 PL_regkind[OP(NEXTOPER(first))] == REG_ANY) &&
4257 !(r->extflags & RXf_ANCH) && !(RExC_seen & REG_SEEN_EVAL))
4259 /* turn .* into ^.* with an implied $*=1 */
4261 (OP(NEXTOPER(first)) == REG_ANY)
4264 r->extflags |= type;
4265 r->intflags |= PREGf_IMPLICIT;
4266 first = NEXTOPER(first);
4269 if (sawplus && (!sawopen || !RExC_sawback)
4270 && !(RExC_seen & REG_SEEN_EVAL)) /* May examine pos and $& */
4271 /* x+ must match at the 1st pos of run of x's */
4272 r->intflags |= PREGf_SKIP;
4274 /* Scan is after the zeroth branch, first is atomic matcher. */
4275 #ifdef TRIE_STUDY_OPT
4278 PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
4279 (IV)(first - scan + 1))
4283 PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
4284 (IV)(first - scan + 1))
4290 * If there's something expensive in the r.e., find the
4291 * longest literal string that must appear and make it the
4292 * regmust. Resolve ties in favor of later strings, since
4293 * the regstart check works with the beginning of the r.e.
4294 * and avoiding duplication strengthens checking. Not a
4295 * strong reason, but sufficient in the absence of others.
4296 * [Now we resolve ties in favor of the earlier string if
4297 * it happens that c_offset_min has been invalidated, since the
4298 * earlier string may buy us something the later one won't.]
4301 data.longest_fixed = newSVpvs("");
4302 data.longest_float = newSVpvs("");
4303 data.last_found = newSVpvs("");
4304 data.longest = &(data.longest_fixed);
4306 if (!ri->regstclass) {
4307 cl_init(pRExC_state, &ch_class);
4308 data.start_class = &ch_class;
4309 stclass_flag = SCF_DO_STCLASS_AND;
4310 } else /* XXXX Check for BOUND? */
4312 data.last_closep = &last_close;
4314 minlen = study_chunk(pRExC_state, &first, &minlen, &fake, scan + RExC_size, /* Up to end */
4315 &data, -1, NULL, NULL,
4316 SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag,0);
4322 if ( RExC_npar == 1 && data.longest == &(data.longest_fixed)
4323 && data.last_start_min == 0 && data.last_end > 0
4324 && !RExC_seen_zerolen
4325 && (!(RExC_seen & REG_SEEN_GPOS) || (r->extflags & RXf_ANCH_GPOS)))
4326 r->extflags |= RXf_CHECK_ALL;
4327 scan_commit(pRExC_state, &data,&minlen);
4328 SvREFCNT_dec(data.last_found);
4330 /* Note that code very similar to this but for anchored string
4331 follows immediately below, changes may need to be made to both.
4334 longest_float_length = CHR_SVLEN(data.longest_float);
4335 if (longest_float_length
4336 || (data.flags & SF_FL_BEFORE_EOL
4337 && (!(data.flags & SF_FL_BEFORE_MEOL)
4338 || (RExC_flags & RXf_PMf_MULTILINE))))
4342 if (SvCUR(data.longest_fixed) /* ok to leave SvCUR */
4343 && data.offset_fixed == data.offset_float_min
4344 && SvCUR(data.longest_fixed) == SvCUR(data.longest_float))
4345 goto remove_float; /* As in (a)+. */
4347 /* copy the information about the longest float from the reg_scan_data
4348 over to the program. */
4349 if (SvUTF8(data.longest_float)) {
4350 r->float_utf8 = data.longest_float;
4351 r->float_substr = NULL;
4353 r->float_substr = data.longest_float;
4354 r->float_utf8 = NULL;
4356 /* float_end_shift is how many chars that must be matched that
4357 follow this item. We calculate it ahead of time as once the
4358 lookbehind offset is added in we lose the ability to correctly
4360 ml = data.minlen_float ? *(data.minlen_float)
4361 : (I32)longest_float_length;
4362 r->float_end_shift = ml - data.offset_float_min
4363 - longest_float_length + (SvTAIL(data.longest_float) != 0)
4364 + data.lookbehind_float;
4365 r->float_min_offset = data.offset_float_min - data.lookbehind_float;
4366 r->float_max_offset = data.offset_float_max;
4367 if (data.offset_float_max < I32_MAX) /* Don't offset infinity */
4368 r->float_max_offset -= data.lookbehind_float;
4370 t = (data.flags & SF_FL_BEFORE_EOL /* Can't have SEOL and MULTI */
4371 && (!(data.flags & SF_FL_BEFORE_MEOL)
4372 || (RExC_flags & RXf_PMf_MULTILINE)));
4373 fbm_compile(data.longest_float, t ? FBMcf_TAIL : 0);
4377 r->float_substr = r->float_utf8 = NULL;
4378 SvREFCNT_dec(data.longest_float);
4379 longest_float_length = 0;
4382 /* Note that code very similar to this but for floating string
4383 is immediately above, changes may need to be made to both.
4386 longest_fixed_length = CHR_SVLEN(data.longest_fixed);
4387 if (longest_fixed_length
4388 || (data.flags & SF_FIX_BEFORE_EOL /* Cannot have SEOL and MULTI */
4389 && (!(data.flags & SF_FIX_BEFORE_MEOL)
4390 || (RExC_flags & RXf_PMf_MULTILINE))))
4394 /* copy the information about the longest fixed
4395 from the reg_scan_data over to the program. */
4396 if (SvUTF8(data.longest_fixed)) {
4397 r->anchored_utf8 = data.longest_fixed;
4398 r->anchored_substr = NULL;
4400 r->anchored_substr = data.longest_fixed;
4401 r->anchored_utf8 = NULL;
4403 /* fixed_end_shift is how many chars that must be matched that
4404 follow this item. We calculate it ahead of time as once the
4405 lookbehind offset is added in we lose the ability to correctly
4407 ml = data.minlen_fixed ? *(data.minlen_fixed)
4408 : (I32)longest_fixed_length;
4409 r->anchored_end_shift = ml - data.offset_fixed
4410 - longest_fixed_length + (SvTAIL(data.longest_fixed) != 0)
4411 + data.lookbehind_fixed;
4412 r->anchored_offset = data.offset_fixed - data.lookbehind_fixed;
4414 t = (data.flags & SF_FIX_BEFORE_EOL /* Can't have SEOL and MULTI */
4415 && (!(data.flags & SF_FIX_BEFORE_MEOL)
4416 || (RExC_flags & RXf_PMf_MULTILINE)));
4417 fbm_compile(data.longest_fixed, t ? FBMcf_TAIL : 0);
4420 r->anchored_substr = r->anchored_utf8 = NULL;
4421 SvREFCNT_dec(data.longest_fixed);
4422 longest_fixed_length = 0;
4425 && (OP(ri->regstclass) == REG_ANY || OP(ri->regstclass) == SANY))
4426 ri->regstclass = NULL;
4427 if ((!(r->anchored_substr || r->anchored_utf8) || r->anchored_offset)
4429 && !(data.start_class->flags & ANYOF_EOS)
4430 && !cl_is_anything(data.start_class))
4432 const U32 n = add_data(pRExC_state, 1, "f");
4434 Newx(RExC_rxi->data->data[n], 1,
4435 struct regnode_charclass_class);
4436 StructCopy(data.start_class,
4437 (struct regnode_charclass_class*)RExC_rxi->data->data[n],
4438 struct regnode_charclass_class);
4439 ri->regstclass = (regnode*)RExC_rxi->data->data[n];
4440 r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
4441 DEBUG_COMPILE_r({ SV *sv = sv_newmortal();
4442 regprop(r, sv, (regnode*)data.start_class);
4443 PerlIO_printf(Perl_debug_log,
4444 "synthetic stclass \"%s\".\n",
4445 SvPVX_const(sv));});
4448 /* A temporary algorithm prefers floated substr to fixed one to dig more info. */
4449 if (longest_fixed_length > longest_float_length) {
4450 r->check_end_shift = r->anchored_end_shift;
4451 r->check_substr = r->anchored_substr;
4452 r->check_utf8 = r->anchored_utf8;
4453 r->check_offset_min = r->check_offset_max = r->anchored_offset;
4454 if (r->extflags & RXf_ANCH_SINGLE)
4455 r->extflags |= RXf_NOSCAN;
4458 r->check_end_shift = r->float_end_shift;
4459 r->check_substr = r->float_substr;
4460 r->check_utf8 = r->float_utf8;
4461 r->check_offset_min = r->float_min_offset;
4462 r->check_offset_max = r->float_max_offset;
4464 /* XXXX Currently intuiting is not compatible with ANCH_GPOS.
4465 This should be changed ASAP! */
4466 if ((r->check_substr || r->check_utf8) && !(r->extflags & RXf_ANCH_GPOS)) {
4467 r->extflags |= RXf_USE_INTUIT;
4468 if (SvTAIL(r->check_substr ? r->check_substr : r->check_utf8))
4469 r->extflags |= RXf_INTUIT_TAIL;
4471 /* XXX Unneeded? dmq (shouldn't as this is handled elsewhere)
4472 if ( (STRLEN)minlen < longest_float_length )
4473 minlen= longest_float_length;
4474 if ( (STRLEN)minlen < longest_fixed_length )
4475 minlen= longest_fixed_length;
4479 /* Several toplevels. Best we can is to set minlen. */
4481 struct regnode_charclass_class ch_class;
4484 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "\nMulti Top Level\n"));
4486 scan = ri->program + 1;
4487 cl_init(pRExC_state, &ch_class);
4488 data.start_class = &ch_class;
4489 data.last_closep = &last_close;
4492 minlen = study_chunk(pRExC_state, &scan, &minlen, &fake, scan + RExC_size,
4493 &data, -1, NULL, NULL, SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS,0);
4497 r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8
4498 = r->float_substr = r->float_utf8 = NULL;
4499 if (!(data.start_class->flags & ANYOF_EOS)
4500 && !cl_is_anything(data.start_class))
4502 const U32 n = add_data(pRExC_state, 1, "f");
4504 Newx(RExC_rxi->data->data[n], 1,
4505 struct regnode_charclass_class);
4506 StructCopy(data.start_class,
4507 (struct regnode_charclass_class*)RExC_rxi->data->data[n],
4508 struct regnode_charclass_class);
4509 ri->regstclass = (regnode*)RExC_rxi->data->data[n];
4510 r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
4511 DEBUG_COMPILE_r({ SV* sv = sv_newmortal();
4512 regprop(r, sv, (regnode*)data.start_class);
4513 PerlIO_printf(Perl_debug_log,
4514 "synthetic stclass \"%s\".\n",
4515 SvPVX_const(sv));});
4519 /* Guard against an embedded (?=) or (?<=) with a longer minlen than
4520 the "real" pattern. */
4522 PerlIO_printf(Perl_debug_log,"minlen: %"IVdf" r->minlen:%"IVdf"\n",
4523 (IV)minlen, (IV)r->minlen);
4525 r->minlenret = minlen;
4526 if (r->minlen < minlen)
4529 if (RExC_seen & REG_SEEN_GPOS)
4530 r->extflags |= RXf_GPOS_SEEN;
4531 if (RExC_seen & REG_SEEN_LOOKBEHIND)
4532 r->extflags |= RXf_LOOKBEHIND_SEEN;
4533 if (RExC_seen & REG_SEEN_EVAL)
4534 r->extflags |= RXf_EVAL_SEEN;
4535 if (RExC_seen & REG_SEEN_CANY)
4536 r->extflags |= RXf_CANY_SEEN;
4537 if (RExC_seen & REG_SEEN_VERBARG)
4538 r->intflags |= PREGf_VERBARG_SEEN;
4539 if (RExC_seen & REG_SEEN_CUTGROUP)
4540 r->intflags |= PREGf_CUTGROUP_SEEN;
4541 if (RExC_paren_names)
4542 r->paren_names = (HV*)SvREFCNT_inc(RExC_paren_names);
4544 r->paren_names = NULL;
4546 if (RExC_recurse_count) {
4547 for ( ; RExC_recurse_count ; RExC_recurse_count-- ) {
4548 const regnode *scan = RExC_recurse[RExC_recurse_count-1];
4549 ARG2L_SET( scan, RExC_open_parens[ARG(scan)-1] - scan );
4552 Newxz(r->startp, RExC_npar, I32);
4553 Newxz(r->endp, RExC_npar, I32);
4554 /* assume we don't need to swap parens around before we match */
4557 PerlIO_printf(Perl_debug_log,"Final program:\n");
4560 DEBUG_OFFSETS_r(if (ri->offsets) {
4561 const U32 len = ri->offsets[0];
4563 GET_RE_DEBUG_FLAGS_DECL;
4564 PerlIO_printf(Perl_debug_log, "Offsets: [%"UVuf"]\n\t", (UV)ri->offsets[0]);
4565 for (i = 1; i <= len; i++) {
4566 if (ri->offsets[i*2-1] || ri->offsets[i*2])
4567 PerlIO_printf(Perl_debug_log, "%"UVuf":%"UVuf"[%"UVuf"] ",
4568 (UV)i, (UV)ri->offsets[i*2-1], (UV)ri->offsets[i*2]);
4570 PerlIO_printf(Perl_debug_log, "\n");
4576 #undef CORE_ONLY_BLOCK
4578 #undef RE_ENGINE_PTR
4580 #ifndef PERL_IN_XSUB_RE
4582 Perl_reg_named_buff_sv(pTHX_ SV* namesv)
4584 I32 parno = 0; /* no match */
4586 const REGEXP * const rx = PM_GETRE(PL_curpm);
4587 if (rx && rx->paren_names) {
4588 HE *he_str = hv_fetch_ent( rx->paren_names, namesv, 0, 0 );
4591 SV* sv_dat=HeVAL(he_str);
4592 I32 *nums=(I32*)SvPVX(sv_dat);
4593 for ( i=0; i<SvIVX(sv_dat); i++ ) {
4594 if ((I32)(rx->lastparen) >= nums[i] &&
4595 rx->endp[nums[i]] != -1)
4608 SV *sv= sv_newmortal();
4609 Perl_sv_setpvf(aTHX_ sv, "%"IVdf,(IV)parno);
4610 gv_paren= Perl_gv_fetchsv(aTHX_ sv, GV_ADD, SVt_PVGV);
4611 return GvSVn(gv_paren);
4616 /* Scans the name of a named buffer from the pattern.
4617 * If flags is REG_RSN_RETURN_NULL returns null.
4618 * If flags is REG_RSN_RETURN_NAME returns an SV* containing the name
4619 * If flags is REG_RSN_RETURN_DATA returns the data SV* corresponding
4620 * to the parsed name as looked up in the RExC_paren_names hash.
4621 * If there is an error throws a vFAIL().. type exception.
4624 #define REG_RSN_RETURN_NULL 0
4625 #define REG_RSN_RETURN_NAME 1
4626 #define REG_RSN_RETURN_DATA 2
4629 S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags) {
4630 char *name_start = RExC_parse;
4633 while( isIDFIRST_uni(utf8n_to_uvchr((U8*)RExC_parse,
4634 RExC_end - RExC_parse, &numlen, UTF8_ALLOW_DEFAULT)))
4636 RExC_parse += numlen;
4639 while( isIDFIRST(*RExC_parse) )
4643 SV* sv_name = sv_2mortal(Perl_newSVpvn(aTHX_ name_start,
4644 (int)(RExC_parse - name_start)));
4647 if ( flags == REG_RSN_RETURN_NAME)
4649 else if (flags==REG_RSN_RETURN_DATA) {
4652 if ( ! sv_name ) /* should not happen*/
4653 Perl_croak(aTHX_ "panic: no svname in reg_scan_name");
4654 if (RExC_paren_names)
4655 he_str = hv_fetch_ent( RExC_paren_names, sv_name, 0, 0 );
4657 sv_dat = HeVAL(he_str);
4659 vFAIL("Reference to nonexistent named group");
4663 Perl_croak(aTHX_ "panic: bad flag in reg_scan_name");
4670 #define DEBUG_PARSE_MSG(funcname) DEBUG_PARSE_r({ \
4671 int rem=(int)(RExC_end - RExC_parse); \
4680 if (RExC_lastparse!=RExC_parse) \
4681 PerlIO_printf(Perl_debug_log," >%.*s%-*s", \
4684 iscut ? "..." : "<" \
4687 PerlIO_printf(Perl_debug_log,"%16s",""); \
4692 num=REG_NODE_NUM(RExC_emit); \
4693 if (RExC_lastnum!=num) \
4694 PerlIO_printf(Perl_debug_log,"|%4d",num); \
4696 PerlIO_printf(Perl_debug_log,"|%4s",""); \
4697 PerlIO_printf(Perl_debug_log,"|%*s%-4s", \
4698 (int)((depth*2)), "", \
4702 RExC_lastparse=RExC_parse; \
4707 #define DEBUG_PARSE(funcname) DEBUG_PARSE_r({ \
4708 DEBUG_PARSE_MSG((funcname)); \
4709 PerlIO_printf(Perl_debug_log,"%4s","\n"); \
4711 #define DEBUG_PARSE_FMT(funcname,fmt,args) DEBUG_PARSE_r({ \
4712 DEBUG_PARSE_MSG((funcname)); \
4713 PerlIO_printf(Perl_debug_log,fmt "\n",args); \
4716 - reg - regular expression, i.e. main body or parenthesized thing
4718 * Caller must absorb opening parenthesis.
4720 * Combining parenthesis handling with the base level of regular expression
4721 * is a trifle forced, but the need to tie the tails of the branches to what
4722 * follows makes it hard to avoid.
4724 #define REGTAIL(x,y,z) regtail((x),(y),(z),depth+1)
4726 #define REGTAIL_STUDY(x,y,z) regtail_study((x),(y),(z),depth+1)
4728 #define REGTAIL_STUDY(x,y,z) regtail((x),(y),(z),depth+1)
4731 /* this idea is borrowed from STR_WITH_LEN in handy.h */
4732 #define CHECK_WORD(s,v,l) \
4733 (((sizeof(s)-1)==(l)) && (strnEQ(start_verb, (s ""), (sizeof(s)-1))))
4736 S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
4737 /* paren: Parenthesized? 0=top, 1=(, inside: changed to letter. */
4740 register regnode *ret; /* Will be the head of the group. */
4741 register regnode *br;
4742 register regnode *lastbr;
4743 register regnode *ender = NULL;
4744 register I32 parno = 0;
4746 const I32 oregflags = RExC_flags;
4747 bool have_branch = 0;
4750 /* for (?g), (?gc), and (?o) warnings; warning
4751 about (?c) will warn about (?g) -- japhy */
4753 #define WASTED_O 0x01
4754 #define WASTED_G 0x02
4755 #define WASTED_C 0x04
4756 #define WASTED_GC (0x02|0x04)
4757 I32 wastedflags = 0x00;
4759 char * parse_start = RExC_parse; /* MJD */
4760 char * const oregcomp_parse = RExC_parse;
4762 GET_RE_DEBUG_FLAGS_DECL;
4763 DEBUG_PARSE("reg ");
4766 *flagp = 0; /* Tentatively. */
4769 /* Make an OPEN node, if parenthesized. */
4771 if ( *RExC_parse == '*') { /* (*VERB:ARG) */
4772 char *start_verb = RExC_parse;
4773 STRLEN verb_len = 0;
4774 char *start_arg = NULL;
4775 unsigned char op = 0;
4777 int internal_argval = 0; /* internal_argval is only useful if !argok */
4778 while ( *RExC_parse && *RExC_parse != ')' ) {
4779 if ( *RExC_parse == ':' ) {
4780 start_arg = RExC_parse + 1;
4786 verb_len = RExC_parse - start_verb;
4789 while ( *RExC_parse && *RExC_parse != ')' )
4791 if ( *RExC_parse != ')' )
4792 vFAIL("Unterminated verb pattern argument");
4793 if ( RExC_parse == start_arg )
4796 if ( *RExC_parse != ')' )
4797 vFAIL("Unterminated verb pattern");
4800 switch ( *start_verb ) {
4801 case 'A': /* (*ACCEPT) */
4802 if ( CHECK_WORD("ACCEPT",start_verb,verb_len) ) {
4804 internal_argval = RExC_nestroot;
4807 case 'C': /* (*COMMIT) */
4808 if ( CHECK_WORD("COMMIT",start_verb,verb_len) )
4811 case 'F': /* (*FAIL) */
4812 if ( verb_len==1 || CHECK_WORD("FAIL",start_verb,verb_len) ) {
4817 case ':': /* (*:NAME) */
4818 case 'M': /* (*MARK:NAME) */
4819 if ( verb_len==0 || CHECK_WORD("MARK",start_verb,verb_len) ) {
4824 case 'P': /* (*PRUNE) */
4825 if ( CHECK_WORD("PRUNE",start_verb,verb_len) )
4828 case 'S': /* (*SKIP) */
4829 if ( CHECK_WORD("SKIP",start_verb,verb_len) )
4832 case 'T': /* (*THEN) */
4833 /* [19:06] <TimToady> :: is then */
4834 if ( CHECK_WORD("THEN",start_verb,verb_len) ) {
4836 RExC_seen |= REG_SEEN_CUTGROUP;
4842 vFAIL3("Unknown verb pattern '%.*s'",
4843 verb_len, start_verb);
4846 if ( start_arg && internal_argval ) {
4847 vFAIL3("Verb pattern '%.*s' may not have an argument",
4848 verb_len, start_verb);
4849 } else if ( argok < 0 && !start_arg ) {
4850 vFAIL3("Verb pattern '%.*s' has a mandatory argument",
4851 verb_len, start_verb);
4853 ret = reganode(pRExC_state, op, internal_argval);
4854 if ( ! internal_argval && ! SIZE_ONLY ) {
4856 SV *sv = newSVpvn( start_arg, RExC_parse - start_arg);
4857 ARG(ret) = add_data( pRExC_state, 1, "S" );
4858 RExC_rxi->data->data[ARG(ret)]=(void*)sv;
4865 if (!internal_argval)
4866 RExC_seen |= REG_SEEN_VERBARG;
4867 } else if ( start_arg ) {
4868 vFAIL3("Verb pattern '%.*s' may not have an argument",
4869 verb_len, start_verb);
4871 ret = reg_node(pRExC_state, op);
4873 nextchar(pRExC_state);
4876 if (*RExC_parse == '?') { /* (?...) */
4877 U32 posflags = 0, negflags = 0;
4878 U32 *flagsp = &posflags;
4879 bool is_logical = 0;
4880 const char * const seqstart = RExC_parse;
4883 paren = *RExC_parse++;
4884 ret = NULL; /* For look-ahead/behind. */
4887 case '<': /* (?<...) */
4888 if (*RExC_parse == '!')
4890 else if (*RExC_parse != '=')
4895 case '\'': /* (?'...') */
4896 name_start= RExC_parse;
4897 svname = reg_scan_name(pRExC_state,
4898 SIZE_ONLY ? /* reverse test from the others */
4899 REG_RSN_RETURN_NAME :
4900 REG_RSN_RETURN_NULL);
4901 if (RExC_parse == name_start)
4903 if (*RExC_parse != paren)
4904 vFAIL2("Sequence (?%c... not terminated",
4905 paren=='>' ? '<' : paren);
4909 if (!svname) /* shouldnt happen */
4911 "panic: reg_scan_name returned NULL");
4912 if (!RExC_paren_names) {
4913 RExC_paren_names= newHV();
4914 sv_2mortal((SV*)RExC_paren_names);
4916 he_str = hv_fetch_ent( RExC_paren_names, svname, 1, 0 );
4918 sv_dat = HeVAL(he_str);
4920 /* croak baby croak */
4922 "panic: paren_name hash element allocation failed");
4923 } else if ( SvPOK(sv_dat) ) {
4924 IV count=SvIV(sv_dat);
4925 I32 *pv=(I32*)SvGROW(sv_dat,SvCUR(sv_dat)+sizeof(I32)+1);
4926 SvCUR_set(sv_dat,SvCUR(sv_dat)+sizeof(I32));
4927 pv[count]=RExC_npar;
4930 (void)SvUPGRADE(sv_dat,SVt_PVNV);
4931 sv_setpvn(sv_dat, (char *)&(RExC_npar), sizeof(I32));
4936 /*sv_dump(sv_dat);*/
4938 nextchar(pRExC_state);
4940 goto capturing_parens;
4942 RExC_seen |= REG_SEEN_LOOKBEHIND;
4944 case '=': /* (?=...) */
4945 case '!': /* (?!...) */
4946 RExC_seen_zerolen++;
4947 if (*RExC_parse == ')') {
4948 ret=reg_node(pRExC_state, OPFAIL);
4949 nextchar(pRExC_state);
4952 case ':': /* (?:...) */
4953 case '>': /* (?>...) */
4955 case '$': /* (?$...) */
4956 case '@': /* (?@...) */
4957 vFAIL2("Sequence (?%c...) not implemented", (int)paren);
4959 case '#': /* (?#...) */
4960 while (*RExC_parse && *RExC_parse != ')')
4962 if (*RExC_parse != ')')
4963 FAIL("Sequence (?#... not terminated");
4964 nextchar(pRExC_state);
4967 case '0' : /* (?0) */
4968 case 'R' : /* (?R) */
4969 if (*RExC_parse != ')')
4970 FAIL("Sequence (?R) not terminated");
4971 ret = reg_node(pRExC_state, GOSTART);
4972 nextchar(pRExC_state);
4975 { /* named and numeric backreferences */
4978 case '&': /* (?&NAME) */
4979 parse_start = RExC_parse - 1;
4981 SV *sv_dat = reg_scan_name(pRExC_state,
4982 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
4983 num = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
4985 goto gen_recurse_regop;
4988 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
4990 vFAIL("Illegal pattern");
4992 goto parse_recursion;
4994 case '-': /* (?-1) */
4995 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
4996 RExC_parse--; /* rewind to let it be handled later */
5000 case '1': case '2': case '3': case '4': /* (?1) */
5001 case '5': case '6': case '7': case '8': case '9':
5004 num = atoi(RExC_parse);
5005 parse_start = RExC_parse - 1; /* MJD */
5006 if (*RExC_parse == '-')
5008 while (isDIGIT(*RExC_parse))
5010 if (*RExC_parse!=')')
5011 vFAIL("Expecting close bracket");
5014 if ( paren == '-' ) {
5016 Diagram of capture buffer numbering.
5017 Top line is the normal capture buffer numbers
5018 Botton line is the negative indexing as from
5022 /(a(x)y)(a(b(c(?-2)d)e)f)(g(h))/
5026 num = RExC_npar + num;
5029 vFAIL("Reference to nonexistent group");
5031 } else if ( paren == '+' ) {
5032 num = RExC_npar + num - 1;
5035 ret = reganode(pRExC_state, GOSUB, num);
5037 if (num > (I32)RExC_rx->nparens) {
5039 vFAIL("Reference to nonexistent group");
5041 ARG2L_SET( ret, RExC_recurse_count++);
5043 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
5044 "Recurse #%"UVuf" to %"IVdf"\n", (UV)ARG(ret), (IV)ARG2L(ret)));
5048 RExC_seen |= REG_SEEN_RECURSE;
5049 Set_Node_Length(ret, 1 + regarglen[OP(ret)]); /* MJD */
5050 Set_Node_Offset(ret, parse_start); /* MJD */
5052 nextchar(pRExC_state);
5054 } /* named and numeric backreferences */
5057 case 'p': /* (?p...) */
5058 if (SIZE_ONLY && ckWARN2(WARN_DEPRECATED, WARN_REGEXP))
5059 vWARNdep(RExC_parse, "(?p{}) is deprecated - use (??{})");
5061 case '?': /* (??...) */
5063 if (*RExC_parse != '{')
5065 paren = *RExC_parse++;
5067 case '{': /* (?{...}) */
5072 char *s = RExC_parse;
5074 RExC_seen_zerolen++;
5075 RExC_seen |= REG_SEEN_EVAL;
5076 while (count && (c = *RExC_parse)) {
5087 if (*RExC_parse != ')') {
5089 vFAIL("Sequence (?{...}) not terminated or not {}-balanced");
5093 OP_4tree *sop, *rop;
5094 SV * const sv = newSVpvn(s, RExC_parse - 1 - s);
5097 Perl_save_re_context(aTHX);
5098 rop = sv_compile_2op(sv, &sop, "re", &pad);
5099 sop->op_private |= OPpREFCOUNTED;
5100 /* re_dup will OpREFCNT_inc */
5101 OpREFCNT_set(sop, 1);
5104 n = add_data(pRExC_state, 3, "nop");
5105 RExC_rxi->data->data[n] = (void*)rop;
5106 RExC_rxi->data->data[n+1] = (void*)sop;
5107 RExC_rxi->data->data[n+2] = (void*)pad;
5110 else { /* First pass */
5111 if (PL_reginterp_cnt < ++RExC_seen_evals
5113 /* No compiled RE interpolated, has runtime
5114 components ===> unsafe. */
5115 FAIL("Eval-group not allowed at runtime, use re 'eval'");
5116 if (PL_tainting && PL_tainted)
5117 FAIL("Eval-group in insecure regular expression");
5118 #if PERL_VERSION > 8
5119 if (IN_PERL_COMPILETIME)
5124 nextchar(pRExC_state);
5126 ret = reg_node(pRExC_state, LOGICAL);
5129 REGTAIL(pRExC_state, ret, reganode(pRExC_state, EVAL, n));
5130 /* deal with the length of this later - MJD */
5133 ret = reganode(pRExC_state, EVAL, n);
5134 Set_Node_Length(ret, RExC_parse - parse_start + 1);
5135 Set_Node_Offset(ret, parse_start);
5138 case '(': /* (?(?{...})...) and (?(?=...)...) */
5141 if (RExC_parse[0] == '?') { /* (?(?...)) */
5142 if (RExC_parse[1] == '=' || RExC_parse[1] == '!'
5143 || RExC_parse[1] == '<'
5144 || RExC_parse[1] == '{') { /* Lookahead or eval. */
5147 ret = reg_node(pRExC_state, LOGICAL);
5150 REGTAIL(pRExC_state, ret, reg(pRExC_state, 1, &flag,depth+1));
5154 else if ( RExC_parse[0] == '<' /* (?(<NAME>)...) */
5155 || RExC_parse[0] == '\'' ) /* (?('NAME')...) */
5157 char ch = RExC_parse[0] == '<' ? '>' : '\'';
5158 char *name_start= RExC_parse++;
5160 SV *sv_dat=reg_scan_name(pRExC_state,
5161 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
5162 if (RExC_parse == name_start || *RExC_parse != ch)
5163 vFAIL2("Sequence (?(%c... not terminated",
5164 (ch == '>' ? '<' : ch));
5167 num = add_data( pRExC_state, 1, "S" );
5168 RExC_rxi->data->data[num]=(void*)sv_dat;
5169 SvREFCNT_inc(sv_dat);
5171 ret = reganode(pRExC_state,NGROUPP,num);
5172 goto insert_if_check_paren;
5174 else if (RExC_parse[0] == 'D' &&
5175 RExC_parse[1] == 'E' &&
5176 RExC_parse[2] == 'F' &&
5177 RExC_parse[3] == 'I' &&
5178 RExC_parse[4] == 'N' &&
5179 RExC_parse[5] == 'E')
5181 ret = reganode(pRExC_state,DEFINEP,0);
5184 goto insert_if_check_paren;
5186 else if (RExC_parse[0] == 'R') {
5189 if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
5190 parno = atoi(RExC_parse++);
5191 while (isDIGIT(*RExC_parse))
5193 } else if (RExC_parse[0] == '&') {
5196 sv_dat = reg_scan_name(pRExC_state,
5197 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
5198 parno = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
5200 ret = reganode(pRExC_state,INSUBP,parno);
5201 goto insert_if_check_paren;
5203 else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
5206 parno = atoi(RExC_parse++);
5208 while (isDIGIT(*RExC_parse))
5210 ret = reganode(pRExC_state, GROUPP, parno);
5212 insert_if_check_paren:
5213 if ((c = *nextchar(pRExC_state)) != ')')
5214 vFAIL("Switch condition not recognized");
5216 REGTAIL(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
5217 br = regbranch(pRExC_state, &flags, 1,depth+1);
5219 br = reganode(pRExC_state, LONGJMP, 0);
5221 REGTAIL(pRExC_state, br, reganode(pRExC_state, LONGJMP, 0));
5222 c = *nextchar(pRExC_state);
5227 vFAIL("(?(DEFINE)....) does not allow branches");
5228 lastbr = reganode(pRExC_state, IFTHEN, 0); /* Fake one for optimizer. */
5229 regbranch(pRExC_state, &flags, 1,depth+1);
5230 REGTAIL(pRExC_state, ret, lastbr);
5233 c = *nextchar(pRExC_state);
5238 vFAIL("Switch (?(condition)... contains too many branches");
5239 ender = reg_node(pRExC_state, TAIL);
5240 REGTAIL(pRExC_state, br, ender);
5242 REGTAIL(pRExC_state, lastbr, ender);
5243 REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
5246 REGTAIL(pRExC_state, ret, ender);
5250 vFAIL2("Unknown switch condition (?(%.2s", RExC_parse);
5254 RExC_parse--; /* for vFAIL to print correctly */
5255 vFAIL("Sequence (? incomplete");
5259 parse_flags: /* (?i) */
5260 while (*RExC_parse && strchr("iogcmsx", *RExC_parse)) {
5261 /* (?g), (?gc) and (?o) are useless here
5262 and must be globally applied -- japhy */
5264 if (*RExC_parse == 'o' || *RExC_parse == 'g') {
5265 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
5266 const I32 wflagbit = *RExC_parse == 'o' ? WASTED_O : WASTED_G;
5267 if (! (wastedflags & wflagbit) ) {
5268 wastedflags |= wflagbit;
5271 "Useless (%s%c) - %suse /%c modifier",
5272 flagsp == &negflags ? "?-" : "?",
5274 flagsp == &negflags ? "don't " : "",
5280 else if (*RExC_parse == 'c') {
5281 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
5282 if (! (wastedflags & WASTED_C) ) {
5283 wastedflags |= WASTED_GC;
5286 "Useless (%sc) - %suse /gc modifier",
5287 flagsp == &negflags ? "?-" : "?",
5288 flagsp == &negflags ? "don't " : ""
5293 else { pmflag(flagsp, *RExC_parse); }
5297 if (*RExC_parse == '-') {
5299 wastedflags = 0; /* reset so (?g-c) warns twice */
5303 RExC_flags |= posflags;
5304 RExC_flags &= ~negflags;
5305 if (*RExC_parse == ':') {
5311 if (*RExC_parse != ')') {
5313 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
5315 nextchar(pRExC_state);
5325 ret = reganode(pRExC_state, OPEN, parno);
5328 RExC_nestroot = parno;
5329 if (RExC_seen & REG_SEEN_RECURSE) {
5330 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
5331 "Setting open paren #%"IVdf" to %d\n",
5332 (IV)parno, REG_NODE_NUM(ret)));
5333 RExC_open_parens[parno-1]= ret;
5336 Set_Node_Length(ret, 1); /* MJD */
5337 Set_Node_Offset(ret, RExC_parse); /* MJD */
5344 /* Pick up the branches, linking them together. */
5345 parse_start = RExC_parse; /* MJD */
5346 br = regbranch(pRExC_state, &flags, 1,depth+1);
5347 /* branch_len = (paren != 0); */
5351 if (*RExC_parse == '|') {
5352 if (!SIZE_ONLY && RExC_extralen) {
5353 reginsert(pRExC_state, BRANCHJ, br, depth+1);
5356 reginsert(pRExC_state, BRANCH, br, depth+1);
5357 Set_Node_Length(br, paren != 0);
5358 Set_Node_Offset_To_R(br-RExC_emit_start, parse_start-RExC_start);
5362 RExC_extralen += 1; /* For BRANCHJ-BRANCH. */
5364 else if (paren == ':') {
5365 *flagp |= flags&SIMPLE;
5367 if (is_open) { /* Starts with OPEN. */
5368 REGTAIL(pRExC_state, ret, br); /* OPEN -> first. */
5370 else if (paren != '?') /* Not Conditional */
5372 *flagp |= flags & (SPSTART | HASWIDTH);
5374 while (*RExC_parse == '|') {
5375 if (!SIZE_ONLY && RExC_extralen) {
5376 ender = reganode(pRExC_state, LONGJMP,0);
5377 REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender); /* Append to the previous. */
5380 RExC_extralen += 2; /* Account for LONGJMP. */
5381 nextchar(pRExC_state);
5382 br = regbranch(pRExC_state, &flags, 0, depth+1);
5386 REGTAIL(pRExC_state, lastbr, br); /* BRANCH -> BRANCH. */
5390 *flagp |= flags&SPSTART;
5393 if (have_branch || paren != ':') {
5394 /* Make a closing node, and hook it on the end. */
5397 ender = reg_node(pRExC_state, TAIL);
5401 ender = reganode(pRExC_state, CLOSE, parno);
5402 if (!SIZE_ONLY && RExC_seen & REG_SEEN_RECURSE) {
5403 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
5404 "Setting close paren #%"IVdf" to %d\n",
5405 (IV)parno, REG_NODE_NUM(ender)));
5406 RExC_close_parens[parno-1]= ender;
5407 if (RExC_nestroot == parno)
5410 Set_Node_Offset(ender,RExC_parse+1); /* MJD */
5411 Set_Node_Length(ender,1); /* MJD */
5417 *flagp &= ~HASWIDTH;
5420 ender = reg_node(pRExC_state, SUCCEED);
5423 ender = reg_node(pRExC_state, END);
5425 assert(!RExC_opend); /* there can only be one! */
5430 REGTAIL(pRExC_state, lastbr, ender);
5432 if (have_branch && !SIZE_ONLY) {
5434 RExC_seen |= REG_TOP_LEVEL_BRANCHES;
5436 /* Hook the tails of the branches to the closing node. */
5437 for (br = ret; br; br = regnext(br)) {
5438 const U8 op = PL_regkind[OP(br)];
5440 REGTAIL_STUDY(pRExC_state, NEXTOPER(br), ender);
5442 else if (op == BRANCHJ) {
5443 REGTAIL_STUDY(pRExC_state, NEXTOPER(NEXTOPER(br)), ender);
5451 static const char parens[] = "=!<,>";
5453 if (paren && (p = strchr(parens, paren))) {
5454 U8 node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
5455 int flag = (p - parens) > 1;
5458 node = SUSPEND, flag = 0;
5459 reginsert(pRExC_state, node,ret, depth+1);
5460 Set_Node_Cur_Length(ret);
5461 Set_Node_Offset(ret, parse_start + 1);
5463 REGTAIL_STUDY(pRExC_state, ret, reg_node(pRExC_state, TAIL));
5467 /* Check for proper termination. */
5469 RExC_flags = oregflags;
5470 if (RExC_parse >= RExC_end || *nextchar(pRExC_state) != ')') {
5471 RExC_parse = oregcomp_parse;
5472 vFAIL("Unmatched (");
5475 else if (!paren && RExC_parse < RExC_end) {
5476 if (*RExC_parse == ')') {
5478 vFAIL("Unmatched )");
5481 FAIL("Junk on end of regexp"); /* "Can't happen". */
5489 - regbranch - one alternative of an | operator
5491 * Implements the concatenation operator.
5494 S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first, U32 depth)
5497 register regnode *ret;
5498 register regnode *chain = NULL;
5499 register regnode *latest;
5500 I32 flags = 0, c = 0;
5501 GET_RE_DEBUG_FLAGS_DECL;
5502 DEBUG_PARSE("brnc");
5506 if (!SIZE_ONLY && RExC_extralen)
5507 ret = reganode(pRExC_state, BRANCHJ,0);
5509 ret = reg_node(pRExC_state, BRANCH);
5510 Set_Node_Length(ret, 1);
5514 if (!first && SIZE_ONLY)
5515 RExC_extralen += 1; /* BRANCHJ */
5517 *flagp = WORST; /* Tentatively. */
5520 nextchar(pRExC_state);
5521 while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
5523 latest = regpiece(pRExC_state, &flags,depth+1);
5524 if (latest == NULL) {
5525 if (flags & TRYAGAIN)
5529 else if (ret == NULL)
5531 *flagp |= flags&HASWIDTH;
5532 if (chain == NULL) /* First piece. */
5533 *flagp |= flags&SPSTART;
5536 REGTAIL(pRExC_state, chain, latest);
5541 if (chain == NULL) { /* Loop ran zero times. */
5542 chain = reg_node(pRExC_state, NOTHING);
5547 *flagp |= flags&SIMPLE;
5554 - regpiece - something followed by possible [*+?]
5556 * Note that the branching code sequences used for ? and the general cases
5557 * of * and + are somewhat optimized: they use the same NOTHING node as
5558 * both the endmarker for their branch list and the body of the last branch.
5559 * It might seem that this node could be dispensed with entirely, but the
5560 * endmarker role is not redundant.
5563 S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
5566 register regnode *ret;
5568 register char *next;
5570 const char * const origparse = RExC_parse;
5572 I32 max = REG_INFTY;
5574 const char *maxpos = NULL;
5575 GET_RE_DEBUG_FLAGS_DECL;
5576 DEBUG_PARSE("piec");
5578 ret = regatom(pRExC_state, &flags,depth+1);
5580 if (flags & TRYAGAIN)
5587 if (op == '{' && regcurly(RExC_parse)) {
5589 parse_start = RExC_parse; /* MJD */
5590 next = RExC_parse + 1;
5591 while (isDIGIT(*next) || *next == ',') {
5600 if (*next == '}') { /* got one */
5604 min = atoi(RExC_parse);
5608 maxpos = RExC_parse;
5610 if (!max && *maxpos != '0')
5611 max = REG_INFTY; /* meaning "infinity" */
5612 else if (max >= REG_INFTY)
5613 vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
5615 nextchar(pRExC_state);
5618 if ((flags&SIMPLE)) {
5619 RExC_naughty += 2 + RExC_naughty / 2;
5620 reginsert(pRExC_state, CURLY, ret, depth+1);
5621 Set_Node_Offset(ret, parse_start+1); /* MJD */
5622 Set_Node_Cur_Length(ret);
5625 regnode * const w = reg_node(pRExC_state, WHILEM);
5628 REGTAIL(pRExC_state, ret, w);
5629 if (!SIZE_ONLY && RExC_extralen) {
5630 reginsert(pRExC_state, LONGJMP,ret, depth+1);
5631 reginsert(pRExC_state, NOTHING,ret, depth+1);
5632 NEXT_OFF(ret) = 3; /* Go over LONGJMP. */
5634 reginsert(pRExC_state, CURLYX,ret, depth+1);
5636 Set_Node_Offset(ret, parse_start+1);
5637 Set_Node_Length(ret,
5638 op == '{' ? (RExC_parse - parse_start) : 1);
5640 if (!SIZE_ONLY && RExC_extralen)
5641 NEXT_OFF(ret) = 3; /* Go over NOTHING to LONGJMP. */
5642 REGTAIL(pRExC_state, ret, reg_node(pRExC_state, NOTHING));
5644 RExC_whilem_seen++, RExC_extralen += 3;
5645 RExC_naughty += 4 + RExC_naughty; /* compound interest */
5653 if (max && max < min)
5654 vFAIL("Can't do {n,m} with n > m");
5656 ARG1_SET(ret, (U16)min);
5657 ARG2_SET(ret, (U16)max);
5669 #if 0 /* Now runtime fix should be reliable. */
5671 /* if this is reinstated, don't forget to put this back into perldiag:
5673 =item Regexp *+ operand could be empty at {#} in regex m/%s/
5675 (F) The part of the regexp subject to either the * or + quantifier
5676 could match an empty string. The {#} shows in the regular
5677 expression about where the problem was discovered.
5681 if (!(flags&HASWIDTH) && op != '?')
5682 vFAIL("Regexp *+ operand could be empty");
5685 parse_start = RExC_parse;
5686 nextchar(pRExC_state);
5688 *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH);
5690 if (op == '*' && (flags&SIMPLE)) {
5691 reginsert(pRExC_state, STAR, ret, depth+1);
5695 else if (op == '*') {
5699 else if (op == '+' && (flags&SIMPLE)) {
5700 reginsert(pRExC_state, PLUS, ret, depth+1);
5704 else if (op == '+') {
5708 else if (op == '?') {
5713 if (!SIZE_ONLY && !(flags&HASWIDTH) && max > REG_INFTY/3 && ckWARN(WARN_REGEXP)) {
5715 "%.*s matches null string many times",
5716 (int)(RExC_parse >= origparse ? RExC_parse - origparse : 0),
5720 if (RExC_parse < RExC_end && *RExC_parse == '?') {
5721 nextchar(pRExC_state);
5722 reginsert(pRExC_state, MINMOD, ret, depth+1);
5723 REGTAIL(pRExC_state, ret, ret + NODE_STEP_REGNODE);
5725 #ifndef REG_ALLOW_MINMOD_SUSPEND
5728 if (RExC_parse < RExC_end && *RExC_parse == '+') {
5730 nextchar(pRExC_state);
5731 ender = reg_node(pRExC_state, SUCCEED);
5732 REGTAIL(pRExC_state, ret, ender);
5733 reginsert(pRExC_state, SUSPEND, ret, depth+1);
5735 ender = reg_node(pRExC_state, TAIL);
5736 REGTAIL(pRExC_state, ret, ender);
5740 if (RExC_parse < RExC_end && ISMULT2(RExC_parse)) {
5742 vFAIL("Nested quantifiers");
5749 /* reg_namedseq(pRExC_state,UVp)
5751 This is expected to be called by a parser routine that has
5752 recognized'\N' and needs to handle the rest. RExC_parse is
5753 expected to point at the first char following the N at the time
5756 If valuep is non-null then it is assumed that we are parsing inside
5757 of a charclass definition and the first codepoint in the resolved
5758 string is returned via *valuep and the routine will return NULL.
5759 In this mode if a multichar string is returned from the charnames
5760 handler a warning will be issued, and only the first char in the
5761 sequence will be examined. If the string returned is zero length
5762 then the value of *valuep is undefined and NON-NULL will
5763 be returned to indicate failure. (This will NOT be a valid pointer
5766 If value is null then it is assumed that we are parsing normal text
5767 and inserts a new EXACT node into the program containing the resolved
5768 string and returns a pointer to the new node. If the string is
5769 zerolength a NOTHING node is emitted.
5771 On success RExC_parse is set to the char following the endbrace.
5772 Parsing failures will generate a fatal errorvia vFAIL(...)
5774 NOTE: We cache all results from the charnames handler locally in
5775 the RExC_charnames hash (created on first use) to prevent a charnames
5776 handler from playing silly-buggers and returning a short string and
5777 then a long string for a given pattern. Since the regexp program
5778 size is calculated during an initial parse this would result
5779 in a buffer overrun so we cache to prevent the charname result from
5780 changing during the course of the parse.
5784 S_reg_namedseq(pTHX_ RExC_state_t *pRExC_state, UV *valuep)
5786 char * name; /* start of the content of the name */
5787 char * endbrace; /* endbrace following the name */
5790 STRLEN len; /* this has various purposes throughout the code */
5791 bool cached = 0; /* if this is true then we shouldn't refcount dev sv_str */
5792 regnode *ret = NULL;
5794 if (*RExC_parse != '{') {
5795 vFAIL("Missing braces on \\N{}");
5797 name = RExC_parse+1;
5798 endbrace = strchr(RExC_parse, '}');
5801 vFAIL("Missing right brace on \\N{}");
5803 RExC_parse = endbrace + 1;
5806 /* RExC_parse points at the beginning brace,
5807 endbrace points at the last */
5808 if ( name[0]=='U' && name[1]=='+' ) {
5809 /* its a "unicode hex" notation {U+89AB} */
5810 I32 fl = PERL_SCAN_ALLOW_UNDERSCORES
5811 | PERL_SCAN_DISALLOW_PREFIX
5812 | (SIZE_ONLY ? PERL_SCAN_SILENT_ILLDIGIT : 0);
5814 len = (STRLEN)(endbrace - name - 2);
5815 cp = grok_hex(name + 2, &len, &fl, NULL);
5816 if ( len != (STRLEN)(endbrace - name - 2) ) {
5825 sv_str= Perl_newSVpvf_nocontext("%c",(int)cp);
5827 /* fetch the charnames handler for this scope */
5828 HV * const table = GvHV(PL_hintgv);
5830 hv_fetchs(table, "charnames", FALSE) :
5832 SV *cv= cvp ? *cvp : NULL;
5835 /* create an SV with the name as argument */
5836 sv_name = newSVpvn(name, endbrace - name);
5838 if (!table || !(PL_hints & HINT_LOCALIZE_HH)) {
5839 vFAIL2("Constant(\\N{%s}) unknown: "
5840 "(possibly a missing \"use charnames ...\")",
5843 if (!cvp || !SvOK(*cvp)) { /* when $^H{charnames} = undef; */
5844 vFAIL2("Constant(\\N{%s}): "
5845 "$^H{charnames} is not defined",SvPVX(sv_name));
5850 if (!RExC_charnames) {
5851 /* make sure our cache is allocated */
5852 RExC_charnames = newHV();
5853 sv_2mortal((SV*)RExC_charnames);
5855 /* see if we have looked this one up before */
5856 he_str = hv_fetch_ent( RExC_charnames, sv_name, 0, 0 );
5858 sv_str = HeVAL(he_str);
5871 count= call_sv(cv, G_SCALAR);
5873 if (count == 1) { /* XXXX is this right? dmq */
5875 SvREFCNT_inc_simple_void(sv_str);
5883 if ( !sv_str || !SvOK(sv_str) ) {
5884 vFAIL2("Constant(\\N{%s}): Call to &{$^H{charnames}} "
5885 "did not return a defined value",SvPVX(sv_name));
5887 if (hv_store_ent( RExC_charnames, sv_name, sv_str, 0))
5892 char *p = SvPV(sv_str, len);
5895 if ( SvUTF8(sv_str) ) {
5896 *valuep = utf8_to_uvchr((U8*)p, &numlen);
5900 We have to turn on utf8 for high bit chars otherwise
5901 we get failures with
5903 "ss" =~ /[\N{LATIN SMALL LETTER SHARP S}]/i
5904 "SS" =~ /[\N{LATIN SMALL LETTER SHARP S}]/i
5906 This is different from what \x{} would do with the same
5907 codepoint, where the condition is > 0xFF.
5914 /* warn if we havent used the whole string? */
5916 if (numlen<len && SIZE_ONLY && ckWARN(WARN_REGEXP)) {
5918 "Ignoring excess chars from \\N{%s} in character class",
5922 } else if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
5924 "Ignoring zero length \\N{%s} in character class",
5929 SvREFCNT_dec(sv_name);
5931 SvREFCNT_dec(sv_str);
5932 return len ? NULL : (regnode *)&len;
5933 } else if(SvCUR(sv_str)) {
5938 char * parse_start = name-3; /* needed for the offsets */
5939 GET_RE_DEBUG_FLAGS_DECL; /* needed for the offsets */
5941 ret = reg_node(pRExC_state,
5942 (U8)(FOLD ? (LOC ? EXACTFL : EXACTF) : EXACT));
5945 if ( RExC_utf8 && !SvUTF8(sv_str) ) {
5946 sv_utf8_upgrade(sv_str);
5947 } else if ( !RExC_utf8 && SvUTF8(sv_str) ) {
5951 p = SvPV(sv_str, len);
5953 /* len is the length written, charlen is the size the char read */
5954 for ( len = 0; p < pend; p += charlen ) {
5956 UV uvc = utf8_to_uvchr((U8*)p, &charlen);
5958 STRLEN foldlen,numlen;
5959 U8 tmpbuf[UTF8_MAXBYTES_CASE+1], *foldbuf;
5960 uvc = toFOLD_uni(uvc, tmpbuf, &foldlen);
5961 /* Emit all the Unicode characters. */
5963 for (foldbuf = tmpbuf;
5967 uvc = utf8_to_uvchr(foldbuf, &numlen);
5969 const STRLEN unilen = reguni(pRExC_state, uvc, s);
5972 /* In EBCDIC the numlen
5973 * and unilen can differ. */
5975 if (numlen >= foldlen)
5979 break; /* "Can't happen." */
5982 const STRLEN unilen = reguni(pRExC_state, uvc, s);
5994 RExC_size += STR_SZ(len);
5997 RExC_emit += STR_SZ(len);
5999 Set_Node_Cur_Length(ret); /* MJD */
6001 nextchar(pRExC_state);
6003 ret = reg_node(pRExC_state,NOTHING);
6006 SvREFCNT_dec(sv_str);
6009 SvREFCNT_dec(sv_name);
6019 * It returns the code point in utf8 for the value in *encp.
6020 * value: a code value in the source encoding
6021 * encp: a pointer to an Encode object
6023 * If the result from Encode is not a single character,
6024 * it returns U+FFFD (Replacement character) and sets *encp to NULL.
6027 S_reg_recode(pTHX_ const char value, SV **encp)
6030 SV * const sv = sv_2mortal(newSVpvn(&value, numlen));
6031 const char * const s = encp && *encp ? sv_recode_to_utf8(sv, *encp)
6033 const STRLEN newlen = SvCUR(sv);
6034 UV uv = UNICODE_REPLACEMENT;
6038 ? utf8n_to_uvchr((U8*)s, newlen, &numlen, UTF8_ALLOW_DEFAULT)
6041 if (!newlen || numlen != newlen) {
6042 uv = UNICODE_REPLACEMENT;
6051 - regatom - the lowest level
6053 * Optimization: gobbles an entire sequence of ordinary characters so that
6054 * it can turn them into a single node, which is smaller to store and
6055 * faster to run. Backslashed characters are exceptions, each becoming a
6056 * separate node; the code is simpler that way and it's not worth fixing.
6058 * [Yes, it is worth fixing, some scripts can run twice the speed.]
6059 * [It looks like its ok, as in S_study_chunk we merge adjacent EXACT nodes]
6062 S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
6065 register regnode *ret = NULL;
6067 char *parse_start = RExC_parse;
6068 GET_RE_DEBUG_FLAGS_DECL;
6069 DEBUG_PARSE("atom");
6070 *flagp = WORST; /* Tentatively. */
6073 switch (*RExC_parse) {
6075 RExC_seen_zerolen++;
6076 nextchar(pRExC_state);
6077 if (RExC_flags & RXf_PMf_MULTILINE)
6078 ret = reg_node(pRExC_state, MBOL);
6079 else if (RExC_flags & RXf_PMf_SINGLELINE)
6080 ret = reg_node(pRExC_state, SBOL);
6082 ret = reg_node(pRExC_state, BOL);
6083 Set_Node_Length(ret, 1); /* MJD */
6086 nextchar(pRExC_state);
6088 RExC_seen_zerolen++;
6089 if (RExC_flags & RXf_PMf_MULTILINE)
6090 ret = reg_node(pRExC_state, MEOL);
6091 else if (RExC_flags & RXf_PMf_SINGLELINE)
6092 ret = reg_node(pRExC_state, SEOL);
6094 ret = reg_node(pRExC_state, EOL);
6095 Set_Node_Length(ret, 1); /* MJD */
6098 nextchar(pRExC_state);
6099 if (RExC_flags & RXf_PMf_SINGLELINE)
6100 ret = reg_node(pRExC_state, SANY);
6102 ret = reg_node(pRExC_state, REG_ANY);
6103 *flagp |= HASWIDTH|SIMPLE;
6105 Set_Node_Length(ret, 1); /* MJD */
6109 char * const oregcomp_parse = ++RExC_parse;
6110 ret = regclass(pRExC_state,depth+1);
6111 if (*RExC_parse != ']') {
6112 RExC_parse = oregcomp_parse;
6113 vFAIL("Unmatched [");
6115 nextchar(pRExC_state);
6116 *flagp |= HASWIDTH|SIMPLE;
6117 Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */
6121 nextchar(pRExC_state);
6122 ret = reg(pRExC_state, 1, &flags,depth+1);
6124 if (flags & TRYAGAIN) {
6125 if (RExC_parse == RExC_end) {
6126 /* Make parent create an empty node if needed. */
6134 *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE);
6138 if (flags & TRYAGAIN) {
6142 vFAIL("Internal urp");
6143 /* Supposed to be caught earlier. */
6146 if (!regcurly(RExC_parse)) {
6155 vFAIL("Quantifier follows nothing");
6158 switch (*++RExC_parse) {
6160 RExC_seen_zerolen++;
6161 ret = reg_node(pRExC_state, SBOL);
6163 nextchar(pRExC_state);
6164 Set_Node_Length(ret, 2); /* MJD */
6167 ret = reg_node(pRExC_state, GPOS);
6168 RExC_seen |= REG_SEEN_GPOS;
6170 nextchar(pRExC_state);
6171 Set_Node_Length(ret, 2); /* MJD */
6174 ret = reg_node(pRExC_state, SEOL);
6176 RExC_seen_zerolen++; /* Do not optimize RE away */
6177 nextchar(pRExC_state);
6180 ret = reg_node(pRExC_state, EOS);
6182 RExC_seen_zerolen++; /* Do not optimize RE away */
6183 nextchar(pRExC_state);
6184 Set_Node_Length(ret, 2); /* MJD */
6187 ret = reg_node(pRExC_state, CANY);
6188 RExC_seen |= REG_SEEN_CANY;
6189 *flagp |= HASWIDTH|SIMPLE;
6190 nextchar(pRExC_state);
6191 Set_Node_Length(ret, 2); /* MJD */
6194 ret = reg_node(pRExC_state, CLUMP);
6196 nextchar(pRExC_state);
6197 Set_Node_Length(ret, 2); /* MJD */
6200 ret = reg_node(pRExC_state, (U8)(LOC ? ALNUML : ALNUM));
6201 *flagp |= HASWIDTH|SIMPLE;
6202 nextchar(pRExC_state);
6203 Set_Node_Length(ret, 2); /* MJD */
6206 ret = reg_node(pRExC_state, (U8)(LOC ? NALNUML : NALNUM));
6207 *flagp |= HASWIDTH|SIMPLE;
6208 nextchar(pRExC_state);
6209 Set_Node_Length(ret, 2); /* MJD */
6212 RExC_seen_zerolen++;
6213 RExC_seen |= REG_SEEN_LOOKBEHIND;
6214 ret = reg_node(pRExC_state, (U8)(LOC ? BOUNDL : BOUND));
6216 nextchar(pRExC_state);
6217 Set_Node_Length(ret, 2); /* MJD */
6220 RExC_seen_zerolen++;
6221 RExC_seen |= REG_SEEN_LOOKBEHIND;
6222 ret = reg_node(pRExC_state, (U8)(LOC ? NBOUNDL : NBOUND));
6224 nextchar(pRExC_state);
6225 Set_Node_Length(ret, 2); /* MJD */
6228 ret = reg_node(pRExC_state, (U8)(LOC ? SPACEL : SPACE));
6229 *flagp |= HASWIDTH|SIMPLE;
6230 nextchar(pRExC_state);
6231 Set_Node_Length(ret, 2); /* MJD */
6234 ret = reg_node(pRExC_state, (U8)(LOC ? NSPACEL : NSPACE));
6235 *flagp |= HASWIDTH|SIMPLE;
6236 nextchar(pRExC_state);
6237 Set_Node_Length(ret, 2); /* MJD */
6240 ret = reg_node(pRExC_state, DIGIT);
6241 *flagp |= HASWIDTH|SIMPLE;
6242 nextchar(pRExC_state);
6243 Set_Node_Length(ret, 2); /* MJD */
6246 ret = reg_node(pRExC_state, NDIGIT);
6247 *flagp |= HASWIDTH|SIMPLE;
6248 nextchar(pRExC_state);
6249 Set_Node_Length(ret, 2); /* MJD */
6254 char* const oldregxend = RExC_end;
6255 char* parse_start = RExC_parse - 2;
6257 if (RExC_parse[1] == '{') {
6258 /* a lovely hack--pretend we saw [\pX] instead */
6259 RExC_end = strchr(RExC_parse, '}');
6261 const U8 c = (U8)*RExC_parse;
6263 RExC_end = oldregxend;
6264 vFAIL2("Missing right brace on \\%c{}", c);
6269 RExC_end = RExC_parse + 2;
6270 if (RExC_end > oldregxend)
6271 RExC_end = oldregxend;
6275 ret = regclass(pRExC_state,depth+1);
6277 RExC_end = oldregxend;
6280 Set_Node_Offset(ret, parse_start + 2);
6281 Set_Node_Cur_Length(ret);
6282 nextchar(pRExC_state);
6283 *flagp |= HASWIDTH|SIMPLE;
6287 /* Handle \N{NAME} here and not below because it can be
6288 multicharacter. join_exact() will join them up later on.
6289 Also this makes sure that things like /\N{BLAH}+/ and
6290 \N{BLAH} being multi char Just Happen. dmq*/
6292 ret= reg_namedseq(pRExC_state, NULL);
6294 case 'k': /* Handle \k<NAME> and \k'NAME' */
6296 char ch= RExC_parse[1];
6297 if (ch != '<' && ch != '\'') {
6299 vWARN( RExC_parse + 1,
6300 "Possible broken named back reference treated as literal k");
6304 char* name_start = (RExC_parse += 2);
6306 SV *sv_dat = reg_scan_name(pRExC_state,
6307 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
6308 ch= (ch == '<') ? '>' : '\'';
6310 if (RExC_parse == name_start || *RExC_parse != ch)
6311 vFAIL2("Sequence \\k%c... not terminated",
6312 (ch == '>' ? '<' : ch));
6315 ret = reganode(pRExC_state,
6316 (U8)(FOLD ? (LOC ? NREFFL : NREFF) : NREF),
6322 num = add_data( pRExC_state, 1, "S" );
6324 RExC_rxi->data->data[num]=(void*)sv_dat;
6325 SvREFCNT_inc(sv_dat);
6327 /* override incorrect value set in reganode MJD */
6328 Set_Node_Offset(ret, parse_start+1);
6329 Set_Node_Cur_Length(ret); /* MJD */
6330 nextchar(pRExC_state);
6346 case '1': case '2': case '3': case '4':
6347 case '5': case '6': case '7': case '8': case '9':
6350 bool isrel=(*RExC_parse=='R');
6353 num = atoi(RExC_parse);
6355 num = RExC_cpar - num;
6357 vFAIL("Reference to nonexistent or unclosed group");
6359 if (num > 9 && num >= RExC_npar)
6362 char * const parse_start = RExC_parse - 1; /* MJD */
6363 while (isDIGIT(*RExC_parse))
6367 if (num > (I32)RExC_rx->nparens)
6368 vFAIL("Reference to nonexistent group");
6369 /* People make this error all the time apparently.
6370 So we cant fail on it, even though we should
6372 else if (num >= RExC_cpar)
6373 vFAIL("Reference to unclosed group will always match");
6377 ret = reganode(pRExC_state,
6378 (U8)(FOLD ? (LOC ? REFFL : REFF) : REF),
6382 /* override incorrect value set in reganode MJD */
6383 Set_Node_Offset(ret, parse_start+1);
6384 Set_Node_Cur_Length(ret); /* MJD */
6386 nextchar(pRExC_state);
6391 if (RExC_parse >= RExC_end)
6392 FAIL("Trailing \\");
6395 /* Do not generate "unrecognized" warnings here, we fall
6396 back into the quick-grab loop below */
6403 if (RExC_flags & RXf_PMf_EXTENDED) {
6404 while (RExC_parse < RExC_end && *RExC_parse != '\n')
6406 if (RExC_parse < RExC_end)
6412 register STRLEN len;
6417 U8 tmpbuf[UTF8_MAXBYTES_CASE+1], *foldbuf;
6419 parse_start = RExC_parse - 1;
6425 ret = reg_node(pRExC_state,
6426 (U8)(FOLD ? (LOC ? EXACTFL : EXACTF) : EXACT));
6428 for (len = 0, p = RExC_parse - 1;
6429 len < 127 && p < RExC_end;
6432 char * const oldp = p;
6434 if (RExC_flags & RXf_PMf_EXTENDED)
6435 p = regwhite(p, RExC_end);
6484 ender = ASCII_TO_NATIVE('\033');
6488 ender = ASCII_TO_NATIVE('\007');
6493 char* const e = strchr(p, '}');
6497 vFAIL("Missing right brace on \\x{}");
6500 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
6501 | PERL_SCAN_DISALLOW_PREFIX;
6502 STRLEN numlen = e - p - 1;
6503 ender = grok_hex(p + 1, &numlen, &flags, NULL);
6510 I32 flags = PERL_SCAN_DISALLOW_PREFIX;
6512 ender = grok_hex(p, &numlen, &flags, NULL);
6515 if (PL_encoding && ender < 0x100)
6516 goto recode_encoding;
6520 ender = UCHARAT(p++);
6521 ender = toCTRL(ender);
6523 case '0': case '1': case '2': case '3':case '4':
6524 case '5': case '6': case '7': case '8':case '9':
6526 (isDIGIT(p[1]) && atoi(p) >= RExC_npar) ) {
6529 ender = grok_oct(p, &numlen, &flags, NULL);
6536 if (PL_encoding && ender < 0x100)
6537 goto recode_encoding;
6541 SV* enc = PL_encoding;
6542 ender = reg_recode((const char)(U8)ender, &enc);
6543 if (!enc && SIZE_ONLY && ckWARN(WARN_REGEXP))
6544 vWARN(p, "Invalid escape in the specified encoding");
6550 FAIL("Trailing \\");
6553 if (!SIZE_ONLY&& isALPHA(*p) && ckWARN(WARN_REGEXP))
6554 vWARN2(p + 1, "Unrecognized escape \\%c passed through", UCHARAT(p));
6555 goto normal_default;
6560 if (UTF8_IS_START(*p) && UTF) {
6562 ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
6563 &numlen, UTF8_ALLOW_DEFAULT);
6570 if (RExC_flags & RXf_PMf_EXTENDED)
6571 p = regwhite(p, RExC_end);
6573 /* Prime the casefolded buffer. */
6574 ender = toFOLD_uni(ender, tmpbuf, &foldlen);
6576 if (ISMULT2(p)) { /* Back off on ?+*. */
6581 /* Emit all the Unicode characters. */
6583 for (foldbuf = tmpbuf;
6585 foldlen -= numlen) {
6586 ender = utf8_to_uvchr(foldbuf, &numlen);
6588 const STRLEN unilen = reguni(pRExC_state, ender, s);
6591 /* In EBCDIC the numlen
6592 * and unilen can differ. */
6594 if (numlen >= foldlen)
6598 break; /* "Can't happen." */
6602 const STRLEN unilen = reguni(pRExC_state, ender, s);
6611 REGC((char)ender, s++);
6617 /* Emit all the Unicode characters. */
6619 for (foldbuf = tmpbuf;
6621 foldlen -= numlen) {
6622 ender = utf8_to_uvchr(foldbuf, &numlen);
6624 const STRLEN unilen = reguni(pRExC_state, ender, s);
6627 /* In EBCDIC the numlen
6628 * and unilen can differ. */
6630 if (numlen >= foldlen)
6638 const STRLEN unilen = reguni(pRExC_state, ender, s);
6647 REGC((char)ender, s++);
6651 Set_Node_Cur_Length(ret); /* MJD */
6652 nextchar(pRExC_state);
6654 /* len is STRLEN which is unsigned, need to copy to signed */
6657 vFAIL("Internal disaster");
6661 if (len == 1 && UNI_IS_INVARIANT(ender))
6665 RExC_size += STR_SZ(len);
6668 RExC_emit += STR_SZ(len);
6678 S_regwhite(char *p, const char *e)
6683 else if (*p == '#') {
6686 } while (p < e && *p != '\n');
6694 /* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
6695 Character classes ([:foo:]) can also be negated ([:^foo:]).
6696 Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
6697 Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
6698 but trigger failures because they are currently unimplemented. */
6700 #define POSIXCC_DONE(c) ((c) == ':')
6701 #define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.')
6702 #define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c))
6705 S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value)
6708 I32 namedclass = OOB_NAMEDCLASS;
6710 if (value == '[' && RExC_parse + 1 < RExC_end &&
6711 /* I smell either [: or [= or [. -- POSIX has been here, right? */
6712 POSIXCC(UCHARAT(RExC_parse))) {
6713 const char c = UCHARAT(RExC_parse);
6714 char* const s = RExC_parse++;
6716 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != c)
6718 if (RExC_parse == RExC_end)
6719 /* Grandfather lone [:, [=, [. */
6722 const char* const t = RExC_parse++; /* skip over the c */
6725 if (UCHARAT(RExC_parse) == ']') {
6726 const char *posixcc = s + 1;
6727 RExC_parse++; /* skip over the ending ] */
6730 const I32 complement = *posixcc == '^' ? *posixcc++ : 0;
6731 const I32 skip = t - posixcc;
6733 /* Initially switch on the length of the name. */
6736 if (memEQ(posixcc, "word", 4)) /* this is not POSIX, this is the Perl \w */
6737 namedclass = complement ? ANYOF_NALNUM : ANYOF_ALNUM;
6740 /* Names all of length 5. */
6741 /* alnum alpha ascii blank cntrl digit graph lower
6742 print punct space upper */
6743 /* Offset 4 gives the best switch position. */
6744 switch (posixcc[4]) {
6746 if (memEQ(posixcc, "alph", 4)) /* alpha */
6747 namedclass = complement ? ANYOF_NALPHA : ANYOF_ALPHA;
6750 if (memEQ(posixcc, "spac", 4)) /* space */
6751 namedclass = complement ? ANYOF_NPSXSPC : ANYOF_PSXSPC;
6754 if (memEQ(posixcc, "grap", 4)) /* graph */
6755 namedclass = complement ? ANYOF_NGRAPH : ANYOF_GRAPH;
6758 if (memEQ(posixcc, "asci", 4)) /* ascii */
6759 namedclass = complement ? ANYOF_NASCII : ANYOF_ASCII;
6762 if (memEQ(posixcc, "blan", 4)) /* blank */
6763 namedclass = complement ? ANYOF_NBLANK : ANYOF_BLANK;
6766 if (memEQ(posixcc, "cntr", 4)) /* cntrl */
6767 namedclass = complement ? ANYOF_NCNTRL : ANYOF_CNTRL;
6770 if (memEQ(posixcc, "alnu", 4)) /* alnum */
6771 namedclass = complement ? ANYOF_NALNUMC : ANYOF_ALNUMC;
6774 if (memEQ(posixcc, "lowe", 4)) /* lower */
6775 namedclass = complement ? ANYOF_NLOWER : ANYOF_LOWER;
6776 else if (memEQ(posixcc, "uppe", 4)) /* upper */
6777 namedclass = complement ? ANYOF_NUPPER : ANYOF_UPPER;
6780 if (memEQ(posixcc, "digi", 4)) /* digit */
6781 namedclass = complement ? ANYOF_NDIGIT : ANYOF_DIGIT;
6782 else if (memEQ(posixcc, "prin", 4)) /* print */
6783 namedclass = complement ? ANYOF_NPRINT : ANYOF_PRINT;
6784 else if (memEQ(posixcc, "punc", 4)) /* punct */
6785 namedclass = complement ? ANYOF_NPUNCT : ANYOF_PUNCT;
6790 if (memEQ(posixcc, "xdigit", 6))
6791 namedclass = complement ? ANYOF_NXDIGIT : ANYOF_XDIGIT;
6795 if (namedclass == OOB_NAMEDCLASS)
6796 Simple_vFAIL3("POSIX class [:%.*s:] unknown",
6798 assert (posixcc[skip] == ':');
6799 assert (posixcc[skip+1] == ']');
6800 } else if (!SIZE_ONLY) {
6801 /* [[=foo=]] and [[.foo.]] are still future. */
6803 /* adjust RExC_parse so the warning shows after
6805 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse) != ']')
6807 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
6810 /* Maternal grandfather:
6811 * "[:" ending in ":" but not in ":]" */
6821 S_checkposixcc(pTHX_ RExC_state_t *pRExC_state)
6824 if (POSIXCC(UCHARAT(RExC_parse))) {
6825 const char *s = RExC_parse;
6826 const char c = *s++;
6830 if (*s && c == *s && s[1] == ']') {
6831 if (ckWARN(WARN_REGEXP))
6833 "POSIX syntax [%c %c] belongs inside character classes",
6836 /* [[=foo=]] and [[.foo.]] are still future. */
6837 if (POSIXCC_NOTYET(c)) {
6838 /* adjust RExC_parse so the error shows after
6840 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse++) != ']')
6842 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
6850 parse a class specification and produce either an ANYOF node that
6851 matches the pattern. If the pattern matches a single char only and
6852 that char is < 256 then we produce an EXACT node instead.
6855 S_regclass(pTHX_ RExC_state_t *pRExC_state, U32 depth)
6858 register UV value = 0;
6859 register UV nextvalue;
6860 register IV prevvalue = OOB_UNICODE;
6861 register IV range = 0;
6862 register regnode *ret;
6865 char *rangebegin = NULL;
6866 bool need_class = 0;
6869 bool optimize_invert = TRUE;
6870 AV* unicode_alternate = NULL;
6872 UV literal_endpoint = 0;
6874 UV stored = 0; /* number of chars stored in the class */
6876 regnode * const orig_emit = RExC_emit; /* Save the original RExC_emit in
6877 case we need to change the emitted regop to an EXACT. */
6878 const char * orig_parse = RExC_parse;
6879 GET_RE_DEBUG_FLAGS_DECL;
6881 PERL_UNUSED_ARG(depth);
6884 DEBUG_PARSE("clas");
6886 /* Assume we are going to generate an ANYOF node. */
6887 ret = reganode(pRExC_state, ANYOF, 0);
6890 ANYOF_FLAGS(ret) = 0;
6892 if (UCHARAT(RExC_parse) == '^') { /* Complement of range. */
6896 ANYOF_FLAGS(ret) |= ANYOF_INVERT;
6900 RExC_size += ANYOF_SKIP;
6901 listsv = &PL_sv_undef; /* For code scanners: listsv always non-NULL. */
6904 RExC_emit += ANYOF_SKIP;
6906 ANYOF_FLAGS(ret) |= ANYOF_FOLD;
6908 ANYOF_FLAGS(ret) |= ANYOF_LOCALE;
6909 ANYOF_BITMAP_ZERO(ret);
6910 listsv = newSVpvs("# comment\n");
6913 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
6915 if (!SIZE_ONLY && POSIXCC(nextvalue))
6916 checkposixcc(pRExC_state);
6918 /* allow 1st char to be ] (allowing it to be - is dealt with later) */
6919 if (UCHARAT(RExC_parse) == ']')
6923 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != ']') {
6927 namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
6930 rangebegin = RExC_parse;
6932 value = utf8n_to_uvchr((U8*)RExC_parse,
6933 RExC_end - RExC_parse,
6934 &numlen, UTF8_ALLOW_DEFAULT);
6935 RExC_parse += numlen;
6938 value = UCHARAT(RExC_parse++);
6940 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
6941 if (value == '[' && POSIXCC(nextvalue))
6942 namedclass = regpposixcc(pRExC_state, value);
6943 else if (value == '\\') {
6945 value = utf8n_to_uvchr((U8*)RExC_parse,
6946 RExC_end - RExC_parse,
6947 &numlen, UTF8_ALLOW_DEFAULT);
6948 RExC_parse += numlen;
6951 value = UCHARAT(RExC_parse++);
6952 /* Some compilers cannot handle switching on 64-bit integer
6953 * values, therefore value cannot be an UV. Yes, this will
6954 * be a problem later if we want switch on Unicode.
6955 * A similar issue a little bit later when switching on
6956 * namedclass. --jhi */
6957 switch ((I32)value) {
6958 case 'w': namedclass = ANYOF_ALNUM; break;
6959 case 'W': namedclass = ANYOF_NALNUM; break;
6960 case 's': namedclass = ANYOF_SPACE; break;
6961 case 'S': namedclass = ANYOF_NSPACE; break;
6962 case 'd': namedclass = ANYOF_DIGIT; break;
6963 case 'D': namedclass = ANYOF_NDIGIT; break;
6964 case 'N': /* Handle \N{NAME} in class */
6966 /* We only pay attention to the first char of
6967 multichar strings being returned. I kinda wonder
6968 if this makes sense as it does change the behaviour
6969 from earlier versions, OTOH that behaviour was broken
6971 UV v; /* value is register so we cant & it /grrr */
6972 if (reg_namedseq(pRExC_state, &v)) {
6982 if (RExC_parse >= RExC_end)
6983 vFAIL2("Empty \\%c{}", (U8)value);
6984 if (*RExC_parse == '{') {
6985 const U8 c = (U8)value;
6986 e = strchr(RExC_parse++, '}');
6988 vFAIL2("Missing right brace on \\%c{}", c);
6989 while (isSPACE(UCHARAT(RExC_parse)))
6991 if (e == RExC_parse)
6992 vFAIL2("Empty \\%c{}", c);
6994 while (isSPACE(UCHARAT(RExC_parse + n - 1)))
7002 if (UCHARAT(RExC_parse) == '^') {
7005 value = value == 'p' ? 'P' : 'p'; /* toggle */
7006 while (isSPACE(UCHARAT(RExC_parse))) {
7011 Perl_sv_catpvf(aTHX_ listsv, "%cutf8::%.*s\n",
7012 (value=='p' ? '+' : '!'), (int)n, RExC_parse);
7015 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
7016 namedclass = ANYOF_MAX; /* no official name, but it's named */
7019 case 'n': value = '\n'; break;
7020 case 'r': value = '\r'; break;
7021 case 't': value = '\t'; break;
7022 case 'f': value = '\f'; break;
7023 case 'b': value = '\b'; break;
7024 case 'e': value = ASCII_TO_NATIVE('\033');break;
7025 case 'a': value = ASCII_TO_NATIVE('\007');break;
7027 if (*RExC_parse == '{') {
7028 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
7029 | PERL_SCAN_DISALLOW_PREFIX;
7030 char * const e = strchr(RExC_parse++, '}');
7032 vFAIL("Missing right brace on \\x{}");
7034 numlen = e - RExC_parse;
7035 value = grok_hex(RExC_parse, &numlen, &flags, NULL);
7039 I32 flags = PERL_SCAN_DISALLOW_PREFIX;
7041 value = grok_hex(RExC_parse, &numlen, &flags, NULL);
7042 RExC_parse += numlen;
7044 if (PL_encoding && value < 0x100)
7045 goto recode_encoding;
7048 value = UCHARAT(RExC_parse++);
7049 value = toCTRL(value);
7051 case '0': case '1': case '2': case '3': case '4':
7052 case '5': case '6': case '7': case '8': case '9':
7056 value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
7057 RExC_parse += numlen;
7058 if (PL_encoding && value < 0x100)
7059 goto recode_encoding;
7064 SV* enc = PL_encoding;
7065 value = reg_recode((const char)(U8)value, &enc);
7066 if (!enc && SIZE_ONLY && ckWARN(WARN_REGEXP))
7068 "Invalid escape in the specified encoding");
7072 if (!SIZE_ONLY && isALPHA(value) && ckWARN(WARN_REGEXP))
7074 "Unrecognized escape \\%c in character class passed through",
7078 } /* end of \blah */
7084 if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
7086 if (!SIZE_ONLY && !need_class)
7087 ANYOF_CLASS_ZERO(ret);
7091 /* a bad range like a-\d, a-[:digit:] ? */
7094 if (ckWARN(WARN_REGEXP)) {
7096 RExC_parse >= rangebegin ?
7097 RExC_parse - rangebegin : 0;
7099 "False [] range \"%*.*s\"",
7102 if (prevvalue < 256) {
7103 ANYOF_BITMAP_SET(ret, prevvalue);
7104 ANYOF_BITMAP_SET(ret, '-');
7107 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
7108 Perl_sv_catpvf(aTHX_ listsv,
7109 "%04"UVxf"\n%04"UVxf"\n", (UV)prevvalue, (UV) '-');
7113 range = 0; /* this was not a true range */
7117 const char *what = NULL;
7120 if (namedclass > OOB_NAMEDCLASS)
7121 optimize_invert = FALSE;
7122 /* Possible truncation here but in some 64-bit environments
7123 * the compiler gets heartburn about switch on 64-bit values.
7124 * A similar issue a little earlier when switching on value.
7126 switch ((I32)namedclass) {
7129 ANYOF_CLASS_SET(ret, ANYOF_ALNUM);
7131 for (value = 0; value < 256; value++)
7133 ANYOF_BITMAP_SET(ret, value);
7140 ANYOF_CLASS_SET(ret, ANYOF_NALNUM);
7142 for (value = 0; value < 256; value++)
7143 if (!isALNUM(value))
7144 ANYOF_BITMAP_SET(ret, value);
7151 ANYOF_CLASS_SET(ret, ANYOF_ALNUMC);
7153 for (value = 0; value < 256; value++)
7154 if (isALNUMC(value))
7155 ANYOF_BITMAP_SET(ret, value);
7162 ANYOF_CLASS_SET(ret, ANYOF_NALNUMC);
7164 for (value = 0; value < 256; value++)
7165 if (!isALNUMC(value))
7166 ANYOF_BITMAP_SET(ret, value);
7173 ANYOF_CLASS_SET(ret, ANYOF_ALPHA);
7175 for (value = 0; value < 256; value++)
7177 ANYOF_BITMAP_SET(ret, value);
7184 ANYOF_CLASS_SET(ret, ANYOF_NALPHA);
7186 for (value = 0; value < 256; value++)
7187 if (!isALPHA(value))
7188 ANYOF_BITMAP_SET(ret, value);
7195 ANYOF_CLASS_SET(ret, ANYOF_ASCII);
7198 for (value = 0; value < 128; value++)
7199 ANYOF_BITMAP_SET(ret, value);
7201 for (value = 0; value < 256; value++) {
7203 ANYOF_BITMAP_SET(ret, value);
7212 ANYOF_CLASS_SET(ret, ANYOF_NASCII);
7215 for (value = 128; value < 256; value++)
7216 ANYOF_BITMAP_SET(ret, value);
7218 for (value = 0; value < 256; value++) {
7219 if (!isASCII(value))
7220 ANYOF_BITMAP_SET(ret, value);
7229 ANYOF_CLASS_SET(ret, ANYOF_BLANK);
7231 for (value = 0; value < 256; value++)
7233 ANYOF_BITMAP_SET(ret, value);
7240 ANYOF_CLASS_SET(ret, ANYOF_NBLANK);
7242 for (value = 0; value < 256; value++)
7243 if (!isBLANK(value))
7244 ANYOF_BITMAP_SET(ret, value);
7251 ANYOF_CLASS_SET(ret, ANYOF_CNTRL);
7253 for (value = 0; value < 256; value++)
7255 ANYOF_BITMAP_SET(ret, value);
7262 ANYOF_CLASS_SET(ret, ANYOF_NCNTRL);
7264 for (value = 0; value < 256; value++)
7265 if (!isCNTRL(value))
7266 ANYOF_BITMAP_SET(ret, value);
7273 ANYOF_CLASS_SET(ret, ANYOF_DIGIT);
7275 /* consecutive digits assumed */
7276 for (value = '0'; value <= '9'; value++)
7277 ANYOF_BITMAP_SET(ret, value);
7284 ANYOF_CLASS_SET(ret, ANYOF_NDIGIT);
7286 /* consecutive digits assumed */
7287 for (value = 0; value < '0'; value++)
7288 ANYOF_BITMAP_SET(ret, value);
7289 for (value = '9' + 1; value < 256; value++)
7290 ANYOF_BITMAP_SET(ret, value);
7297 ANYOF_CLASS_SET(ret, ANYOF_GRAPH);
7299 for (value = 0; value < 256; value++)
7301 ANYOF_BITMAP_SET(ret, value);
7308 ANYOF_CLASS_SET(ret, ANYOF_NGRAPH);
7310 for (value = 0; value < 256; value++)
7311 if (!isGRAPH(value))
7312 ANYOF_BITMAP_SET(ret, value);
7319 ANYOF_CLASS_SET(ret, ANYOF_LOWER);
7321 for (value = 0; value < 256; value++)
7323 ANYOF_BITMAP_SET(ret, value);
7330 ANYOF_CLASS_SET(ret, ANYOF_NLOWER);
7332 for (value = 0; value < 256; value++)
7333 if (!isLOWER(value))
7334 ANYOF_BITMAP_SET(ret, value);
7341 ANYOF_CLASS_SET(ret, ANYOF_PRINT);
7343 for (value = 0; value < 256; value++)
7345 ANYOF_BITMAP_SET(ret, value);
7352 ANYOF_CLASS_SET(ret, ANYOF_NPRINT);
7354 for (value = 0; value < 256; value++)
7355 if (!isPRINT(value))
7356 ANYOF_BITMAP_SET(ret, value);
7363 ANYOF_CLASS_SET(ret, ANYOF_PSXSPC);
7365 for (value = 0; value < 256; value++)
7366 if (isPSXSPC(value))
7367 ANYOF_BITMAP_SET(ret, value);
7374 ANYOF_CLASS_SET(ret, ANYOF_NPSXSPC);
7376 for (value = 0; value < 256; value++)
7377 if (!isPSXSPC(value))
7378 ANYOF_BITMAP_SET(ret, value);
7385 ANYOF_CLASS_SET(ret, ANYOF_PUNCT);
7387 for (value = 0; value < 256; value++)
7389 ANYOF_BITMAP_SET(ret, value);
7396 ANYOF_CLASS_SET(ret, ANYOF_NPUNCT);
7398 for (value = 0; value < 256; value++)
7399 if (!isPUNCT(value))
7400 ANYOF_BITMAP_SET(ret, value);
7407 ANYOF_CLASS_SET(ret, ANYOF_SPACE);
7409 for (value = 0; value < 256; value++)
7411 ANYOF_BITMAP_SET(ret, value);
7418 ANYOF_CLASS_SET(ret, ANYOF_NSPACE);
7420 for (value = 0; value < 256; value++)
7421 if (!isSPACE(value))
7422 ANYOF_BITMAP_SET(ret, value);
7429 ANYOF_CLASS_SET(ret, ANYOF_UPPER);
7431 for (value = 0; value < 256; value++)
7433 ANYOF_BITMAP_SET(ret, value);
7440 ANYOF_CLASS_SET(ret, ANYOF_NUPPER);
7442 for (value = 0; value < 256; value++)
7443 if (!isUPPER(value))
7444 ANYOF_BITMAP_SET(ret, value);
7451 ANYOF_CLASS_SET(ret, ANYOF_XDIGIT);
7453 for (value = 0; value < 256; value++)
7454 if (isXDIGIT(value))
7455 ANYOF_BITMAP_SET(ret, value);
7462 ANYOF_CLASS_SET(ret, ANYOF_NXDIGIT);
7464 for (value = 0; value < 256; value++)
7465 if (!isXDIGIT(value))
7466 ANYOF_BITMAP_SET(ret, value);
7472 /* this is to handle \p and \P */
7475 vFAIL("Invalid [::] class");
7479 /* Strings such as "+utf8::isWord\n" */
7480 Perl_sv_catpvf(aTHX_ listsv, "%cutf8::Is%s\n", yesno, what);
7483 ANYOF_FLAGS(ret) |= ANYOF_CLASS;
7486 } /* end of namedclass \blah */
7489 if (prevvalue > (IV)value) /* b-a */ {
7490 const int w = RExC_parse - rangebegin;
7491 Simple_vFAIL4("Invalid [] range \"%*.*s\"", w, w, rangebegin);
7492 range = 0; /* not a valid range */
7496 prevvalue = value; /* save the beginning of the range */
7497 if (*RExC_parse == '-' && RExC_parse+1 < RExC_end &&
7498 RExC_parse[1] != ']') {
7501 /* a bad range like \w-, [:word:]- ? */
7502 if (namedclass > OOB_NAMEDCLASS) {
7503 if (ckWARN(WARN_REGEXP)) {
7505 RExC_parse >= rangebegin ?
7506 RExC_parse - rangebegin : 0;
7508 "False [] range \"%*.*s\"",
7512 ANYOF_BITMAP_SET(ret, '-');
7514 range = 1; /* yeah, it's a range! */
7515 continue; /* but do it the next time */
7519 /* now is the next time */
7520 /*stored += (value - prevvalue + 1);*/
7522 if (prevvalue < 256) {
7523 const IV ceilvalue = value < 256 ? value : 255;
7526 /* In EBCDIC [\x89-\x91] should include
7527 * the \x8e but [i-j] should not. */
7528 if (literal_endpoint == 2 &&
7529 ((isLOWER(prevvalue) && isLOWER(ceilvalue)) ||
7530 (isUPPER(prevvalue) && isUPPER(ceilvalue))))
7532 if (isLOWER(prevvalue)) {
7533 for (i = prevvalue; i <= ceilvalue; i++)
7535 ANYOF_BITMAP_SET(ret, i);
7537 for (i = prevvalue; i <= ceilvalue; i++)
7539 ANYOF_BITMAP_SET(ret, i);
7544 for (i = prevvalue; i <= ceilvalue; i++) {
7545 if (!ANYOF_BITMAP_TEST(ret,i)) {
7547 ANYOF_BITMAP_SET(ret, i);
7551 if (value > 255 || UTF) {
7552 const UV prevnatvalue = NATIVE_TO_UNI(prevvalue);
7553 const UV natvalue = NATIVE_TO_UNI(value);
7554 stored+=2; /* can't optimize this class */
7555 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
7556 if (prevnatvalue < natvalue) { /* what about > ? */
7557 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\t%04"UVxf"\n",
7558 prevnatvalue, natvalue);
7560 else if (prevnatvalue == natvalue) {
7561 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n", natvalue);
7563 U8 foldbuf[UTF8_MAXBYTES_CASE+1];
7565 const UV f = to_uni_fold(natvalue, foldbuf, &foldlen);
7567 #ifdef EBCDIC /* RD t/uni/fold ff and 6b */
7568 if (RExC_precomp[0] == ':' &&
7569 RExC_precomp[1] == '[' &&
7570 (f == 0xDF || f == 0x92)) {
7571 f = NATIVE_TO_UNI(f);
7574 /* If folding and foldable and a single
7575 * character, insert also the folded version
7576 * to the charclass. */
7578 #ifdef EBCDIC /* RD tunifold ligatures s,t fb05, fb06 */
7579 if ((RExC_precomp[0] == ':' &&
7580 RExC_precomp[1] == '[' &&
7582 (value == 0xFB05 || value == 0xFB06))) ?
7583 foldlen == ((STRLEN)UNISKIP(f) - 1) :
7584 foldlen == (STRLEN)UNISKIP(f) )
7586 if (foldlen == (STRLEN)UNISKIP(f))
7588 Perl_sv_catpvf(aTHX_ listsv,
7591 /* Any multicharacter foldings
7592 * require the following transform:
7593 * [ABCDEF] -> (?:[ABCabcDEFd]|pq|rst)
7594 * where E folds into "pq" and F folds
7595 * into "rst", all other characters
7596 * fold to single characters. We save
7597 * away these multicharacter foldings,
7598 * to be later saved as part of the
7599 * additional "s" data. */
7602 if (!unicode_alternate)
7603 unicode_alternate = newAV();
7604 sv = newSVpvn((char*)foldbuf, foldlen);
7606 av_push(unicode_alternate, sv);
7610 /* If folding and the value is one of the Greek
7611 * sigmas insert a few more sigmas to make the
7612 * folding rules of the sigmas to work right.
7613 * Note that not all the possible combinations
7614 * are handled here: some of them are handled
7615 * by the standard folding rules, and some of
7616 * them (literal or EXACTF cases) are handled
7617 * during runtime in regexec.c:S_find_byclass(). */
7618 if (value == UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA) {
7619 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
7620 (UV)UNICODE_GREEK_CAPITAL_LETTER_SIGMA);
7621 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
7622 (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
7624 else if (value == UNICODE_GREEK_CAPITAL_LETTER_SIGMA)
7625 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
7626 (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
7631 literal_endpoint = 0;
7635 range = 0; /* this range (if it was one) is done now */
7639 ANYOF_FLAGS(ret) |= ANYOF_LARGE;
7641 RExC_size += ANYOF_CLASS_ADD_SKIP;
7643 RExC_emit += ANYOF_CLASS_ADD_SKIP;
7649 /****** !SIZE_ONLY AFTER HERE *********/
7651 if( stored == 1 && value < 256
7652 && !( ANYOF_FLAGS(ret) & ( ANYOF_FLAGS_ALL ^ ANYOF_FOLD ) )
7654 /* optimize single char class to an EXACT node
7655 but *only* when its not a UTF/high char */
7656 const char * cur_parse= RExC_parse;
7657 RExC_emit = (regnode *)orig_emit;
7658 RExC_parse = (char *)orig_parse;
7659 ret = reg_node(pRExC_state,
7660 (U8)((ANYOF_FLAGS(ret) & ANYOF_FOLD) ? EXACTF : EXACT));
7661 RExC_parse = (char *)cur_parse;
7662 *STRING(ret)= (char)value;
7664 RExC_emit += STR_SZ(1);
7667 /* optimize case-insensitive simple patterns (e.g. /[a-z]/i) */
7668 if ( /* If the only flag is folding (plus possibly inversion). */
7669 ((ANYOF_FLAGS(ret) & (ANYOF_FLAGS_ALL ^ ANYOF_INVERT)) == ANYOF_FOLD)
7671 for (value = 0; value < 256; ++value) {
7672 if (ANYOF_BITMAP_TEST(ret, value)) {
7673 UV fold = PL_fold[value];
7676 ANYOF_BITMAP_SET(ret, fold);
7679 ANYOF_FLAGS(ret) &= ~ANYOF_FOLD;
7682 /* optimize inverted simple patterns (e.g. [^a-z]) */
7683 if (optimize_invert &&
7684 /* If the only flag is inversion. */
7685 (ANYOF_FLAGS(ret) & ANYOF_FLAGS_ALL) == ANYOF_INVERT) {
7686 for (value = 0; value < ANYOF_BITMAP_SIZE; ++value)
7687 ANYOF_BITMAP(ret)[value] ^= ANYOF_FLAGS_ALL;
7688 ANYOF_FLAGS(ret) = ANYOF_UNICODE_ALL;
7691 AV * const av = newAV();
7693 /* The 0th element stores the character class description
7694 * in its textual form: used later (regexec.c:Perl_regclass_swash())
7695 * to initialize the appropriate swash (which gets stored in
7696 * the 1st element), and also useful for dumping the regnode.
7697 * The 2nd element stores the multicharacter foldings,
7698 * used later (regexec.c:S_reginclass()). */
7699 av_store(av, 0, listsv);
7700 av_store(av, 1, NULL);
7701 av_store(av, 2, (SV*)unicode_alternate);
7702 rv = newRV_noinc((SV*)av);
7703 n = add_data(pRExC_state, 1, "s");
7704 RExC_rxi->data->data[n] = (void*)rv;
7711 S_nextchar(pTHX_ RExC_state_t *pRExC_state)
7713 char* const retval = RExC_parse++;
7716 if (*RExC_parse == '(' && RExC_parse[1] == '?' &&
7717 RExC_parse[2] == '#') {
7718 while (*RExC_parse != ')') {
7719 if (RExC_parse == RExC_end)
7720 FAIL("Sequence (?#... not terminated");
7726 if (RExC_flags & RXf_PMf_EXTENDED) {
7727 if (isSPACE(*RExC_parse)) {
7731 else if (*RExC_parse == '#') {
7732 while (RExC_parse < RExC_end)
7733 if (*RExC_parse++ == '\n') break;
7742 - reg_node - emit a node
7744 STATIC regnode * /* Location. */
7745 S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
7748 register regnode *ptr;
7749 regnode * const ret = RExC_emit;
7750 GET_RE_DEBUG_FLAGS_DECL;
7753 SIZE_ALIGN(RExC_size);
7758 if (OP(RExC_emit) == 255)
7759 Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %s: %d ",
7760 reg_name[op], OP(RExC_emit));
7762 NODE_ALIGN_FILL(ret);
7764 FILL_ADVANCE_NODE(ptr, op);
7765 if (RExC_offsets) { /* MJD */
7766 MJD_OFFSET_DEBUG(("%s:%d: (op %s) %s %"UVuf" (len %"UVuf") (max %"UVuf").\n",
7767 "reg_node", __LINE__,
7769 (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0]
7770 ? "Overwriting end of array!\n" : "OK",
7771 (UV)(RExC_emit - RExC_emit_start),
7772 (UV)(RExC_parse - RExC_start),
7773 (UV)RExC_offsets[0]));
7774 Set_Node_Offset(RExC_emit, RExC_parse + (op == END));
7782 - reganode - emit a node with an argument
7784 STATIC regnode * /* Location. */
7785 S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
7788 register regnode *ptr;
7789 regnode * const ret = RExC_emit;
7790 GET_RE_DEBUG_FLAGS_DECL;
7793 SIZE_ALIGN(RExC_size);
7798 assert(2==regarglen[op]+1);
7800 Anything larger than this has to allocate the extra amount.
7801 If we changed this to be:
7803 RExC_size += (1 + regarglen[op]);
7805 then it wouldn't matter. Its not clear what side effect
7806 might come from that so its not done so far.
7812 if (OP(RExC_emit) == 255)
7813 Perl_croak(aTHX_ "panic: reganode overwriting end of allocated program space");
7815 NODE_ALIGN_FILL(ret);
7817 FILL_ADVANCE_NODE_ARG(ptr, op, arg);
7818 if (RExC_offsets) { /* MJD */
7819 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
7823 (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0] ?
7824 "Overwriting end of array!\n" : "OK",
7825 (UV)(RExC_emit - RExC_emit_start),
7826 (UV)(RExC_parse - RExC_start),
7827 (UV)RExC_offsets[0]));
7828 Set_Cur_Node_Offset;
7836 - reguni - emit (if appropriate) a Unicode character
7839 S_reguni(pTHX_ const RExC_state_t *pRExC_state, UV uv, char* s)
7842 return SIZE_ONLY ? UNISKIP(uv) : (uvchr_to_utf8((U8*)s, uv) - (U8*)s);
7846 - reginsert - insert an operator in front of already-emitted operand
7848 * Means relocating the operand.
7851 S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd, U32 depth)
7854 register regnode *src;
7855 register regnode *dst;
7856 register regnode *place;
7857 const int offset = regarglen[(U8)op];
7858 const int size = NODE_STEP_REGNODE + offset;
7859 GET_RE_DEBUG_FLAGS_DECL;
7860 /* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */
7861 DEBUG_PARSE_FMT("inst"," - %s",reg_name[op]);
7870 if (RExC_open_parens) {
7872 DEBUG_PARSE_FMT("inst"," - %"IVdf, (IV)RExC_npar);
7873 for ( paren=0 ; paren < RExC_npar ; paren++ ) {
7874 if ( RExC_open_parens[paren] >= opnd ) {
7875 DEBUG_PARSE_FMT("open"," - %d",size);
7876 RExC_open_parens[paren] += size;
7878 DEBUG_PARSE_FMT("open"," - %s","ok");
7880 if ( RExC_close_parens[paren] >= opnd ) {
7881 DEBUG_PARSE_FMT("close"," - %d",size);
7882 RExC_close_parens[paren] += size;
7884 DEBUG_PARSE_FMT("close"," - %s","ok");
7889 while (src > opnd) {
7890 StructCopy(--src, --dst, regnode);
7891 if (RExC_offsets) { /* MJD 20010112 */
7892 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s copy %"UVuf" -> %"UVuf" (max %"UVuf").\n",
7896 (UV)(dst - RExC_emit_start) > RExC_offsets[0]
7897 ? "Overwriting end of array!\n" : "OK",
7898 (UV)(src - RExC_emit_start),
7899 (UV)(dst - RExC_emit_start),
7900 (UV)RExC_offsets[0]));
7901 Set_Node_Offset_To_R(dst-RExC_emit_start, Node_Offset(src));
7902 Set_Node_Length_To_R(dst-RExC_emit_start, Node_Length(src));
7907 place = opnd; /* Op node, where operand used to be. */
7908 if (RExC_offsets) { /* MJD */
7909 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
7913 (UV)(place - RExC_emit_start) > RExC_offsets[0]
7914 ? "Overwriting end of array!\n" : "OK",
7915 (UV)(place - RExC_emit_start),
7916 (UV)(RExC_parse - RExC_start),
7917 (UV)RExC_offsets[0]));
7918 Set_Node_Offset(place, RExC_parse);
7919 Set_Node_Length(place, 1);
7921 src = NEXTOPER(place);
7922 FILL_ADVANCE_NODE(place, op);
7923 Zero(src, offset, regnode);
7927 - regtail - set the next-pointer at the end of a node chain of p to val.
7928 - SEE ALSO: regtail_study
7930 /* TODO: All three parms should be const */
7932 S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
7935 register regnode *scan;
7936 GET_RE_DEBUG_FLAGS_DECL;
7938 PERL_UNUSED_ARG(depth);
7944 /* Find last node. */
7947 regnode * const temp = regnext(scan);
7949 SV * const mysv=sv_newmortal();
7950 DEBUG_PARSE_MSG((scan==p ? "tail" : ""));
7951 regprop(RExC_rx, mysv, scan);
7952 PerlIO_printf(Perl_debug_log, "~ %s (%d) %s %s\n",
7953 SvPV_nolen_const(mysv), REG_NODE_NUM(scan),
7954 (temp == NULL ? "->" : ""),
7955 (temp == NULL ? reg_name[OP(val)] : "")
7963 if (reg_off_by_arg[OP(scan)]) {
7964 ARG_SET(scan, val - scan);
7967 NEXT_OFF(scan) = val - scan;
7973 - regtail_study - set the next-pointer at the end of a node chain of p to val.
7974 - Look for optimizable sequences at the same time.
7975 - currently only looks for EXACT chains.
7977 This is expermental code. The idea is to use this routine to perform
7978 in place optimizations on branches and groups as they are constructed,
7979 with the long term intention of removing optimization from study_chunk so
7980 that it is purely analytical.
7982 Currently only used when in DEBUG mode. The macro REGTAIL_STUDY() is used
7983 to control which is which.
7986 /* TODO: All four parms should be const */
7989 S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
7992 register regnode *scan;
7994 #ifdef EXPERIMENTAL_INPLACESCAN
7998 GET_RE_DEBUG_FLAGS_DECL;
8004 /* Find last node. */
8008 regnode * const temp = regnext(scan);
8009 #ifdef EXPERIMENTAL_INPLACESCAN
8010 if (PL_regkind[OP(scan)] == EXACT)
8011 if (join_exact(pRExC_state,scan,&min,1,val,depth+1))
8019 if( exact == PSEUDO )
8021 else if ( exact != OP(scan) )
8030 SV * const mysv=sv_newmortal();
8031 DEBUG_PARSE_MSG((scan==p ? "tsdy" : ""));
8032 regprop(RExC_rx, mysv, scan);
8033 PerlIO_printf(Perl_debug_log, "~ %s (%d) -> %s\n",
8034 SvPV_nolen_const(mysv),
8043 SV * const mysv_val=sv_newmortal();
8044 DEBUG_PARSE_MSG("");
8045 regprop(RExC_rx, mysv_val, val);
8046 PerlIO_printf(Perl_debug_log, "~ attach to %s (%"IVdf") offset to %"IVdf"\n",
8047 SvPV_nolen_const(mysv_val),
8048 (IV)REG_NODE_NUM(val),
8052 if (reg_off_by_arg[OP(scan)]) {
8053 ARG_SET(scan, val - scan);
8056 NEXT_OFF(scan) = val - scan;
8064 - regcurly - a little FSA that accepts {\d+,?\d*}
8067 S_regcurly(register const char *s)
8086 - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
8089 Perl_regdump(pTHX_ const regexp *r)
8093 SV * const sv = sv_newmortal();
8094 SV *dsv= sv_newmortal();
8097 (void)dumpuntil(r, ri->program, ri->program + 1, NULL, NULL, sv, 0, 0);
8099 /* Header fields of interest. */
8100 if (r->anchored_substr) {
8101 RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->anchored_substr),
8102 RE_SV_DUMPLEN(r->anchored_substr), 30);
8103 PerlIO_printf(Perl_debug_log,
8104 "anchored %s%s at %"IVdf" ",
8105 s, RE_SV_TAIL(r->anchored_substr),
8106 (IV)r->anchored_offset);
8107 } else if (r->anchored_utf8) {
8108 RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->anchored_utf8),
8109 RE_SV_DUMPLEN(r->anchored_utf8), 30);
8110 PerlIO_printf(Perl_debug_log,
8111 "anchored utf8 %s%s at %"IVdf" ",
8112 s, RE_SV_TAIL(r->anchored_utf8),
8113 (IV)r->anchored_offset);
8115 if (r->float_substr) {
8116 RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->float_substr),
8117 RE_SV_DUMPLEN(r->float_substr), 30);
8118 PerlIO_printf(Perl_debug_log,
8119 "floating %s%s at %"IVdf"..%"UVuf" ",
8120 s, RE_SV_TAIL(r->float_substr),
8121 (IV)r->float_min_offset, (UV)r->float_max_offset);
8122 } else if (r->float_utf8) {
8123 RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->float_utf8),
8124 RE_SV_DUMPLEN(r->float_utf8), 30);
8125 PerlIO_printf(Perl_debug_log,
8126 "floating utf8 %s%s at %"IVdf"..%"UVuf" ",
8127 s, RE_SV_TAIL(r->float_utf8),
8128 (IV)r->float_min_offset, (UV)r->float_max_offset);
8130 if (r->check_substr || r->check_utf8)
8131 PerlIO_printf(Perl_debug_log,
8133 (r->check_substr == r->float_substr
8134 && r->check_utf8 == r->float_utf8
8135 ? "(checking floating" : "(checking anchored"));
8136 if (r->extflags & RXf_NOSCAN)
8137 PerlIO_printf(Perl_debug_log, " noscan");
8138 if (r->extflags & RXf_CHECK_ALL)
8139 PerlIO_printf(Perl_debug_log, " isall");
8140 if (r->check_substr || r->check_utf8)
8141 PerlIO_printf(Perl_debug_log, ") ");
8143 if (ri->regstclass) {
8144 regprop(r, sv, ri->regstclass);
8145 PerlIO_printf(Perl_debug_log, "stclass %s ", SvPVX_const(sv));
8147 if (r->extflags & RXf_ANCH) {
8148 PerlIO_printf(Perl_debug_log, "anchored");
8149 if (r->extflags & RXf_ANCH_BOL)
8150 PerlIO_printf(Perl_debug_log, "(BOL)");
8151 if (r->extflags & RXf_ANCH_MBOL)
8152 PerlIO_printf(Perl_debug_log, "(MBOL)");
8153 if (r->extflags & RXf_ANCH_SBOL)
8154 PerlIO_printf(Perl_debug_log, "(SBOL)");
8155 if (r->extflags & RXf_ANCH_GPOS)
8156 PerlIO_printf(Perl_debug_log, "(GPOS)");
8157 PerlIO_putc(Perl_debug_log, ' ');
8159 if (r->extflags & RXf_GPOS_SEEN)
8160 PerlIO_printf(Perl_debug_log, "GPOS:%"UVuf" ", (UV)r->gofs);
8161 if (r->intflags & PREGf_SKIP)
8162 PerlIO_printf(Perl_debug_log, "plus ");
8163 if (r->intflags & PREGf_IMPLICIT)
8164 PerlIO_printf(Perl_debug_log, "implicit ");
8165 PerlIO_printf(Perl_debug_log, "minlen %"IVdf" ", (IV)r->minlen);
8166 if (r->extflags & RXf_EVAL_SEEN)
8167 PerlIO_printf(Perl_debug_log, "with eval ");
8168 PerlIO_printf(Perl_debug_log, "\n");
8170 PERL_UNUSED_CONTEXT;
8172 #endif /* DEBUGGING */
8176 - regprop - printable representation of opcode
8179 Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o)
8184 RXi_GET_DECL(prog,progi);
8185 GET_RE_DEBUG_FLAGS_DECL;
8188 sv_setpvn(sv, "", 0);
8190 if (OP(o) > REGNODE_MAX) /* regnode.type is unsigned */
8191 /* It would be nice to FAIL() here, but this may be called from
8192 regexec.c, and it would be hard to supply pRExC_state. */
8193 Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d", (int)OP(o), (int)REGNODE_MAX);
8194 sv_catpv(sv, reg_name[OP(o)]); /* Take off const! */
8196 k = PL_regkind[OP(o)];
8199 SV * const dsv = sv_2mortal(newSVpvs(""));
8200 /* Using is_utf8_string() (via PERL_PV_UNI_DETECT)
8201 * is a crude hack but it may be the best for now since
8202 * we have no flag "this EXACTish node was UTF-8"
8204 const char * const s =
8205 pv_pretty(dsv, STRING(o), STR_LEN(o), 60,
8206 PL_colors[0], PL_colors[1],
8207 PERL_PV_ESCAPE_UNI_DETECT |
8208 PERL_PV_PRETTY_ELIPSES |
8211 Perl_sv_catpvf(aTHX_ sv, " %s", s );
8212 } else if (k == TRIE) {
8213 /* print the details of the trie in dumpuntil instead, as
8214 * progi->data isn't available here */
8215 const char op = OP(o);
8216 const I32 n = ARG(o);
8217 const reg_ac_data * const ac = IS_TRIE_AC(op) ?
8218 (reg_ac_data *)progi->data->data[n] :
8220 const reg_trie_data * const trie = !IS_TRIE_AC(op) ?
8221 (reg_trie_data*)progi->data->data[n] :
8224 Perl_sv_catpvf(aTHX_ sv, "-%s",reg_name[o->flags]);
8225 DEBUG_TRIE_COMPILE_r(
8226 Perl_sv_catpvf(aTHX_ sv,
8227 "<S:%"UVuf"/%"IVdf" W:%"UVuf" L:%"UVuf"/%"UVuf" C:%"UVuf"/%"UVuf">",
8228 (UV)trie->startstate,
8229 (IV)trie->statecount-1, /* -1 because of the unused 0 element */
8230 (UV)trie->wordcount,
8233 (UV)TRIE_CHARCOUNT(trie),
8234 (UV)trie->uniquecharcount
8237 if ( IS_ANYOF_TRIE(op) || trie->bitmap ) {
8239 int rangestart = -1;
8240 U8* bitmap = IS_ANYOF_TRIE(op) ? (U8*)ANYOF_BITMAP(o) : (U8*)TRIE_BITMAP(trie);
8241 Perl_sv_catpvf(aTHX_ sv, "[");
8242 for (i = 0; i <= 256; i++) {
8243 if (i < 256 && BITMAP_TEST(bitmap,i)) {
8244 if (rangestart == -1)
8246 } else if (rangestart != -1) {
8247 if (i <= rangestart + 3)
8248 for (; rangestart < i; rangestart++)
8249 put_byte(sv, rangestart);
8251 put_byte(sv, rangestart);
8253 put_byte(sv, i - 1);
8258 Perl_sv_catpvf(aTHX_ sv, "]");
8261 } else if (k == CURLY) {
8262 if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX)
8263 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */
8264 Perl_sv_catpvf(aTHX_ sv, " {%d,%d}", ARG1(o), ARG2(o));
8266 else if (k == WHILEM && o->flags) /* Ordinal/of */
8267 Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4);
8268 else if (k == REF || k == OPEN || k == CLOSE || k == GROUPP || OP(o)==ACCEPT)
8269 Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o)); /* Parenth number */
8270 else if (k == GOSUB)
8271 Perl_sv_catpvf(aTHX_ sv, "%d[%+d]", (int)ARG(o),(int)ARG2L(o)); /* Paren and offset */
8272 else if (k == VERB) {
8274 Perl_sv_catpvf(aTHX_ sv, ":%"SVf,
8275 (SV*)progi->data->data[ ARG( o ) ]);
8276 } else if (k == LOGICAL)
8277 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* 2: embedded, otherwise 1 */
8278 else if (k == ANYOF) {
8279 int i, rangestart = -1;
8280 const U8 flags = ANYOF_FLAGS(o);
8282 /* Should be synchronized with * ANYOF_ #xdefines in regcomp.h */
8283 static const char * const anyofs[] = {
8316 if (flags & ANYOF_LOCALE)
8317 sv_catpvs(sv, "{loc}");
8318 if (flags & ANYOF_FOLD)
8319 sv_catpvs(sv, "{i}");
8320 Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
8321 if (flags & ANYOF_INVERT)
8323 for (i = 0; i <= 256; i++) {
8324 if (i < 256 && ANYOF_BITMAP_TEST(o,i)) {
8325 if (rangestart == -1)
8327 } else if (rangestart != -1) {
8328 if (i <= rangestart + 3)
8329 for (; rangestart < i; rangestart++)
8330 put_byte(sv, rangestart);
8332 put_byte(sv, rangestart);
8334 put_byte(sv, i - 1);
8340 if (o->flags & ANYOF_CLASS)
8341 for (i = 0; i < (int)(sizeof(anyofs)/sizeof(char*)); i++)
8342 if (ANYOF_CLASS_TEST(o,i))
8343 sv_catpv(sv, anyofs[i]);
8345 if (flags & ANYOF_UNICODE)
8346 sv_catpvs(sv, "{unicode}");
8347 else if (flags & ANYOF_UNICODE_ALL)
8348 sv_catpvs(sv, "{unicode_all}");
8352 SV * const sw = regclass_swash(prog, o, FALSE, &lv, 0);
8356 U8 s[UTF8_MAXBYTES_CASE+1];
8358 for (i = 0; i <= 256; i++) { /* just the first 256 */
8359 uvchr_to_utf8(s, i);
8361 if (i < 256 && swash_fetch(sw, s, TRUE)) {
8362 if (rangestart == -1)
8364 } else if (rangestart != -1) {
8365 if (i <= rangestart + 3)
8366 for (; rangestart < i; rangestart++) {
8367 const U8 * const e = uvchr_to_utf8(s,rangestart);
8369 for(p = s; p < e; p++)
8373 const U8 *e = uvchr_to_utf8(s,rangestart);
8375 for (p = s; p < e; p++)
8378 e = uvchr_to_utf8(s, i-1);
8379 for (p = s; p < e; p++)
8386 sv_catpvs(sv, "..."); /* et cetera */
8390 char *s = savesvpv(lv);
8391 char * const origs = s;
8393 while (*s && *s != '\n')
8397 const char * const t = ++s;
8415 Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
8417 else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH))
8418 Perl_sv_catpvf(aTHX_ sv, "[%d]", -(o->flags));
8420 PERL_UNUSED_CONTEXT;
8421 PERL_UNUSED_ARG(sv);
8423 PERL_UNUSED_ARG(prog);
8424 #endif /* DEBUGGING */
8428 Perl_re_intuit_string(pTHX_ regexp *prog)
8429 { /* Assume that RE_INTUIT is set */
8431 GET_RE_DEBUG_FLAGS_DECL;
8432 PERL_UNUSED_CONTEXT;
8436 const char * const s = SvPV_nolen_const(prog->check_substr
8437 ? prog->check_substr : prog->check_utf8);
8439 if (!PL_colorset) reginitcolors();
8440 PerlIO_printf(Perl_debug_log,
8441 "%sUsing REx %ssubstr:%s \"%s%.60s%s%s\"\n",
8443 prog->check_substr ? "" : "utf8 ",
8444 PL_colors[5],PL_colors[0],
8447 (strlen(s) > 60 ? "..." : ""));
8450 return prog->check_substr ? prog->check_substr : prog->check_utf8;
8454 pregfree - free a regexp
8456 See regdupe below if you change anything here.
8460 Perl_pregfree(pTHX_ struct regexp *r)
8464 GET_RE_DEBUG_FLAGS_DECL;
8466 if (!r || (--r->refcnt > 0))
8472 SV *dsv= sv_newmortal();
8473 RE_PV_QUOTED_DECL(s, (r->extflags & RXf_UTF8),
8474 dsv, r->precomp, r->prelen, 60);
8475 PerlIO_printf(Perl_debug_log,"%sFreeing REx:%s %s\n",
8476 PL_colors[4],PL_colors[5],s);
8480 /* gcov results gave these as non-null 100% of the time, so there's no
8481 optimisation in checking them before calling Safefree */
8482 Safefree(r->precomp);
8483 Safefree(ri->offsets); /* 20010421 MJD */
8484 RX_MATCH_COPY_FREE(r);
8485 #ifdef PERL_OLD_COPY_ON_WRITE
8487 SvREFCNT_dec(r->saved_copy);
8490 if (r->anchored_substr)
8491 SvREFCNT_dec(r->anchored_substr);
8492 if (r->anchored_utf8)
8493 SvREFCNT_dec(r->anchored_utf8);
8494 if (r->float_substr)
8495 SvREFCNT_dec(r->float_substr);
8497 SvREFCNT_dec(r->float_utf8);
8498 Safefree(r->substrs);
8501 SvREFCNT_dec(r->paren_names);
8503 int n = ri->data->count;
8504 PAD* new_comppad = NULL;
8509 /* If you add a ->what type here, update the comment in regcomp.h */
8510 switch (ri->data->what[n]) {
8513 SvREFCNT_dec((SV*)ri->data->data[n]);
8516 Safefree(ri->data->data[n]);
8519 new_comppad = (AV*)ri->data->data[n];
8522 if (new_comppad == NULL)
8523 Perl_croak(aTHX_ "panic: pregfree comppad");
8524 PAD_SAVE_LOCAL(old_comppad,
8525 /* Watch out for global destruction's random ordering. */
8526 (SvTYPE(new_comppad) == SVt_PVAV) ? new_comppad : NULL
8529 refcnt = OpREFCNT_dec((OP_4tree*)ri->data->data[n]);
8532 op_free((OP_4tree*)ri->data->data[n]);
8534 PAD_RESTORE_LOCAL(old_comppad);
8535 SvREFCNT_dec((SV*)new_comppad);
8541 { /* Aho Corasick add-on structure for a trie node.
8542 Used in stclass optimization only */
8544 reg_ac_data *aho=(reg_ac_data*)ri->data->data[n];
8546 refcount = --aho->refcount;
8549 Safefree(aho->states);
8550 Safefree(aho->fail);
8551 aho->trie=NULL; /* not necessary to free this as it is
8552 handled by the 't' case */
8553 Safefree(ri->data->data[n]); /* do this last!!!! */
8554 Safefree(ri->regstclass);
8560 /* trie structure. */
8562 reg_trie_data *trie=(reg_trie_data*)ri->data->data[n];
8564 refcount = --trie->refcount;
8567 Safefree(trie->charmap);
8568 if (trie->widecharmap)
8569 SvREFCNT_dec((SV*)trie->widecharmap);
8570 Safefree(trie->states);
8571 Safefree(trie->trans);
8573 Safefree(trie->bitmap);
8575 Safefree(trie->wordlen);
8577 Safefree(trie->jump);
8579 Safefree(trie->nextword);
8582 SvREFCNT_dec((SV*)trie->words);
8583 if (trie->revcharmap)
8584 SvREFCNT_dec((SV*)trie->revcharmap);
8586 Safefree(ri->data->data[n]); /* do this last!!!! */
8591 Perl_croak(aTHX_ "panic: regfree data code '%c'", ri->data->what[n]);
8594 Safefree(ri->data->what);
8597 Safefree(r->startp);
8600 Safefree(ri->swap->startp);
8601 Safefree(ri->swap->endp);
8608 #define sv_dup_inc(s,t) SvREFCNT_inc(sv_dup(s,t))
8609 #define av_dup_inc(s,t) (AV*)SvREFCNT_inc(sv_dup((SV*)s,t))
8610 #define hv_dup_inc(s,t) (HV*)SvREFCNT_inc(sv_dup((SV*)s,t))
8611 #define SAVEPVN(p,n) ((p) ? savepvn(p,n) : NULL)
8614 regdupe - duplicate a regexp.
8616 This routine is called by sv.c's re_dup and is expected to clone a
8617 given regexp structure. It is a no-op when not under USE_ITHREADS.
8618 (Originally this *was* re_dup() for change history see sv.c)
8620 See pregfree() above if you change anything here.
8622 #if defined(USE_ITHREADS)
8624 Perl_regdupe(pTHX_ const regexp *r, CLONE_PARAMS *param)
8628 regexp_internal *reti;
8630 struct reg_substr_datum *s;
8634 return (REGEXP *)NULL;
8636 if ((ret = (REGEXP *)ptr_table_fetch(PL_ptr_table, r)))
8639 len = ri->offsets[0];
8640 npar = r->nparens+1;
8642 Newxz(ret, 1, regexp);
8643 Newxc(reti, sizeof(regexp_internal) + (len+1)*sizeof(regnode), char, regexp_internal);
8645 Copy(ri->program, reti->program, len+1, regnode);
8647 Newx(ret->startp, npar, I32);
8648 Copy(r->startp, ret->startp, npar, I32);
8649 Newx(ret->endp, npar, I32);
8650 Copy(r->startp, ret->startp, npar, I32);
8652 Newx(reti->swap, 1, regexp_paren_ofs);
8653 /* no need to copy these */
8654 Newx(reti->swap->startp, npar, I32);
8655 Newx(reti->swap->endp, npar, I32);
8660 Newx(ret->substrs, 1, struct reg_substr_data);
8661 for (s = ret->substrs->data, i = 0; i < 3; i++, s++) {
8662 s->min_offset = r->substrs->data[i].min_offset;
8663 s->max_offset = r->substrs->data[i].max_offset;
8664 s->end_shift = r->substrs->data[i].end_shift;
8665 s->substr = sv_dup_inc(r->substrs->data[i].substr, param);
8666 s->utf8_substr = sv_dup_inc(r->substrs->data[i].utf8_substr, param);
8669 reti->regstclass = NULL;
8672 const int count = ri->data->count;
8675 Newxc(d, sizeof(struct reg_data) + count*sizeof(void *),
8676 char, struct reg_data);
8677 Newx(d->what, count, U8);
8680 for (i = 0; i < count; i++) {
8681 d->what[i] = ri->data->what[i];
8682 switch (d->what[i]) {
8683 /* legal options are one of: sSfpontT
8684 see also regcomp.h and pregfree() */
8687 d->data[i] = sv_dup_inc((SV *)ri->data->data[i], param);
8690 d->data[i] = av_dup_inc((AV *)ri->data->data[i], param);
8693 /* This is cheating. */
8694 Newx(d->data[i], 1, struct regnode_charclass_class);
8695 StructCopy(ri->data->data[i], d->data[i],
8696 struct regnode_charclass_class);
8697 reti->regstclass = (regnode*)d->data[i];
8700 /* Compiled op trees are readonly and in shared memory,
8701 and can thus be shared without duplication. */
8703 d->data[i] = (void*)OpREFCNT_inc((OP*)ri->data->data[i]);
8707 d->data[i] = ri->data->data[i];
8710 d->data[i] = ri->data->data[i];
8712 ((reg_trie_data*)d->data[i])->refcount++;
8716 d->data[i] = ri->data->data[i];
8718 ((reg_ac_data*)d->data[i])->refcount++;
8720 /* Trie stclasses are readonly and can thus be shared
8721 * without duplication. We free the stclass in pregfree
8722 * when the corresponding reg_ac_data struct is freed.
8724 reti->regstclass= ri->regstclass;
8727 Perl_croak(aTHX_ "panic: re_dup unknown data code '%c'", ri->data->what[i]);
8736 Newx(reti->offsets, 2*len+1, U32);
8737 Copy(ri->offsets, reti->offsets, 2*len+1, U32);
8739 ret->precomp = SAVEPVN(r->precomp, r->prelen);
8740 ret->refcnt = r->refcnt;
8741 ret->minlen = r->minlen;
8742 ret->minlenret = r->minlenret;
8743 ret->prelen = r->prelen;
8744 ret->nparens = r->nparens;
8745 ret->lastparen = r->lastparen;
8746 ret->lastcloseparen = r->lastcloseparen;
8747 ret->intflags = r->intflags;
8748 ret->extflags = r->extflags;
8750 ret->sublen = r->sublen;
8752 ret->engine = r->engine;
8754 ret->paren_names = hv_dup_inc(r->paren_names, param);
8756 if (RX_MATCH_COPIED(ret))
8757 ret->subbeg = SAVEPVN(r->subbeg, r->sublen);
8760 #ifdef PERL_OLD_COPY_ON_WRITE
8761 ret->saved_copy = NULL;
8764 ptr_table_store(PL_ptr_table, r, ret);
8772 converts a regexp embedded in a MAGIC struct to its stringified form,
8773 caching the converted form in the struct and returns the cached
8776 If lp is nonnull then it is used to return the length of the
8779 If flags is nonnull and the returned string contains UTF8 then
8780 (flags & 1) will be true.
8782 If haseval is nonnull then it is used to return whether the pattern
8785 Normally called via macro:
8787 CALLREG_STRINGIFY(mg,0,0);
8791 CALLREG_AS_STR(mg,lp,flags,haseval)
8793 See sv_2pv_flags() in sv.c for an example of internal usage.
8798 Perl_reg_stringify(pTHX_ MAGIC *mg, STRLEN *lp, U32 *flags, I32 *haseval ) {
8800 const regexp * const re = (regexp *)mg->mg_obj;
8801 RXi_GET_DECL(re,ri);
8804 const char *fptr = "msix";
8809 bool need_newline = 0;
8810 U16 reganch = (U16)((re->extflags & RXf_PMf_COMPILETIME) >> 12);
8812 while((ch = *fptr++)) {
8814 reflags[left++] = ch;
8817 reflags[right--] = ch;
8822 reflags[left] = '-';
8826 mg->mg_len = re->prelen + 4 + left;
8828 * If /x was used, we have to worry about a regex ending with a
8829 * comment later being embedded within another regex. If so, we don't
8830 * want this regex's "commentization" to leak out to the right part of
8831 * the enclosing regex, we must cap it with a newline.
8833 * So, if /x was used, we scan backwards from the end of the regex. If
8834 * we find a '#' before we find a newline, we need to add a newline
8835 * ourself. If we find a '\n' first (or if we don't find '#' or '\n'),
8836 * we don't need to add anything. -jfriedl
8838 if (PMf_EXTENDED & re->extflags) {
8839 const char *endptr = re->precomp + re->prelen;
8840 while (endptr >= re->precomp) {
8841 const char c = *(endptr--);
8843 break; /* don't need another */
8845 /* we end while in a comment, so we need a newline */
8846 mg->mg_len++; /* save space for it */
8847 need_newline = 1; /* note to add it */
8853 Newx(mg->mg_ptr, mg->mg_len + 1 + left, char);
8854 mg->mg_ptr[0] = '(';
8855 mg->mg_ptr[1] = '?';
8856 Copy(reflags, mg->mg_ptr+2, left, char);
8857 *(mg->mg_ptr+left+2) = ':';
8858 Copy(re->precomp, mg->mg_ptr+3+left, re->prelen, char);
8860 mg->mg_ptr[mg->mg_len - 2] = '\n';
8861 mg->mg_ptr[mg->mg_len - 1] = ')';
8862 mg->mg_ptr[mg->mg_len] = 0;
8865 *haseval = ri->program[0].next_off;
8867 *flags = ((re->extflags & RXf_UTF8) ? 1 : 0);
8875 #ifndef PERL_IN_XSUB_RE
8877 - regnext - dig the "next" pointer out of a node
8880 Perl_regnext(pTHX_ register regnode *p)
8883 register I32 offset;
8888 offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p));
8897 S_re_croak2(pTHX_ const char* pat1,const char* pat2,...)
8900 STRLEN l1 = strlen(pat1);
8901 STRLEN l2 = strlen(pat2);
8904 const char *message;
8910 Copy(pat1, buf, l1 , char);
8911 Copy(pat2, buf + l1, l2 , char);
8912 buf[l1 + l2] = '\n';
8913 buf[l1 + l2 + 1] = '\0';
8915 /* ANSI variant takes additional second argument */
8916 va_start(args, pat2);
8920 msv = vmess(buf, &args);
8922 message = SvPV_const(msv,l1);
8925 Copy(message, buf, l1 , char);
8926 buf[l1-1] = '\0'; /* Overwrite \n */
8927 Perl_croak(aTHX_ "%s", buf);
8930 /* XXX Here's a total kludge. But we need to re-enter for swash routines. */
8932 #ifndef PERL_IN_XSUB_RE
8934 Perl_save_re_context(pTHX)
8938 struct re_save_state *state;
8940 SAVEVPTR(PL_curcop);
8941 SSGROW(SAVESTACK_ALLOC_FOR_RE_SAVE_STATE + 1);
8943 state = (struct re_save_state *)(PL_savestack + PL_savestack_ix);
8944 PL_savestack_ix += SAVESTACK_ALLOC_FOR_RE_SAVE_STATE;
8945 SSPUSHINT(SAVEt_RE_STATE);
8947 Copy(&PL_reg_state, state, 1, struct re_save_state);
8949 PL_reg_start_tmp = 0;
8950 PL_reg_start_tmpl = 0;
8951 PL_reg_oldsaved = NULL;
8952 PL_reg_oldsavedlen = 0;
8954 PL_reg_leftiter = 0;
8955 PL_reg_poscache = NULL;
8956 PL_reg_poscache_size = 0;
8957 #ifdef PERL_OLD_COPY_ON_WRITE
8961 /* Save $1..$n (#18107: UTF-8 s/(\w+)/uc($1)/e); AMS 20021106. */
8963 const REGEXP * const rx = PM_GETRE(PL_curpm);
8966 for (i = 1; i <= rx->nparens; i++) {
8967 char digits[TYPE_CHARS(long)];
8968 const STRLEN len = my_snprintf(digits, sizeof(digits), "%lu", (long)i);
8969 GV *const *const gvp
8970 = (GV**)hv_fetch(PL_defstash, digits, len, 0);
8973 GV * const gv = *gvp;
8974 if (SvTYPE(gv) == SVt_PVGV && GvSV(gv))
8984 clear_re(pTHX_ void *r)
8987 ReREFCNT_dec((regexp *)r);
8993 S_put_byte(pTHX_ SV *sv, int c)
8995 if (isCNTRL(c) || c == 255 || !isPRINT(c))
8996 Perl_sv_catpvf(aTHX_ sv, "\\%o", c);
8997 else if (c == '-' || c == ']' || c == '\\' || c == '^')
8998 Perl_sv_catpvf(aTHX_ sv, "\\%c", c);
9000 Perl_sv_catpvf(aTHX_ sv, "%c", c);
9004 #define CLEAR_OPTSTART \
9005 if (optstart) STMT_START { \
9006 DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log, " (%"IVdf" nodes)\n", (IV)(node - optstart))); \
9010 #define DUMPUNTIL(b,e) CLEAR_OPTSTART; node=dumpuntil(r,start,(b),(e),last,sv,indent+1,depth+1);
9012 STATIC const regnode *
9013 S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node,
9014 const regnode *last, const regnode *plast,
9015 SV* sv, I32 indent, U32 depth)
9018 register U8 op = PSEUDO; /* Arbitrary non-END op. */
9019 register const regnode *next;
9020 const regnode *optstart= NULL;
9022 GET_RE_DEBUG_FLAGS_DECL;
9024 #ifdef DEBUG_DUMPUNTIL
9025 PerlIO_printf(Perl_debug_log, "--- %d : %d - %d - %d\n",indent,node-start,
9026 last ? last-start : 0,plast ? plast-start : 0);
9029 if (plast && plast < last)
9032 while (PL_regkind[op] != END && (!last || node < last)) {
9033 /* While that wasn't END last time... */
9037 if (op == CLOSE || op == WHILEM)
9039 next = regnext((regnode *)node);
9042 if (OP(node) == OPTIMIZED) {
9043 if (!optstart && RE_DEBUG_FLAG(RE_DEBUG_COMPILE_OPTIMISE))
9050 regprop(r, sv, node);
9051 PerlIO_printf(Perl_debug_log, "%4"IVdf":%*s%s", (IV)(node - start),
9052 (int)(2*indent + 1), "", SvPVX_const(sv));
9054 if (OP(node) != OPTIMIZED) {
9055 if (next == NULL) /* Next ptr. */
9056 PerlIO_printf(Perl_debug_log, "(0)");
9057 else if (PL_regkind[(U8)op] == BRANCH && PL_regkind[OP(next)] != BRANCH )
9058 PerlIO_printf(Perl_debug_log, "(FAIL)");
9060 PerlIO_printf(Perl_debug_log, "(%"IVdf")", (IV)(next - start));
9062 /*if (PL_regkind[(U8)op] != TRIE)*/
9063 (void)PerlIO_putc(Perl_debug_log, '\n');
9067 if (PL_regkind[(U8)op] == BRANCHJ) {
9070 register const regnode *nnode = (OP(next) == LONGJMP
9071 ? regnext((regnode *)next)
9073 if (last && nnode > last)
9075 DUMPUNTIL(NEXTOPER(NEXTOPER(node)), nnode);
9078 else if (PL_regkind[(U8)op] == BRANCH) {
9080 DUMPUNTIL(NEXTOPER(node), next);
9082 else if ( PL_regkind[(U8)op] == TRIE ) {
9083 const regnode *this_trie = node;
9084 const char op = OP(node);
9085 const I32 n = ARG(node);
9086 const reg_ac_data * const ac = op>=AHOCORASICK ?
9087 (reg_ac_data *)ri->data->data[n] :
9089 const reg_trie_data * const trie = op<AHOCORASICK ?
9090 (reg_trie_data*)ri->data->data[n] :
9092 const regnode *nextbranch= NULL;
9094 sv_setpvn(sv, "", 0);
9095 for (word_idx= 0; word_idx < (I32)trie->wordcount; word_idx++) {
9096 SV ** const elem_ptr = av_fetch(trie->words,word_idx,0);
9098 PerlIO_printf(Perl_debug_log, "%*s%s ",
9099 (int)(2*(indent+3)), "",
9100 elem_ptr ? pv_pretty(sv, SvPV_nolen_const(*elem_ptr), SvCUR(*elem_ptr), 60,
9101 PL_colors[0], PL_colors[1],
9102 (SvUTF8(*elem_ptr) ? PERL_PV_ESCAPE_UNI : 0) |
9103 PERL_PV_PRETTY_ELIPSES |
9109 U16 dist= trie->jump[word_idx+1];
9110 PerlIO_printf(Perl_debug_log, "(%"UVuf")\n",
9111 (UV)((dist ? this_trie + dist : next) - start));
9114 nextbranch= this_trie + trie->jump[0];
9115 DUMPUNTIL(this_trie + dist, nextbranch);
9117 if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
9118 nextbranch= regnext((regnode *)nextbranch);
9120 PerlIO_printf(Perl_debug_log, "\n");
9123 if (last && next > last)
9128 else if ( op == CURLY ) { /* "next" might be very big: optimizer */
9129 DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS,
9130 NEXTOPER(node) + EXTRA_STEP_2ARGS + 1);
9132 else if (PL_regkind[(U8)op] == CURLY && op != CURLYX) {
9134 DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS, next);
9136 else if ( op == PLUS || op == STAR) {
9137 DUMPUNTIL(NEXTOPER(node), NEXTOPER(node) + 1);
9139 else if (op == ANYOF) {
9140 /* arglen 1 + class block */
9141 node += 1 + ((ANYOF_FLAGS(node) & ANYOF_LARGE)
9142 ? ANYOF_CLASS_SKIP : ANYOF_SKIP);
9143 node = NEXTOPER(node);
9145 else if (PL_regkind[(U8)op] == EXACT) {
9146 /* Literal string, where present. */
9147 node += NODE_SZ_STR(node) - 1;
9148 node = NEXTOPER(node);
9151 node = NEXTOPER(node);
9152 node += regarglen[(U8)op];
9154 if (op == CURLYX || op == OPEN)
9158 #ifdef DEBUG_DUMPUNTIL
9159 PerlIO_printf(Perl_debug_log, "--- %d\n", (int)indent);
9164 #endif /* DEBUGGING */
9168 * c-indentation-style: bsd
9170 * indent-tabs-mode: t
9173 * ex: set ts=8 sts=4 sw=4 noet: