5 * "A fair jaw-cracker dwarf-language must be." --Samwise Gamgee
8 /* This file contains functions for compiling a regular expression. See
9 * also regexec.c which funnily enough, contains functions for executing
10 * a regular expression.
12 * This file is also copied at build time to ext/re/re_comp.c, where
13 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
14 * This causes the main functions to be compiled under new names and with
15 * debugging support added, which makes "use re 'debug'" work.
18 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
19 * confused with the original package (see point 3 below). Thanks, Henry!
22 /* Additional note: this code is very heavily munged from Henry's version
23 * in places. In some spots I've traded clarity for efficiency, so don't
24 * blame Henry for some of the lack of readability.
27 /* The names of the functions have been changed from regcomp and
28 * regexec to pregcomp and pregexec in order to avoid conflicts
29 * with the POSIX routines of the same names.
32 #ifdef PERL_EXT_RE_BUILD
37 * pregcomp and pregexec -- regsub and regerror are not used in perl
39 * Copyright (c) 1986 by University of Toronto.
40 * Written by Henry Spencer. Not derived from licensed software.
42 * Permission is granted to anyone to use this software for any
43 * purpose on any computer system, and to redistribute it freely,
44 * subject to the following restrictions:
46 * 1. The author is not responsible for the consequences of use of
47 * this software, no matter how awful, even if they arise
50 * 2. The origin of this software must not be misrepresented, either
51 * by explicit claim or by omission.
53 * 3. Altered versions must be plainly marked as such, and must not
54 * be misrepresented as being the original software.
57 **** Alterations to Henry's code are...
59 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
60 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, by Larry Wall and others
62 **** You may distribute under the terms of either the GNU General Public
63 **** License or the Artistic License, as specified in the README file.
66 * Beware that some of this code is subtly aware of the way operator
67 * precedence is structured in regular expressions. Serious changes in
68 * regular-expression syntax might require a total rethink.
71 #define PERL_IN_REGCOMP_C
74 #ifndef PERL_IN_XSUB_RE
79 #ifdef PERL_IN_XSUB_RE
90 # if defined(BUGGY_MSC6)
91 /* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */
92 # pragma optimize("a",off)
93 /* But MSC 6.00A is happy with 'w', for aliases only across function calls*/
94 # pragma optimize("w",on )
95 # endif /* BUGGY_MSC6 */
102 typedef struct RExC_state_t {
103 U32 flags; /* are we folding, multilining? */
104 char *precomp; /* uncompiled string. */
105 regexp *rx; /* perl core regexp structure */
106 regexp_internal *rxi; /* internal data for regexp object pprivate field */
107 char *start; /* Start of input for compile */
108 char *end; /* End of input for compile */
109 char *parse; /* Input-scan pointer. */
110 I32 whilem_seen; /* number of WHILEM in this expr */
111 regnode *emit_start; /* Start of emitted-code area */
112 regnode *emit; /* Code-emit pointer; ®dummy = don't = compiling */
113 I32 naughty; /* How bad is this pattern? */
114 I32 sawback; /* Did we see \1, ...? */
116 I32 size; /* Code size. */
117 I32 npar; /* Capture buffer count, (OPEN). */
118 I32 cpar; /* Capture buffer count, (CLOSE). */
119 I32 nestroot; /* root parens we are in - used by accept */
123 regnode **open_parens; /* pointers to open parens */
124 regnode **close_parens; /* pointers to close parens */
125 regnode *opend; /* END node in program */
127 HV *charnames; /* cache of named sequences */
128 HV *paren_names; /* Paren names */
129 regnode **recurse; /* Recurse regops */
130 I32 recurse_count; /* Number of recurse regops */
132 char *starttry; /* -Dr: where regtry was called. */
133 #define RExC_starttry (pRExC_state->starttry)
136 const char *lastparse;
138 #define RExC_lastparse (pRExC_state->lastparse)
139 #define RExC_lastnum (pRExC_state->lastnum)
143 #define RExC_flags (pRExC_state->flags)
144 #define RExC_precomp (pRExC_state->precomp)
145 #define RExC_rx (pRExC_state->rx)
146 #define RExC_rxi (pRExC_state->rxi)
147 #define RExC_start (pRExC_state->start)
148 #define RExC_end (pRExC_state->end)
149 #define RExC_parse (pRExC_state->parse)
150 #define RExC_whilem_seen (pRExC_state->whilem_seen)
151 #define RExC_offsets (pRExC_state->rxi->offsets) /* I am not like the others */
152 #define RExC_emit (pRExC_state->emit)
153 #define RExC_emit_start (pRExC_state->emit_start)
154 #define RExC_naughty (pRExC_state->naughty)
155 #define RExC_sawback (pRExC_state->sawback)
156 #define RExC_seen (pRExC_state->seen)
157 #define RExC_size (pRExC_state->size)
158 #define RExC_npar (pRExC_state->npar)
159 #define RExC_cpar (pRExC_state->cpar)
160 #define RExC_nestroot (pRExC_state->nestroot)
161 #define RExC_extralen (pRExC_state->extralen)
162 #define RExC_seen_zerolen (pRExC_state->seen_zerolen)
163 #define RExC_seen_evals (pRExC_state->seen_evals)
164 #define RExC_utf8 (pRExC_state->utf8)
165 #define RExC_charnames (pRExC_state->charnames)
166 #define RExC_open_parens (pRExC_state->open_parens)
167 #define RExC_close_parens (pRExC_state->close_parens)
168 #define RExC_opend (pRExC_state->opend)
169 #define RExC_paren_names (pRExC_state->paren_names)
170 #define RExC_recurse (pRExC_state->recurse)
171 #define RExC_recurse_count (pRExC_state->recurse_count)
173 #define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?')
174 #define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
175 ((*s) == '{' && regcurly(s)))
178 #undef SPSTART /* dratted cpp namespace... */
181 * Flags to be passed up and down.
183 #define WORST 0 /* Worst case. */
184 #define HASWIDTH 0x1 /* Known to match non-null strings. */
185 #define SIMPLE 0x2 /* Simple enough to be STAR/PLUS operand. */
186 #define SPSTART 0x4 /* Starts with * or +. */
187 #define TRYAGAIN 0x8 /* Weeded out a declaration. */
189 #define REG_NODE_NUM(x) ((x) ? (int)((x)-RExC_emit_start) : -1)
191 /* whether trie related optimizations are enabled */
192 #if PERL_ENABLE_EXTENDED_TRIE_OPTIMISATION
193 #define TRIE_STUDY_OPT
194 #define FULL_TRIE_STUDY
200 #define PBYTE(u8str,paren) ((U8*)(u8str))[(paren) >> 3]
201 #define PBITVAL(paren) (1 << ((paren) & 7))
202 #define PAREN_TEST(u8str,paren) ( PBYTE(u8str,paren) & PBITVAL(paren))
203 #define PAREN_SET(u8str,paren) PBYTE(u8str,paren) |= PBITVAL(paren)
204 #define PAREN_UNSET(u8str,paren) PBYTE(u8str,paren) &= (~PBITVAL(paren))
207 /* About scan_data_t.
209 During optimisation we recurse through the regexp program performing
210 various inplace (keyhole style) optimisations. In addition study_chunk
211 and scan_commit populate this data structure with information about
212 what strings MUST appear in the pattern. We look for the longest
213 string that must appear for at a fixed location, and we look for the
214 longest string that may appear at a floating location. So for instance
219 Both 'FOO' and 'A' are fixed strings. Both 'B' and 'BAR' are floating
220 strings (because they follow a .* construct). study_chunk will identify
221 both FOO and BAR as being the longest fixed and floating strings respectively.
223 The strings can be composites, for instance
227 will result in a composite fixed substring 'foo'.
229 For each string some basic information is maintained:
231 - offset or min_offset
232 This is the position the string must appear at, or not before.
233 It also implicitly (when combined with minlenp) tells us how many
234 character must match before the string we are searching.
235 Likewise when combined with minlenp and the length of the string
236 tells us how many characters must appear after the string we have
240 Only used for floating strings. This is the rightmost point that
241 the string can appear at. Ifset to I32 max it indicates that the
242 string can occur infinitely far to the right.
245 A pointer to the minimum length of the pattern that the string
246 was found inside. This is important as in the case of positive
247 lookahead or positive lookbehind we can have multiple patterns
252 The minimum length of the pattern overall is 3, the minimum length
253 of the lookahead part is 3, but the minimum length of the part that
254 will actually match is 1. So 'FOO's minimum length is 3, but the
255 minimum length for the F is 1. This is important as the minimum length
256 is used to determine offsets in front of and behind the string being
257 looked for. Since strings can be composites this is the length of the
258 pattern at the time it was commited with a scan_commit. Note that
259 the length is calculated by study_chunk, so that the minimum lengths
260 are not known until the full pattern has been compiled, thus the
261 pointer to the value.
265 In the case of lookbehind the string being searched for can be
266 offset past the start point of the final matching string.
267 If this value was just blithely removed from the min_offset it would
268 invalidate some of the calculations for how many chars must match
269 before or after (as they are derived from min_offset and minlen and
270 the length of the string being searched for).
271 When the final pattern is compiled and the data is moved from the
272 scan_data_t structure into the regexp structure the information
273 about lookbehind is factored in, with the information that would
274 have been lost precalculated in the end_shift field for the
277 The fields pos_min and pos_delta are used to store the minimum offset
278 and the delta to the maximum offset at the current point in the pattern.
282 typedef struct scan_data_t {
283 /*I32 len_min; unused */
284 /*I32 len_delta; unused */
288 I32 last_end; /* min value, <0 unless valid. */
291 SV **longest; /* Either &l_fixed, or &l_float. */
292 SV *longest_fixed; /* longest fixed string found in pattern */
293 I32 offset_fixed; /* offset where it starts */
294 I32 *minlen_fixed; /* pointer to the minlen relevent to the string */
295 I32 lookbehind_fixed; /* is the position of the string modfied by LB */
296 SV *longest_float; /* longest floating string found in pattern */
297 I32 offset_float_min; /* earliest point in string it can appear */
298 I32 offset_float_max; /* latest point in string it can appear */
299 I32 *minlen_float; /* pointer to the minlen relevent to the string */
300 I32 lookbehind_float; /* is the position of the string modified by LB */
304 struct regnode_charclass_class *start_class;
308 * Forward declarations for pregcomp()'s friends.
311 static const scan_data_t zero_scan_data =
312 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ,0};
314 #define SF_BEFORE_EOL (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
315 #define SF_BEFORE_SEOL 0x0001
316 #define SF_BEFORE_MEOL 0x0002
317 #define SF_FIX_BEFORE_EOL (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
318 #define SF_FL_BEFORE_EOL (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
321 # define SF_FIX_SHIFT_EOL (0+2)
322 # define SF_FL_SHIFT_EOL (0+4)
324 # define SF_FIX_SHIFT_EOL (+2)
325 # define SF_FL_SHIFT_EOL (+4)
328 #define SF_FIX_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
329 #define SF_FIX_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
331 #define SF_FL_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
332 #define SF_FL_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
333 #define SF_IS_INF 0x0040
334 #define SF_HAS_PAR 0x0080
335 #define SF_IN_PAR 0x0100
336 #define SF_HAS_EVAL 0x0200
337 #define SCF_DO_SUBSTR 0x0400
338 #define SCF_DO_STCLASS_AND 0x0800
339 #define SCF_DO_STCLASS_OR 0x1000
340 #define SCF_DO_STCLASS (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
341 #define SCF_WHILEM_VISITED_POS 0x2000
343 #define SCF_TRIE_RESTUDY 0x4000 /* Do restudy? */
344 #define SCF_SEEN_ACCEPT 0x8000
346 #define UTF (RExC_utf8 != 0)
347 #define LOC ((RExC_flags & RXf_PMf_LOCALE) != 0)
348 #define FOLD ((RExC_flags & RXf_PMf_FOLD) != 0)
350 #define OOB_UNICODE 12345678
351 #define OOB_NAMEDCLASS -1
353 #define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
354 #define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
357 /* length of regex to show in messages that don't mark a position within */
358 #define RegexLengthToShowInErrorMessages 127
361 * If MARKER[12] are adjusted, be sure to adjust the constants at the top
362 * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
363 * op/pragma/warn/regcomp.
365 #define MARKER1 "<-- HERE" /* marker as it appears in the description */
366 #define MARKER2 " <-- HERE " /* marker as it appears within the regex */
368 #define REPORT_LOCATION " in regex; marked by " MARKER1 " in m/%.*s" MARKER2 "%s/"
371 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
372 * arg. Show regex, up to a maximum length. If it's too long, chop and add
375 #define _FAIL(code) STMT_START { \
376 const char *ellipses = ""; \
377 IV len = RExC_end - RExC_precomp; \
380 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
381 if (len > RegexLengthToShowInErrorMessages) { \
382 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
383 len = RegexLengthToShowInErrorMessages - 10; \
389 #define FAIL(msg) _FAIL( \
390 Perl_croak(aTHX_ "%s in regex m/%.*s%s/", \
391 msg, (int)len, RExC_precomp, ellipses))
393 #define FAIL2(msg,arg) _FAIL( \
394 Perl_croak(aTHX_ msg " in regex m/%.*s%s/", \
395 arg, (int)len, RExC_precomp, ellipses))
398 * Simple_vFAIL -- like FAIL, but marks the current location in the scan
400 #define Simple_vFAIL(m) STMT_START { \
401 const IV offset = RExC_parse - RExC_precomp; \
402 Perl_croak(aTHX_ "%s" REPORT_LOCATION, \
403 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
407 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
409 #define vFAIL(m) STMT_START { \
411 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
416 * Like Simple_vFAIL(), but accepts two arguments.
418 #define Simple_vFAIL2(m,a1) STMT_START { \
419 const IV offset = RExC_parse - RExC_precomp; \
420 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, \
421 (int)offset, RExC_precomp, RExC_precomp + offset); \
425 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
427 #define vFAIL2(m,a1) STMT_START { \
429 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
430 Simple_vFAIL2(m, a1); \
435 * Like Simple_vFAIL(), but accepts three arguments.
437 #define Simple_vFAIL3(m, a1, a2) STMT_START { \
438 const IV offset = RExC_parse - RExC_precomp; \
439 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, \
440 (int)offset, RExC_precomp, RExC_precomp + offset); \
444 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
446 #define vFAIL3(m,a1,a2) STMT_START { \
448 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
449 Simple_vFAIL3(m, a1, a2); \
453 * Like Simple_vFAIL(), but accepts four arguments.
455 #define Simple_vFAIL4(m, a1, a2, a3) STMT_START { \
456 const IV offset = RExC_parse - RExC_precomp; \
457 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3, \
458 (int)offset, RExC_precomp, RExC_precomp + offset); \
461 #define vWARN(loc,m) STMT_START { \
462 const IV offset = loc - RExC_precomp; \
463 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s" REPORT_LOCATION, \
464 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
467 #define vWARNdep(loc,m) STMT_START { \
468 const IV offset = loc - RExC_precomp; \
469 Perl_warner(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), \
470 "%s" REPORT_LOCATION, \
471 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
475 #define vWARN2(loc, m, a1) STMT_START { \
476 const IV offset = loc - RExC_precomp; \
477 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
478 a1, (int)offset, RExC_precomp, RExC_precomp + offset); \
481 #define vWARN3(loc, m, a1, a2) STMT_START { \
482 const IV offset = loc - RExC_precomp; \
483 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
484 a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset); \
487 #define vWARN4(loc, m, a1, a2, a3) STMT_START { \
488 const IV offset = loc - RExC_precomp; \
489 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
490 a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
493 #define vWARN5(loc, m, a1, a2, a3, a4) STMT_START { \
494 const IV offset = loc - RExC_precomp; \
495 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
496 a1, a2, a3, a4, (int)offset, RExC_precomp, RExC_precomp + offset); \
500 /* Allow for side effects in s */
501 #define REGC(c,s) STMT_START { \
502 if (!SIZE_ONLY) *(s) = (c); else (void)(s); \
505 /* Macros for recording node offsets. 20001227 mjd@plover.com
506 * Nodes are numbered 1, 2, 3, 4. Node #n's position is recorded in
507 * element 2*n-1 of the array. Element #2n holds the byte length node #n.
508 * Element 0 holds the number n.
509 * Position is 1 indexed.
512 #define Set_Node_Offset_To_R(node,byte) STMT_START { \
514 MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n", \
515 __LINE__, (int)(node), (int)(byte))); \
517 Perl_croak(aTHX_ "value of node is %d in Offset macro", (int)(node)); \
519 RExC_offsets[2*(node)-1] = (byte); \
524 #define Set_Node_Offset(node,byte) \
525 Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
526 #define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
528 #define Set_Node_Length_To_R(node,len) STMT_START { \
530 MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n", \
531 __LINE__, (int)(node), (int)(len))); \
533 Perl_croak(aTHX_ "value of node is %d in Length macro", (int)(node)); \
535 RExC_offsets[2*(node)] = (len); \
540 #define Set_Node_Length(node,len) \
541 Set_Node_Length_To_R((node)-RExC_emit_start, len)
542 #define Set_Cur_Node_Length(len) Set_Node_Length(RExC_emit, len)
543 #define Set_Node_Cur_Length(node) \
544 Set_Node_Length(node, RExC_parse - parse_start)
546 /* Get offsets and lengths */
547 #define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
548 #define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
550 #define Set_Node_Offset_Length(node,offset,len) STMT_START { \
551 Set_Node_Offset_To_R((node)-RExC_emit_start, (offset)); \
552 Set_Node_Length_To_R((node)-RExC_emit_start, (len)); \
556 #if PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS
557 #define EXPERIMENTAL_INPLACESCAN
560 #define DEBUG_STUDYDATA(data,depth) \
561 DEBUG_OPTIMISE_MORE_r(if(data){ \
562 PerlIO_printf(Perl_debug_log, \
563 "%*s"/* Len:%"IVdf"/%"IVdf" */"Pos:%"IVdf"/%"IVdf \
564 " Flags: %"IVdf" Whilem_c: %"IVdf" Lcp: %"IVdf" ", \
565 (int)(depth)*2, "", \
566 (IV)((data)->pos_min), \
567 (IV)((data)->pos_delta), \
568 (IV)((data)->flags), \
569 (IV)((data)->whilem_c), \
570 (IV)((data)->last_closep ? *((data)->last_closep) : -1) \
572 if ((data)->last_found) \
573 PerlIO_printf(Perl_debug_log, \
574 "Last:'%s' %"IVdf":%"IVdf"/%"IVdf" %sFixed:'%s' @ %"IVdf \
575 " %sFloat: '%s' @ %"IVdf"/%"IVdf"", \
576 SvPVX_const((data)->last_found), \
577 (IV)((data)->last_end), \
578 (IV)((data)->last_start_min), \
579 (IV)((data)->last_start_max), \
580 ((data)->longest && \
581 (data)->longest==&((data)->longest_fixed)) ? "*" : "", \
582 SvPVX_const((data)->longest_fixed), \
583 (IV)((data)->offset_fixed), \
584 ((data)->longest && \
585 (data)->longest==&((data)->longest_float)) ? "*" : "", \
586 SvPVX_const((data)->longest_float), \
587 (IV)((data)->offset_float_min), \
588 (IV)((data)->offset_float_max) \
590 PerlIO_printf(Perl_debug_log,"\n"); \
593 static void clear_re(pTHX_ void *r);
595 /* Mark that we cannot extend a found fixed substring at this point.
596 Update the longest found anchored substring and the longest found
597 floating substrings if needed. */
600 S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data, I32 *minlenp)
602 const STRLEN l = CHR_SVLEN(data->last_found);
603 const STRLEN old_l = CHR_SVLEN(*data->longest);
604 GET_RE_DEBUG_FLAGS_DECL;
606 if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
607 SvSetMagicSV(*data->longest, data->last_found);
608 if (*data->longest == data->longest_fixed) {
609 data->offset_fixed = l ? data->last_start_min : data->pos_min;
610 if (data->flags & SF_BEFORE_EOL)
612 |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
614 data->flags &= ~SF_FIX_BEFORE_EOL;
615 data->minlen_fixed=minlenp;
616 data->lookbehind_fixed=0;
619 data->offset_float_min = l ? data->last_start_min : data->pos_min;
620 data->offset_float_max = (l
621 ? data->last_start_max
622 : data->pos_min + data->pos_delta);
623 if ((U32)data->offset_float_max > (U32)I32_MAX)
624 data->offset_float_max = I32_MAX;
625 if (data->flags & SF_BEFORE_EOL)
627 |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
629 data->flags &= ~SF_FL_BEFORE_EOL;
630 data->minlen_float=minlenp;
631 data->lookbehind_float=0;
634 SvCUR_set(data->last_found, 0);
636 SV * const sv = data->last_found;
637 if (SvUTF8(sv) && SvMAGICAL(sv)) {
638 MAGIC * const mg = mg_find(sv, PERL_MAGIC_utf8);
644 data->flags &= ~SF_BEFORE_EOL;
645 DEBUG_STUDYDATA(data,0);
648 /* Can match anything (initialization) */
650 S_cl_anything(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
652 ANYOF_CLASS_ZERO(cl);
653 ANYOF_BITMAP_SETALL(cl);
654 cl->flags = ANYOF_EOS|ANYOF_UNICODE_ALL;
656 cl->flags |= ANYOF_LOCALE;
659 /* Can match anything (initialization) */
661 S_cl_is_anything(const struct regnode_charclass_class *cl)
665 for (value = 0; value <= ANYOF_MAX; value += 2)
666 if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1))
668 if (!(cl->flags & ANYOF_UNICODE_ALL))
670 if (!ANYOF_BITMAP_TESTALLSET((const void*)cl))
675 /* Can match anything (initialization) */
677 S_cl_init(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
679 Zero(cl, 1, struct regnode_charclass_class);
681 cl_anything(pRExC_state, cl);
685 S_cl_init_zero(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
687 Zero(cl, 1, struct regnode_charclass_class);
689 cl_anything(pRExC_state, cl);
691 cl->flags |= ANYOF_LOCALE;
694 /* 'And' a given class with another one. Can create false positives */
695 /* We assume that cl is not inverted */
697 S_cl_and(struct regnode_charclass_class *cl,
698 const struct regnode_charclass_class *and_with)
701 assert(and_with->type == ANYOF);
702 if (!(and_with->flags & ANYOF_CLASS)
703 && !(cl->flags & ANYOF_CLASS)
704 && (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
705 && !(and_with->flags & ANYOF_FOLD)
706 && !(cl->flags & ANYOF_FOLD)) {
709 if (and_with->flags & ANYOF_INVERT)
710 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
711 cl->bitmap[i] &= ~and_with->bitmap[i];
713 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
714 cl->bitmap[i] &= and_with->bitmap[i];
715 } /* XXXX: logic is complicated otherwise, leave it along for a moment. */
716 if (!(and_with->flags & ANYOF_EOS))
717 cl->flags &= ~ANYOF_EOS;
719 if (cl->flags & ANYOF_UNICODE_ALL && and_with->flags & ANYOF_UNICODE &&
720 !(and_with->flags & ANYOF_INVERT)) {
721 cl->flags &= ~ANYOF_UNICODE_ALL;
722 cl->flags |= ANYOF_UNICODE;
723 ARG_SET(cl, ARG(and_with));
725 if (!(and_with->flags & ANYOF_UNICODE_ALL) &&
726 !(and_with->flags & ANYOF_INVERT))
727 cl->flags &= ~ANYOF_UNICODE_ALL;
728 if (!(and_with->flags & (ANYOF_UNICODE|ANYOF_UNICODE_ALL)) &&
729 !(and_with->flags & ANYOF_INVERT))
730 cl->flags &= ~ANYOF_UNICODE;
733 /* 'OR' a given class with another one. Can create false positives */
734 /* We assume that cl is not inverted */
736 S_cl_or(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, const struct regnode_charclass_class *or_with)
738 if (or_with->flags & ANYOF_INVERT) {
740 * (B1 | CL1) | (!B2 & !CL2) = (B1 | !B2 & !CL2) | (CL1 | (!B2 & !CL2))
741 * <= (B1 | !B2) | (CL1 | !CL2)
742 * which is wasteful if CL2 is small, but we ignore CL2:
743 * (B1 | CL1) | (!B2 & !CL2) <= (B1 | CL1) | !B2 = (B1 | !B2) | CL1
744 * XXXX Can we handle case-fold? Unclear:
745 * (OK1(i) | OK1(i')) | !(OK1(i) | OK1(i')) =
746 * (OK1(i) | OK1(i')) | (!OK1(i) & !OK1(i'))
748 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
749 && !(or_with->flags & ANYOF_FOLD)
750 && !(cl->flags & ANYOF_FOLD) ) {
753 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
754 cl->bitmap[i] |= ~or_with->bitmap[i];
755 } /* XXXX: logic is complicated otherwise */
757 cl_anything(pRExC_state, cl);
760 /* (B1 | CL1) | (B2 | CL2) = (B1 | B2) | (CL1 | CL2)) */
761 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
762 && (!(or_with->flags & ANYOF_FOLD)
763 || (cl->flags & ANYOF_FOLD)) ) {
766 /* OR char bitmap and class bitmap separately */
767 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
768 cl->bitmap[i] |= or_with->bitmap[i];
769 if (or_with->flags & ANYOF_CLASS) {
770 for (i = 0; i < ANYOF_CLASSBITMAP_SIZE; i++)
771 cl->classflags[i] |= or_with->classflags[i];
772 cl->flags |= ANYOF_CLASS;
775 else { /* XXXX: logic is complicated, leave it along for a moment. */
776 cl_anything(pRExC_state, cl);
779 if (or_with->flags & ANYOF_EOS)
780 cl->flags |= ANYOF_EOS;
782 if (cl->flags & ANYOF_UNICODE && or_with->flags & ANYOF_UNICODE &&
783 ARG(cl) != ARG(or_with)) {
784 cl->flags |= ANYOF_UNICODE_ALL;
785 cl->flags &= ~ANYOF_UNICODE;
787 if (or_with->flags & ANYOF_UNICODE_ALL) {
788 cl->flags |= ANYOF_UNICODE_ALL;
789 cl->flags &= ~ANYOF_UNICODE;
793 #define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ]
794 #define TRIE_LIST_CUR(state) ( TRIE_LIST_ITEM( state, 0 ).forid )
795 #define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate )
796 #define TRIE_LIST_USED(idx) ( trie->states[state].trans.list ? (TRIE_LIST_CUR( idx ) - 1) : 0 )
802 dump_trie_interim_list(trie,next_alloc)
803 dump_trie_interim_table(trie,next_alloc)
805 These routines dump out a trie in a somewhat readable format.
806 The _interim_ variants are used for debugging the interim
807 tables that are used to generate the final compressed
808 representation which is what dump_trie expects.
810 Part of the reason for their existance is to provide a form
811 of documentation as to how the different representations function.
817 Dumps the final compressed table form of the trie to Perl_debug_log.
818 Used for debugging make_trie().
822 S_dump_trie(pTHX_ const struct _reg_trie_data *trie,U32 depth)
825 SV *sv=sv_newmortal();
826 int colwidth= trie->widecharmap ? 6 : 4;
827 GET_RE_DEBUG_FLAGS_DECL;
830 PerlIO_printf( Perl_debug_log, "%*sChar : %-6s%-6s%-4s ",
831 (int)depth * 2 + 2,"",
832 "Match","Base","Ofs" );
834 for( state = 0 ; state < trie->uniquecharcount ; state++ ) {
835 SV ** const tmp = av_fetch( trie->revcharmap, state, 0);
837 PerlIO_printf( Perl_debug_log, "%*s",
839 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
840 PL_colors[0], PL_colors[1],
841 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
842 PERL_PV_ESCAPE_FIRSTCHAR
847 PerlIO_printf( Perl_debug_log, "\n%*sState|-----------------------",
848 (int)depth * 2 + 2,"");
850 for( state = 0 ; state < trie->uniquecharcount ; state++ )
851 PerlIO_printf( Perl_debug_log, "%.*s", colwidth, "--------");
852 PerlIO_printf( Perl_debug_log, "\n");
854 for( state = 1 ; state < trie->statecount ; state++ ) {
855 const U32 base = trie->states[ state ].trans.base;
857 PerlIO_printf( Perl_debug_log, "%*s#%4"UVXf"|", (int)depth * 2 + 2,"", (UV)state);
859 if ( trie->states[ state ].wordnum ) {
860 PerlIO_printf( Perl_debug_log, " W%4X", trie->states[ state ].wordnum );
862 PerlIO_printf( Perl_debug_log, "%6s", "" );
865 PerlIO_printf( Perl_debug_log, " @%4"UVXf" ", (UV)base );
870 while( ( base + ofs < trie->uniquecharcount ) ||
871 ( base + ofs - trie->uniquecharcount < trie->lasttrans
872 && trie->trans[ base + ofs - trie->uniquecharcount ].check != state))
875 PerlIO_printf( Perl_debug_log, "+%2"UVXf"[ ", (UV)ofs);
877 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
878 if ( ( base + ofs >= trie->uniquecharcount ) &&
879 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
880 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
882 PerlIO_printf( Perl_debug_log, "%*"UVXf,
884 (UV)trie->trans[ base + ofs - trie->uniquecharcount ].next );
886 PerlIO_printf( Perl_debug_log, "%*s",colwidth," ." );
890 PerlIO_printf( Perl_debug_log, "]");
893 PerlIO_printf( Perl_debug_log, "\n" );
897 dump_trie_interim_list(trie,next_alloc)
898 Dumps a fully constructed but uncompressed trie in list form.
899 List tries normally only are used for construction when the number of
900 possible chars (trie->uniquecharcount) is very high.
901 Used for debugging make_trie().
904 S_dump_trie_interim_list(pTHX_ const struct _reg_trie_data *trie, U32 next_alloc,U32 depth)
907 SV *sv=sv_newmortal();
908 int colwidth= trie->widecharmap ? 6 : 4;
909 GET_RE_DEBUG_FLAGS_DECL;
910 /* print out the table precompression. */
911 PerlIO_printf( Perl_debug_log, "%*sState :Word | Transition Data\n%*s%s",
912 (int)depth * 2 + 2,"", (int)depth * 2 + 2,"",
913 "------:-----+-----------------\n" );
915 for( state=1 ; state < next_alloc ; state ++ ) {
918 PerlIO_printf( Perl_debug_log, "%*s %4"UVXf" :",
919 (int)depth * 2 + 2,"", (UV)state );
920 if ( ! trie->states[ state ].wordnum ) {
921 PerlIO_printf( Perl_debug_log, "%5s| ","");
923 PerlIO_printf( Perl_debug_log, "W%4x| ",
924 trie->states[ state ].wordnum
927 for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) {
928 SV ** const tmp = av_fetch( trie->revcharmap, TRIE_LIST_ITEM(state,charid).forid, 0);
930 PerlIO_printf( Perl_debug_log, "%*s:%3X=%4"UVXf" | ",
932 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
933 PL_colors[0], PL_colors[1],
934 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
935 PERL_PV_ESCAPE_FIRSTCHAR
937 TRIE_LIST_ITEM(state,charid).forid,
938 (UV)TRIE_LIST_ITEM(state,charid).newstate
941 PerlIO_printf(Perl_debug_log, "\n%*s| ",
942 (int)((depth * 2) + 14), "");
945 PerlIO_printf( Perl_debug_log, "\n");
950 dump_trie_interim_table(trie,next_alloc)
951 Dumps a fully constructed but uncompressed trie in table form.
952 This is the normal DFA style state transition table, with a few
953 twists to facilitate compression later.
954 Used for debugging make_trie().
957 S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie, U32 next_alloc, U32 depth)
961 SV *sv=sv_newmortal();
962 int colwidth= trie->widecharmap ? 6 : 4;
963 GET_RE_DEBUG_FLAGS_DECL;
966 print out the table precompression so that we can do a visual check
967 that they are identical.
970 PerlIO_printf( Perl_debug_log, "%*sChar : ",(int)depth * 2 + 2,"" );
972 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
973 SV ** const tmp = av_fetch( trie->revcharmap, charid, 0);
975 PerlIO_printf( Perl_debug_log, "%*s",
977 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
978 PL_colors[0], PL_colors[1],
979 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
980 PERL_PV_ESCAPE_FIRSTCHAR
986 PerlIO_printf( Perl_debug_log, "\n%*sState+-",(int)depth * 2 + 2,"" );
988 for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) {
989 PerlIO_printf( Perl_debug_log, "%.*s", colwidth,"--------");
992 PerlIO_printf( Perl_debug_log, "\n" );
994 for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) {
996 PerlIO_printf( Perl_debug_log, "%*s%4"UVXf" : ",
997 (int)depth * 2 + 2,"",
998 (UV)TRIE_NODENUM( state ) );
1000 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
1001 UV v=(UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next );
1003 PerlIO_printf( Perl_debug_log, "%*"UVXf, colwidth, v );
1005 PerlIO_printf( Perl_debug_log, "%*s", colwidth, "." );
1007 if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) {
1008 PerlIO_printf( Perl_debug_log, " (%4"UVXf")\n", (UV)trie->trans[ state ].check );
1010 PerlIO_printf( Perl_debug_log, " (%4"UVXf") W%4X\n", (UV)trie->trans[ state ].check,
1011 trie->states[ TRIE_NODENUM( state ) ].wordnum );
1018 /* make_trie(startbranch,first,last,tail,word_count,flags,depth)
1019 startbranch: the first branch in the whole branch sequence
1020 first : start branch of sequence of branch-exact nodes.
1021 May be the same as startbranch
1022 last : Thing following the last branch.
1023 May be the same as tail.
1024 tail : item following the branch sequence
1025 count : words in the sequence
1026 flags : currently the OP() type we will be building one of /EXACT(|F|Fl)/
1027 depth : indent depth
1029 Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node.
1031 A trie is an N'ary tree where the branches are determined by digital
1032 decomposition of the key. IE, at the root node you look up the 1st character and
1033 follow that branch repeat until you find the end of the branches. Nodes can be
1034 marked as "accepting" meaning they represent a complete word. Eg:
1038 would convert into the following structure. Numbers represent states, letters
1039 following numbers represent valid transitions on the letter from that state, if
1040 the number is in square brackets it represents an accepting state, otherwise it
1041 will be in parenthesis.
1043 +-h->+-e->[3]-+-r->(8)-+-s->[9]
1047 (1) +-i->(6)-+-s->[7]
1049 +-s->(3)-+-h->(4)-+-e->[5]
1051 Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers)
1053 This shows that when matching against the string 'hers' we will begin at state 1
1054 read 'h' and move to state 2, read 'e' and move to state 3 which is accepting,
1055 then read 'r' and go to state 8 followed by 's' which takes us to state 9 which
1056 is also accepting. Thus we know that we can match both 'he' and 'hers' with a
1057 single traverse. We store a mapping from accepting to state to which word was
1058 matched, and then when we have multiple possibilities we try to complete the
1059 rest of the regex in the order in which they occured in the alternation.
1061 The only prior NFA like behaviour that would be changed by the TRIE support is
1062 the silent ignoring of duplicate alternations which are of the form:
1064 / (DUPE|DUPE) X? (?{ ... }) Y /x
1066 Thus EVAL blocks follwing a trie may be called a different number of times with
1067 and without the optimisation. With the optimisations dupes will be silently
1068 ignored. This inconsistant behaviour of EVAL type nodes is well established as
1069 the following demonstrates:
1071 'words'=~/(word|word|word)(?{ print $1 })[xyz]/
1073 which prints out 'word' three times, but
1075 'words'=~/(word|word|word)(?{ print $1 })S/
1077 which doesnt print it out at all. This is due to other optimisations kicking in.
1079 Example of what happens on a structural level:
1081 The regexp /(ac|ad|ab)+/ will produce the folowing debug output:
1083 1: CURLYM[1] {1,32767}(18)
1094 This would be optimizable with startbranch=5, first=5, last=16, tail=16
1095 and should turn into:
1097 1: CURLYM[1] {1,32767}(18)
1099 [Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1]
1107 Cases where tail != last would be like /(?foo|bar)baz/:
1117 which would be optimizable with startbranch=1, first=1, last=7, tail=8
1118 and would end up looking like:
1121 [Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1]
1128 d = uvuni_to_utf8_flags(d, uv, 0);
1130 is the recommended Unicode-aware way of saying
1135 #define TRIE_STORE_REVCHAR \
1137 SV *tmp = newSVpvs(""); \
1138 if (UTF) SvUTF8_on(tmp); \
1139 Perl_sv_catpvf( aTHX_ tmp, "%c", (int)uvc ); \
1140 av_push( TRIE_REVCHARMAP(trie), tmp ); \
1143 #define TRIE_READ_CHAR STMT_START { \
1147 if ( foldlen > 0 ) { \
1148 uvc = utf8n_to_uvuni( scan, UTF8_MAXLEN, &len, uniflags ); \
1153 uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
1154 uvc = to_uni_fold( uvc, foldbuf, &foldlen ); \
1155 foldlen -= UNISKIP( uvc ); \
1156 scan = foldbuf + UNISKIP( uvc ); \
1159 uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
1169 #define TRIE_LIST_PUSH(state,fid,ns) STMT_START { \
1170 if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) { \
1171 U32 ging = TRIE_LIST_LEN( state ) *= 2; \
1172 Renew( trie->states[ state ].trans.list, ging, reg_trie_trans_le ); \
1174 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid; \
1175 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns; \
1176 TRIE_LIST_CUR( state )++; \
1179 #define TRIE_LIST_NEW(state) STMT_START { \
1180 Newxz( trie->states[ state ].trans.list, \
1181 4, reg_trie_trans_le ); \
1182 TRIE_LIST_CUR( state ) = 1; \
1183 TRIE_LIST_LEN( state ) = 4; \
1186 #define TRIE_HANDLE_WORD(state) STMT_START { \
1187 U16 dupe= trie->states[ state ].wordnum; \
1188 regnode * const noper_next = regnext( noper ); \
1190 if (trie->wordlen) \
1191 trie->wordlen[ curword ] = wordlen; \
1193 /* store the word for dumping */ \
1195 if (OP(noper) != NOTHING) \
1196 tmp = newSVpvn(STRING(noper), STR_LEN(noper)); \
1198 tmp = newSVpvn( "", 0 ); \
1199 if ( UTF ) SvUTF8_on( tmp ); \
1200 av_push( trie->words, tmp ); \
1205 if ( noper_next < tail ) { \
1207 Newxz( trie->jump, word_count + 1, U16); \
1208 trie->jump[curword] = (U16)(noper_next - convert); \
1210 jumper = noper_next; \
1212 nextbranch= regnext(cur); \
1216 /* So it's a dupe. This means we need to maintain a */\
1217 /* linked-list from the first to the next. */\
1218 /* we only allocate the nextword buffer when there */\
1219 /* a dupe, so first time we have to do the allocation */\
1220 if (!trie->nextword) \
1221 Newxz( trie->nextword, word_count + 1, U16); \
1222 while ( trie->nextword[dupe] ) \
1223 dupe= trie->nextword[dupe]; \
1224 trie->nextword[dupe]= curword; \
1226 /* we haven't inserted this word yet. */ \
1227 trie->states[ state ].wordnum = curword; \
1232 #define TRIE_TRANS_STATE(state,base,ucharcount,charid,special) \
1233 ( ( base + charid >= ucharcount \
1234 && base + charid < ubound \
1235 && state == trie->trans[ base - ucharcount + charid ].check \
1236 && trie->trans[ base - ucharcount + charid ].next ) \
1237 ? trie->trans[ base - ucharcount + charid ].next \
1238 : ( state==1 ? special : 0 ) \
1242 #define MADE_JUMP_TRIE 2
1243 #define MADE_EXACT_TRIE 4
1246 S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *first, regnode *last, regnode *tail, U32 word_count, U32 flags, U32 depth)
1249 /* first pass, loop through and scan words */
1250 reg_trie_data *trie;
1252 const U32 uniflags = UTF8_ALLOW_DEFAULT;
1257 regnode *jumper = NULL;
1258 regnode *nextbranch = NULL;
1259 regnode *convert = NULL;
1260 /* we just use folder as a flag in utf8 */
1261 const U8 * const folder = ( flags == EXACTF
1263 : ( flags == EXACTFL
1269 const U32 data_slot = add_data( pRExC_state, 1, "t" );
1270 SV *re_trie_maxbuff;
1272 /* these are only used during construction but are useful during
1273 * debugging so we store them in the struct when debugging.
1275 STRLEN trie_charcount=0;
1276 AV *trie_revcharmap;
1278 GET_RE_DEBUG_FLAGS_DECL;
1280 PERL_UNUSED_ARG(depth);
1283 Newxz( trie, 1, reg_trie_data );
1285 trie->startstate = 1;
1286 trie->wordcount = word_count;
1287 RExC_rxi->data->data[ data_slot ] = (void*)trie;
1288 Newxz( trie->charmap, 256, U16 );
1289 if (!(UTF && folder))
1290 Newxz( trie->bitmap, ANYOF_BITMAP_SIZE, char );
1292 trie->words = newAV();
1294 TRIE_REVCHARMAP(trie) = newAV();
1296 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
1297 if (!SvIOK(re_trie_maxbuff)) {
1298 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
1301 PerlIO_printf( Perl_debug_log,
1302 "%*smake_trie start==%d, first==%d, last==%d, tail==%d depth=%d\n",
1303 (int)depth * 2 + 2, "",
1304 REG_NODE_NUM(startbranch),REG_NODE_NUM(first),
1305 REG_NODE_NUM(last), REG_NODE_NUM(tail),
1309 /* Find the node we are going to overwrite */
1310 if ( first == startbranch && OP( last ) != BRANCH ) {
1311 /* whole branch chain */
1314 /* branch sub-chain */
1315 convert = NEXTOPER( first );
1318 /* -- First loop and Setup --
1320 We first traverse the branches and scan each word to determine if it
1321 contains widechars, and how many unique chars there are, this is
1322 important as we have to build a table with at least as many columns as we
1325 We use an array of integers to represent the character codes 0..255
1326 (trie->charmap) and we use a an HV* to store unicode characters. We use the
1327 native representation of the character value as the key and IV's for the
1330 *TODO* If we keep track of how many times each character is used we can
1331 remap the columns so that the table compression later on is more
1332 efficient in terms of memory by ensuring most common value is in the
1333 middle and the least common are on the outside. IMO this would be better
1334 than a most to least common mapping as theres a decent chance the most
1335 common letter will share a node with the least common, meaning the node
1336 will not be compressable. With a middle is most common approach the worst
1337 case is when we have the least common nodes twice.
1341 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1342 regnode * const noper = NEXTOPER( cur );
1343 const U8 *uc = (U8*)STRING( noper );
1344 const U8 * const e = uc + STR_LEN( noper );
1346 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1347 const U8 *scan = (U8*)NULL;
1348 U32 wordlen = 0; /* required init */
1351 if (OP(noper) == NOTHING) {
1356 TRIE_BITMAP_SET(trie,*uc);
1357 if ( folder ) TRIE_BITMAP_SET(trie,folder[ *uc ]);
1359 for ( ; uc < e ; uc += len ) {
1360 TRIE_CHARCOUNT(trie)++;
1364 if ( !trie->charmap[ uvc ] ) {
1365 trie->charmap[ uvc ]=( ++trie->uniquecharcount );
1367 trie->charmap[ folder[ uvc ] ] = trie->charmap[ uvc ];
1372 if ( !trie->widecharmap )
1373 trie->widecharmap = newHV();
1375 svpp = hv_fetch( trie->widecharmap, (char*)&uvc, sizeof( UV ), 1 );
1378 Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%"UVXf, uvc );
1380 if ( !SvTRUE( *svpp ) ) {
1381 sv_setiv( *svpp, ++trie->uniquecharcount );
1386 if( cur == first ) {
1389 } else if (chars < trie->minlen) {
1391 } else if (chars > trie->maxlen) {
1395 } /* end first pass */
1396 DEBUG_TRIE_COMPILE_r(
1397 PerlIO_printf( Perl_debug_log, "%*sTRIE(%s): W:%d C:%d Uq:%d Min:%d Max:%d\n",
1398 (int)depth * 2 + 2,"",
1399 ( trie->widecharmap ? "UTF8" : "NATIVE" ), (int)word_count,
1400 (int)TRIE_CHARCOUNT(trie), trie->uniquecharcount,
1401 (int)trie->minlen, (int)trie->maxlen )
1403 Newxz( trie->wordlen, word_count, U32 );
1406 We now know what we are dealing with in terms of unique chars and
1407 string sizes so we can calculate how much memory a naive
1408 representation using a flat table will take. If it's over a reasonable
1409 limit (as specified by ${^RE_TRIE_MAXBUF}) we use a more memory
1410 conservative but potentially much slower representation using an array
1413 At the end we convert both representations into the same compressed
1414 form that will be used in regexec.c for matching with. The latter
1415 is a form that cannot be used to construct with but has memory
1416 properties similar to the list form and access properties similar
1417 to the table form making it both suitable for fast searches and
1418 small enough that its feasable to store for the duration of a program.
1420 See the comment in the code where the compressed table is produced
1421 inplace from the flat tabe representation for an explanation of how
1422 the compression works.
1427 if ( (IV)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1) > SvIV(re_trie_maxbuff) ) {
1429 Second Pass -- Array Of Lists Representation
1431 Each state will be represented by a list of charid:state records
1432 (reg_trie_trans_le) the first such element holds the CUR and LEN
1433 points of the allocated array. (See defines above).
1435 We build the initial structure using the lists, and then convert
1436 it into the compressed table form which allows faster lookups
1437 (but cant be modified once converted).
1440 STRLEN transcount = 1;
1442 DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
1443 "%*sCompiling trie using list compiler\n",
1444 (int)depth * 2 + 2, ""));
1446 Newxz( trie->states, TRIE_CHARCOUNT(trie) + 2, reg_trie_state );
1450 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1452 regnode * const noper = NEXTOPER( cur );
1453 U8 *uc = (U8*)STRING( noper );
1454 const U8 * const e = uc + STR_LEN( noper );
1455 U32 state = 1; /* required init */
1456 U16 charid = 0; /* sanity init */
1457 U8 *scan = (U8*)NULL; /* sanity init */
1458 STRLEN foldlen = 0; /* required init */
1459 U32 wordlen = 0; /* required init */
1460 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1462 if (OP(noper) != NOTHING) {
1463 for ( ; uc < e ; uc += len ) {
1468 charid = trie->charmap[ uvc ];
1470 SV** const svpp = hv_fetch( trie->widecharmap, (char*)&uvc, sizeof( UV ), 0);
1474 charid=(U16)SvIV( *svpp );
1477 /* charid is now 0 if we dont know the char read, or nonzero if we do */
1484 if ( !trie->states[ state ].trans.list ) {
1485 TRIE_LIST_NEW( state );
1487 for ( check = 1; check <= TRIE_LIST_USED( state ); check++ ) {
1488 if ( TRIE_LIST_ITEM( state, check ).forid == charid ) {
1489 newstate = TRIE_LIST_ITEM( state, check ).newstate;
1494 newstate = next_alloc++;
1495 TRIE_LIST_PUSH( state, charid, newstate );
1500 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
1504 TRIE_HANDLE_WORD(state);
1506 } /* end second pass */
1508 /* next alloc is the NEXT state to be allocated */
1509 trie->statecount = next_alloc;
1510 Renew( trie->states, next_alloc, reg_trie_state );
1512 /* and now dump it out before we compress it */
1513 DEBUG_TRIE_COMPILE_MORE_r(
1514 dump_trie_interim_list(trie,next_alloc,depth+1)
1517 Newxz( trie->trans, transcount ,reg_trie_trans );
1524 for( state=1 ; state < next_alloc ; state ++ ) {
1528 DEBUG_TRIE_COMPILE_MORE_r(
1529 PerlIO_printf( Perl_debug_log, "tp: %d zp: %d ",tp,zp)
1533 if (trie->states[state].trans.list) {
1534 U16 minid=TRIE_LIST_ITEM( state, 1).forid;
1538 for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
1539 const U16 forid = TRIE_LIST_ITEM( state, idx).forid;
1540 if ( forid < minid ) {
1542 } else if ( forid > maxid ) {
1546 if ( transcount < tp + maxid - minid + 1) {
1548 Renew( trie->trans, transcount, reg_trie_trans );
1549 Zero( trie->trans + (transcount / 2), transcount / 2 , reg_trie_trans );
1551 base = trie->uniquecharcount + tp - minid;
1552 if ( maxid == minid ) {
1554 for ( ; zp < tp ; zp++ ) {
1555 if ( ! trie->trans[ zp ].next ) {
1556 base = trie->uniquecharcount + zp - minid;
1557 trie->trans[ zp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1558 trie->trans[ zp ].check = state;
1564 trie->trans[ tp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1565 trie->trans[ tp ].check = state;
1570 for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
1571 const U32 tid = base - trie->uniquecharcount + TRIE_LIST_ITEM( state, idx ).forid;
1572 trie->trans[ tid ].next = TRIE_LIST_ITEM( state, idx ).newstate;
1573 trie->trans[ tid ].check = state;
1575 tp += ( maxid - minid + 1 );
1577 Safefree(trie->states[ state ].trans.list);
1580 DEBUG_TRIE_COMPILE_MORE_r(
1581 PerlIO_printf( Perl_debug_log, " base: %d\n",base);
1584 trie->states[ state ].trans.base=base;
1586 trie->lasttrans = tp + 1;
1590 Second Pass -- Flat Table Representation.
1592 we dont use the 0 slot of either trans[] or states[] so we add 1 to each.
1593 We know that we will need Charcount+1 trans at most to store the data
1594 (one row per char at worst case) So we preallocate both structures
1595 assuming worst case.
1597 We then construct the trie using only the .next slots of the entry
1600 We use the .check field of the first entry of the node temporarily to
1601 make compression both faster and easier by keeping track of how many non
1602 zero fields are in the node.
1604 Since trans are numbered from 1 any 0 pointer in the table is a FAIL
1607 There are two terms at use here: state as a TRIE_NODEIDX() which is a
1608 number representing the first entry of the node, and state as a
1609 TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1) and
1610 TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3) if there
1611 are 2 entrys per node. eg:
1619 The table is internally in the right hand, idx form. However as we also
1620 have to deal with the states array which is indexed by nodenum we have to
1621 use TRIE_NODENUM() to convert.
1624 DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
1625 "%*sCompiling trie using table compiler\n",
1626 (int)depth * 2 + 2, ""));
1628 Newxz( trie->trans, ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1,
1630 Newxz( trie->states, TRIE_CHARCOUNT(trie) + 2, reg_trie_state );
1631 next_alloc = trie->uniquecharcount + 1;
1634 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1636 regnode * const noper = NEXTOPER( cur );
1637 const U8 *uc = (U8*)STRING( noper );
1638 const U8 * const e = uc + STR_LEN( noper );
1640 U32 state = 1; /* required init */
1642 U16 charid = 0; /* sanity init */
1643 U32 accept_state = 0; /* sanity init */
1644 U8 *scan = (U8*)NULL; /* sanity init */
1646 STRLEN foldlen = 0; /* required init */
1647 U32 wordlen = 0; /* required init */
1648 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1650 if ( OP(noper) != NOTHING ) {
1651 for ( ; uc < e ; uc += len ) {
1656 charid = trie->charmap[ uvc ];
1658 SV* const * const svpp = hv_fetch( trie->widecharmap, (char*)&uvc, sizeof( UV ), 0);
1659 charid = svpp ? (U16)SvIV(*svpp) : 0;
1663 if ( !trie->trans[ state + charid ].next ) {
1664 trie->trans[ state + charid ].next = next_alloc;
1665 trie->trans[ state ].check++;
1666 next_alloc += trie->uniquecharcount;
1668 state = trie->trans[ state + charid ].next;
1670 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
1672 /* charid is now 0 if we dont know the char read, or nonzero if we do */
1675 accept_state = TRIE_NODENUM( state );
1676 TRIE_HANDLE_WORD(accept_state);
1678 } /* end second pass */
1680 /* and now dump it out before we compress it */
1681 DEBUG_TRIE_COMPILE_MORE_r(
1682 dump_trie_interim_table(trie,next_alloc,depth+1)
1687 * Inplace compress the table.*
1689 For sparse data sets the table constructed by the trie algorithm will
1690 be mostly 0/FAIL transitions or to put it another way mostly empty.
1691 (Note that leaf nodes will not contain any transitions.)
1693 This algorithm compresses the tables by eliminating most such
1694 transitions, at the cost of a modest bit of extra work during lookup:
1696 - Each states[] entry contains a .base field which indicates the
1697 index in the state[] array wheres its transition data is stored.
1699 - If .base is 0 there are no valid transitions from that node.
1701 - If .base is nonzero then charid is added to it to find an entry in
1704 -If trans[states[state].base+charid].check!=state then the
1705 transition is taken to be a 0/Fail transition. Thus if there are fail
1706 transitions at the front of the node then the .base offset will point
1707 somewhere inside the previous nodes data (or maybe even into a node
1708 even earlier), but the .check field determines if the transition is
1712 The following process inplace converts the table to the compressed
1713 table: We first do not compress the root node 1,and mark its all its
1714 .check pointers as 1 and set its .base pointer as 1 as well. This
1715 allows to do a DFA construction from the compressed table later, and
1716 ensures that any .base pointers we calculate later are greater than
1719 - We set 'pos' to indicate the first entry of the second node.
1721 - We then iterate over the columns of the node, finding the first and
1722 last used entry at l and m. We then copy l..m into pos..(pos+m-l),
1723 and set the .check pointers accordingly, and advance pos
1724 appropriately and repreat for the next node. Note that when we copy
1725 the next pointers we have to convert them from the original
1726 NODEIDX form to NODENUM form as the former is not valid post
1729 - If a node has no transitions used we mark its base as 0 and do not
1730 advance the pos pointer.
1732 - If a node only has one transition we use a second pointer into the
1733 structure to fill in allocated fail transitions from other states.
1734 This pointer is independent of the main pointer and scans forward
1735 looking for null transitions that are allocated to a state. When it
1736 finds one it writes the single transition into the "hole". If the
1737 pointer doesnt find one the single transition is appended as normal.
1739 - Once compressed we can Renew/realloc the structures to release the
1742 See "Table-Compression Methods" in sec 3.9 of the Red Dragon,
1743 specifically Fig 3.47 and the associated pseudocode.
1747 const U32 laststate = TRIE_NODENUM( next_alloc );
1750 trie->statecount = laststate;
1752 for ( state = 1 ; state < laststate ; state++ ) {
1754 const U32 stateidx = TRIE_NODEIDX( state );
1755 const U32 o_used = trie->trans[ stateidx ].check;
1756 U32 used = trie->trans[ stateidx ].check;
1757 trie->trans[ stateidx ].check = 0;
1759 for ( charid = 0 ; used && charid < trie->uniquecharcount ; charid++ ) {
1760 if ( flag || trie->trans[ stateidx + charid ].next ) {
1761 if ( trie->trans[ stateidx + charid ].next ) {
1763 for ( ; zp < pos ; zp++ ) {
1764 if ( ! trie->trans[ zp ].next ) {
1768 trie->states[ state ].trans.base = zp + trie->uniquecharcount - charid ;
1769 trie->trans[ zp ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
1770 trie->trans[ zp ].check = state;
1771 if ( ++zp > pos ) pos = zp;
1778 trie->states[ state ].trans.base = pos + trie->uniquecharcount - charid ;
1780 trie->trans[ pos ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
1781 trie->trans[ pos ].check = state;
1786 trie->lasttrans = pos + 1;
1787 Renew( trie->states, laststate, reg_trie_state);
1788 DEBUG_TRIE_COMPILE_MORE_r(
1789 PerlIO_printf( Perl_debug_log,
1790 "%*sAlloc: %d Orig: %"IVdf" elements, Final:%"IVdf". Savings of %%%5.2f\n",
1791 (int)depth * 2 + 2,"",
1792 (int)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1 ),
1795 ( ( next_alloc - pos ) * 100 ) / (double)next_alloc );
1798 } /* end table compress */
1800 DEBUG_TRIE_COMPILE_MORE_r(
1801 PerlIO_printf(Perl_debug_log, "%*sStatecount:%"UVxf" Lasttrans:%"UVxf"\n",
1802 (int)depth * 2 + 2, "",
1803 (UV)trie->statecount,
1804 (UV)trie->lasttrans)
1806 /* resize the trans array to remove unused space */
1807 Renew( trie->trans, trie->lasttrans, reg_trie_trans);
1809 /* and now dump out the compressed format */
1810 DEBUG_TRIE_COMPILE_r(
1811 dump_trie(trie,depth+1)
1814 { /* Modify the program and insert the new TRIE node*/
1815 U8 nodetype =(U8)(flags & 0xFF);
1819 regnode *optimize = NULL;
1821 U32 mjd_nodelen = 0;
1824 This means we convert either the first branch or the first Exact,
1825 depending on whether the thing following (in 'last') is a branch
1826 or not and whther first is the startbranch (ie is it a sub part of
1827 the alternation or is it the whole thing.)
1828 Assuming its a sub part we conver the EXACT otherwise we convert
1829 the whole branch sequence, including the first.
1831 /* Find the node we are going to overwrite */
1832 if ( first != startbranch || OP( last ) == BRANCH ) {
1833 /* branch sub-chain */
1834 NEXT_OFF( first ) = (U16)(last - first);
1836 mjd_offset= Node_Offset((convert));
1837 mjd_nodelen= Node_Length((convert));
1839 /* whole branch chain */
1842 const regnode *nop = NEXTOPER( convert );
1843 mjd_offset= Node_Offset((nop));
1844 mjd_nodelen= Node_Length((nop));
1849 PerlIO_printf(Perl_debug_log, "%*sMJD offset:%"UVuf" MJD length:%"UVuf"\n",
1850 (int)depth * 2 + 2, "",
1851 (UV)mjd_offset, (UV)mjd_nodelen)
1854 /* But first we check to see if there is a common prefix we can
1855 split out as an EXACT and put in front of the TRIE node. */
1856 trie->startstate= 1;
1857 if ( trie->bitmap && !trie->widecharmap && !trie->jump ) {
1859 for ( state = 1 ; state < trie->statecount-1 ; state++ ) {
1863 const U32 base = trie->states[ state ].trans.base;
1865 if ( trie->states[state].wordnum )
1868 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
1869 if ( ( base + ofs >= trie->uniquecharcount ) &&
1870 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
1871 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
1873 if ( ++count > 1 ) {
1874 SV **tmp = av_fetch( TRIE_REVCHARMAP(trie), ofs, 0);
1875 const U8 *ch = (U8*)SvPV_nolen_const( *tmp );
1876 if ( state == 1 ) break;
1878 Zero(trie->bitmap, ANYOF_BITMAP_SIZE, char);
1880 PerlIO_printf(Perl_debug_log,
1881 "%*sNew Start State=%"UVuf" Class: [",
1882 (int)depth * 2 + 2, "",
1885 SV ** const tmp = av_fetch( TRIE_REVCHARMAP(trie), idx, 0);
1886 const U8 * const ch = (U8*)SvPV_nolen_const( *tmp );
1888 TRIE_BITMAP_SET(trie,*ch);
1890 TRIE_BITMAP_SET(trie, folder[ *ch ]);
1892 PerlIO_printf(Perl_debug_log, (char*)ch)
1896 TRIE_BITMAP_SET(trie,*ch);
1898 TRIE_BITMAP_SET(trie,folder[ *ch ]);
1899 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"%s", ch));
1905 SV **tmp = av_fetch( TRIE_REVCHARMAP(trie), idx, 0);
1906 char *ch = SvPV_nolen( *tmp );
1908 SV *sv=sv_newmortal();
1909 PerlIO_printf( Perl_debug_log,
1910 "%*sPrefix State: %"UVuf" Idx:%"UVuf" Char='%s'\n",
1911 (int)depth * 2 + 2, "",
1913 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 6,
1914 PL_colors[0], PL_colors[1],
1915 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1916 PERL_PV_ESCAPE_FIRSTCHAR
1921 OP( convert ) = nodetype;
1922 str=STRING(convert);
1933 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"]\n"));
1939 regnode *n = convert+NODE_SZ_STR(convert);
1940 NEXT_OFF(convert) = NODE_SZ_STR(convert);
1941 trie->startstate = state;
1942 trie->minlen -= (state - 1);
1943 trie->maxlen -= (state - 1);
1945 regnode *fix = convert;
1946 U32 word = trie->wordcount;
1948 Set_Node_Offset_Length(convert, mjd_offset, state - 1);
1949 while( ++fix < n ) {
1950 Set_Node_Offset_Length(fix, 0, 0);
1953 SV ** const tmp = av_fetch( trie->words, word, 0 );
1955 if ( STR_LEN(convert) <= SvCUR(*tmp) )
1956 sv_chop(*tmp, SvPV_nolen(*tmp) + STR_LEN(convert));
1958 sv_chop(*tmp, SvPV_nolen(*tmp) + SvCUR(*tmp));
1965 NEXT_OFF(convert) = (U16)(tail - convert);
1966 DEBUG_r(optimize= n);
1972 if ( trie->maxlen ) {
1973 NEXT_OFF( convert ) = (U16)(tail - convert);
1974 ARG_SET( convert, data_slot );
1975 /* Store the offset to the first unabsorbed branch in
1976 jump[0], which is otherwise unused by the jump logic.
1977 We use this when dumping a trie and during optimisation. */
1979 trie->jump[0] = (U16)(nextbranch - convert);
1982 if ( !trie->states[trie->startstate].wordnum && trie->bitmap &&
1983 ( (char *)jumper - (char *)convert) >= (int)sizeof(struct regnode_charclass) )
1985 OP( convert ) = TRIEC;
1986 Copy(trie->bitmap, ((struct regnode_charclass *)convert)->bitmap, ANYOF_BITMAP_SIZE, char);
1987 Safefree(trie->bitmap);
1990 OP( convert ) = TRIE;
1992 /* store the type in the flags */
1993 convert->flags = nodetype;
1997 + regarglen[ OP( convert ) ];
1999 /* XXX We really should free up the resource in trie now,
2000 as we won't use them - (which resources?) dmq */
2002 /* needed for dumping*/
2003 DEBUG_r(if (optimize) {
2004 regnode *opt = convert;
2005 while ( ++opt < optimize) {
2006 Set_Node_Offset_Length(opt,0,0);
2009 Try to clean up some of the debris left after the
2012 while( optimize < jumper ) {
2013 mjd_nodelen += Node_Length((optimize));
2014 OP( optimize ) = OPTIMIZED;
2015 Set_Node_Offset_Length(optimize,0,0);
2018 Set_Node_Offset_Length(convert,mjd_offset,mjd_nodelen);
2020 } /* end node insert */
2022 SvREFCNT_dec(TRIE_REVCHARMAP(trie));
2026 : trie->startstate>1
2032 S_make_trie_failtable(pTHX_ RExC_state_t *pRExC_state, regnode *source, regnode *stclass, U32 depth)
2034 /* The Trie is constructed and compressed now so we can build a fail array now if its needed
2036 This is basically the Aho-Corasick algorithm. Its from exercise 3.31 and 3.32 in the
2037 "Red Dragon" -- Compilers, principles, techniques, and tools. Aho, Sethi, Ullman 1985/88
2040 We find the fail state for each state in the trie, this state is the longest proper
2041 suffix of the current states 'word' that is also a proper prefix of another word in our
2042 trie. State 1 represents the word '' and is the thus the default fail state. This allows
2043 the DFA not to have to restart after its tried and failed a word at a given point, it
2044 simply continues as though it had been matching the other word in the first place.
2046 'abcdgu'=~/abcdefg|cdgu/
2047 When we get to 'd' we are still matching the first word, we would encounter 'g' which would
2048 fail, which would bring use to the state representing 'd' in the second word where we would
2049 try 'g' and succeed, prodceding to match 'cdgu'.
2051 /* add a fail transition */
2052 reg_trie_data *trie=(reg_trie_data *)RExC_rxi->data->data[ARG(source)];
2054 const U32 ucharcount = trie->uniquecharcount;
2055 const U32 numstates = trie->statecount;
2056 const U32 ubound = trie->lasttrans + ucharcount;
2060 U32 base = trie->states[ 1 ].trans.base;
2063 const U32 data_slot = add_data( pRExC_state, 1, "T" );
2064 GET_RE_DEBUG_FLAGS_DECL;
2066 PERL_UNUSED_ARG(depth);
2070 ARG_SET( stclass, data_slot );
2071 Newxz( aho, 1, reg_ac_data );
2072 RExC_rxi->data->data[ data_slot ] = (void*)aho;
2074 aho->states=(reg_trie_state *)savepvn((const char*)trie->states,
2075 numstates * sizeof(reg_trie_state));
2076 Newxz( q, numstates, U32);
2077 Newxz( aho->fail, numstates, U32 );
2080 /* initialize fail[0..1] to be 1 so that we always have
2081 a valid final fail state */
2082 fail[ 0 ] = fail[ 1 ] = 1;
2084 for ( charid = 0; charid < ucharcount ; charid++ ) {
2085 const U32 newstate = TRIE_TRANS_STATE( 1, base, ucharcount, charid, 0 );
2087 q[ q_write ] = newstate;
2088 /* set to point at the root */
2089 fail[ q[ q_write++ ] ]=1;
2092 while ( q_read < q_write) {
2093 const U32 cur = q[ q_read++ % numstates ];
2094 base = trie->states[ cur ].trans.base;
2096 for ( charid = 0 ; charid < ucharcount ; charid++ ) {
2097 const U32 ch_state = TRIE_TRANS_STATE( cur, base, ucharcount, charid, 1 );
2099 U32 fail_state = cur;
2102 fail_state = fail[ fail_state ];
2103 fail_base = aho->states[ fail_state ].trans.base;
2104 } while ( !TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 ) );
2106 fail_state = TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 );
2107 fail[ ch_state ] = fail_state;
2108 if ( !aho->states[ ch_state ].wordnum && aho->states[ fail_state ].wordnum )
2110 aho->states[ ch_state ].wordnum = aho->states[ fail_state ].wordnum;
2112 q[ q_write++ % numstates] = ch_state;
2116 /* restore fail[0..1] to 0 so that we "fall out" of the AC loop
2117 when we fail in state 1, this allows us to use the
2118 charclass scan to find a valid start char. This is based on the principle
2119 that theres a good chance the string being searched contains lots of stuff
2120 that cant be a start char.
2122 fail[ 0 ] = fail[ 1 ] = 0;
2123 DEBUG_TRIE_COMPILE_r({
2124 PerlIO_printf(Perl_debug_log,
2125 "%*sStclass Failtable (%"UVuf" states): 0",
2126 (int)(depth * 2), "", (UV)numstates
2128 for( q_read=1; q_read<numstates; q_read++ ) {
2129 PerlIO_printf(Perl_debug_log, ", %"UVuf, (UV)fail[q_read]);
2131 PerlIO_printf(Perl_debug_log, "\n");
2134 /*RExC_seen |= REG_SEEN_TRIEDFA;*/
2139 * There are strange code-generation bugs caused on sparc64 by gcc-2.95.2.
2140 * These need to be revisited when a newer toolchain becomes available.
2142 #if defined(__sparc64__) && defined(__GNUC__)
2143 # if __GNUC__ < 2 || (__GNUC__ == 2 && __GNUC_MINOR__ < 96)
2144 # undef SPARC64_GCC_WORKAROUND
2145 # define SPARC64_GCC_WORKAROUND 1
2149 #define DEBUG_PEEP(str,scan,depth) \
2150 DEBUG_OPTIMISE_r({if (scan){ \
2151 SV * const mysv=sv_newmortal(); \
2152 regnode *Next = regnext(scan); \
2153 regprop(RExC_rx, mysv, scan); \
2154 PerlIO_printf(Perl_debug_log, "%*s" str ">%3d: %s (%d)\n", \
2155 (int)depth*2, "", REG_NODE_NUM(scan), SvPV_nolen_const(mysv),\
2156 Next ? (REG_NODE_NUM(Next)) : 0 ); \
2163 #define JOIN_EXACT(scan,min,flags) \
2164 if (PL_regkind[OP(scan)] == EXACT) \
2165 join_exact(pRExC_state,(scan),(min),(flags),NULL,depth+1)
2168 S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, I32 *min, U32 flags,regnode *val, U32 depth) {
2169 /* Merge several consecutive EXACTish nodes into one. */
2170 regnode *n = regnext(scan);
2172 regnode *next = scan + NODE_SZ_STR(scan);
2176 regnode *stop = scan;
2177 GET_RE_DEBUG_FLAGS_DECL;
2179 PERL_UNUSED_ARG(depth);
2181 #ifndef EXPERIMENTAL_INPLACESCAN
2182 PERL_UNUSED_ARG(flags);
2183 PERL_UNUSED_ARG(val);
2185 DEBUG_PEEP("join",scan,depth);
2187 /* Skip NOTHING, merge EXACT*. */
2189 ( PL_regkind[OP(n)] == NOTHING ||
2190 (stringok && (OP(n) == OP(scan))))
2192 && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX) {
2194 if (OP(n) == TAIL || n > next)
2196 if (PL_regkind[OP(n)] == NOTHING) {
2197 DEBUG_PEEP("skip:",n,depth);
2198 NEXT_OFF(scan) += NEXT_OFF(n);
2199 next = n + NODE_STEP_REGNODE;
2206 else if (stringok) {
2207 const unsigned int oldl = STR_LEN(scan);
2208 regnode * const nnext = regnext(n);
2210 DEBUG_PEEP("merg",n,depth);
2213 if (oldl + STR_LEN(n) > U8_MAX)
2215 NEXT_OFF(scan) += NEXT_OFF(n);
2216 STR_LEN(scan) += STR_LEN(n);
2217 next = n + NODE_SZ_STR(n);
2218 /* Now we can overwrite *n : */
2219 Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
2227 #ifdef EXPERIMENTAL_INPLACESCAN
2228 if (flags && !NEXT_OFF(n)) {
2229 DEBUG_PEEP("atch", val, depth);
2230 if (reg_off_by_arg[OP(n)]) {
2231 ARG_SET(n, val - n);
2234 NEXT_OFF(n) = val - n;
2241 if (UTF && ( OP(scan) == EXACTF ) && ( STR_LEN(scan) >= 6 ) ) {
2243 Two problematic code points in Unicode casefolding of EXACT nodes:
2245 U+0390 - GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
2246 U+03B0 - GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
2252 U+03B9 U+0308 U+0301 0xCE 0xB9 0xCC 0x88 0xCC 0x81
2253 U+03C5 U+0308 U+0301 0xCF 0x85 0xCC 0x88 0xCC 0x81
2255 This means that in case-insensitive matching (or "loose matching",
2256 as Unicode calls it), an EXACTF of length six (the UTF-8 encoded byte
2257 length of the above casefolded versions) can match a target string
2258 of length two (the byte length of UTF-8 encoded U+0390 or U+03B0).
2259 This would rather mess up the minimum length computation.
2261 What we'll do is to look for the tail four bytes, and then peek
2262 at the preceding two bytes to see whether we need to decrease
2263 the minimum length by four (six minus two).
2265 Thanks to the design of UTF-8, there cannot be false matches:
2266 A sequence of valid UTF-8 bytes cannot be a subsequence of
2267 another valid sequence of UTF-8 bytes.
2270 char * const s0 = STRING(scan), *s, *t;
2271 char * const s1 = s0 + STR_LEN(scan) - 1;
2272 char * const s2 = s1 - 4;
2273 #ifdef EBCDIC /* RD tunifold greek 0390 and 03B0 */
2274 const char t0[] = "\xaf\x49\xaf\x42";
2276 const char t0[] = "\xcc\x88\xcc\x81";
2278 const char * const t1 = t0 + 3;
2281 s < s2 && (t = ninstr(s, s1, t0, t1));
2284 if (((U8)t[-1] == 0x68 && (U8)t[-2] == 0xB4) ||
2285 ((U8)t[-1] == 0x46 && (U8)t[-2] == 0xB5))
2287 if (((U8)t[-1] == 0xB9 && (U8)t[-2] == 0xCE) ||
2288 ((U8)t[-1] == 0x85 && (U8)t[-2] == 0xCF))
2296 n = scan + NODE_SZ_STR(scan);
2298 if (PL_regkind[OP(n)] != NOTHING || OP(n) == NOTHING) {
2305 DEBUG_OPTIMISE_r(if (merged){DEBUG_PEEP("finl",scan,depth)});
2309 /* REx optimizer. Converts nodes into quickier variants "in place".
2310 Finds fixed substrings. */
2312 /* Stops at toplevel WHILEM as well as at "last". At end *scanp is set
2313 to the position after last scanned or to NULL. */
2315 #define INIT_AND_WITHP \
2316 assert(!and_withp); \
2317 Newx(and_withp,1,struct regnode_charclass_class); \
2318 SAVEFREEPV(and_withp)
2320 /* this is a chain of data about sub patterns we are processing that
2321 need to be handled seperately/specially in study_chunk. Its so
2322 we can simulate recursion without losing state. */
2324 typedef struct scan_frame {
2325 regnode *last; /* last node to process in this frame */
2326 regnode *next; /* next node to process when last is reached */
2327 struct scan_frame *prev; /*previous frame*/
2328 I32 stop; /* what stopparen do we use */
2332 S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
2333 I32 *minlenp, I32 *deltap,
2338 struct regnode_charclass_class *and_withp,
2339 U32 flags, U32 depth)
2340 /* scanp: Start here (read-write). */
2341 /* deltap: Write maxlen-minlen here. */
2342 /* last: Stop before this one. */
2343 /* data: string data about the pattern */
2344 /* stopparen: treat close N as END */
2345 /* recursed: which subroutines have we recursed into */
2346 /* and_withp: Valid if flags & SCF_DO_STCLASS_OR */
2349 I32 min = 0, pars = 0, code;
2350 regnode *scan = *scanp, *next;
2352 int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
2353 int is_inf_internal = 0; /* The studied chunk is infinite */
2354 I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
2355 scan_data_t data_fake;
2356 SV *re_trie_maxbuff = NULL;
2357 regnode *first_non_open = scan;
2358 I32 stopmin = I32_MAX;
2359 scan_frame *frame = NULL;
2361 GET_RE_DEBUG_FLAGS_DECL;
2364 StructCopy(&zero_scan_data, &data_fake, scan_data_t);
2368 while (first_non_open && OP(first_non_open) == OPEN)
2369 first_non_open=regnext(first_non_open);
2374 while ( scan && OP(scan) != END && scan < last ){
2375 /* Peephole optimizer: */
2376 DEBUG_STUDYDATA(data,depth);
2377 DEBUG_PEEP("Peep",scan,depth);
2378 JOIN_EXACT(scan,&min,0);
2380 /* Follow the next-chain of the current node and optimize
2381 away all the NOTHINGs from it. */
2382 if (OP(scan) != CURLYX) {
2383 const int max = (reg_off_by_arg[OP(scan)]
2385 /* I32 may be smaller than U16 on CRAYs! */
2386 : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
2387 int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
2391 /* Skip NOTHING and LONGJMP. */
2392 while ((n = regnext(n))
2393 && ((PL_regkind[OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
2394 || ((OP(n) == LONGJMP) && (noff = ARG(n))))
2395 && off + noff < max)
2397 if (reg_off_by_arg[OP(scan)])
2400 NEXT_OFF(scan) = off;
2405 /* The principal pseudo-switch. Cannot be a switch, since we
2406 look into several different things. */
2407 if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
2408 || OP(scan) == IFTHEN) {
2409 next = regnext(scan);
2411 /* demq: the op(next)==code check is to see if we have "branch-branch" AFAICT */
2413 if (OP(next) == code || code == IFTHEN) {
2414 /* NOTE - There is similar code to this block below for handling
2415 TRIE nodes on a re-study. If you change stuff here check there
2417 I32 max1 = 0, min1 = I32_MAX, num = 0;
2418 struct regnode_charclass_class accum;
2419 regnode * const startbranch=scan;
2421 if (flags & SCF_DO_SUBSTR)
2422 scan_commit(pRExC_state, data, minlenp); /* Cannot merge strings after this. */
2423 if (flags & SCF_DO_STCLASS)
2424 cl_init_zero(pRExC_state, &accum);
2426 while (OP(scan) == code) {
2427 I32 deltanext, minnext, f = 0, fake;
2428 struct regnode_charclass_class this_class;
2431 data_fake.flags = 0;
2433 data_fake.whilem_c = data->whilem_c;
2434 data_fake.last_closep = data->last_closep;
2437 data_fake.last_closep = &fake;
2439 data_fake.pos_delta = delta;
2440 next = regnext(scan);
2441 scan = NEXTOPER(scan);
2443 scan = NEXTOPER(scan);
2444 if (flags & SCF_DO_STCLASS) {
2445 cl_init(pRExC_state, &this_class);
2446 data_fake.start_class = &this_class;
2447 f = SCF_DO_STCLASS_AND;
2449 if (flags & SCF_WHILEM_VISITED_POS)
2450 f |= SCF_WHILEM_VISITED_POS;
2452 /* we suppose the run is continuous, last=next...*/
2453 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
2455 stopparen, recursed, NULL, f,depth+1);
2458 if (max1 < minnext + deltanext)
2459 max1 = minnext + deltanext;
2460 if (deltanext == I32_MAX)
2461 is_inf = is_inf_internal = 1;
2463 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
2465 if (data_fake.flags & SCF_SEEN_ACCEPT) {
2466 if ( stopmin > minnext)
2467 stopmin = min + min1;
2468 flags &= ~SCF_DO_SUBSTR;
2470 data->flags |= SCF_SEEN_ACCEPT;
2473 if (data_fake.flags & SF_HAS_EVAL)
2474 data->flags |= SF_HAS_EVAL;
2475 data->whilem_c = data_fake.whilem_c;
2477 if (flags & SCF_DO_STCLASS)
2478 cl_or(pRExC_state, &accum, &this_class);
2480 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
2482 if (flags & SCF_DO_SUBSTR) {
2483 data->pos_min += min1;
2484 data->pos_delta += max1 - min1;
2485 if (max1 != min1 || is_inf)
2486 data->longest = &(data->longest_float);
2489 delta += max1 - min1;
2490 if (flags & SCF_DO_STCLASS_OR) {
2491 cl_or(pRExC_state, data->start_class, &accum);
2493 cl_and(data->start_class, and_withp);
2494 flags &= ~SCF_DO_STCLASS;
2497 else if (flags & SCF_DO_STCLASS_AND) {
2499 cl_and(data->start_class, &accum);
2500 flags &= ~SCF_DO_STCLASS;
2503 /* Switch to OR mode: cache the old value of
2504 * data->start_class */
2506 StructCopy(data->start_class, and_withp,
2507 struct regnode_charclass_class);
2508 flags &= ~SCF_DO_STCLASS_AND;
2509 StructCopy(&accum, data->start_class,
2510 struct regnode_charclass_class);
2511 flags |= SCF_DO_STCLASS_OR;
2512 data->start_class->flags |= ANYOF_EOS;
2516 if (PERL_ENABLE_TRIE_OPTIMISATION && OP( startbranch ) == BRANCH ) {
2519 Assuming this was/is a branch we are dealing with: 'scan' now
2520 points at the item that follows the branch sequence, whatever
2521 it is. We now start at the beginning of the sequence and look
2528 which would be constructed from a pattern like /A|LIST|OF|WORDS/
2530 If we can find such a subseqence we need to turn the first
2531 element into a trie and then add the subsequent branch exact
2532 strings to the trie.
2536 1. patterns where the whole set of branch can be converted.
2538 2. patterns where only a subset can be converted.
2540 In case 1 we can replace the whole set with a single regop
2541 for the trie. In case 2 we need to keep the start and end
2544 'BRANCH EXACT; BRANCH EXACT; BRANCH X'
2545 becomes BRANCH TRIE; BRANCH X;
2547 There is an additional case, that being where there is a
2548 common prefix, which gets split out into an EXACT like node
2549 preceding the TRIE node.
2551 If x(1..n)==tail then we can do a simple trie, if not we make
2552 a "jump" trie, such that when we match the appropriate word
2553 we "jump" to the appopriate tail node. Essentailly we turn
2554 a nested if into a case structure of sorts.
2559 if (!re_trie_maxbuff) {
2560 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
2561 if (!SvIOK(re_trie_maxbuff))
2562 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
2564 if ( SvIV(re_trie_maxbuff)>=0 ) {
2566 regnode *first = (regnode *)NULL;
2567 regnode *last = (regnode *)NULL;
2568 regnode *tail = scan;
2573 SV * const mysv = sv_newmortal(); /* for dumping */
2575 /* var tail is used because there may be a TAIL
2576 regop in the way. Ie, the exacts will point to the
2577 thing following the TAIL, but the last branch will
2578 point at the TAIL. So we advance tail. If we
2579 have nested (?:) we may have to move through several
2583 while ( OP( tail ) == TAIL ) {
2584 /* this is the TAIL generated by (?:) */
2585 tail = regnext( tail );
2590 regprop(RExC_rx, mysv, tail );
2591 PerlIO_printf( Perl_debug_log, "%*s%s%s\n",
2592 (int)depth * 2 + 2, "",
2593 "Looking for TRIE'able sequences. Tail node is: ",
2594 SvPV_nolen_const( mysv )
2600 step through the branches, cur represents each
2601 branch, noper is the first thing to be matched
2602 as part of that branch and noper_next is the
2603 regnext() of that node. if noper is an EXACT
2604 and noper_next is the same as scan (our current
2605 position in the regex) then the EXACT branch is
2606 a possible optimization target. Once we have
2607 two or more consequetive such branches we can
2608 create a trie of the EXACT's contents and stich
2609 it in place. If the sequence represents all of
2610 the branches we eliminate the whole thing and
2611 replace it with a single TRIE. If it is a
2612 subsequence then we need to stitch it in. This
2613 means the first branch has to remain, and needs
2614 to be repointed at the item on the branch chain
2615 following the last branch optimized. This could
2616 be either a BRANCH, in which case the
2617 subsequence is internal, or it could be the
2618 item following the branch sequence in which
2619 case the subsequence is at the end.
2623 /* dont use tail as the end marker for this traverse */
2624 for ( cur = startbranch ; cur != scan ; cur = regnext( cur ) ) {
2625 regnode * const noper = NEXTOPER( cur );
2626 #if defined(DEBUGGING) || defined(NOJUMPTRIE)
2627 regnode * const noper_next = regnext( noper );
2631 regprop(RExC_rx, mysv, cur);
2632 PerlIO_printf( Perl_debug_log, "%*s- %s (%d)",
2633 (int)depth * 2 + 2,"", SvPV_nolen_const( mysv ), REG_NODE_NUM(cur) );
2635 regprop(RExC_rx, mysv, noper);
2636 PerlIO_printf( Perl_debug_log, " -> %s",
2637 SvPV_nolen_const(mysv));
2640 regprop(RExC_rx, mysv, noper_next );
2641 PerlIO_printf( Perl_debug_log,"\t=> %s\t",
2642 SvPV_nolen_const(mysv));
2644 PerlIO_printf( Perl_debug_log, "(First==%d,Last==%d,Cur==%d)\n",
2645 REG_NODE_NUM(first), REG_NODE_NUM(last), REG_NODE_NUM(cur) );
2647 if ( (((first && optype!=NOTHING) ? OP( noper ) == optype
2648 : PL_regkind[ OP( noper ) ] == EXACT )
2649 || OP(noper) == NOTHING )
2651 && noper_next == tail
2656 if ( !first || optype == NOTHING ) {
2657 if (!first) first = cur;
2658 optype = OP( noper );
2664 make_trie( pRExC_state,
2665 startbranch, first, cur, tail, count,
2668 if ( PL_regkind[ OP( noper ) ] == EXACT
2670 && noper_next == tail
2675 optype = OP( noper );
2685 regprop(RExC_rx, mysv, cur);
2686 PerlIO_printf( Perl_debug_log,
2687 "%*s- %s (%d) <SCAN FINISHED>\n", (int)depth * 2 + 2,
2688 "", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur));
2692 made= make_trie( pRExC_state, startbranch, first, scan, tail, count, optype, depth+1 );
2693 #ifdef TRIE_STUDY_OPT
2694 if ( ((made == MADE_EXACT_TRIE &&
2695 startbranch == first)
2696 || ( first_non_open == first )) &&
2698 flags |= SCF_TRIE_RESTUDY;
2699 if ( startbranch == first
2702 RExC_seen &=~REG_TOP_LEVEL_BRANCHES;
2712 else if ( code == BRANCHJ ) { /* single branch is optimized. */
2713 scan = NEXTOPER(NEXTOPER(scan));
2714 } else /* single branch is optimized. */
2715 scan = NEXTOPER(scan);
2717 } else if (OP(scan) == SUSPEND || OP(scan) == GOSUB || OP(scan) == GOSTART) {
2718 scan_frame *newframe = NULL;
2723 if (OP(scan) != SUSPEND) {
2724 /* set the pointer */
2725 if (OP(scan) == GOSUB) {
2727 RExC_recurse[ARG2L(scan)] = scan;
2728 start = RExC_open_parens[paren-1];
2729 end = RExC_close_parens[paren-1];
2732 start = RExC_rxi->program + 1;
2736 Newxz(recursed, (((RExC_npar)>>3) +1), U8);
2737 SAVEFREEPV(recursed);
2739 if (!PAREN_TEST(recursed,paren+1)) {
2740 PAREN_SET(recursed,paren+1);
2741 Newx(newframe,1,scan_frame);
2743 if (flags & SCF_DO_SUBSTR) {
2744 scan_commit(pRExC_state,data,minlenp);
2745 data->longest = &(data->longest_float);
2747 is_inf = is_inf_internal = 1;
2748 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
2749 cl_anything(pRExC_state, data->start_class);
2750 flags &= ~SCF_DO_STCLASS;
2753 Newx(newframe,1,scan_frame);
2756 end = regnext(scan);
2761 SAVEFREEPV(newframe);
2762 newframe->next = regnext(scan);
2763 newframe->last = last;
2764 newframe->stop = stopparen;
2765 newframe->prev = frame;
2775 else if (OP(scan) == EXACT) {
2776 I32 l = STR_LEN(scan);
2779 const U8 * const s = (U8*)STRING(scan);
2780 l = utf8_length(s, s + l);
2781 uc = utf8_to_uvchr(s, NULL);
2783 uc = *((U8*)STRING(scan));
2786 if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
2787 /* The code below prefers earlier match for fixed
2788 offset, later match for variable offset. */
2789 if (data->last_end == -1) { /* Update the start info. */
2790 data->last_start_min = data->pos_min;
2791 data->last_start_max = is_inf
2792 ? I32_MAX : data->pos_min + data->pos_delta;
2794 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
2796 SvUTF8_on(data->last_found);
2798 SV * const sv = data->last_found;
2799 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
2800 mg_find(sv, PERL_MAGIC_utf8) : NULL;
2801 if (mg && mg->mg_len >= 0)
2802 mg->mg_len += utf8_length((U8*)STRING(scan),
2803 (U8*)STRING(scan)+STR_LEN(scan));
2805 data->last_end = data->pos_min + l;
2806 data->pos_min += l; /* As in the first entry. */
2807 data->flags &= ~SF_BEFORE_EOL;
2809 if (flags & SCF_DO_STCLASS_AND) {
2810 /* Check whether it is compatible with what we know already! */
2814 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
2815 && !ANYOF_BITMAP_TEST(data->start_class, uc)
2816 && (!(data->start_class->flags & ANYOF_FOLD)
2817 || !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
2820 ANYOF_CLASS_ZERO(data->start_class);
2821 ANYOF_BITMAP_ZERO(data->start_class);
2823 ANYOF_BITMAP_SET(data->start_class, uc);
2824 data->start_class->flags &= ~ANYOF_EOS;
2826 data->start_class->flags &= ~ANYOF_UNICODE_ALL;
2828 else if (flags & SCF_DO_STCLASS_OR) {
2829 /* false positive possible if the class is case-folded */
2831 ANYOF_BITMAP_SET(data->start_class, uc);
2833 data->start_class->flags |= ANYOF_UNICODE_ALL;
2834 data->start_class->flags &= ~ANYOF_EOS;
2835 cl_and(data->start_class, and_withp);
2837 flags &= ~SCF_DO_STCLASS;
2839 else if (PL_regkind[OP(scan)] == EXACT) { /* But OP != EXACT! */
2840 I32 l = STR_LEN(scan);
2841 UV uc = *((U8*)STRING(scan));
2843 /* Search for fixed substrings supports EXACT only. */
2844 if (flags & SCF_DO_SUBSTR) {
2846 scan_commit(pRExC_state, data, minlenp);
2849 const U8 * const s = (U8 *)STRING(scan);
2850 l = utf8_length(s, s + l);
2851 uc = utf8_to_uvchr(s, NULL);
2854 if (flags & SCF_DO_SUBSTR)
2856 if (flags & SCF_DO_STCLASS_AND) {
2857 /* Check whether it is compatible with what we know already! */
2861 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
2862 && !ANYOF_BITMAP_TEST(data->start_class, uc)
2863 && !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
2865 ANYOF_CLASS_ZERO(data->start_class);
2866 ANYOF_BITMAP_ZERO(data->start_class);
2868 ANYOF_BITMAP_SET(data->start_class, uc);
2869 data->start_class->flags &= ~ANYOF_EOS;
2870 data->start_class->flags |= ANYOF_FOLD;
2871 if (OP(scan) == EXACTFL)
2872 data->start_class->flags |= ANYOF_LOCALE;
2875 else if (flags & SCF_DO_STCLASS_OR) {
2876 if (data->start_class->flags & ANYOF_FOLD) {
2877 /* false positive possible if the class is case-folded.
2878 Assume that the locale settings are the same... */
2880 ANYOF_BITMAP_SET(data->start_class, uc);
2881 data->start_class->flags &= ~ANYOF_EOS;
2883 cl_and(data->start_class, and_withp);
2885 flags &= ~SCF_DO_STCLASS;
2887 else if (strchr((const char*)PL_varies,OP(scan))) {
2888 I32 mincount, maxcount, minnext, deltanext, fl = 0;
2889 I32 f = flags, pos_before = 0;
2890 regnode * const oscan = scan;
2891 struct regnode_charclass_class this_class;
2892 struct regnode_charclass_class *oclass = NULL;
2893 I32 next_is_eval = 0;
2895 switch (PL_regkind[OP(scan)]) {
2896 case WHILEM: /* End of (?:...)* . */
2897 scan = NEXTOPER(scan);
2900 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
2901 next = NEXTOPER(scan);
2902 if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) {
2904 maxcount = REG_INFTY;
2905 next = regnext(scan);
2906 scan = NEXTOPER(scan);
2910 if (flags & SCF_DO_SUBSTR)
2915 if (flags & SCF_DO_STCLASS) {
2917 maxcount = REG_INFTY;
2918 next = regnext(scan);
2919 scan = NEXTOPER(scan);
2922 is_inf = is_inf_internal = 1;
2923 scan = regnext(scan);
2924 if (flags & SCF_DO_SUBSTR) {
2925 scan_commit(pRExC_state, data, minlenp); /* Cannot extend fixed substrings */
2926 data->longest = &(data->longest_float);
2928 goto optimize_curly_tail;
2930 if (stopparen>0 && (OP(scan)==CURLYN || OP(scan)==CURLYM)
2931 && (scan->flags == stopparen))
2936 mincount = ARG1(scan);
2937 maxcount = ARG2(scan);
2939 next = regnext(scan);
2940 if (OP(scan) == CURLYX) {
2941 I32 lp = (data ? *(data->last_closep) : 0);
2942 scan->flags = ((lp <= (I32)U8_MAX) ? (U8)lp : U8_MAX);
2944 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
2945 next_is_eval = (OP(scan) == EVAL);
2947 if (flags & SCF_DO_SUBSTR) {
2948 if (mincount == 0) scan_commit(pRExC_state,data,minlenp); /* Cannot extend fixed substrings */
2949 pos_before = data->pos_min;
2953 data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
2955 data->flags |= SF_IS_INF;
2957 if (flags & SCF_DO_STCLASS) {
2958 cl_init(pRExC_state, &this_class);
2959 oclass = data->start_class;
2960 data->start_class = &this_class;
2961 f |= SCF_DO_STCLASS_AND;
2962 f &= ~SCF_DO_STCLASS_OR;
2964 /* These are the cases when once a subexpression
2965 fails at a particular position, it cannot succeed
2966 even after backtracking at the enclosing scope.
2968 XXXX what if minimal match and we are at the
2969 initial run of {n,m}? */
2970 if ((mincount != maxcount - 1) && (maxcount != REG_INFTY))
2971 f &= ~SCF_WHILEM_VISITED_POS;
2973 /* This will finish on WHILEM, setting scan, or on NULL: */
2974 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
2975 last, data, stopparen, recursed, NULL,
2977 ? (f & ~SCF_DO_SUBSTR) : f),depth+1);
2979 if (flags & SCF_DO_STCLASS)
2980 data->start_class = oclass;
2981 if (mincount == 0 || minnext == 0) {
2982 if (flags & SCF_DO_STCLASS_OR) {
2983 cl_or(pRExC_state, data->start_class, &this_class);
2985 else if (flags & SCF_DO_STCLASS_AND) {
2986 /* Switch to OR mode: cache the old value of
2987 * data->start_class */
2989 StructCopy(data->start_class, and_withp,
2990 struct regnode_charclass_class);
2991 flags &= ~SCF_DO_STCLASS_AND;
2992 StructCopy(&this_class, data->start_class,
2993 struct regnode_charclass_class);
2994 flags |= SCF_DO_STCLASS_OR;
2995 data->start_class->flags |= ANYOF_EOS;
2997 } else { /* Non-zero len */
2998 if (flags & SCF_DO_STCLASS_OR) {
2999 cl_or(pRExC_state, data->start_class, &this_class);
3000 cl_and(data->start_class, and_withp);
3002 else if (flags & SCF_DO_STCLASS_AND)
3003 cl_and(data->start_class, &this_class);
3004 flags &= ~SCF_DO_STCLASS;
3006 if (!scan) /* It was not CURLYX, but CURLY. */
3008 if ( /* ? quantifier ok, except for (?{ ... }) */
3009 (next_is_eval || !(mincount == 0 && maxcount == 1))
3010 && (minnext == 0) && (deltanext == 0)
3011 && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
3012 && maxcount <= REG_INFTY/3 /* Complement check for big count */
3013 && ckWARN(WARN_REGEXP))
3016 "Quantifier unexpected on zero-length expression");
3019 min += minnext * mincount;
3020 is_inf_internal |= ((maxcount == REG_INFTY
3021 && (minnext + deltanext) > 0)
3022 || deltanext == I32_MAX);
3023 is_inf |= is_inf_internal;
3024 delta += (minnext + deltanext) * maxcount - minnext * mincount;
3026 /* Try powerful optimization CURLYX => CURLYN. */
3027 if ( OP(oscan) == CURLYX && data
3028 && data->flags & SF_IN_PAR
3029 && !(data->flags & SF_HAS_EVAL)
3030 && !deltanext && minnext == 1 ) {
3031 /* Try to optimize to CURLYN. */
3032 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
3033 regnode * const nxt1 = nxt;
3040 if (!strchr((const char*)PL_simple,OP(nxt))
3041 && !(PL_regkind[OP(nxt)] == EXACT
3042 && STR_LEN(nxt) == 1))
3048 if (OP(nxt) != CLOSE)
3050 if (RExC_open_parens) {
3051 RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
3052 RExC_close_parens[ARG(nxt1)-1]=nxt+2; /*close->while*/
3054 /* Now we know that nxt2 is the only contents: */
3055 oscan->flags = (U8)ARG(nxt);
3057 OP(nxt1) = NOTHING; /* was OPEN. */
3060 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
3061 NEXT_OFF(nxt1+ 1) = 0; /* just for consistancy. */
3062 NEXT_OFF(nxt2) = 0; /* just for consistancy with CURLY. */
3063 OP(nxt) = OPTIMIZED; /* was CLOSE. */
3064 OP(nxt + 1) = OPTIMIZED; /* was count. */
3065 NEXT_OFF(nxt+ 1) = 0; /* just for consistancy. */
3070 /* Try optimization CURLYX => CURLYM. */
3071 if ( OP(oscan) == CURLYX && data
3072 && !(data->flags & SF_HAS_PAR)
3073 && !(data->flags & SF_HAS_EVAL)
3074 && !deltanext /* atom is fixed width */
3075 && minnext != 0 /* CURLYM can't handle zero width */
3077 /* XXXX How to optimize if data == 0? */
3078 /* Optimize to a simpler form. */
3079 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
3083 while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
3084 && (OP(nxt2) != WHILEM))
3086 OP(nxt2) = SUCCEED; /* Whas WHILEM */
3087 /* Need to optimize away parenths. */
3088 if (data->flags & SF_IN_PAR) {
3089 /* Set the parenth number. */
3090 regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
3092 if (OP(nxt) != CLOSE)
3093 FAIL("Panic opt close");
3094 oscan->flags = (U8)ARG(nxt);
3095 if (RExC_open_parens) {
3096 RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
3097 RExC_close_parens[ARG(nxt1)-1]=nxt2+1; /*close->NOTHING*/
3099 OP(nxt1) = OPTIMIZED; /* was OPEN. */
3100 OP(nxt) = OPTIMIZED; /* was CLOSE. */
3103 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
3104 OP(nxt + 1) = OPTIMIZED; /* was count. */
3105 NEXT_OFF(nxt1 + 1) = 0; /* just for consistancy. */
3106 NEXT_OFF(nxt + 1) = 0; /* just for consistancy. */
3109 while ( nxt1 && (OP(nxt1) != WHILEM)) {
3110 regnode *nnxt = regnext(nxt1);
3113 if (reg_off_by_arg[OP(nxt1)])
3114 ARG_SET(nxt1, nxt2 - nxt1);
3115 else if (nxt2 - nxt1 < U16_MAX)
3116 NEXT_OFF(nxt1) = nxt2 - nxt1;
3118 OP(nxt) = NOTHING; /* Cannot beautify */
3123 /* Optimize again: */
3124 study_chunk(pRExC_state, &nxt1, minlenp, &deltanext, nxt,
3125 NULL, stopparen, recursed, NULL, 0,depth+1);
3130 else if ((OP(oscan) == CURLYX)
3131 && (flags & SCF_WHILEM_VISITED_POS)
3132 /* See the comment on a similar expression above.
3133 However, this time it not a subexpression
3134 we care about, but the expression itself. */
3135 && (maxcount == REG_INFTY)
3136 && data && ++data->whilem_c < 16) {
3137 /* This stays as CURLYX, we can put the count/of pair. */
3138 /* Find WHILEM (as in regexec.c) */
3139 regnode *nxt = oscan + NEXT_OFF(oscan);
3141 if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
3143 PREVOPER(nxt)->flags = (U8)(data->whilem_c
3144 | (RExC_whilem_seen << 4)); /* On WHILEM */
3146 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
3148 if (flags & SCF_DO_SUBSTR) {
3149 SV *last_str = NULL;
3150 int counted = mincount != 0;
3152 if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */
3153 #if defined(SPARC64_GCC_WORKAROUND)
3156 const char *s = NULL;
3159 if (pos_before >= data->last_start_min)
3162 b = data->last_start_min;
3165 s = SvPV_const(data->last_found, l);
3166 old = b - data->last_start_min;
3169 I32 b = pos_before >= data->last_start_min
3170 ? pos_before : data->last_start_min;
3172 const char * const s = SvPV_const(data->last_found, l);
3173 I32 old = b - data->last_start_min;
3177 old = utf8_hop((U8*)s, old) - (U8*)s;
3180 /* Get the added string: */
3181 last_str = newSVpvn(s + old, l);
3183 SvUTF8_on(last_str);
3184 if (deltanext == 0 && pos_before == b) {
3185 /* What was added is a constant string */
3187 SvGROW(last_str, (mincount * l) + 1);
3188 repeatcpy(SvPVX(last_str) + l,
3189 SvPVX_const(last_str), l, mincount - 1);
3190 SvCUR_set(last_str, SvCUR(last_str) * mincount);
3191 /* Add additional parts. */
3192 SvCUR_set(data->last_found,
3193 SvCUR(data->last_found) - l);
3194 sv_catsv(data->last_found, last_str);
3196 SV * sv = data->last_found;
3198 SvUTF8(sv) && SvMAGICAL(sv) ?
3199 mg_find(sv, PERL_MAGIC_utf8) : NULL;
3200 if (mg && mg->mg_len >= 0)
3201 mg->mg_len += CHR_SVLEN(last_str);
3203 data->last_end += l * (mincount - 1);
3206 /* start offset must point into the last copy */
3207 data->last_start_min += minnext * (mincount - 1);
3208 data->last_start_max += is_inf ? I32_MAX
3209 : (maxcount - 1) * (minnext + data->pos_delta);
3212 /* It is counted once already... */
3213 data->pos_min += minnext * (mincount - counted);
3214 data->pos_delta += - counted * deltanext +
3215 (minnext + deltanext) * maxcount - minnext * mincount;
3216 if (mincount != maxcount) {
3217 /* Cannot extend fixed substrings found inside
3219 scan_commit(pRExC_state,data,minlenp);
3220 if (mincount && last_str) {
3221 SV * const sv = data->last_found;
3222 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
3223 mg_find(sv, PERL_MAGIC_utf8) : NULL;
3227 sv_setsv(sv, last_str);
3228 data->last_end = data->pos_min;
3229 data->last_start_min =
3230 data->pos_min - CHR_SVLEN(last_str);
3231 data->last_start_max = is_inf
3233 : data->pos_min + data->pos_delta
3234 - CHR_SVLEN(last_str);
3236 data->longest = &(data->longest_float);
3238 SvREFCNT_dec(last_str);
3240 if (data && (fl & SF_HAS_EVAL))
3241 data->flags |= SF_HAS_EVAL;
3242 optimize_curly_tail:
3243 if (OP(oscan) != CURLYX) {
3244 while (PL_regkind[OP(next = regnext(oscan))] == NOTHING
3246 NEXT_OFF(oscan) += NEXT_OFF(next);
3249 default: /* REF and CLUMP only? */
3250 if (flags & SCF_DO_SUBSTR) {
3251 scan_commit(pRExC_state,data,minlenp); /* Cannot expect anything... */
3252 data->longest = &(data->longest_float);
3254 is_inf = is_inf_internal = 1;
3255 if (flags & SCF_DO_STCLASS_OR)
3256 cl_anything(pRExC_state, data->start_class);
3257 flags &= ~SCF_DO_STCLASS;
3261 else if (strchr((const char*)PL_simple,OP(scan))) {
3264 if (flags & SCF_DO_SUBSTR) {
3265 scan_commit(pRExC_state,data,minlenp);
3269 if (flags & SCF_DO_STCLASS) {
3270 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
3272 /* Some of the logic below assumes that switching
3273 locale on will only add false positives. */
3274 switch (PL_regkind[OP(scan)]) {
3278 /* Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan)); */
3279 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
3280 cl_anything(pRExC_state, data->start_class);
3283 if (OP(scan) == SANY)
3285 if (flags & SCF_DO_STCLASS_OR) { /* Everything but \n */
3286 value = (ANYOF_BITMAP_TEST(data->start_class,'\n')
3287 || (data->start_class->flags & ANYOF_CLASS));
3288 cl_anything(pRExC_state, data->start_class);
3290 if (flags & SCF_DO_STCLASS_AND || !value)
3291 ANYOF_BITMAP_CLEAR(data->start_class,'\n');
3294 if (flags & SCF_DO_STCLASS_AND)
3295 cl_and(data->start_class,
3296 (struct regnode_charclass_class*)scan);
3298 cl_or(pRExC_state, data->start_class,
3299 (struct regnode_charclass_class*)scan);
3302 if (flags & SCF_DO_STCLASS_AND) {
3303 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3304 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
3305 for (value = 0; value < 256; value++)
3306 if (!isALNUM(value))
3307 ANYOF_BITMAP_CLEAR(data->start_class, value);
3311 if (data->start_class->flags & ANYOF_LOCALE)
3312 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
3314 for (value = 0; value < 256; value++)
3316 ANYOF_BITMAP_SET(data->start_class, value);
3321 if (flags & SCF_DO_STCLASS_AND) {
3322 if (data->start_class->flags & ANYOF_LOCALE)
3323 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
3326 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
3327 data->start_class->flags |= ANYOF_LOCALE;
3331 if (flags & SCF_DO_STCLASS_AND) {
3332 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3333 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
3334 for (value = 0; value < 256; value++)
3336 ANYOF_BITMAP_CLEAR(data->start_class, value);
3340 if (data->start_class->flags & ANYOF_LOCALE)
3341 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
3343 for (value = 0; value < 256; value++)
3344 if (!isALNUM(value))
3345 ANYOF_BITMAP_SET(data->start_class, value);
3350 if (flags & SCF_DO_STCLASS_AND) {
3351 if (data->start_class->flags & ANYOF_LOCALE)
3352 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
3355 data->start_class->flags |= ANYOF_LOCALE;
3356 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
3360 if (flags & SCF_DO_STCLASS_AND) {
3361 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3362 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
3363 for (value = 0; value < 256; value++)
3364 if (!isSPACE(value))
3365 ANYOF_BITMAP_CLEAR(data->start_class, value);
3369 if (data->start_class->flags & ANYOF_LOCALE)
3370 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
3372 for (value = 0; value < 256; value++)
3374 ANYOF_BITMAP_SET(data->start_class, value);
3379 if (flags & SCF_DO_STCLASS_AND) {
3380 if (data->start_class->flags & ANYOF_LOCALE)
3381 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
3384 data->start_class->flags |= ANYOF_LOCALE;
3385 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
3389 if (flags & SCF_DO_STCLASS_AND) {
3390 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3391 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
3392 for (value = 0; value < 256; value++)
3394 ANYOF_BITMAP_CLEAR(data->start_class, value);
3398 if (data->start_class->flags & ANYOF_LOCALE)
3399 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
3401 for (value = 0; value < 256; value++)
3402 if (!isSPACE(value))
3403 ANYOF_BITMAP_SET(data->start_class, value);
3408 if (flags & SCF_DO_STCLASS_AND) {
3409 if (data->start_class->flags & ANYOF_LOCALE) {
3410 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
3411 for (value = 0; value < 256; value++)
3412 if (!isSPACE(value))
3413 ANYOF_BITMAP_CLEAR(data->start_class, value);
3417 data->start_class->flags |= ANYOF_LOCALE;
3418 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
3422 if (flags & SCF_DO_STCLASS_AND) {
3423 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NDIGIT);
3424 for (value = 0; value < 256; value++)
3425 if (!isDIGIT(value))
3426 ANYOF_BITMAP_CLEAR(data->start_class, value);
3429 if (data->start_class->flags & ANYOF_LOCALE)
3430 ANYOF_CLASS_SET(data->start_class,ANYOF_DIGIT);
3432 for (value = 0; value < 256; value++)
3434 ANYOF_BITMAP_SET(data->start_class, value);
3439 if (flags & SCF_DO_STCLASS_AND) {
3440 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_DIGIT);
3441 for (value = 0; value < 256; value++)
3443 ANYOF_BITMAP_CLEAR(data->start_class, value);
3446 if (data->start_class->flags & ANYOF_LOCALE)
3447 ANYOF_CLASS_SET(data->start_class,ANYOF_NDIGIT);
3449 for (value = 0; value < 256; value++)
3450 if (!isDIGIT(value))
3451 ANYOF_BITMAP_SET(data->start_class, value);
3456 if (flags & SCF_DO_STCLASS_OR)
3457 cl_and(data->start_class, and_withp);
3458 flags &= ~SCF_DO_STCLASS;
3461 else if (PL_regkind[OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
3462 data->flags |= (OP(scan) == MEOL
3466 else if ( PL_regkind[OP(scan)] == BRANCHJ
3467 /* Lookbehind, or need to calculate parens/evals/stclass: */
3468 && (scan->flags || data || (flags & SCF_DO_STCLASS))
3469 && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) {
3470 if ( !PERL_ENABLE_POSITIVE_ASSERTION_STUDY
3471 || OP(scan) == UNLESSM )
3473 /* Negative Lookahead/lookbehind
3474 In this case we can't do fixed string optimisation.
3477 I32 deltanext, minnext, fake = 0;
3479 struct regnode_charclass_class intrnl;
3482 data_fake.flags = 0;
3484 data_fake.whilem_c = data->whilem_c;
3485 data_fake.last_closep = data->last_closep;
3488 data_fake.last_closep = &fake;
3489 data_fake.pos_delta = delta;
3490 if ( flags & SCF_DO_STCLASS && !scan->flags
3491 && OP(scan) == IFMATCH ) { /* Lookahead */
3492 cl_init(pRExC_state, &intrnl);
3493 data_fake.start_class = &intrnl;
3494 f |= SCF_DO_STCLASS_AND;
3496 if (flags & SCF_WHILEM_VISITED_POS)
3497 f |= SCF_WHILEM_VISITED_POS;
3498 next = regnext(scan);
3499 nscan = NEXTOPER(NEXTOPER(scan));
3500 minnext = study_chunk(pRExC_state, &nscan, minlenp, &deltanext,
3501 last, &data_fake, stopparen, recursed, NULL, f, depth+1);
3504 FAIL("Variable length lookbehind not implemented");
3506 else if (minnext > (I32)U8_MAX) {
3507 FAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
3509 scan->flags = (U8)minnext;
3512 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3514 if (data_fake.flags & SF_HAS_EVAL)
3515 data->flags |= SF_HAS_EVAL;
3516 data->whilem_c = data_fake.whilem_c;
3518 if (f & SCF_DO_STCLASS_AND) {
3519 const int was = (data->start_class->flags & ANYOF_EOS);
3521 cl_and(data->start_class, &intrnl);
3523 data->start_class->flags |= ANYOF_EOS;
3526 #if PERL_ENABLE_POSITIVE_ASSERTION_STUDY
3528 /* Positive Lookahead/lookbehind
3529 In this case we can do fixed string optimisation,
3530 but we must be careful about it. Note in the case of
3531 lookbehind the positions will be offset by the minimum
3532 length of the pattern, something we won't know about
3533 until after the recurse.
3535 I32 deltanext, fake = 0;
3537 struct regnode_charclass_class intrnl;
3539 /* We use SAVEFREEPV so that when the full compile
3540 is finished perl will clean up the allocated
3541 minlens when its all done. This was we don't
3542 have to worry about freeing them when we know
3543 they wont be used, which would be a pain.
3546 Newx( minnextp, 1, I32 );
3547 SAVEFREEPV(minnextp);
3550 StructCopy(data, &data_fake, scan_data_t);
3551 if ((flags & SCF_DO_SUBSTR) && data->last_found) {
3554 scan_commit(pRExC_state, &data_fake,minlenp);
3555 data_fake.last_found=newSVsv(data->last_found);
3559 data_fake.last_closep = &fake;
3560 data_fake.flags = 0;
3561 data_fake.pos_delta = delta;
3563 data_fake.flags |= SF_IS_INF;
3564 if ( flags & SCF_DO_STCLASS && !scan->flags
3565 && OP(scan) == IFMATCH ) { /* Lookahead */
3566 cl_init(pRExC_state, &intrnl);
3567 data_fake.start_class = &intrnl;
3568 f |= SCF_DO_STCLASS_AND;
3570 if (flags & SCF_WHILEM_VISITED_POS)
3571 f |= SCF_WHILEM_VISITED_POS;
3572 next = regnext(scan);
3573 nscan = NEXTOPER(NEXTOPER(scan));
3575 *minnextp = study_chunk(pRExC_state, &nscan, minnextp, &deltanext,
3576 last, &data_fake, stopparen, recursed, NULL, f,depth+1);
3579 FAIL("Variable length lookbehind not implemented");
3581 else if (*minnextp > (I32)U8_MAX) {
3582 FAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
3584 scan->flags = (U8)*minnextp;
3589 if (f & SCF_DO_STCLASS_AND) {
3590 const int was = (data->start_class->flags & ANYOF_EOS);
3592 cl_and(data->start_class, &intrnl);
3594 data->start_class->flags |= ANYOF_EOS;
3597 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3599 if (data_fake.flags & SF_HAS_EVAL)
3600 data->flags |= SF_HAS_EVAL;
3601 data->whilem_c = data_fake.whilem_c;
3602 if ((flags & SCF_DO_SUBSTR) && data_fake.last_found) {
3603 if (RExC_rx->minlen<*minnextp)
3604 RExC_rx->minlen=*minnextp;
3605 scan_commit(pRExC_state, &data_fake, minnextp);
3606 SvREFCNT_dec(data_fake.last_found);
3608 if ( data_fake.minlen_fixed != minlenp )
3610 data->offset_fixed= data_fake.offset_fixed;
3611 data->minlen_fixed= data_fake.minlen_fixed;
3612 data->lookbehind_fixed+= scan->flags;
3614 if ( data_fake.minlen_float != minlenp )
3616 data->minlen_float= data_fake.minlen_float;
3617 data->offset_float_min=data_fake.offset_float_min;
3618 data->offset_float_max=data_fake.offset_float_max;
3619 data->lookbehind_float+= scan->flags;
3628 else if (OP(scan) == OPEN) {
3629 if (stopparen != (I32)ARG(scan))
3632 else if (OP(scan) == CLOSE) {
3633 if (stopparen == (I32)ARG(scan)) {
3636 if ((I32)ARG(scan) == is_par) {
3637 next = regnext(scan);
3639 if ( next && (OP(next) != WHILEM) && next < last)
3640 is_par = 0; /* Disable optimization */
3643 *(data->last_closep) = ARG(scan);
3645 else if (OP(scan) == EVAL) {
3647 data->flags |= SF_HAS_EVAL;
3649 else if ( PL_regkind[OP(scan)] == ENDLIKE ) {
3650 if (flags & SCF_DO_SUBSTR) {
3651 scan_commit(pRExC_state,data,minlenp);
3652 flags &= ~SCF_DO_SUBSTR;
3654 if (data && OP(scan)==ACCEPT) {
3655 data->flags |= SCF_SEEN_ACCEPT;
3660 else if (OP(scan) == LOGICAL && scan->flags == 2) /* Embedded follows */
3662 if (flags & SCF_DO_SUBSTR) {
3663 scan_commit(pRExC_state,data,minlenp);
3664 data->longest = &(data->longest_float);
3666 is_inf = is_inf_internal = 1;
3667 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
3668 cl_anything(pRExC_state, data->start_class);
3669 flags &= ~SCF_DO_STCLASS;
3671 else if (OP(scan) == GPOS) {
3672 if (!(RExC_rx->extflags & RXf_GPOS_FLOAT) &&
3673 !(delta || is_inf || (data && data->pos_delta)))
3675 if (!(RExC_rx->extflags & RXf_ANCH) && (flags & SCF_DO_SUBSTR))
3676 RExC_rx->extflags |= RXf_ANCH_GPOS;
3677 if (RExC_rx->gofs < (U32)min)
3678 RExC_rx->gofs = min;
3680 RExC_rx->extflags |= RXf_GPOS_FLOAT;
3684 #ifdef TRIE_STUDY_OPT
3685 #ifdef FULL_TRIE_STUDY
3686 else if (PL_regkind[OP(scan)] == TRIE) {
3687 /* NOTE - There is similar code to this block above for handling
3688 BRANCH nodes on the initial study. If you change stuff here
3690 regnode *trie_node= scan;
3691 regnode *tail= regnext(scan);
3692 reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
3693 I32 max1 = 0, min1 = I32_MAX;
3694 struct regnode_charclass_class accum;
3696 if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
3697 scan_commit(pRExC_state, data,minlenp); /* Cannot merge strings after this. */
3698 if (flags & SCF_DO_STCLASS)
3699 cl_init_zero(pRExC_state, &accum);
3705 const regnode *nextbranch= NULL;
3708 for ( word=1 ; word <= trie->wordcount ; word++)
3710 I32 deltanext=0, minnext=0, f = 0, fake;
3711 struct regnode_charclass_class this_class;
3713 data_fake.flags = 0;
3715 data_fake.whilem_c = data->whilem_c;
3716 data_fake.last_closep = data->last_closep;
3719 data_fake.last_closep = &fake;
3720 data_fake.pos_delta = delta;
3721 if (flags & SCF_DO_STCLASS) {
3722 cl_init(pRExC_state, &this_class);
3723 data_fake.start_class = &this_class;
3724 f = SCF_DO_STCLASS_AND;
3726 if (flags & SCF_WHILEM_VISITED_POS)
3727 f |= SCF_WHILEM_VISITED_POS;
3729 if (trie->jump[word]) {
3731 nextbranch = trie_node + trie->jump[0];
3732 scan= trie_node + trie->jump[word];
3733 /* We go from the jump point to the branch that follows
3734 it. Note this means we need the vestigal unused branches
3735 even though they arent otherwise used.
3737 minnext = study_chunk(pRExC_state, &scan, minlenp,
3738 &deltanext, (regnode *)nextbranch, &data_fake,
3739 stopparen, recursed, NULL, f,depth+1);
3741 if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
3742 nextbranch= regnext((regnode*)nextbranch);
3744 if (min1 > (I32)(minnext + trie->minlen))
3745 min1 = minnext + trie->minlen;
3746 if (max1 < (I32)(minnext + deltanext + trie->maxlen))
3747 max1 = minnext + deltanext + trie->maxlen;
3748 if (deltanext == I32_MAX)
3749 is_inf = is_inf_internal = 1;
3751 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3753 if (data_fake.flags & SCF_SEEN_ACCEPT) {
3754 if ( stopmin > min + min1)
3755 stopmin = min + min1;
3756 flags &= ~SCF_DO_SUBSTR;
3758 data->flags |= SCF_SEEN_ACCEPT;
3761 if (data_fake.flags & SF_HAS_EVAL)
3762 data->flags |= SF_HAS_EVAL;
3763 data->whilem_c = data_fake.whilem_c;
3765 if (flags & SCF_DO_STCLASS)
3766 cl_or(pRExC_state, &accum, &this_class);
3769 if (flags & SCF_DO_SUBSTR) {
3770 data->pos_min += min1;
3771 data->pos_delta += max1 - min1;
3772 if (max1 != min1 || is_inf)
3773 data->longest = &(data->longest_float);
3776 delta += max1 - min1;
3777 if (flags & SCF_DO_STCLASS_OR) {
3778 cl_or(pRExC_state, data->start_class, &accum);
3780 cl_and(data->start_class, and_withp);
3781 flags &= ~SCF_DO_STCLASS;
3784 else if (flags & SCF_DO_STCLASS_AND) {
3786 cl_and(data->start_class, &accum);
3787 flags &= ~SCF_DO_STCLASS;
3790 /* Switch to OR mode: cache the old value of
3791 * data->start_class */
3793 StructCopy(data->start_class, and_withp,
3794 struct regnode_charclass_class);
3795 flags &= ~SCF_DO_STCLASS_AND;
3796 StructCopy(&accum, data->start_class,
3797 struct regnode_charclass_class);
3798 flags |= SCF_DO_STCLASS_OR;
3799 data->start_class->flags |= ANYOF_EOS;
3806 else if (PL_regkind[OP(scan)] == TRIE) {
3807 reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
3810 min += trie->minlen;
3811 delta += (trie->maxlen - trie->minlen);
3812 flags &= ~SCF_DO_STCLASS; /* xxx */
3813 if (flags & SCF_DO_SUBSTR) {
3814 scan_commit(pRExC_state,data,minlenp); /* Cannot expect anything... */
3815 data->pos_min += trie->minlen;
3816 data->pos_delta += (trie->maxlen - trie->minlen);
3817 if (trie->maxlen != trie->minlen)
3818 data->longest = &(data->longest_float);
3820 if (trie->jump) /* no more substrings -- for now /grr*/
3821 flags &= ~SCF_DO_SUBSTR;
3823 #endif /* old or new */
3824 #endif /* TRIE_STUDY_OPT */
3825 /* Else: zero-length, ignore. */
3826 scan = regnext(scan);
3831 stopparen = frame->stop;
3832 frame = frame->prev;
3833 goto fake_study_recurse;
3840 *deltap = is_inf_internal ? I32_MAX : delta;
3841 if (flags & SCF_DO_SUBSTR && is_inf)
3842 data->pos_delta = I32_MAX - data->pos_min;
3843 if (is_par > (I32)U8_MAX)
3845 if (is_par && pars==1 && data) {
3846 data->flags |= SF_IN_PAR;
3847 data->flags &= ~SF_HAS_PAR;
3849 else if (pars && data) {
3850 data->flags |= SF_HAS_PAR;
3851 data->flags &= ~SF_IN_PAR;
3853 if (flags & SCF_DO_STCLASS_OR)
3854 cl_and(data->start_class, and_withp);
3855 if (flags & SCF_TRIE_RESTUDY)
3856 data->flags |= SCF_TRIE_RESTUDY;
3858 DEBUG_STUDYDATA(data,depth);
3860 return min < stopmin ? min : stopmin;
3864 S_add_data(RExC_state_t *pRExC_state, U32 n, const char *s)
3866 if (RExC_rxi->data) {
3867 const U32 count = RExC_rxi->data->count;
3868 Renewc(RExC_rxi->data,
3869 sizeof(*RExC_rxi->data) + sizeof(void*) * (count + n - 1),
3870 char, struct reg_data);
3871 Renew(RExC_rxi->data->what, count + n, U8);
3872 RExC_rxi->data->count += n;
3875 Newxc(RExC_rxi->data, sizeof(*RExC_rxi->data) + sizeof(void*) * (n - 1),
3876 char, struct reg_data);
3877 Newx(RExC_rxi->data->what, n, U8);
3878 RExC_rxi->data->count = n;
3880 Copy(s, RExC_rxi->data->what + RExC_rxi->data->count - n, n, U8);
3881 return RExC_rxi->data->count - n;
3884 #ifndef PERL_IN_XSUB_RE
3886 Perl_reginitcolors(pTHX)
3889 const char * const s = PerlEnv_getenv("PERL_RE_COLORS");
3891 char *t = savepv(s);
3895 t = strchr(t, '\t');
3901 PL_colors[i] = t = (char *)"";
3906 PL_colors[i++] = (char *)"";
3913 #ifdef TRIE_STUDY_OPT
3914 #define CHECK_RESTUDY_GOTO \
3916 (data.flags & SCF_TRIE_RESTUDY) \
3920 #define CHECK_RESTUDY_GOTO
3924 - pregcomp - compile a regular expression into internal code
3926 * We can't allocate space until we know how big the compiled form will be,
3927 * but we can't compile it (and thus know how big it is) until we've got a
3928 * place to put the code. So we cheat: we compile it twice, once with code
3929 * generation turned off and size counting turned on, and once "for real".
3930 * This also means that we don't allocate space until we are sure that the
3931 * thing really will compile successfully, and we never have to move the
3932 * code and thus invalidate pointers into it. (Note that it has to be in
3933 * one piece because free() must be able to free it all.) [NB: not true in perl]
3935 * Beware that the optimization-preparation code in here knows about some
3936 * of the structure of the compiled regexp. [I'll say.]
3941 #ifndef PERL_IN_XSUB_RE
3942 #define RE_ENGINE_PTR &PL_core_reg_engine
3944 extern const struct regexp_engine my_reg_engine;
3945 #define RE_ENGINE_PTR &my_reg_engine
3947 /* these make a few things look better, to avoid indentation */
3948 #define BEGIN_BLOCK {
3952 Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm)
3955 GET_RE_DEBUG_FLAGS_DECL;
3956 DEBUG_r(if (!PL_colorset) reginitcolors());
3957 #ifndef PERL_IN_XSUB_RE
3959 /* Dispatch a request to compile a regexp to correct
3961 HV * const table = GvHV(PL_hintgv);
3963 SV **ptr= hv_fetchs(table, "regcomp", FALSE);
3964 if (ptr && SvIOK(*ptr) && SvIV(*ptr)) {
3965 const regexp_engine *eng=INT2PTR(regexp_engine*,SvIV(*ptr));
3967 PerlIO_printf(Perl_debug_log, "Using engine %"UVxf"\n",
3970 return CALLREGCOMP_ENG(eng, exp, xend, pm);
3977 register regexp_internal *ri;
3985 RExC_state_t RExC_state;
3986 RExC_state_t * const pRExC_state = &RExC_state;
3987 #ifdef TRIE_STUDY_OPT
3989 RExC_state_t copyRExC_state;
3992 FAIL("NULL regexp argument");
3994 RExC_utf8 = pm->op_pmdynflags & PMdf_CMP_UTF8;
3998 SV *dsv= sv_newmortal();
3999 RE_PV_QUOTED_DECL(s, RExC_utf8,
4000 dsv, RExC_precomp, (xend - exp), 60);
4001 PerlIO_printf(Perl_debug_log, "%sCompiling REx%s %s\n",
4002 PL_colors[4],PL_colors[5],s);
4004 RExC_flags = pm->op_pmflags;
4008 RExC_seen_zerolen = *exp == '^' ? -1 : 0;
4009 RExC_seen_evals = 0;
4012 /* First pass: determine size, legality. */
4021 RExC_emit = &PL_regdummy;
4022 RExC_whilem_seen = 0;
4023 RExC_charnames = NULL;
4024 RExC_open_parens = NULL;
4025 RExC_close_parens = NULL;
4027 RExC_paren_names = NULL;
4028 RExC_recurse = NULL;
4029 RExC_recurse_count = 0;
4031 #if 0 /* REGC() is (currently) a NOP at the first pass.
4032 * Clever compilers notice this and complain. --jhi */
4033 REGC((U8)REG_MAGIC, (char*)RExC_emit);
4035 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "Starting first pass (sizing)\n"));
4036 if (reg(pRExC_state, 0, &flags,1) == NULL) {
4037 RExC_precomp = NULL;
4041 PerlIO_printf(Perl_debug_log,
4042 "Required size %"IVdf" nodes\n"
4043 "Starting second pass (creation)\n",
4046 RExC_lastparse=NULL;
4048 /* Small enough for pointer-storage convention?
4049 If extralen==0, this means that we will not need long jumps. */
4050 if (RExC_size >= 0x10000L && RExC_extralen)
4051 RExC_size += RExC_extralen;
4054 if (RExC_whilem_seen > 15)
4055 RExC_whilem_seen = 15;
4058 /* Make room for a sentinel value at the end of the program */
4062 /* Allocate space and zero-initialize. Note, the two step process
4063 of zeroing when in debug mode, thus anything assigned has to
4064 happen after that */
4065 Newxz(r, 1, regexp);
4066 Newxc(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode),
4067 char, regexp_internal);
4068 if ( r == NULL || ri == NULL )
4069 FAIL("Regexp out of space");
4071 /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */
4072 Zero(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode), char);
4074 /* bulk initialize base fields with 0. */
4075 Zero(ri, sizeof(regexp_internal), char);
4078 /* non-zero initialization begins here */
4080 r->engine= RE_ENGINE_PTR;
4082 r->prelen = xend - exp;
4083 r->precomp = savepvn(RExC_precomp, r->prelen);
4084 r->extflags = pm->op_pmflags & RXf_PMf_COMPILETIME;
4086 r->nparens = RExC_npar - 1; /* set early to validate backrefs */
4088 if (RExC_seen & REG_SEEN_RECURSE) {
4089 Newxz(RExC_open_parens, RExC_npar,regnode *);
4090 SAVEFREEPV(RExC_open_parens);
4091 Newxz(RExC_close_parens,RExC_npar,regnode *);
4092 SAVEFREEPV(RExC_close_parens);
4095 /* Useful during FAIL. */
4096 Newxz(ri->offsets, 2*RExC_size+1, U32); /* MJD 20001228 */
4098 ri->offsets[0] = RExC_size;
4100 DEBUG_OFFSETS_r(PerlIO_printf(Perl_debug_log,
4101 "%s %"UVuf" bytes for offset annotations.\n",
4102 ri->offsets ? "Got" : "Couldn't get",
4103 (UV)((2*RExC_size+1) * sizeof(U32))));
4108 /* Second pass: emit code. */
4109 RExC_flags = pm->op_pmflags; /* don't let top level (?i) bleed */
4115 RExC_emit_start = ri->program;
4116 RExC_emit = ri->program;
4118 /* put a sentinal on the end of the program so we can check for
4120 ri->program[RExC_size].type = 255;
4122 /* Store the count of eval-groups for security checks: */
4123 RExC_emit->next_off = (RExC_seen_evals > (I32)U16_MAX) ? U16_MAX : (U16)RExC_seen_evals;
4124 REGC((U8)REG_MAGIC, (char*) RExC_emit++);
4125 if (reg(pRExC_state, 0, &flags,1) == NULL)
4128 /* XXXX To minimize changes to RE engine we always allocate
4129 3-units-long substrs field. */
4130 Newx(r->substrs, 1, struct reg_substr_data);
4131 if (RExC_recurse_count) {
4132 Newxz(RExC_recurse,RExC_recurse_count,regnode *);
4133 SAVEFREEPV(RExC_recurse);
4137 r->minlen = minlen = sawplus = sawopen = 0;
4138 Zero(r->substrs, 1, struct reg_substr_data);
4140 #ifdef TRIE_STUDY_OPT
4143 DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log,"Restudying\n"));
4145 RExC_state = copyRExC_state;
4146 if (seen & REG_TOP_LEVEL_BRANCHES)
4147 RExC_seen |= REG_TOP_LEVEL_BRANCHES;
4149 RExC_seen &= ~REG_TOP_LEVEL_BRANCHES;
4150 if (data.last_found) {
4151 SvREFCNT_dec(data.longest_fixed);
4152 SvREFCNT_dec(data.longest_float);
4153 SvREFCNT_dec(data.last_found);
4155 StructCopy(&zero_scan_data, &data, scan_data_t);
4157 StructCopy(&zero_scan_data, &data, scan_data_t);
4158 copyRExC_state = RExC_state;
4161 StructCopy(&zero_scan_data, &data, scan_data_t);
4164 /* Dig out information for optimizations. */
4165 r->extflags = pm->op_pmflags & RXf_PMf_COMPILETIME; /* Again? */
4166 pm->op_pmflags = RExC_flags;
4168 r->extflags |= RXf_UTF8; /* Unicode in it? */
4169 ri->regstclass = NULL;
4170 if (RExC_naughty >= 10) /* Probably an expensive pattern. */
4171 r->intflags |= PREGf_NAUGHTY;
4172 scan = ri->program + 1; /* First BRANCH. */
4174 /* testing for BRANCH here tells us whether there is "must appear"
4175 data in the pattern. If there is then we can use it for optimisations */
4176 if (!(RExC_seen & REG_TOP_LEVEL_BRANCHES)) { /* Only one top-level choice. */
4178 STRLEN longest_float_length, longest_fixed_length;
4179 struct regnode_charclass_class ch_class; /* pointed to by data */
4181 I32 last_close = 0; /* pointed to by data */
4184 /* Skip introductions and multiplicators >= 1. */
4185 while ((OP(first) == OPEN && (sawopen = 1)) ||
4186 /* An OR of *one* alternative - should not happen now. */
4187 (OP(first) == BRANCH && OP(regnext(first)) != BRANCH) ||
4188 /* for now we can't handle lookbehind IFMATCH*/
4189 (OP(first) == IFMATCH && !first->flags) ||
4190 (OP(first) == PLUS) ||
4191 (OP(first) == MINMOD) ||
4192 /* An {n,m} with n>0 */
4193 (PL_regkind[OP(first)] == CURLY && ARG1(first) > 0) )
4196 if (OP(first) == PLUS)
4199 first += regarglen[OP(first)];
4200 if (OP(first) == IFMATCH) {
4201 first = NEXTOPER(first);
4202 first += EXTRA_STEP_2ARGS;
4203 } else /* XXX possible optimisation for /(?=)/ */
4204 first = NEXTOPER(first);
4207 /* Starting-point info. */
4209 DEBUG_PEEP("first:",first,0);
4210 /* Ignore EXACT as we deal with it later. */
4211 if (PL_regkind[OP(first)] == EXACT) {
4212 if (OP(first) == EXACT)
4213 NOOP; /* Empty, get anchored substr later. */
4214 else if ((OP(first) == EXACTF || OP(first) == EXACTFL))
4215 ri->regstclass = first;
4218 else if (PL_regkind[OP(first)] == TRIE &&
4219 ((reg_trie_data *)ri->data->data[ ARG(first) ])->minlen>0)
4222 /* this can happen only on restudy */
4223 if ( OP(first) == TRIE ) {
4224 struct regnode_1 *trieop;
4225 Newxz(trieop,1,struct regnode_1);
4226 StructCopy(first,trieop,struct regnode_1);
4227 trie_op=(regnode *)trieop;
4229 struct regnode_charclass *trieop;
4230 Newxz(trieop,1,struct regnode_charclass);
4231 StructCopy(first,trieop,struct regnode_charclass);
4232 trie_op=(regnode *)trieop;
4235 make_trie_failtable(pRExC_state, (regnode *)first, trie_op, 0);
4236 ri->regstclass = trie_op;
4239 else if (strchr((const char*)PL_simple,OP(first)))
4240 ri->regstclass = first;
4241 else if (PL_regkind[OP(first)] == BOUND ||
4242 PL_regkind[OP(first)] == NBOUND)
4243 ri->regstclass = first;
4244 else if (PL_regkind[OP(first)] == BOL) {
4245 r->extflags |= (OP(first) == MBOL
4247 : (OP(first) == SBOL
4250 first = NEXTOPER(first);
4253 else if (OP(first) == GPOS) {
4254 r->extflags |= RXf_ANCH_GPOS;
4255 first = NEXTOPER(first);
4258 else if ((!sawopen || !RExC_sawback) &&
4259 (OP(first) == STAR &&
4260 PL_regkind[OP(NEXTOPER(first))] == REG_ANY) &&
4261 !(r->extflags & RXf_ANCH) && !(RExC_seen & REG_SEEN_EVAL))
4263 /* turn .* into ^.* with an implied $*=1 */
4265 (OP(NEXTOPER(first)) == REG_ANY)
4268 r->extflags |= type;
4269 r->intflags |= PREGf_IMPLICIT;
4270 first = NEXTOPER(first);
4273 if (sawplus && (!sawopen || !RExC_sawback)
4274 && !(RExC_seen & REG_SEEN_EVAL)) /* May examine pos and $& */
4275 /* x+ must match at the 1st pos of run of x's */
4276 r->intflags |= PREGf_SKIP;
4278 /* Scan is after the zeroth branch, first is atomic matcher. */
4279 #ifdef TRIE_STUDY_OPT
4282 PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
4283 (IV)(first - scan + 1))
4287 PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
4288 (IV)(first - scan + 1))
4294 * If there's something expensive in the r.e., find the
4295 * longest literal string that must appear and make it the
4296 * regmust. Resolve ties in favor of later strings, since
4297 * the regstart check works with the beginning of the r.e.
4298 * and avoiding duplication strengthens checking. Not a
4299 * strong reason, but sufficient in the absence of others.
4300 * [Now we resolve ties in favor of the earlier string if
4301 * it happens that c_offset_min has been invalidated, since the
4302 * earlier string may buy us something the later one won't.]
4305 data.longest_fixed = newSVpvs("");
4306 data.longest_float = newSVpvs("");
4307 data.last_found = newSVpvs("");
4308 data.longest = &(data.longest_fixed);
4310 if (!ri->regstclass) {
4311 cl_init(pRExC_state, &ch_class);
4312 data.start_class = &ch_class;
4313 stclass_flag = SCF_DO_STCLASS_AND;
4314 } else /* XXXX Check for BOUND? */
4316 data.last_closep = &last_close;
4318 minlen = study_chunk(pRExC_state, &first, &minlen, &fake, scan + RExC_size, /* Up to end */
4319 &data, -1, NULL, NULL,
4320 SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag,0);
4326 if ( RExC_npar == 1 && data.longest == &(data.longest_fixed)
4327 && data.last_start_min == 0 && data.last_end > 0
4328 && !RExC_seen_zerolen
4329 && (!(RExC_seen & REG_SEEN_GPOS) || (r->extflags & RXf_ANCH_GPOS)))
4330 r->extflags |= RXf_CHECK_ALL;
4331 scan_commit(pRExC_state, &data,&minlen);
4332 SvREFCNT_dec(data.last_found);
4334 /* Note that code very similar to this but for anchored string
4335 follows immediately below, changes may need to be made to both.
4338 longest_float_length = CHR_SVLEN(data.longest_float);
4339 if (longest_float_length
4340 || (data.flags & SF_FL_BEFORE_EOL
4341 && (!(data.flags & SF_FL_BEFORE_MEOL)
4342 || (RExC_flags & RXf_PMf_MULTILINE))))
4346 if (SvCUR(data.longest_fixed) /* ok to leave SvCUR */
4347 && data.offset_fixed == data.offset_float_min
4348 && SvCUR(data.longest_fixed) == SvCUR(data.longest_float))
4349 goto remove_float; /* As in (a)+. */
4351 /* copy the information about the longest float from the reg_scan_data
4352 over to the program. */
4353 if (SvUTF8(data.longest_float)) {
4354 r->float_utf8 = data.longest_float;
4355 r->float_substr = NULL;
4357 r->float_substr = data.longest_float;
4358 r->float_utf8 = NULL;
4360 /* float_end_shift is how many chars that must be matched that
4361 follow this item. We calculate it ahead of time as once the
4362 lookbehind offset is added in we lose the ability to correctly
4364 ml = data.minlen_float ? *(data.minlen_float)
4365 : (I32)longest_float_length;
4366 r->float_end_shift = ml - data.offset_float_min
4367 - longest_float_length + (SvTAIL(data.longest_float) != 0)
4368 + data.lookbehind_float;
4369 r->float_min_offset = data.offset_float_min - data.lookbehind_float;
4370 r->float_max_offset = data.offset_float_max;
4371 if (data.offset_float_max < I32_MAX) /* Don't offset infinity */
4372 r->float_max_offset -= data.lookbehind_float;
4374 t = (data.flags & SF_FL_BEFORE_EOL /* Can't have SEOL and MULTI */
4375 && (!(data.flags & SF_FL_BEFORE_MEOL)
4376 || (RExC_flags & RXf_PMf_MULTILINE)));
4377 fbm_compile(data.longest_float, t ? FBMcf_TAIL : 0);
4381 r->float_substr = r->float_utf8 = NULL;
4382 SvREFCNT_dec(data.longest_float);
4383 longest_float_length = 0;
4386 /* Note that code very similar to this but for floating string
4387 is immediately above, changes may need to be made to both.
4390 longest_fixed_length = CHR_SVLEN(data.longest_fixed);
4391 if (longest_fixed_length
4392 || (data.flags & SF_FIX_BEFORE_EOL /* Cannot have SEOL and MULTI */
4393 && (!(data.flags & SF_FIX_BEFORE_MEOL)
4394 || (RExC_flags & RXf_PMf_MULTILINE))))
4398 /* copy the information about the longest fixed
4399 from the reg_scan_data over to the program. */
4400 if (SvUTF8(data.longest_fixed)) {
4401 r->anchored_utf8 = data.longest_fixed;
4402 r->anchored_substr = NULL;
4404 r->anchored_substr = data.longest_fixed;
4405 r->anchored_utf8 = NULL;
4407 /* fixed_end_shift is how many chars that must be matched that
4408 follow this item. We calculate it ahead of time as once the
4409 lookbehind offset is added in we lose the ability to correctly
4411 ml = data.minlen_fixed ? *(data.minlen_fixed)
4412 : (I32)longest_fixed_length;
4413 r->anchored_end_shift = ml - data.offset_fixed
4414 - longest_fixed_length + (SvTAIL(data.longest_fixed) != 0)
4415 + data.lookbehind_fixed;
4416 r->anchored_offset = data.offset_fixed - data.lookbehind_fixed;
4418 t = (data.flags & SF_FIX_BEFORE_EOL /* Can't have SEOL and MULTI */
4419 && (!(data.flags & SF_FIX_BEFORE_MEOL)
4420 || (RExC_flags & RXf_PMf_MULTILINE)));
4421 fbm_compile(data.longest_fixed, t ? FBMcf_TAIL : 0);
4424 r->anchored_substr = r->anchored_utf8 = NULL;
4425 SvREFCNT_dec(data.longest_fixed);
4426 longest_fixed_length = 0;
4429 && (OP(ri->regstclass) == REG_ANY || OP(ri->regstclass) == SANY))
4430 ri->regstclass = NULL;
4431 if ((!(r->anchored_substr || r->anchored_utf8) || r->anchored_offset)
4433 && !(data.start_class->flags & ANYOF_EOS)
4434 && !cl_is_anything(data.start_class))
4436 const U32 n = add_data(pRExC_state, 1, "f");
4438 Newx(RExC_rxi->data->data[n], 1,
4439 struct regnode_charclass_class);
4440 StructCopy(data.start_class,
4441 (struct regnode_charclass_class*)RExC_rxi->data->data[n],
4442 struct regnode_charclass_class);
4443 ri->regstclass = (regnode*)RExC_rxi->data->data[n];
4444 r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
4445 DEBUG_COMPILE_r({ SV *sv = sv_newmortal();
4446 regprop(r, sv, (regnode*)data.start_class);
4447 PerlIO_printf(Perl_debug_log,
4448 "synthetic stclass \"%s\".\n",
4449 SvPVX_const(sv));});
4452 /* A temporary algorithm prefers floated substr to fixed one to dig more info. */
4453 if (longest_fixed_length > longest_float_length) {
4454 r->check_end_shift = r->anchored_end_shift;
4455 r->check_substr = r->anchored_substr;
4456 r->check_utf8 = r->anchored_utf8;
4457 r->check_offset_min = r->check_offset_max = r->anchored_offset;
4458 if (r->extflags & RXf_ANCH_SINGLE)
4459 r->extflags |= RXf_NOSCAN;
4462 r->check_end_shift = r->float_end_shift;
4463 r->check_substr = r->float_substr;
4464 r->check_utf8 = r->float_utf8;
4465 r->check_offset_min = r->float_min_offset;
4466 r->check_offset_max = r->float_max_offset;
4468 /* XXXX Currently intuiting is not compatible with ANCH_GPOS.
4469 This should be changed ASAP! */
4470 if ((r->check_substr || r->check_utf8) && !(r->extflags & RXf_ANCH_GPOS)) {
4471 r->extflags |= RXf_USE_INTUIT;
4472 if (SvTAIL(r->check_substr ? r->check_substr : r->check_utf8))
4473 r->extflags |= RXf_INTUIT_TAIL;
4475 /* XXX Unneeded? dmq (shouldn't as this is handled elsewhere)
4476 if ( (STRLEN)minlen < longest_float_length )
4477 minlen= longest_float_length;
4478 if ( (STRLEN)minlen < longest_fixed_length )
4479 minlen= longest_fixed_length;
4483 /* Several toplevels. Best we can is to set minlen. */
4485 struct regnode_charclass_class ch_class;
4488 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "\nMulti Top Level\n"));
4490 scan = ri->program + 1;
4491 cl_init(pRExC_state, &ch_class);
4492 data.start_class = &ch_class;
4493 data.last_closep = &last_close;
4496 minlen = study_chunk(pRExC_state, &scan, &minlen, &fake, scan + RExC_size,
4497 &data, -1, NULL, NULL, SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS,0);
4501 r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8
4502 = r->float_substr = r->float_utf8 = NULL;
4503 if (!(data.start_class->flags & ANYOF_EOS)
4504 && !cl_is_anything(data.start_class))
4506 const U32 n = add_data(pRExC_state, 1, "f");
4508 Newx(RExC_rxi->data->data[n], 1,
4509 struct regnode_charclass_class);
4510 StructCopy(data.start_class,
4511 (struct regnode_charclass_class*)RExC_rxi->data->data[n],
4512 struct regnode_charclass_class);
4513 ri->regstclass = (regnode*)RExC_rxi->data->data[n];
4514 r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
4515 DEBUG_COMPILE_r({ SV* sv = sv_newmortal();
4516 regprop(r, sv, (regnode*)data.start_class);
4517 PerlIO_printf(Perl_debug_log,
4518 "synthetic stclass \"%s\".\n",
4519 SvPVX_const(sv));});
4523 /* Guard against an embedded (?=) or (?<=) with a longer minlen than
4524 the "real" pattern. */
4526 PerlIO_printf(Perl_debug_log,"minlen: %"IVdf" r->minlen:%"IVdf"\n",
4527 (IV)minlen, (IV)r->minlen);
4529 r->minlenret = minlen;
4530 if (r->minlen < minlen)
4533 if (RExC_seen & REG_SEEN_GPOS)
4534 r->extflags |= RXf_GPOS_SEEN;
4535 if (RExC_seen & REG_SEEN_LOOKBEHIND)
4536 r->extflags |= RXf_LOOKBEHIND_SEEN;
4537 if (RExC_seen & REG_SEEN_EVAL)
4538 r->extflags |= RXf_EVAL_SEEN;
4539 if (RExC_seen & REG_SEEN_CANY)
4540 r->extflags |= RXf_CANY_SEEN;
4541 if (RExC_seen & REG_SEEN_VERBARG)
4542 r->intflags |= PREGf_VERBARG_SEEN;
4543 if (RExC_seen & REG_SEEN_CUTGROUP)
4544 r->intflags |= PREGf_CUTGROUP_SEEN;
4545 if (RExC_paren_names)
4546 r->paren_names = (HV*)SvREFCNT_inc(RExC_paren_names);
4548 r->paren_names = NULL;
4550 if (RExC_recurse_count) {
4551 for ( ; RExC_recurse_count ; RExC_recurse_count-- ) {
4552 const regnode *scan = RExC_recurse[RExC_recurse_count-1];
4553 ARG2L_SET( scan, RExC_open_parens[ARG(scan)-1] - scan );
4556 Newxz(r->startp, RExC_npar, I32);
4557 Newxz(r->endp, RExC_npar, I32);
4558 /* assume we don't need to swap parens around before we match */
4561 PerlIO_printf(Perl_debug_log,"Final program:\n");
4564 DEBUG_OFFSETS_r(if (ri->offsets) {
4565 const U32 len = ri->offsets[0];
4567 GET_RE_DEBUG_FLAGS_DECL;
4568 PerlIO_printf(Perl_debug_log, "Offsets: [%"UVuf"]\n\t", (UV)ri->offsets[0]);
4569 for (i = 1; i <= len; i++) {
4570 if (ri->offsets[i*2-1] || ri->offsets[i*2])
4571 PerlIO_printf(Perl_debug_log, "%"UVuf":%"UVuf"[%"UVuf"] ",
4572 (UV)i, (UV)ri->offsets[i*2-1], (UV)ri->offsets[i*2]);
4574 PerlIO_printf(Perl_debug_log, "\n");
4580 #undef CORE_ONLY_BLOCK
4582 #undef RE_ENGINE_PTR
4584 #ifndef PERL_IN_XSUB_RE
4586 Perl_reg_named_buff_sv(pTHX_ SV* namesv)
4588 I32 parno = 0; /* no match */
4590 const REGEXP * const rx = PM_GETRE(PL_curpm);
4591 if (rx && rx->paren_names) {
4592 HE *he_str = hv_fetch_ent( rx->paren_names, namesv, 0, 0 );
4595 SV* sv_dat=HeVAL(he_str);
4596 I32 *nums=(I32*)SvPVX(sv_dat);
4597 for ( i=0; i<SvIVX(sv_dat); i++ ) {
4598 if ((I32)(rx->lastparen) >= nums[i] &&
4599 rx->endp[nums[i]] != -1)
4612 SV *sv= sv_newmortal();
4613 Perl_sv_setpvf(aTHX_ sv, "%"IVdf,(IV)parno);
4614 gv_paren= Perl_gv_fetchsv(aTHX_ sv, GV_ADD, SVt_PVGV);
4615 return GvSVn(gv_paren);
4620 /* Scans the name of a named buffer from the pattern.
4621 * If flags is REG_RSN_RETURN_NULL returns null.
4622 * If flags is REG_RSN_RETURN_NAME returns an SV* containing the name
4623 * If flags is REG_RSN_RETURN_DATA returns the data SV* corresponding
4624 * to the parsed name as looked up in the RExC_paren_names hash.
4625 * If there is an error throws a vFAIL().. type exception.
4628 #define REG_RSN_RETURN_NULL 0
4629 #define REG_RSN_RETURN_NAME 1
4630 #define REG_RSN_RETURN_DATA 2
4633 S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags) {
4634 char *name_start = RExC_parse;
4637 while( isIDFIRST_uni(utf8n_to_uvchr((U8*)RExC_parse,
4638 RExC_end - RExC_parse, &numlen, UTF8_ALLOW_DEFAULT)))
4640 RExC_parse += numlen;
4643 while( isIDFIRST(*RExC_parse) )
4647 SV* sv_name = sv_2mortal(Perl_newSVpvn(aTHX_ name_start,
4648 (int)(RExC_parse - name_start)));
4651 if ( flags == REG_RSN_RETURN_NAME)
4653 else if (flags==REG_RSN_RETURN_DATA) {
4656 if ( ! sv_name ) /* should not happen*/
4657 Perl_croak(aTHX_ "panic: no svname in reg_scan_name");
4658 if (RExC_paren_names)
4659 he_str = hv_fetch_ent( RExC_paren_names, sv_name, 0, 0 );
4661 sv_dat = HeVAL(he_str);
4663 vFAIL("Reference to nonexistent named group");
4667 Perl_croak(aTHX_ "panic: bad flag in reg_scan_name");
4674 #define DEBUG_PARSE_MSG(funcname) DEBUG_PARSE_r({ \
4675 int rem=(int)(RExC_end - RExC_parse); \
4684 if (RExC_lastparse!=RExC_parse) \
4685 PerlIO_printf(Perl_debug_log," >%.*s%-*s", \
4688 iscut ? "..." : "<" \
4691 PerlIO_printf(Perl_debug_log,"%16s",""); \
4696 num=REG_NODE_NUM(RExC_emit); \
4697 if (RExC_lastnum!=num) \
4698 PerlIO_printf(Perl_debug_log,"|%4d",num); \
4700 PerlIO_printf(Perl_debug_log,"|%4s",""); \
4701 PerlIO_printf(Perl_debug_log,"|%*s%-4s", \
4702 (int)((depth*2)), "", \
4706 RExC_lastparse=RExC_parse; \
4711 #define DEBUG_PARSE(funcname) DEBUG_PARSE_r({ \
4712 DEBUG_PARSE_MSG((funcname)); \
4713 PerlIO_printf(Perl_debug_log,"%4s","\n"); \
4715 #define DEBUG_PARSE_FMT(funcname,fmt,args) DEBUG_PARSE_r({ \
4716 DEBUG_PARSE_MSG((funcname)); \
4717 PerlIO_printf(Perl_debug_log,fmt "\n",args); \
4720 - reg - regular expression, i.e. main body or parenthesized thing
4722 * Caller must absorb opening parenthesis.
4724 * Combining parenthesis handling with the base level of regular expression
4725 * is a trifle forced, but the need to tie the tails of the branches to what
4726 * follows makes it hard to avoid.
4728 #define REGTAIL(x,y,z) regtail((x),(y),(z),depth+1)
4730 #define REGTAIL_STUDY(x,y,z) regtail_study((x),(y),(z),depth+1)
4732 #define REGTAIL_STUDY(x,y,z) regtail((x),(y),(z),depth+1)
4735 /* this idea is borrowed from STR_WITH_LEN in handy.h */
4736 #define CHECK_WORD(s,v,l) \
4737 (((sizeof(s)-1)==(l)) && (strnEQ(start_verb, (s ""), (sizeof(s)-1))))
4740 S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
4741 /* paren: Parenthesized? 0=top, 1=(, inside: changed to letter. */
4744 register regnode *ret; /* Will be the head of the group. */
4745 register regnode *br;
4746 register regnode *lastbr;
4747 register regnode *ender = NULL;
4748 register I32 parno = 0;
4750 const I32 oregflags = RExC_flags;
4751 bool have_branch = 0;
4754 /* for (?g), (?gc), and (?o) warnings; warning
4755 about (?c) will warn about (?g) -- japhy */
4757 #define WASTED_O 0x01
4758 #define WASTED_G 0x02
4759 #define WASTED_C 0x04
4760 #define WASTED_GC (0x02|0x04)
4761 I32 wastedflags = 0x00;
4763 char * parse_start = RExC_parse; /* MJD */
4764 char * const oregcomp_parse = RExC_parse;
4766 GET_RE_DEBUG_FLAGS_DECL;
4767 DEBUG_PARSE("reg ");
4770 *flagp = 0; /* Tentatively. */
4773 /* Make an OPEN node, if parenthesized. */
4775 if ( *RExC_parse == '*') { /* (*VERB:ARG) */
4776 char *start_verb = RExC_parse;
4777 STRLEN verb_len = 0;
4778 char *start_arg = NULL;
4779 unsigned char op = 0;
4781 int internal_argval = 0; /* internal_argval is only useful if !argok */
4782 while ( *RExC_parse && *RExC_parse != ')' ) {
4783 if ( *RExC_parse == ':' ) {
4784 start_arg = RExC_parse + 1;
4790 verb_len = RExC_parse - start_verb;
4793 while ( *RExC_parse && *RExC_parse != ')' )
4795 if ( *RExC_parse != ')' )
4796 vFAIL("Unterminated verb pattern argument");
4797 if ( RExC_parse == start_arg )
4800 if ( *RExC_parse != ')' )
4801 vFAIL("Unterminated verb pattern");
4804 switch ( *start_verb ) {
4805 case 'A': /* (*ACCEPT) */
4806 if ( CHECK_WORD("ACCEPT",start_verb,verb_len) ) {
4808 internal_argval = RExC_nestroot;
4811 case 'C': /* (*COMMIT) */
4812 if ( CHECK_WORD("COMMIT",start_verb,verb_len) )
4815 case 'F': /* (*FAIL) */
4816 if ( verb_len==1 || CHECK_WORD("FAIL",start_verb,verb_len) ) {
4821 case ':': /* (*:NAME) */
4822 case 'M': /* (*MARK:NAME) */
4823 if ( verb_len==0 || CHECK_WORD("MARK",start_verb,verb_len) ) {
4828 case 'P': /* (*PRUNE) */
4829 if ( CHECK_WORD("PRUNE",start_verb,verb_len) )
4832 case 'S': /* (*SKIP) */
4833 if ( CHECK_WORD("SKIP",start_verb,verb_len) )
4836 case 'T': /* (*THEN) */
4837 /* [19:06] <TimToady> :: is then */
4838 if ( CHECK_WORD("THEN",start_verb,verb_len) ) {
4840 RExC_seen |= REG_SEEN_CUTGROUP;
4846 vFAIL3("Unknown verb pattern '%.*s'",
4847 verb_len, start_verb);
4850 if ( start_arg && internal_argval ) {
4851 vFAIL3("Verb pattern '%.*s' may not have an argument",
4852 verb_len, start_verb);
4853 } else if ( argok < 0 && !start_arg ) {
4854 vFAIL3("Verb pattern '%.*s' has a mandatory argument",
4855 verb_len, start_verb);
4857 ret = reganode(pRExC_state, op, internal_argval);
4858 if ( ! internal_argval && ! SIZE_ONLY ) {
4860 SV *sv = newSVpvn( start_arg, RExC_parse - start_arg);
4861 ARG(ret) = add_data( pRExC_state, 1, "S" );
4862 RExC_rxi->data->data[ARG(ret)]=(void*)sv;
4869 if (!internal_argval)
4870 RExC_seen |= REG_SEEN_VERBARG;
4871 } else if ( start_arg ) {
4872 vFAIL3("Verb pattern '%.*s' may not have an argument",
4873 verb_len, start_verb);
4875 ret = reg_node(pRExC_state, op);
4877 nextchar(pRExC_state);
4880 if (*RExC_parse == '?') { /* (?...) */
4881 U32 posflags = 0, negflags = 0;
4882 U32 *flagsp = &posflags;
4883 bool is_logical = 0;
4884 const char * const seqstart = RExC_parse;
4887 paren = *RExC_parse++;
4888 ret = NULL; /* For look-ahead/behind. */
4891 case '<': /* (?<...) */
4892 if (*RExC_parse == '!')
4894 else if (*RExC_parse != '=')
4899 case '\'': /* (?'...') */
4900 name_start= RExC_parse;
4901 svname = reg_scan_name(pRExC_state,
4902 SIZE_ONLY ? /* reverse test from the others */
4903 REG_RSN_RETURN_NAME :
4904 REG_RSN_RETURN_NULL);
4905 if (RExC_parse == name_start)
4907 if (*RExC_parse != paren)
4908 vFAIL2("Sequence (?%c... not terminated",
4909 paren=='>' ? '<' : paren);
4913 if (!svname) /* shouldnt happen */
4915 "panic: reg_scan_name returned NULL");
4916 if (!RExC_paren_names) {
4917 RExC_paren_names= newHV();
4918 sv_2mortal((SV*)RExC_paren_names);
4920 he_str = hv_fetch_ent( RExC_paren_names, svname, 1, 0 );
4922 sv_dat = HeVAL(he_str);
4924 /* croak baby croak */
4926 "panic: paren_name hash element allocation failed");
4927 } else if ( SvPOK(sv_dat) ) {
4928 IV count=SvIV(sv_dat);
4929 I32 *pv=(I32*)SvGROW(sv_dat,SvCUR(sv_dat)+sizeof(I32)+1);
4930 SvCUR_set(sv_dat,SvCUR(sv_dat)+sizeof(I32));
4931 pv[count]=RExC_npar;
4934 (void)SvUPGRADE(sv_dat,SVt_PVNV);
4935 sv_setpvn(sv_dat, (char *)&(RExC_npar), sizeof(I32));
4940 /*sv_dump(sv_dat);*/
4942 nextchar(pRExC_state);
4944 goto capturing_parens;
4946 RExC_seen |= REG_SEEN_LOOKBEHIND;
4948 case '=': /* (?=...) */
4949 case '!': /* (?!...) */
4950 RExC_seen_zerolen++;
4951 if (*RExC_parse == ')') {
4952 ret=reg_node(pRExC_state, OPFAIL);
4953 nextchar(pRExC_state);
4956 case ':': /* (?:...) */
4957 case '>': /* (?>...) */
4959 case '$': /* (?$...) */
4960 case '@': /* (?@...) */
4961 vFAIL2("Sequence (?%c...) not implemented", (int)paren);
4963 case '#': /* (?#...) */
4964 while (*RExC_parse && *RExC_parse != ')')
4966 if (*RExC_parse != ')')
4967 FAIL("Sequence (?#... not terminated");
4968 nextchar(pRExC_state);
4971 case '0' : /* (?0) */
4972 case 'R' : /* (?R) */
4973 if (*RExC_parse != ')')
4974 FAIL("Sequence (?R) not terminated");
4975 ret = reg_node(pRExC_state, GOSTART);
4976 nextchar(pRExC_state);
4979 { /* named and numeric backreferences */
4982 case '&': /* (?&NAME) */
4983 parse_start = RExC_parse - 1;
4985 SV *sv_dat = reg_scan_name(pRExC_state,
4986 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
4987 num = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
4989 goto gen_recurse_regop;
4992 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
4994 vFAIL("Illegal pattern");
4996 goto parse_recursion;
4998 case '-': /* (?-1) */
4999 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
5000 RExC_parse--; /* rewind to let it be handled later */
5004 case '1': case '2': case '3': case '4': /* (?1) */
5005 case '5': case '6': case '7': case '8': case '9':
5008 num = atoi(RExC_parse);
5009 parse_start = RExC_parse - 1; /* MJD */
5010 if (*RExC_parse == '-')
5012 while (isDIGIT(*RExC_parse))
5014 if (*RExC_parse!=')')
5015 vFAIL("Expecting close bracket");
5018 if ( paren == '-' ) {
5020 Diagram of capture buffer numbering.
5021 Top line is the normal capture buffer numbers
5022 Botton line is the negative indexing as from
5026 /(a(x)y)(a(b(c(?-2)d)e)f)(g(h))/
5030 num = RExC_npar + num;
5033 vFAIL("Reference to nonexistent group");
5035 } else if ( paren == '+' ) {
5036 num = RExC_npar + num - 1;
5039 ret = reganode(pRExC_state, GOSUB, num);
5041 if (num > (I32)RExC_rx->nparens) {
5043 vFAIL("Reference to nonexistent group");
5045 ARG2L_SET( ret, RExC_recurse_count++);
5047 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
5048 "Recurse #%"UVuf" to %"IVdf"\n", (UV)ARG(ret), (IV)ARG2L(ret)));
5052 RExC_seen |= REG_SEEN_RECURSE;
5053 Set_Node_Length(ret, 1 + regarglen[OP(ret)]); /* MJD */
5054 Set_Node_Offset(ret, parse_start); /* MJD */
5056 nextchar(pRExC_state);
5058 } /* named and numeric backreferences */
5061 case 'p': /* (?p...) */
5062 if (SIZE_ONLY && ckWARN2(WARN_DEPRECATED, WARN_REGEXP))
5063 vWARNdep(RExC_parse, "(?p{}) is deprecated - use (??{})");
5065 case '?': /* (??...) */
5067 if (*RExC_parse != '{')
5069 paren = *RExC_parse++;
5071 case '{': /* (?{...}) */
5076 char *s = RExC_parse;
5078 RExC_seen_zerolen++;
5079 RExC_seen |= REG_SEEN_EVAL;
5080 while (count && (c = *RExC_parse)) {
5091 if (*RExC_parse != ')') {
5093 vFAIL("Sequence (?{...}) not terminated or not {}-balanced");
5097 OP_4tree *sop, *rop;
5098 SV * const sv = newSVpvn(s, RExC_parse - 1 - s);
5101 Perl_save_re_context(aTHX);
5102 rop = sv_compile_2op(sv, &sop, "re", &pad);
5103 sop->op_private |= OPpREFCOUNTED;
5104 /* re_dup will OpREFCNT_inc */
5105 OpREFCNT_set(sop, 1);
5108 n = add_data(pRExC_state, 3, "nop");
5109 RExC_rxi->data->data[n] = (void*)rop;
5110 RExC_rxi->data->data[n+1] = (void*)sop;
5111 RExC_rxi->data->data[n+2] = (void*)pad;
5114 else { /* First pass */
5115 if (PL_reginterp_cnt < ++RExC_seen_evals
5117 /* No compiled RE interpolated, has runtime
5118 components ===> unsafe. */
5119 FAIL("Eval-group not allowed at runtime, use re 'eval'");
5120 if (PL_tainting && PL_tainted)
5121 FAIL("Eval-group in insecure regular expression");
5122 #if PERL_VERSION > 8
5123 if (IN_PERL_COMPILETIME)
5128 nextchar(pRExC_state);
5130 ret = reg_node(pRExC_state, LOGICAL);
5133 REGTAIL(pRExC_state, ret, reganode(pRExC_state, EVAL, n));
5134 /* deal with the length of this later - MJD */
5137 ret = reganode(pRExC_state, EVAL, n);
5138 Set_Node_Length(ret, RExC_parse - parse_start + 1);
5139 Set_Node_Offset(ret, parse_start);
5142 case '(': /* (?(?{...})...) and (?(?=...)...) */
5145 if (RExC_parse[0] == '?') { /* (?(?...)) */
5146 if (RExC_parse[1] == '=' || RExC_parse[1] == '!'
5147 || RExC_parse[1] == '<'
5148 || RExC_parse[1] == '{') { /* Lookahead or eval. */
5151 ret = reg_node(pRExC_state, LOGICAL);
5154 REGTAIL(pRExC_state, ret, reg(pRExC_state, 1, &flag,depth+1));
5158 else if ( RExC_parse[0] == '<' /* (?(<NAME>)...) */
5159 || RExC_parse[0] == '\'' ) /* (?('NAME')...) */
5161 char ch = RExC_parse[0] == '<' ? '>' : '\'';
5162 char *name_start= RExC_parse++;
5164 SV *sv_dat=reg_scan_name(pRExC_state,
5165 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
5166 if (RExC_parse == name_start || *RExC_parse != ch)
5167 vFAIL2("Sequence (?(%c... not terminated",
5168 (ch == '>' ? '<' : ch));
5171 num = add_data( pRExC_state, 1, "S" );
5172 RExC_rxi->data->data[num]=(void*)sv_dat;
5173 SvREFCNT_inc(sv_dat);
5175 ret = reganode(pRExC_state,NGROUPP,num);
5176 goto insert_if_check_paren;
5178 else if (RExC_parse[0] == 'D' &&
5179 RExC_parse[1] == 'E' &&
5180 RExC_parse[2] == 'F' &&
5181 RExC_parse[3] == 'I' &&
5182 RExC_parse[4] == 'N' &&
5183 RExC_parse[5] == 'E')
5185 ret = reganode(pRExC_state,DEFINEP,0);
5188 goto insert_if_check_paren;
5190 else if (RExC_parse[0] == 'R') {
5193 if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
5194 parno = atoi(RExC_parse++);
5195 while (isDIGIT(*RExC_parse))
5197 } else if (RExC_parse[0] == '&') {
5200 sv_dat = reg_scan_name(pRExC_state,
5201 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
5202 parno = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
5204 ret = reganode(pRExC_state,INSUBP,parno);
5205 goto insert_if_check_paren;
5207 else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
5210 parno = atoi(RExC_parse++);
5212 while (isDIGIT(*RExC_parse))
5214 ret = reganode(pRExC_state, GROUPP, parno);
5216 insert_if_check_paren:
5217 if ((c = *nextchar(pRExC_state)) != ')')
5218 vFAIL("Switch condition not recognized");
5220 REGTAIL(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
5221 br = regbranch(pRExC_state, &flags, 1,depth+1);
5223 br = reganode(pRExC_state, LONGJMP, 0);
5225 REGTAIL(pRExC_state, br, reganode(pRExC_state, LONGJMP, 0));
5226 c = *nextchar(pRExC_state);
5231 vFAIL("(?(DEFINE)....) does not allow branches");
5232 lastbr = reganode(pRExC_state, IFTHEN, 0); /* Fake one for optimizer. */
5233 regbranch(pRExC_state, &flags, 1,depth+1);
5234 REGTAIL(pRExC_state, ret, lastbr);
5237 c = *nextchar(pRExC_state);
5242 vFAIL("Switch (?(condition)... contains too many branches");
5243 ender = reg_node(pRExC_state, TAIL);
5244 REGTAIL(pRExC_state, br, ender);
5246 REGTAIL(pRExC_state, lastbr, ender);
5247 REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
5250 REGTAIL(pRExC_state, ret, ender);
5254 vFAIL2("Unknown switch condition (?(%.2s", RExC_parse);
5258 RExC_parse--; /* for vFAIL to print correctly */
5259 vFAIL("Sequence (? incomplete");
5263 parse_flags: /* (?i) */
5264 while (*RExC_parse && strchr("iogcmsx", *RExC_parse)) {
5265 /* (?g), (?gc) and (?o) are useless here
5266 and must be globally applied -- japhy */
5268 if (*RExC_parse == 'o' || *RExC_parse == 'g') {
5269 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
5270 const I32 wflagbit = *RExC_parse == 'o' ? WASTED_O : WASTED_G;
5271 if (! (wastedflags & wflagbit) ) {
5272 wastedflags |= wflagbit;
5275 "Useless (%s%c) - %suse /%c modifier",
5276 flagsp == &negflags ? "?-" : "?",
5278 flagsp == &negflags ? "don't " : "",
5284 else if (*RExC_parse == 'c') {
5285 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
5286 if (! (wastedflags & WASTED_C) ) {
5287 wastedflags |= WASTED_GC;
5290 "Useless (%sc) - %suse /gc modifier",
5291 flagsp == &negflags ? "?-" : "?",
5292 flagsp == &negflags ? "don't " : ""
5297 else { pmflag(flagsp, *RExC_parse); }
5301 if (*RExC_parse == '-') {
5303 wastedflags = 0; /* reset so (?g-c) warns twice */
5307 RExC_flags |= posflags;
5308 RExC_flags &= ~negflags;
5309 if (*RExC_parse == ':') {
5315 if (*RExC_parse != ')') {
5317 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
5319 nextchar(pRExC_state);
5329 ret = reganode(pRExC_state, OPEN, parno);
5332 RExC_nestroot = parno;
5333 if (RExC_seen & REG_SEEN_RECURSE) {
5334 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
5335 "Setting open paren #%"IVdf" to %d\n",
5336 (IV)parno, REG_NODE_NUM(ret)));
5337 RExC_open_parens[parno-1]= ret;
5340 Set_Node_Length(ret, 1); /* MJD */
5341 Set_Node_Offset(ret, RExC_parse); /* MJD */
5348 /* Pick up the branches, linking them together. */
5349 parse_start = RExC_parse; /* MJD */
5350 br = regbranch(pRExC_state, &flags, 1,depth+1);
5351 /* branch_len = (paren != 0); */
5355 if (*RExC_parse == '|') {
5356 if (!SIZE_ONLY && RExC_extralen) {
5357 reginsert(pRExC_state, BRANCHJ, br, depth+1);
5360 reginsert(pRExC_state, BRANCH, br, depth+1);
5361 Set_Node_Length(br, paren != 0);
5362 Set_Node_Offset_To_R(br-RExC_emit_start, parse_start-RExC_start);
5366 RExC_extralen += 1; /* For BRANCHJ-BRANCH. */
5368 else if (paren == ':') {
5369 *flagp |= flags&SIMPLE;
5371 if (is_open) { /* Starts with OPEN. */
5372 REGTAIL(pRExC_state, ret, br); /* OPEN -> first. */
5374 else if (paren != '?') /* Not Conditional */
5376 *flagp |= flags & (SPSTART | HASWIDTH);
5378 while (*RExC_parse == '|') {
5379 if (!SIZE_ONLY && RExC_extralen) {
5380 ender = reganode(pRExC_state, LONGJMP,0);
5381 REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender); /* Append to the previous. */
5384 RExC_extralen += 2; /* Account for LONGJMP. */
5385 nextchar(pRExC_state);
5386 br = regbranch(pRExC_state, &flags, 0, depth+1);
5390 REGTAIL(pRExC_state, lastbr, br); /* BRANCH -> BRANCH. */
5394 *flagp |= flags&SPSTART;
5397 if (have_branch || paren != ':') {
5398 /* Make a closing node, and hook it on the end. */
5401 ender = reg_node(pRExC_state, TAIL);
5405 ender = reganode(pRExC_state, CLOSE, parno);
5406 if (!SIZE_ONLY && RExC_seen & REG_SEEN_RECURSE) {
5407 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
5408 "Setting close paren #%"IVdf" to %d\n",
5409 (IV)parno, REG_NODE_NUM(ender)));
5410 RExC_close_parens[parno-1]= ender;
5411 if (RExC_nestroot == parno)
5414 Set_Node_Offset(ender,RExC_parse+1); /* MJD */
5415 Set_Node_Length(ender,1); /* MJD */
5421 *flagp &= ~HASWIDTH;
5424 ender = reg_node(pRExC_state, SUCCEED);
5427 ender = reg_node(pRExC_state, END);
5429 assert(!RExC_opend); /* there can only be one! */
5434 REGTAIL(pRExC_state, lastbr, ender);
5436 if (have_branch && !SIZE_ONLY) {
5438 RExC_seen |= REG_TOP_LEVEL_BRANCHES;
5440 /* Hook the tails of the branches to the closing node. */
5441 for (br = ret; br; br = regnext(br)) {
5442 const U8 op = PL_regkind[OP(br)];
5444 REGTAIL_STUDY(pRExC_state, NEXTOPER(br), ender);
5446 else if (op == BRANCHJ) {
5447 REGTAIL_STUDY(pRExC_state, NEXTOPER(NEXTOPER(br)), ender);
5455 static const char parens[] = "=!<,>";
5457 if (paren && (p = strchr(parens, paren))) {
5458 U8 node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
5459 int flag = (p - parens) > 1;
5462 node = SUSPEND, flag = 0;
5463 reginsert(pRExC_state, node,ret, depth+1);
5464 Set_Node_Cur_Length(ret);
5465 Set_Node_Offset(ret, parse_start + 1);
5467 REGTAIL_STUDY(pRExC_state, ret, reg_node(pRExC_state, TAIL));
5471 /* Check for proper termination. */
5473 RExC_flags = oregflags;
5474 if (RExC_parse >= RExC_end || *nextchar(pRExC_state) != ')') {
5475 RExC_parse = oregcomp_parse;
5476 vFAIL("Unmatched (");
5479 else if (!paren && RExC_parse < RExC_end) {
5480 if (*RExC_parse == ')') {
5482 vFAIL("Unmatched )");
5485 FAIL("Junk on end of regexp"); /* "Can't happen". */
5493 - regbranch - one alternative of an | operator
5495 * Implements the concatenation operator.
5498 S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first, U32 depth)
5501 register regnode *ret;
5502 register regnode *chain = NULL;
5503 register regnode *latest;
5504 I32 flags = 0, c = 0;
5505 GET_RE_DEBUG_FLAGS_DECL;
5506 DEBUG_PARSE("brnc");
5510 if (!SIZE_ONLY && RExC_extralen)
5511 ret = reganode(pRExC_state, BRANCHJ,0);
5513 ret = reg_node(pRExC_state, BRANCH);
5514 Set_Node_Length(ret, 1);
5518 if (!first && SIZE_ONLY)
5519 RExC_extralen += 1; /* BRANCHJ */
5521 *flagp = WORST; /* Tentatively. */
5524 nextchar(pRExC_state);
5525 while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
5527 latest = regpiece(pRExC_state, &flags,depth+1);
5528 if (latest == NULL) {
5529 if (flags & TRYAGAIN)
5533 else if (ret == NULL)
5535 *flagp |= flags&HASWIDTH;
5536 if (chain == NULL) /* First piece. */
5537 *flagp |= flags&SPSTART;
5540 REGTAIL(pRExC_state, chain, latest);
5545 if (chain == NULL) { /* Loop ran zero times. */
5546 chain = reg_node(pRExC_state, NOTHING);
5551 *flagp |= flags&SIMPLE;
5558 - regpiece - something followed by possible [*+?]
5560 * Note that the branching code sequences used for ? and the general cases
5561 * of * and + are somewhat optimized: they use the same NOTHING node as
5562 * both the endmarker for their branch list and the body of the last branch.
5563 * It might seem that this node could be dispensed with entirely, but the
5564 * endmarker role is not redundant.
5567 S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
5570 register regnode *ret;
5572 register char *next;
5574 const char * const origparse = RExC_parse;
5576 I32 max = REG_INFTY;
5578 const char *maxpos = NULL;
5579 GET_RE_DEBUG_FLAGS_DECL;
5580 DEBUG_PARSE("piec");
5582 ret = regatom(pRExC_state, &flags,depth+1);
5584 if (flags & TRYAGAIN)
5591 if (op == '{' && regcurly(RExC_parse)) {
5593 parse_start = RExC_parse; /* MJD */
5594 next = RExC_parse + 1;
5595 while (isDIGIT(*next) || *next == ',') {
5604 if (*next == '}') { /* got one */
5608 min = atoi(RExC_parse);
5612 maxpos = RExC_parse;
5614 if (!max && *maxpos != '0')
5615 max = REG_INFTY; /* meaning "infinity" */
5616 else if (max >= REG_INFTY)
5617 vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
5619 nextchar(pRExC_state);
5622 if ((flags&SIMPLE)) {
5623 RExC_naughty += 2 + RExC_naughty / 2;
5624 reginsert(pRExC_state, CURLY, ret, depth+1);
5625 Set_Node_Offset(ret, parse_start+1); /* MJD */
5626 Set_Node_Cur_Length(ret);
5629 regnode * const w = reg_node(pRExC_state, WHILEM);
5632 REGTAIL(pRExC_state, ret, w);
5633 if (!SIZE_ONLY && RExC_extralen) {
5634 reginsert(pRExC_state, LONGJMP,ret, depth+1);
5635 reginsert(pRExC_state, NOTHING,ret, depth+1);
5636 NEXT_OFF(ret) = 3; /* Go over LONGJMP. */
5638 reginsert(pRExC_state, CURLYX,ret, depth+1);
5640 Set_Node_Offset(ret, parse_start+1);
5641 Set_Node_Length(ret,
5642 op == '{' ? (RExC_parse - parse_start) : 1);
5644 if (!SIZE_ONLY && RExC_extralen)
5645 NEXT_OFF(ret) = 3; /* Go over NOTHING to LONGJMP. */
5646 REGTAIL(pRExC_state, ret, reg_node(pRExC_state, NOTHING));
5648 RExC_whilem_seen++, RExC_extralen += 3;
5649 RExC_naughty += 4 + RExC_naughty; /* compound interest */
5657 if (max && max < min)
5658 vFAIL("Can't do {n,m} with n > m");
5660 ARG1_SET(ret, (U16)min);
5661 ARG2_SET(ret, (U16)max);
5673 #if 0 /* Now runtime fix should be reliable. */
5675 /* if this is reinstated, don't forget to put this back into perldiag:
5677 =item Regexp *+ operand could be empty at {#} in regex m/%s/
5679 (F) The part of the regexp subject to either the * or + quantifier
5680 could match an empty string. The {#} shows in the regular
5681 expression about where the problem was discovered.
5685 if (!(flags&HASWIDTH) && op != '?')
5686 vFAIL("Regexp *+ operand could be empty");
5689 parse_start = RExC_parse;
5690 nextchar(pRExC_state);
5692 *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH);
5694 if (op == '*' && (flags&SIMPLE)) {
5695 reginsert(pRExC_state, STAR, ret, depth+1);
5699 else if (op == '*') {
5703 else if (op == '+' && (flags&SIMPLE)) {
5704 reginsert(pRExC_state, PLUS, ret, depth+1);
5708 else if (op == '+') {
5712 else if (op == '?') {
5717 if (!SIZE_ONLY && !(flags&HASWIDTH) && max > REG_INFTY/3 && ckWARN(WARN_REGEXP)) {
5719 "%.*s matches null string many times",
5720 (int)(RExC_parse >= origparse ? RExC_parse - origparse : 0),
5724 if (RExC_parse < RExC_end && *RExC_parse == '?') {
5725 nextchar(pRExC_state);
5726 reginsert(pRExC_state, MINMOD, ret, depth+1);
5727 REGTAIL(pRExC_state, ret, ret + NODE_STEP_REGNODE);
5729 #ifndef REG_ALLOW_MINMOD_SUSPEND
5732 if (RExC_parse < RExC_end && *RExC_parse == '+') {
5734 nextchar(pRExC_state);
5735 ender = reg_node(pRExC_state, SUCCEED);
5736 REGTAIL(pRExC_state, ret, ender);
5737 reginsert(pRExC_state, SUSPEND, ret, depth+1);
5739 ender = reg_node(pRExC_state, TAIL);
5740 REGTAIL(pRExC_state, ret, ender);
5744 if (RExC_parse < RExC_end && ISMULT2(RExC_parse)) {
5746 vFAIL("Nested quantifiers");
5753 /* reg_namedseq(pRExC_state,UVp)
5755 This is expected to be called by a parser routine that has
5756 recognized'\N' and needs to handle the rest. RExC_parse is
5757 expected to point at the first char following the N at the time
5760 If valuep is non-null then it is assumed that we are parsing inside
5761 of a charclass definition and the first codepoint in the resolved
5762 string is returned via *valuep and the routine will return NULL.
5763 In this mode if a multichar string is returned from the charnames
5764 handler a warning will be issued, and only the first char in the
5765 sequence will be examined. If the string returned is zero length
5766 then the value of *valuep is undefined and NON-NULL will
5767 be returned to indicate failure. (This will NOT be a valid pointer
5770 If value is null then it is assumed that we are parsing normal text
5771 and inserts a new EXACT node into the program containing the resolved
5772 string and returns a pointer to the new node. If the string is
5773 zerolength a NOTHING node is emitted.
5775 On success RExC_parse is set to the char following the endbrace.
5776 Parsing failures will generate a fatal errorvia vFAIL(...)
5778 NOTE: We cache all results from the charnames handler locally in
5779 the RExC_charnames hash (created on first use) to prevent a charnames
5780 handler from playing silly-buggers and returning a short string and
5781 then a long string for a given pattern. Since the regexp program
5782 size is calculated during an initial parse this would result
5783 in a buffer overrun so we cache to prevent the charname result from
5784 changing during the course of the parse.
5788 S_reg_namedseq(pTHX_ RExC_state_t *pRExC_state, UV *valuep)
5790 char * name; /* start of the content of the name */
5791 char * endbrace; /* endbrace following the name */
5794 STRLEN len; /* this has various purposes throughout the code */
5795 bool cached = 0; /* if this is true then we shouldn't refcount dev sv_str */
5796 regnode *ret = NULL;
5798 if (*RExC_parse != '{') {
5799 vFAIL("Missing braces on \\N{}");
5801 name = RExC_parse+1;
5802 endbrace = strchr(RExC_parse, '}');
5805 vFAIL("Missing right brace on \\N{}");
5807 RExC_parse = endbrace + 1;
5810 /* RExC_parse points at the beginning brace,
5811 endbrace points at the last */
5812 if ( name[0]=='U' && name[1]=='+' ) {
5813 /* its a "unicode hex" notation {U+89AB} */
5814 I32 fl = PERL_SCAN_ALLOW_UNDERSCORES
5815 | PERL_SCAN_DISALLOW_PREFIX
5816 | (SIZE_ONLY ? PERL_SCAN_SILENT_ILLDIGIT : 0);
5818 len = (STRLEN)(endbrace - name - 2);
5819 cp = grok_hex(name + 2, &len, &fl, NULL);
5820 if ( len != (STRLEN)(endbrace - name - 2) ) {
5829 sv_str= Perl_newSVpvf_nocontext("%c",(int)cp);
5831 /* fetch the charnames handler for this scope */
5832 HV * const table = GvHV(PL_hintgv);
5834 hv_fetchs(table, "charnames", FALSE) :
5836 SV *cv= cvp ? *cvp : NULL;
5839 /* create an SV with the name as argument */
5840 sv_name = newSVpvn(name, endbrace - name);
5842 if (!table || !(PL_hints & HINT_LOCALIZE_HH)) {
5843 vFAIL2("Constant(\\N{%s}) unknown: "
5844 "(possibly a missing \"use charnames ...\")",
5847 if (!cvp || !SvOK(*cvp)) { /* when $^H{charnames} = undef; */
5848 vFAIL2("Constant(\\N{%s}): "
5849 "$^H{charnames} is not defined",SvPVX(sv_name));
5854 if (!RExC_charnames) {
5855 /* make sure our cache is allocated */
5856 RExC_charnames = newHV();
5857 sv_2mortal((SV*)RExC_charnames);
5859 /* see if we have looked this one up before */
5860 he_str = hv_fetch_ent( RExC_charnames, sv_name, 0, 0 );
5862 sv_str = HeVAL(he_str);
5875 count= call_sv(cv, G_SCALAR);
5877 if (count == 1) { /* XXXX is this right? dmq */
5879 SvREFCNT_inc_simple_void(sv_str);
5887 if ( !sv_str || !SvOK(sv_str) ) {
5888 vFAIL2("Constant(\\N{%s}): Call to &{$^H{charnames}} "
5889 "did not return a defined value",SvPVX(sv_name));
5891 if (hv_store_ent( RExC_charnames, sv_name, sv_str, 0))
5896 char *p = SvPV(sv_str, len);
5899 if ( SvUTF8(sv_str) ) {
5900 *valuep = utf8_to_uvchr((U8*)p, &numlen);
5904 We have to turn on utf8 for high bit chars otherwise
5905 we get failures with
5907 "ss" =~ /[\N{LATIN SMALL LETTER SHARP S}]/i
5908 "SS" =~ /[\N{LATIN SMALL LETTER SHARP S}]/i
5910 This is different from what \x{} would do with the same
5911 codepoint, where the condition is > 0xFF.
5918 /* warn if we havent used the whole string? */
5920 if (numlen<len && SIZE_ONLY && ckWARN(WARN_REGEXP)) {
5922 "Ignoring excess chars from \\N{%s} in character class",
5926 } else if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
5928 "Ignoring zero length \\N{%s} in character class",
5933 SvREFCNT_dec(sv_name);
5935 SvREFCNT_dec(sv_str);
5936 return len ? NULL : (regnode *)&len;
5937 } else if(SvCUR(sv_str)) {
5942 char * parse_start = name-3; /* needed for the offsets */
5943 GET_RE_DEBUG_FLAGS_DECL; /* needed for the offsets */
5945 ret = reg_node(pRExC_state,
5946 (U8)(FOLD ? (LOC ? EXACTFL : EXACTF) : EXACT));
5949 if ( RExC_utf8 && !SvUTF8(sv_str) ) {
5950 sv_utf8_upgrade(sv_str);
5951 } else if ( !RExC_utf8 && SvUTF8(sv_str) ) {
5955 p = SvPV(sv_str, len);
5957 /* len is the length written, charlen is the size the char read */
5958 for ( len = 0; p < pend; p += charlen ) {
5960 UV uvc = utf8_to_uvchr((U8*)p, &charlen);
5962 STRLEN foldlen,numlen;
5963 U8 tmpbuf[UTF8_MAXBYTES_CASE+1], *foldbuf;
5964 uvc = toFOLD_uni(uvc, tmpbuf, &foldlen);
5965 /* Emit all the Unicode characters. */
5967 for (foldbuf = tmpbuf;
5971 uvc = utf8_to_uvchr(foldbuf, &numlen);
5973 const STRLEN unilen = reguni(pRExC_state, uvc, s);
5976 /* In EBCDIC the numlen
5977 * and unilen can differ. */
5979 if (numlen >= foldlen)
5983 break; /* "Can't happen." */
5986 const STRLEN unilen = reguni(pRExC_state, uvc, s);
5998 RExC_size += STR_SZ(len);
6001 RExC_emit += STR_SZ(len);
6003 Set_Node_Cur_Length(ret); /* MJD */
6005 nextchar(pRExC_state);
6007 ret = reg_node(pRExC_state,NOTHING);
6010 SvREFCNT_dec(sv_str);
6013 SvREFCNT_dec(sv_name);
6023 * It returns the code point in utf8 for the value in *encp.
6024 * value: a code value in the source encoding
6025 * encp: a pointer to an Encode object
6027 * If the result from Encode is not a single character,
6028 * it returns U+FFFD (Replacement character) and sets *encp to NULL.
6031 S_reg_recode(pTHX_ const char value, SV **encp)
6034 SV * const sv = sv_2mortal(newSVpvn(&value, numlen));
6035 const char * const s = encp && *encp ? sv_recode_to_utf8(sv, *encp)
6037 const STRLEN newlen = SvCUR(sv);
6038 UV uv = UNICODE_REPLACEMENT;
6042 ? utf8n_to_uvchr((U8*)s, newlen, &numlen, UTF8_ALLOW_DEFAULT)
6045 if (!newlen || numlen != newlen) {
6046 uv = UNICODE_REPLACEMENT;
6055 - regatom - the lowest level
6057 * Optimization: gobbles an entire sequence of ordinary characters so that
6058 * it can turn them into a single node, which is smaller to store and
6059 * faster to run. Backslashed characters are exceptions, each becoming a
6060 * separate node; the code is simpler that way and it's not worth fixing.
6062 * [Yes, it is worth fixing, some scripts can run twice the speed.]
6063 * [It looks like its ok, as in S_study_chunk we merge adjacent EXACT nodes]
6066 S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
6069 register regnode *ret = NULL;
6071 char *parse_start = RExC_parse;
6072 GET_RE_DEBUG_FLAGS_DECL;
6073 DEBUG_PARSE("atom");
6074 *flagp = WORST; /* Tentatively. */
6077 switch (*RExC_parse) {
6079 RExC_seen_zerolen++;
6080 nextchar(pRExC_state);
6081 if (RExC_flags & RXf_PMf_MULTILINE)
6082 ret = reg_node(pRExC_state, MBOL);
6083 else if (RExC_flags & RXf_PMf_SINGLELINE)
6084 ret = reg_node(pRExC_state, SBOL);
6086 ret = reg_node(pRExC_state, BOL);
6087 Set_Node_Length(ret, 1); /* MJD */
6090 nextchar(pRExC_state);
6092 RExC_seen_zerolen++;
6093 if (RExC_flags & RXf_PMf_MULTILINE)
6094 ret = reg_node(pRExC_state, MEOL);
6095 else if (RExC_flags & RXf_PMf_SINGLELINE)
6096 ret = reg_node(pRExC_state, SEOL);
6098 ret = reg_node(pRExC_state, EOL);
6099 Set_Node_Length(ret, 1); /* MJD */
6102 nextchar(pRExC_state);
6103 if (RExC_flags & RXf_PMf_SINGLELINE)
6104 ret = reg_node(pRExC_state, SANY);
6106 ret = reg_node(pRExC_state, REG_ANY);
6107 *flagp |= HASWIDTH|SIMPLE;
6109 Set_Node_Length(ret, 1); /* MJD */
6113 char * const oregcomp_parse = ++RExC_parse;
6114 ret = regclass(pRExC_state,depth+1);
6115 if (*RExC_parse != ']') {
6116 RExC_parse = oregcomp_parse;
6117 vFAIL("Unmatched [");
6119 nextchar(pRExC_state);
6120 *flagp |= HASWIDTH|SIMPLE;
6121 Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */
6125 nextchar(pRExC_state);
6126 ret = reg(pRExC_state, 1, &flags,depth+1);
6128 if (flags & TRYAGAIN) {
6129 if (RExC_parse == RExC_end) {
6130 /* Make parent create an empty node if needed. */
6138 *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE);
6142 if (flags & TRYAGAIN) {
6146 vFAIL("Internal urp");
6147 /* Supposed to be caught earlier. */
6150 if (!regcurly(RExC_parse)) {
6159 vFAIL("Quantifier follows nothing");
6162 switch (*++RExC_parse) {
6164 RExC_seen_zerolen++;
6165 ret = reg_node(pRExC_state, SBOL);
6167 nextchar(pRExC_state);
6168 Set_Node_Length(ret, 2); /* MJD */
6171 ret = reg_node(pRExC_state, GPOS);
6172 RExC_seen |= REG_SEEN_GPOS;
6174 nextchar(pRExC_state);
6175 Set_Node_Length(ret, 2); /* MJD */
6178 ret = reg_node(pRExC_state, SEOL);
6180 RExC_seen_zerolen++; /* Do not optimize RE away */
6181 nextchar(pRExC_state);
6184 ret = reg_node(pRExC_state, EOS);
6186 RExC_seen_zerolen++; /* Do not optimize RE away */
6187 nextchar(pRExC_state);
6188 Set_Node_Length(ret, 2); /* MJD */
6191 ret = reg_node(pRExC_state, CANY);
6192 RExC_seen |= REG_SEEN_CANY;
6193 *flagp |= HASWIDTH|SIMPLE;
6194 nextchar(pRExC_state);
6195 Set_Node_Length(ret, 2); /* MJD */
6198 ret = reg_node(pRExC_state, CLUMP);
6200 nextchar(pRExC_state);
6201 Set_Node_Length(ret, 2); /* MJD */
6204 ret = reg_node(pRExC_state, (U8)(LOC ? ALNUML : ALNUM));
6205 *flagp |= HASWIDTH|SIMPLE;
6206 nextchar(pRExC_state);
6207 Set_Node_Length(ret, 2); /* MJD */
6210 ret = reg_node(pRExC_state, (U8)(LOC ? NALNUML : NALNUM));
6211 *flagp |= HASWIDTH|SIMPLE;
6212 nextchar(pRExC_state);
6213 Set_Node_Length(ret, 2); /* MJD */
6216 RExC_seen_zerolen++;
6217 RExC_seen |= REG_SEEN_LOOKBEHIND;
6218 ret = reg_node(pRExC_state, (U8)(LOC ? BOUNDL : BOUND));
6220 nextchar(pRExC_state);
6221 Set_Node_Length(ret, 2); /* MJD */
6224 RExC_seen_zerolen++;
6225 RExC_seen |= REG_SEEN_LOOKBEHIND;
6226 ret = reg_node(pRExC_state, (U8)(LOC ? NBOUNDL : NBOUND));
6228 nextchar(pRExC_state);
6229 Set_Node_Length(ret, 2); /* MJD */
6232 ret = reg_node(pRExC_state, (U8)(LOC ? SPACEL : SPACE));
6233 *flagp |= HASWIDTH|SIMPLE;
6234 nextchar(pRExC_state);
6235 Set_Node_Length(ret, 2); /* MJD */
6238 ret = reg_node(pRExC_state, (U8)(LOC ? NSPACEL : NSPACE));
6239 *flagp |= HASWIDTH|SIMPLE;
6240 nextchar(pRExC_state);
6241 Set_Node_Length(ret, 2); /* MJD */
6244 ret = reg_node(pRExC_state, DIGIT);
6245 *flagp |= HASWIDTH|SIMPLE;
6246 nextchar(pRExC_state);
6247 Set_Node_Length(ret, 2); /* MJD */
6250 ret = reg_node(pRExC_state, NDIGIT);
6251 *flagp |= HASWIDTH|SIMPLE;
6252 nextchar(pRExC_state);
6253 Set_Node_Length(ret, 2); /* MJD */
6258 char* const oldregxend = RExC_end;
6259 char* parse_start = RExC_parse - 2;
6261 if (RExC_parse[1] == '{') {
6262 /* a lovely hack--pretend we saw [\pX] instead */
6263 RExC_end = strchr(RExC_parse, '}');
6265 const U8 c = (U8)*RExC_parse;
6267 RExC_end = oldregxend;
6268 vFAIL2("Missing right brace on \\%c{}", c);
6273 RExC_end = RExC_parse + 2;
6274 if (RExC_end > oldregxend)
6275 RExC_end = oldregxend;
6279 ret = regclass(pRExC_state,depth+1);
6281 RExC_end = oldregxend;
6284 Set_Node_Offset(ret, parse_start + 2);
6285 Set_Node_Cur_Length(ret);
6286 nextchar(pRExC_state);
6287 *flagp |= HASWIDTH|SIMPLE;
6291 /* Handle \N{NAME} here and not below because it can be
6292 multicharacter. join_exact() will join them up later on.
6293 Also this makes sure that things like /\N{BLAH}+/ and
6294 \N{BLAH} being multi char Just Happen. dmq*/
6296 ret= reg_namedseq(pRExC_state, NULL);
6298 case 'k': /* Handle \k<NAME> and \k'NAME' */
6300 char ch= RExC_parse[1];
6301 if (ch != '<' && ch != '\'') {
6303 vWARN( RExC_parse + 1,
6304 "Possible broken named back reference treated as literal k");
6308 char* name_start = (RExC_parse += 2);
6310 SV *sv_dat = reg_scan_name(pRExC_state,
6311 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
6312 ch= (ch == '<') ? '>' : '\'';
6314 if (RExC_parse == name_start || *RExC_parse != ch)
6315 vFAIL2("Sequence \\k%c... not terminated",
6316 (ch == '>' ? '<' : ch));
6319 ret = reganode(pRExC_state,
6320 (U8)(FOLD ? (LOC ? NREFFL : NREFF) : NREF),
6326 num = add_data( pRExC_state, 1, "S" );
6328 RExC_rxi->data->data[num]=(void*)sv_dat;
6329 SvREFCNT_inc(sv_dat);
6331 /* override incorrect value set in reganode MJD */
6332 Set_Node_Offset(ret, parse_start+1);
6333 Set_Node_Cur_Length(ret); /* MJD */
6334 nextchar(pRExC_state);
6350 case '1': case '2': case '3': case '4':
6351 case '5': case '6': case '7': case '8': case '9':
6354 bool isrel=(*RExC_parse=='R');
6357 num = atoi(RExC_parse);
6359 num = RExC_cpar - num;
6361 vFAIL("Reference to nonexistent or unclosed group");
6363 if (num > 9 && num >= RExC_npar)
6366 char * const parse_start = RExC_parse - 1; /* MJD */
6367 while (isDIGIT(*RExC_parse))
6371 if (num > (I32)RExC_rx->nparens)
6372 vFAIL("Reference to nonexistent group");
6373 /* People make this error all the time apparently.
6374 So we cant fail on it, even though we should
6376 else if (num >= RExC_cpar)
6377 vFAIL("Reference to unclosed group will always match");
6381 ret = reganode(pRExC_state,
6382 (U8)(FOLD ? (LOC ? REFFL : REFF) : REF),
6386 /* override incorrect value set in reganode MJD */
6387 Set_Node_Offset(ret, parse_start+1);
6388 Set_Node_Cur_Length(ret); /* MJD */
6390 nextchar(pRExC_state);
6395 if (RExC_parse >= RExC_end)
6396 FAIL("Trailing \\");
6399 /* Do not generate "unrecognized" warnings here, we fall
6400 back into the quick-grab loop below */
6407 if (RExC_flags & RXf_PMf_EXTENDED) {
6408 while (RExC_parse < RExC_end && *RExC_parse != '\n')
6410 if (RExC_parse < RExC_end)
6416 register STRLEN len;
6421 U8 tmpbuf[UTF8_MAXBYTES_CASE+1], *foldbuf;
6423 parse_start = RExC_parse - 1;
6429 ret = reg_node(pRExC_state,
6430 (U8)(FOLD ? (LOC ? EXACTFL : EXACTF) : EXACT));
6432 for (len = 0, p = RExC_parse - 1;
6433 len < 127 && p < RExC_end;
6436 char * const oldp = p;
6438 if (RExC_flags & RXf_PMf_EXTENDED)
6439 p = regwhite(p, RExC_end);
6488 ender = ASCII_TO_NATIVE('\033');
6492 ender = ASCII_TO_NATIVE('\007');
6497 char* const e = strchr(p, '}');
6501 vFAIL("Missing right brace on \\x{}");
6504 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
6505 | PERL_SCAN_DISALLOW_PREFIX;
6506 STRLEN numlen = e - p - 1;
6507 ender = grok_hex(p + 1, &numlen, &flags, NULL);
6514 I32 flags = PERL_SCAN_DISALLOW_PREFIX;
6516 ender = grok_hex(p, &numlen, &flags, NULL);
6519 if (PL_encoding && ender < 0x100)
6520 goto recode_encoding;
6524 ender = UCHARAT(p++);
6525 ender = toCTRL(ender);
6527 case '0': case '1': case '2': case '3':case '4':
6528 case '5': case '6': case '7': case '8':case '9':
6530 (isDIGIT(p[1]) && atoi(p) >= RExC_npar) ) {
6533 ender = grok_oct(p, &numlen, &flags, NULL);
6540 if (PL_encoding && ender < 0x100)
6541 goto recode_encoding;
6545 SV* enc = PL_encoding;
6546 ender = reg_recode((const char)(U8)ender, &enc);
6547 if (!enc && SIZE_ONLY && ckWARN(WARN_REGEXP))
6548 vWARN(p, "Invalid escape in the specified encoding");
6554 FAIL("Trailing \\");
6557 if (!SIZE_ONLY&& isALPHA(*p) && ckWARN(WARN_REGEXP))
6558 vWARN2(p + 1, "Unrecognized escape \\%c passed through", UCHARAT(p));
6559 goto normal_default;
6564 if (UTF8_IS_START(*p) && UTF) {
6566 ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
6567 &numlen, UTF8_ALLOW_DEFAULT);
6574 if (RExC_flags & RXf_PMf_EXTENDED)
6575 p = regwhite(p, RExC_end);
6577 /* Prime the casefolded buffer. */
6578 ender = toFOLD_uni(ender, tmpbuf, &foldlen);
6580 if (ISMULT2(p)) { /* Back off on ?+*. */
6585 /* Emit all the Unicode characters. */
6587 for (foldbuf = tmpbuf;
6589 foldlen -= numlen) {
6590 ender = utf8_to_uvchr(foldbuf, &numlen);
6592 const STRLEN unilen = reguni(pRExC_state, ender, s);
6595 /* In EBCDIC the numlen
6596 * and unilen can differ. */
6598 if (numlen >= foldlen)
6602 break; /* "Can't happen." */
6606 const STRLEN unilen = reguni(pRExC_state, ender, s);
6615 REGC((char)ender, s++);
6621 /* Emit all the Unicode characters. */
6623 for (foldbuf = tmpbuf;
6625 foldlen -= numlen) {
6626 ender = utf8_to_uvchr(foldbuf, &numlen);
6628 const STRLEN unilen = reguni(pRExC_state, ender, s);
6631 /* In EBCDIC the numlen
6632 * and unilen can differ. */
6634 if (numlen >= foldlen)
6642 const STRLEN unilen = reguni(pRExC_state, ender, s);
6651 REGC((char)ender, s++);
6655 Set_Node_Cur_Length(ret); /* MJD */
6656 nextchar(pRExC_state);
6658 /* len is STRLEN which is unsigned, need to copy to signed */
6661 vFAIL("Internal disaster");
6665 if (len == 1 && UNI_IS_INVARIANT(ender))
6669 RExC_size += STR_SZ(len);
6672 RExC_emit += STR_SZ(len);
6682 S_regwhite(char *p, const char *e)
6687 else if (*p == '#') {
6690 } while (p < e && *p != '\n');
6698 /* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
6699 Character classes ([:foo:]) can also be negated ([:^foo:]).
6700 Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
6701 Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
6702 but trigger failures because they are currently unimplemented. */
6704 #define POSIXCC_DONE(c) ((c) == ':')
6705 #define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.')
6706 #define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c))
6709 S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value)
6712 I32 namedclass = OOB_NAMEDCLASS;
6714 if (value == '[' && RExC_parse + 1 < RExC_end &&
6715 /* I smell either [: or [= or [. -- POSIX has been here, right? */
6716 POSIXCC(UCHARAT(RExC_parse))) {
6717 const char c = UCHARAT(RExC_parse);
6718 char* const s = RExC_parse++;
6720 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != c)
6722 if (RExC_parse == RExC_end)
6723 /* Grandfather lone [:, [=, [. */
6726 const char* const t = RExC_parse++; /* skip over the c */
6729 if (UCHARAT(RExC_parse) == ']') {
6730 const char *posixcc = s + 1;
6731 RExC_parse++; /* skip over the ending ] */
6734 const I32 complement = *posixcc == '^' ? *posixcc++ : 0;
6735 const I32 skip = t - posixcc;
6737 /* Initially switch on the length of the name. */
6740 if (memEQ(posixcc, "word", 4)) /* this is not POSIX, this is the Perl \w */
6741 namedclass = complement ? ANYOF_NALNUM : ANYOF_ALNUM;
6744 /* Names all of length 5. */
6745 /* alnum alpha ascii blank cntrl digit graph lower
6746 print punct space upper */
6747 /* Offset 4 gives the best switch position. */
6748 switch (posixcc[4]) {
6750 if (memEQ(posixcc, "alph", 4)) /* alpha */
6751 namedclass = complement ? ANYOF_NALPHA : ANYOF_ALPHA;
6754 if (memEQ(posixcc, "spac", 4)) /* space */
6755 namedclass = complement ? ANYOF_NPSXSPC : ANYOF_PSXSPC;
6758 if (memEQ(posixcc, "grap", 4)) /* graph */
6759 namedclass = complement ? ANYOF_NGRAPH : ANYOF_GRAPH;
6762 if (memEQ(posixcc, "asci", 4)) /* ascii */
6763 namedclass = complement ? ANYOF_NASCII : ANYOF_ASCII;
6766 if (memEQ(posixcc, "blan", 4)) /* blank */
6767 namedclass = complement ? ANYOF_NBLANK : ANYOF_BLANK;
6770 if (memEQ(posixcc, "cntr", 4)) /* cntrl */
6771 namedclass = complement ? ANYOF_NCNTRL : ANYOF_CNTRL;
6774 if (memEQ(posixcc, "alnu", 4)) /* alnum */
6775 namedclass = complement ? ANYOF_NALNUMC : ANYOF_ALNUMC;
6778 if (memEQ(posixcc, "lowe", 4)) /* lower */
6779 namedclass = complement ? ANYOF_NLOWER : ANYOF_LOWER;
6780 else if (memEQ(posixcc, "uppe", 4)) /* upper */
6781 namedclass = complement ? ANYOF_NUPPER : ANYOF_UPPER;
6784 if (memEQ(posixcc, "digi", 4)) /* digit */
6785 namedclass = complement ? ANYOF_NDIGIT : ANYOF_DIGIT;
6786 else if (memEQ(posixcc, "prin", 4)) /* print */
6787 namedclass = complement ? ANYOF_NPRINT : ANYOF_PRINT;
6788 else if (memEQ(posixcc, "punc", 4)) /* punct */
6789 namedclass = complement ? ANYOF_NPUNCT : ANYOF_PUNCT;
6794 if (memEQ(posixcc, "xdigit", 6))
6795 namedclass = complement ? ANYOF_NXDIGIT : ANYOF_XDIGIT;
6799 if (namedclass == OOB_NAMEDCLASS)
6800 Simple_vFAIL3("POSIX class [:%.*s:] unknown",
6802 assert (posixcc[skip] == ':');
6803 assert (posixcc[skip+1] == ']');
6804 } else if (!SIZE_ONLY) {
6805 /* [[=foo=]] and [[.foo.]] are still future. */
6807 /* adjust RExC_parse so the warning shows after
6809 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse) != ']')
6811 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
6814 /* Maternal grandfather:
6815 * "[:" ending in ":" but not in ":]" */
6825 S_checkposixcc(pTHX_ RExC_state_t *pRExC_state)
6828 if (POSIXCC(UCHARAT(RExC_parse))) {
6829 const char *s = RExC_parse;
6830 const char c = *s++;
6834 if (*s && c == *s && s[1] == ']') {
6835 if (ckWARN(WARN_REGEXP))
6837 "POSIX syntax [%c %c] belongs inside character classes",
6840 /* [[=foo=]] and [[.foo.]] are still future. */
6841 if (POSIXCC_NOTYET(c)) {
6842 /* adjust RExC_parse so the error shows after
6844 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse++) != ']')
6846 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
6854 parse a class specification and produce either an ANYOF node that
6855 matches the pattern. If the pattern matches a single char only and
6856 that char is < 256 then we produce an EXACT node instead.
6859 S_regclass(pTHX_ RExC_state_t *pRExC_state, U32 depth)
6862 register UV value = 0;
6863 register UV nextvalue;
6864 register IV prevvalue = OOB_UNICODE;
6865 register IV range = 0;
6866 register regnode *ret;
6869 char *rangebegin = NULL;
6870 bool need_class = 0;
6873 bool optimize_invert = TRUE;
6874 AV* unicode_alternate = NULL;
6876 UV literal_endpoint = 0;
6878 UV stored = 0; /* number of chars stored in the class */
6880 regnode * const orig_emit = RExC_emit; /* Save the original RExC_emit in
6881 case we need to change the emitted regop to an EXACT. */
6882 const char * orig_parse = RExC_parse;
6883 GET_RE_DEBUG_FLAGS_DECL;
6885 PERL_UNUSED_ARG(depth);
6888 DEBUG_PARSE("clas");
6890 /* Assume we are going to generate an ANYOF node. */
6891 ret = reganode(pRExC_state, ANYOF, 0);
6894 ANYOF_FLAGS(ret) = 0;
6896 if (UCHARAT(RExC_parse) == '^') { /* Complement of range. */
6900 ANYOF_FLAGS(ret) |= ANYOF_INVERT;
6904 RExC_size += ANYOF_SKIP;
6905 listsv = &PL_sv_undef; /* For code scanners: listsv always non-NULL. */
6908 RExC_emit += ANYOF_SKIP;
6910 ANYOF_FLAGS(ret) |= ANYOF_FOLD;
6912 ANYOF_FLAGS(ret) |= ANYOF_LOCALE;
6913 ANYOF_BITMAP_ZERO(ret);
6914 listsv = newSVpvs("# comment\n");
6917 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
6919 if (!SIZE_ONLY && POSIXCC(nextvalue))
6920 checkposixcc(pRExC_state);
6922 /* allow 1st char to be ] (allowing it to be - is dealt with later) */
6923 if (UCHARAT(RExC_parse) == ']')
6927 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != ']') {
6931 namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
6934 rangebegin = RExC_parse;
6936 value = utf8n_to_uvchr((U8*)RExC_parse,
6937 RExC_end - RExC_parse,
6938 &numlen, UTF8_ALLOW_DEFAULT);
6939 RExC_parse += numlen;
6942 value = UCHARAT(RExC_parse++);
6944 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
6945 if (value == '[' && POSIXCC(nextvalue))
6946 namedclass = regpposixcc(pRExC_state, value);
6947 else if (value == '\\') {
6949 value = utf8n_to_uvchr((U8*)RExC_parse,
6950 RExC_end - RExC_parse,
6951 &numlen, UTF8_ALLOW_DEFAULT);
6952 RExC_parse += numlen;
6955 value = UCHARAT(RExC_parse++);
6956 /* Some compilers cannot handle switching on 64-bit integer
6957 * values, therefore value cannot be an UV. Yes, this will
6958 * be a problem later if we want switch on Unicode.
6959 * A similar issue a little bit later when switching on
6960 * namedclass. --jhi */
6961 switch ((I32)value) {
6962 case 'w': namedclass = ANYOF_ALNUM; break;
6963 case 'W': namedclass = ANYOF_NALNUM; break;
6964 case 's': namedclass = ANYOF_SPACE; break;
6965 case 'S': namedclass = ANYOF_NSPACE; break;
6966 case 'd': namedclass = ANYOF_DIGIT; break;
6967 case 'D': namedclass = ANYOF_NDIGIT; break;
6968 case 'N': /* Handle \N{NAME} in class */
6970 /* We only pay attention to the first char of
6971 multichar strings being returned. I kinda wonder
6972 if this makes sense as it does change the behaviour
6973 from earlier versions, OTOH that behaviour was broken
6975 UV v; /* value is register so we cant & it /grrr */
6976 if (reg_namedseq(pRExC_state, &v)) {
6986 if (RExC_parse >= RExC_end)
6987 vFAIL2("Empty \\%c{}", (U8)value);
6988 if (*RExC_parse == '{') {
6989 const U8 c = (U8)value;
6990 e = strchr(RExC_parse++, '}');
6992 vFAIL2("Missing right brace on \\%c{}", c);
6993 while (isSPACE(UCHARAT(RExC_parse)))
6995 if (e == RExC_parse)
6996 vFAIL2("Empty \\%c{}", c);
6998 while (isSPACE(UCHARAT(RExC_parse + n - 1)))
7006 if (UCHARAT(RExC_parse) == '^') {
7009 value = value == 'p' ? 'P' : 'p'; /* toggle */
7010 while (isSPACE(UCHARAT(RExC_parse))) {
7015 Perl_sv_catpvf(aTHX_ listsv, "%cutf8::%.*s\n",
7016 (value=='p' ? '+' : '!'), (int)n, RExC_parse);
7019 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
7020 namedclass = ANYOF_MAX; /* no official name, but it's named */
7023 case 'n': value = '\n'; break;
7024 case 'r': value = '\r'; break;
7025 case 't': value = '\t'; break;
7026 case 'f': value = '\f'; break;
7027 case 'b': value = '\b'; break;
7028 case 'e': value = ASCII_TO_NATIVE('\033');break;
7029 case 'a': value = ASCII_TO_NATIVE('\007');break;
7031 if (*RExC_parse == '{') {
7032 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
7033 | PERL_SCAN_DISALLOW_PREFIX;
7034 char * const e = strchr(RExC_parse++, '}');
7036 vFAIL("Missing right brace on \\x{}");
7038 numlen = e - RExC_parse;
7039 value = grok_hex(RExC_parse, &numlen, &flags, NULL);
7043 I32 flags = PERL_SCAN_DISALLOW_PREFIX;
7045 value = grok_hex(RExC_parse, &numlen, &flags, NULL);
7046 RExC_parse += numlen;
7048 if (PL_encoding && value < 0x100)
7049 goto recode_encoding;
7052 value = UCHARAT(RExC_parse++);
7053 value = toCTRL(value);
7055 case '0': case '1': case '2': case '3': case '4':
7056 case '5': case '6': case '7': case '8': case '9':
7060 value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
7061 RExC_parse += numlen;
7062 if (PL_encoding && value < 0x100)
7063 goto recode_encoding;
7068 SV* enc = PL_encoding;
7069 value = reg_recode((const char)(U8)value, &enc);
7070 if (!enc && SIZE_ONLY && ckWARN(WARN_REGEXP))
7072 "Invalid escape in the specified encoding");
7076 if (!SIZE_ONLY && isALPHA(value) && ckWARN(WARN_REGEXP))
7078 "Unrecognized escape \\%c in character class passed through",
7082 } /* end of \blah */
7088 if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
7090 if (!SIZE_ONLY && !need_class)
7091 ANYOF_CLASS_ZERO(ret);
7095 /* a bad range like a-\d, a-[:digit:] ? */
7098 if (ckWARN(WARN_REGEXP)) {
7100 RExC_parse >= rangebegin ?
7101 RExC_parse - rangebegin : 0;
7103 "False [] range \"%*.*s\"",
7106 if (prevvalue < 256) {
7107 ANYOF_BITMAP_SET(ret, prevvalue);
7108 ANYOF_BITMAP_SET(ret, '-');
7111 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
7112 Perl_sv_catpvf(aTHX_ listsv,
7113 "%04"UVxf"\n%04"UVxf"\n", (UV)prevvalue, (UV) '-');
7117 range = 0; /* this was not a true range */
7121 const char *what = NULL;
7124 if (namedclass > OOB_NAMEDCLASS)
7125 optimize_invert = FALSE;
7126 /* Possible truncation here but in some 64-bit environments
7127 * the compiler gets heartburn about switch on 64-bit values.
7128 * A similar issue a little earlier when switching on value.
7130 switch ((I32)namedclass) {
7133 ANYOF_CLASS_SET(ret, ANYOF_ALNUM);
7135 for (value = 0; value < 256; value++)
7137 ANYOF_BITMAP_SET(ret, value);
7144 ANYOF_CLASS_SET(ret, ANYOF_NALNUM);
7146 for (value = 0; value < 256; value++)
7147 if (!isALNUM(value))
7148 ANYOF_BITMAP_SET(ret, value);
7155 ANYOF_CLASS_SET(ret, ANYOF_ALNUMC);
7157 for (value = 0; value < 256; value++)
7158 if (isALNUMC(value))
7159 ANYOF_BITMAP_SET(ret, value);
7166 ANYOF_CLASS_SET(ret, ANYOF_NALNUMC);
7168 for (value = 0; value < 256; value++)
7169 if (!isALNUMC(value))
7170 ANYOF_BITMAP_SET(ret, value);
7177 ANYOF_CLASS_SET(ret, ANYOF_ALPHA);
7179 for (value = 0; value < 256; value++)
7181 ANYOF_BITMAP_SET(ret, value);
7188 ANYOF_CLASS_SET(ret, ANYOF_NALPHA);
7190 for (value = 0; value < 256; value++)
7191 if (!isALPHA(value))
7192 ANYOF_BITMAP_SET(ret, value);
7199 ANYOF_CLASS_SET(ret, ANYOF_ASCII);
7202 for (value = 0; value < 128; value++)
7203 ANYOF_BITMAP_SET(ret, value);
7205 for (value = 0; value < 256; value++) {
7207 ANYOF_BITMAP_SET(ret, value);
7216 ANYOF_CLASS_SET(ret, ANYOF_NASCII);
7219 for (value = 128; value < 256; value++)
7220 ANYOF_BITMAP_SET(ret, value);
7222 for (value = 0; value < 256; value++) {
7223 if (!isASCII(value))
7224 ANYOF_BITMAP_SET(ret, value);
7233 ANYOF_CLASS_SET(ret, ANYOF_BLANK);
7235 for (value = 0; value < 256; value++)
7237 ANYOF_BITMAP_SET(ret, value);
7244 ANYOF_CLASS_SET(ret, ANYOF_NBLANK);
7246 for (value = 0; value < 256; value++)
7247 if (!isBLANK(value))
7248 ANYOF_BITMAP_SET(ret, value);
7255 ANYOF_CLASS_SET(ret, ANYOF_CNTRL);
7257 for (value = 0; value < 256; value++)
7259 ANYOF_BITMAP_SET(ret, value);
7266 ANYOF_CLASS_SET(ret, ANYOF_NCNTRL);
7268 for (value = 0; value < 256; value++)
7269 if (!isCNTRL(value))
7270 ANYOF_BITMAP_SET(ret, value);
7277 ANYOF_CLASS_SET(ret, ANYOF_DIGIT);
7279 /* consecutive digits assumed */
7280 for (value = '0'; value <= '9'; value++)
7281 ANYOF_BITMAP_SET(ret, value);
7288 ANYOF_CLASS_SET(ret, ANYOF_NDIGIT);
7290 /* consecutive digits assumed */
7291 for (value = 0; value < '0'; value++)
7292 ANYOF_BITMAP_SET(ret, value);
7293 for (value = '9' + 1; value < 256; value++)
7294 ANYOF_BITMAP_SET(ret, value);
7301 ANYOF_CLASS_SET(ret, ANYOF_GRAPH);
7303 for (value = 0; value < 256; value++)
7305 ANYOF_BITMAP_SET(ret, value);
7312 ANYOF_CLASS_SET(ret, ANYOF_NGRAPH);
7314 for (value = 0; value < 256; value++)
7315 if (!isGRAPH(value))
7316 ANYOF_BITMAP_SET(ret, value);
7323 ANYOF_CLASS_SET(ret, ANYOF_LOWER);
7325 for (value = 0; value < 256; value++)
7327 ANYOF_BITMAP_SET(ret, value);
7334 ANYOF_CLASS_SET(ret, ANYOF_NLOWER);
7336 for (value = 0; value < 256; value++)
7337 if (!isLOWER(value))
7338 ANYOF_BITMAP_SET(ret, value);
7345 ANYOF_CLASS_SET(ret, ANYOF_PRINT);
7347 for (value = 0; value < 256; value++)
7349 ANYOF_BITMAP_SET(ret, value);
7356 ANYOF_CLASS_SET(ret, ANYOF_NPRINT);
7358 for (value = 0; value < 256; value++)
7359 if (!isPRINT(value))
7360 ANYOF_BITMAP_SET(ret, value);
7367 ANYOF_CLASS_SET(ret, ANYOF_PSXSPC);
7369 for (value = 0; value < 256; value++)
7370 if (isPSXSPC(value))
7371 ANYOF_BITMAP_SET(ret, value);
7378 ANYOF_CLASS_SET(ret, ANYOF_NPSXSPC);
7380 for (value = 0; value < 256; value++)
7381 if (!isPSXSPC(value))
7382 ANYOF_BITMAP_SET(ret, value);
7389 ANYOF_CLASS_SET(ret, ANYOF_PUNCT);
7391 for (value = 0; value < 256; value++)
7393 ANYOF_BITMAP_SET(ret, value);
7400 ANYOF_CLASS_SET(ret, ANYOF_NPUNCT);
7402 for (value = 0; value < 256; value++)
7403 if (!isPUNCT(value))
7404 ANYOF_BITMAP_SET(ret, value);
7411 ANYOF_CLASS_SET(ret, ANYOF_SPACE);
7413 for (value = 0; value < 256; value++)
7415 ANYOF_BITMAP_SET(ret, value);
7422 ANYOF_CLASS_SET(ret, ANYOF_NSPACE);
7424 for (value = 0; value < 256; value++)
7425 if (!isSPACE(value))
7426 ANYOF_BITMAP_SET(ret, value);
7433 ANYOF_CLASS_SET(ret, ANYOF_UPPER);
7435 for (value = 0; value < 256; value++)
7437 ANYOF_BITMAP_SET(ret, value);
7444 ANYOF_CLASS_SET(ret, ANYOF_NUPPER);
7446 for (value = 0; value < 256; value++)
7447 if (!isUPPER(value))
7448 ANYOF_BITMAP_SET(ret, value);
7455 ANYOF_CLASS_SET(ret, ANYOF_XDIGIT);
7457 for (value = 0; value < 256; value++)
7458 if (isXDIGIT(value))
7459 ANYOF_BITMAP_SET(ret, value);
7466 ANYOF_CLASS_SET(ret, ANYOF_NXDIGIT);
7468 for (value = 0; value < 256; value++)
7469 if (!isXDIGIT(value))
7470 ANYOF_BITMAP_SET(ret, value);
7476 /* this is to handle \p and \P */
7479 vFAIL("Invalid [::] class");
7483 /* Strings such as "+utf8::isWord\n" */
7484 Perl_sv_catpvf(aTHX_ listsv, "%cutf8::Is%s\n", yesno, what);
7487 ANYOF_FLAGS(ret) |= ANYOF_CLASS;
7490 } /* end of namedclass \blah */
7493 if (prevvalue > (IV)value) /* b-a */ {
7494 const int w = RExC_parse - rangebegin;
7495 Simple_vFAIL4("Invalid [] range \"%*.*s\"", w, w, rangebegin);
7496 range = 0; /* not a valid range */
7500 prevvalue = value; /* save the beginning of the range */
7501 if (*RExC_parse == '-' && RExC_parse+1 < RExC_end &&
7502 RExC_parse[1] != ']') {
7505 /* a bad range like \w-, [:word:]- ? */
7506 if (namedclass > OOB_NAMEDCLASS) {
7507 if (ckWARN(WARN_REGEXP)) {
7509 RExC_parse >= rangebegin ?
7510 RExC_parse - rangebegin : 0;
7512 "False [] range \"%*.*s\"",
7516 ANYOF_BITMAP_SET(ret, '-');
7518 range = 1; /* yeah, it's a range! */
7519 continue; /* but do it the next time */
7523 /* now is the next time */
7524 /*stored += (value - prevvalue + 1);*/
7526 if (prevvalue < 256) {
7527 const IV ceilvalue = value < 256 ? value : 255;
7530 /* In EBCDIC [\x89-\x91] should include
7531 * the \x8e but [i-j] should not. */
7532 if (literal_endpoint == 2 &&
7533 ((isLOWER(prevvalue) && isLOWER(ceilvalue)) ||
7534 (isUPPER(prevvalue) && isUPPER(ceilvalue))))
7536 if (isLOWER(prevvalue)) {
7537 for (i = prevvalue; i <= ceilvalue; i++)
7539 ANYOF_BITMAP_SET(ret, i);
7541 for (i = prevvalue; i <= ceilvalue; i++)
7543 ANYOF_BITMAP_SET(ret, i);
7548 for (i = prevvalue; i <= ceilvalue; i++) {
7549 if (!ANYOF_BITMAP_TEST(ret,i)) {
7551 ANYOF_BITMAP_SET(ret, i);
7555 if (value > 255 || UTF) {
7556 const UV prevnatvalue = NATIVE_TO_UNI(prevvalue);
7557 const UV natvalue = NATIVE_TO_UNI(value);
7558 stored+=2; /* can't optimize this class */
7559 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
7560 if (prevnatvalue < natvalue) { /* what about > ? */
7561 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\t%04"UVxf"\n",
7562 prevnatvalue, natvalue);
7564 else if (prevnatvalue == natvalue) {
7565 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n", natvalue);
7567 U8 foldbuf[UTF8_MAXBYTES_CASE+1];
7569 const UV f = to_uni_fold(natvalue, foldbuf, &foldlen);
7571 #ifdef EBCDIC /* RD t/uni/fold ff and 6b */
7572 if (RExC_precomp[0] == ':' &&
7573 RExC_precomp[1] == '[' &&
7574 (f == 0xDF || f == 0x92)) {
7575 f = NATIVE_TO_UNI(f);
7578 /* If folding and foldable and a single
7579 * character, insert also the folded version
7580 * to the charclass. */
7582 #ifdef EBCDIC /* RD tunifold ligatures s,t fb05, fb06 */
7583 if ((RExC_precomp[0] == ':' &&
7584 RExC_precomp[1] == '[' &&
7586 (value == 0xFB05 || value == 0xFB06))) ?
7587 foldlen == ((STRLEN)UNISKIP(f) - 1) :
7588 foldlen == (STRLEN)UNISKIP(f) )
7590 if (foldlen == (STRLEN)UNISKIP(f))
7592 Perl_sv_catpvf(aTHX_ listsv,
7595 /* Any multicharacter foldings
7596 * require the following transform:
7597 * [ABCDEF] -> (?:[ABCabcDEFd]|pq|rst)
7598 * where E folds into "pq" and F folds
7599 * into "rst", all other characters
7600 * fold to single characters. We save
7601 * away these multicharacter foldings,
7602 * to be later saved as part of the
7603 * additional "s" data. */
7606 if (!unicode_alternate)
7607 unicode_alternate = newAV();
7608 sv = newSVpvn((char*)foldbuf, foldlen);
7610 av_push(unicode_alternate, sv);
7614 /* If folding and the value is one of the Greek
7615 * sigmas insert a few more sigmas to make the
7616 * folding rules of the sigmas to work right.
7617 * Note that not all the possible combinations
7618 * are handled here: some of them are handled
7619 * by the standard folding rules, and some of
7620 * them (literal or EXACTF cases) are handled
7621 * during runtime in regexec.c:S_find_byclass(). */
7622 if (value == UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA) {
7623 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
7624 (UV)UNICODE_GREEK_CAPITAL_LETTER_SIGMA);
7625 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
7626 (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
7628 else if (value == UNICODE_GREEK_CAPITAL_LETTER_SIGMA)
7629 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
7630 (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
7635 literal_endpoint = 0;
7639 range = 0; /* this range (if it was one) is done now */
7643 ANYOF_FLAGS(ret) |= ANYOF_LARGE;
7645 RExC_size += ANYOF_CLASS_ADD_SKIP;
7647 RExC_emit += ANYOF_CLASS_ADD_SKIP;
7653 /****** !SIZE_ONLY AFTER HERE *********/
7655 if( stored == 1 && value < 256
7656 && !( ANYOF_FLAGS(ret) & ( ANYOF_FLAGS_ALL ^ ANYOF_FOLD ) )
7658 /* optimize single char class to an EXACT node
7659 but *only* when its not a UTF/high char */
7660 const char * cur_parse= RExC_parse;
7661 RExC_emit = (regnode *)orig_emit;
7662 RExC_parse = (char *)orig_parse;
7663 ret = reg_node(pRExC_state,
7664 (U8)((ANYOF_FLAGS(ret) & ANYOF_FOLD) ? EXACTF : EXACT));
7665 RExC_parse = (char *)cur_parse;
7666 *STRING(ret)= (char)value;
7668 RExC_emit += STR_SZ(1);
7671 /* optimize case-insensitive simple patterns (e.g. /[a-z]/i) */
7672 if ( /* If the only flag is folding (plus possibly inversion). */
7673 ((ANYOF_FLAGS(ret) & (ANYOF_FLAGS_ALL ^ ANYOF_INVERT)) == ANYOF_FOLD)
7675 for (value = 0; value < 256; ++value) {
7676 if (ANYOF_BITMAP_TEST(ret, value)) {
7677 UV fold = PL_fold[value];
7680 ANYOF_BITMAP_SET(ret, fold);
7683 ANYOF_FLAGS(ret) &= ~ANYOF_FOLD;
7686 /* optimize inverted simple patterns (e.g. [^a-z]) */
7687 if (optimize_invert &&
7688 /* If the only flag is inversion. */
7689 (ANYOF_FLAGS(ret) & ANYOF_FLAGS_ALL) == ANYOF_INVERT) {
7690 for (value = 0; value < ANYOF_BITMAP_SIZE; ++value)
7691 ANYOF_BITMAP(ret)[value] ^= ANYOF_FLAGS_ALL;
7692 ANYOF_FLAGS(ret) = ANYOF_UNICODE_ALL;
7695 AV * const av = newAV();
7697 /* The 0th element stores the character class description
7698 * in its textual form: used later (regexec.c:Perl_regclass_swash())
7699 * to initialize the appropriate swash (which gets stored in
7700 * the 1st element), and also useful for dumping the regnode.
7701 * The 2nd element stores the multicharacter foldings,
7702 * used later (regexec.c:S_reginclass()). */
7703 av_store(av, 0, listsv);
7704 av_store(av, 1, NULL);
7705 av_store(av, 2, (SV*)unicode_alternate);
7706 rv = newRV_noinc((SV*)av);
7707 n = add_data(pRExC_state, 1, "s");
7708 RExC_rxi->data->data[n] = (void*)rv;
7715 S_nextchar(pTHX_ RExC_state_t *pRExC_state)
7717 char* const retval = RExC_parse++;
7720 if (*RExC_parse == '(' && RExC_parse[1] == '?' &&
7721 RExC_parse[2] == '#') {
7722 while (*RExC_parse != ')') {
7723 if (RExC_parse == RExC_end)
7724 FAIL("Sequence (?#... not terminated");
7730 if (RExC_flags & RXf_PMf_EXTENDED) {
7731 if (isSPACE(*RExC_parse)) {
7735 else if (*RExC_parse == '#') {
7736 while (RExC_parse < RExC_end)
7737 if (*RExC_parse++ == '\n') break;
7746 - reg_node - emit a node
7748 STATIC regnode * /* Location. */
7749 S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
7752 register regnode *ptr;
7753 regnode * const ret = RExC_emit;
7754 GET_RE_DEBUG_FLAGS_DECL;
7757 SIZE_ALIGN(RExC_size);
7762 if (OP(RExC_emit) == 255)
7763 Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %s: %d ",
7764 reg_name[op], OP(RExC_emit));
7766 NODE_ALIGN_FILL(ret);
7768 FILL_ADVANCE_NODE(ptr, op);
7769 if (RExC_offsets) { /* MJD */
7770 MJD_OFFSET_DEBUG(("%s:%d: (op %s) %s %"UVuf" (len %"UVuf") (max %"UVuf").\n",
7771 "reg_node", __LINE__,
7773 (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0]
7774 ? "Overwriting end of array!\n" : "OK",
7775 (UV)(RExC_emit - RExC_emit_start),
7776 (UV)(RExC_parse - RExC_start),
7777 (UV)RExC_offsets[0]));
7778 Set_Node_Offset(RExC_emit, RExC_parse + (op == END));
7786 - reganode - emit a node with an argument
7788 STATIC regnode * /* Location. */
7789 S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
7792 register regnode *ptr;
7793 regnode * const ret = RExC_emit;
7794 GET_RE_DEBUG_FLAGS_DECL;
7797 SIZE_ALIGN(RExC_size);
7802 assert(2==regarglen[op]+1);
7804 Anything larger than this has to allocate the extra amount.
7805 If we changed this to be:
7807 RExC_size += (1 + regarglen[op]);
7809 then it wouldn't matter. Its not clear what side effect
7810 might come from that so its not done so far.
7816 if (OP(RExC_emit) == 255)
7817 Perl_croak(aTHX_ "panic: reganode overwriting end of allocated program space");
7819 NODE_ALIGN_FILL(ret);
7821 FILL_ADVANCE_NODE_ARG(ptr, op, arg);
7822 if (RExC_offsets) { /* MJD */
7823 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
7827 (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0] ?
7828 "Overwriting end of array!\n" : "OK",
7829 (UV)(RExC_emit - RExC_emit_start),
7830 (UV)(RExC_parse - RExC_start),
7831 (UV)RExC_offsets[0]));
7832 Set_Cur_Node_Offset;
7840 - reguni - emit (if appropriate) a Unicode character
7843 S_reguni(pTHX_ const RExC_state_t *pRExC_state, UV uv, char* s)
7846 return SIZE_ONLY ? UNISKIP(uv) : (uvchr_to_utf8((U8*)s, uv) - (U8*)s);
7850 - reginsert - insert an operator in front of already-emitted operand
7852 * Means relocating the operand.
7855 S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd, U32 depth)
7858 register regnode *src;
7859 register regnode *dst;
7860 register regnode *place;
7861 const int offset = regarglen[(U8)op];
7862 const int size = NODE_STEP_REGNODE + offset;
7863 GET_RE_DEBUG_FLAGS_DECL;
7864 /* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */
7865 DEBUG_PARSE_FMT("inst"," - %s",reg_name[op]);
7874 if (RExC_open_parens) {
7876 DEBUG_PARSE_FMT("inst"," - %"IVdf, (IV)RExC_npar);
7877 for ( paren=0 ; paren < RExC_npar ; paren++ ) {
7878 if ( RExC_open_parens[paren] >= opnd ) {
7879 DEBUG_PARSE_FMT("open"," - %d",size);
7880 RExC_open_parens[paren] += size;
7882 DEBUG_PARSE_FMT("open"," - %s","ok");
7884 if ( RExC_close_parens[paren] >= opnd ) {
7885 DEBUG_PARSE_FMT("close"," - %d",size);
7886 RExC_close_parens[paren] += size;
7888 DEBUG_PARSE_FMT("close"," - %s","ok");
7893 while (src > opnd) {
7894 StructCopy(--src, --dst, regnode);
7895 if (RExC_offsets) { /* MJD 20010112 */
7896 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s copy %"UVuf" -> %"UVuf" (max %"UVuf").\n",
7900 (UV)(dst - RExC_emit_start) > RExC_offsets[0]
7901 ? "Overwriting end of array!\n" : "OK",
7902 (UV)(src - RExC_emit_start),
7903 (UV)(dst - RExC_emit_start),
7904 (UV)RExC_offsets[0]));
7905 Set_Node_Offset_To_R(dst-RExC_emit_start, Node_Offset(src));
7906 Set_Node_Length_To_R(dst-RExC_emit_start, Node_Length(src));
7911 place = opnd; /* Op node, where operand used to be. */
7912 if (RExC_offsets) { /* MJD */
7913 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
7917 (UV)(place - RExC_emit_start) > RExC_offsets[0]
7918 ? "Overwriting end of array!\n" : "OK",
7919 (UV)(place - RExC_emit_start),
7920 (UV)(RExC_parse - RExC_start),
7921 (UV)RExC_offsets[0]));
7922 Set_Node_Offset(place, RExC_parse);
7923 Set_Node_Length(place, 1);
7925 src = NEXTOPER(place);
7926 FILL_ADVANCE_NODE(place, op);
7927 Zero(src, offset, regnode);
7931 - regtail - set the next-pointer at the end of a node chain of p to val.
7932 - SEE ALSO: regtail_study
7934 /* TODO: All three parms should be const */
7936 S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
7939 register regnode *scan;
7940 GET_RE_DEBUG_FLAGS_DECL;
7942 PERL_UNUSED_ARG(depth);
7948 /* Find last node. */
7951 regnode * const temp = regnext(scan);
7953 SV * const mysv=sv_newmortal();
7954 DEBUG_PARSE_MSG((scan==p ? "tail" : ""));
7955 regprop(RExC_rx, mysv, scan);
7956 PerlIO_printf(Perl_debug_log, "~ %s (%d) %s %s\n",
7957 SvPV_nolen_const(mysv), REG_NODE_NUM(scan),
7958 (temp == NULL ? "->" : ""),
7959 (temp == NULL ? reg_name[OP(val)] : "")
7967 if (reg_off_by_arg[OP(scan)]) {
7968 ARG_SET(scan, val - scan);
7971 NEXT_OFF(scan) = val - scan;
7977 - regtail_study - set the next-pointer at the end of a node chain of p to val.
7978 - Look for optimizable sequences at the same time.
7979 - currently only looks for EXACT chains.
7981 This is expermental code. The idea is to use this routine to perform
7982 in place optimizations on branches and groups as they are constructed,
7983 with the long term intention of removing optimization from study_chunk so
7984 that it is purely analytical.
7986 Currently only used when in DEBUG mode. The macro REGTAIL_STUDY() is used
7987 to control which is which.
7990 /* TODO: All four parms should be const */
7993 S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
7996 register regnode *scan;
7998 #ifdef EXPERIMENTAL_INPLACESCAN
8002 GET_RE_DEBUG_FLAGS_DECL;
8008 /* Find last node. */
8012 regnode * const temp = regnext(scan);
8013 #ifdef EXPERIMENTAL_INPLACESCAN
8014 if (PL_regkind[OP(scan)] == EXACT)
8015 if (join_exact(pRExC_state,scan,&min,1,val,depth+1))
8023 if( exact == PSEUDO )
8025 else if ( exact != OP(scan) )
8034 SV * const mysv=sv_newmortal();
8035 DEBUG_PARSE_MSG((scan==p ? "tsdy" : ""));
8036 regprop(RExC_rx, mysv, scan);
8037 PerlIO_printf(Perl_debug_log, "~ %s (%d) -> %s\n",
8038 SvPV_nolen_const(mysv),
8047 SV * const mysv_val=sv_newmortal();
8048 DEBUG_PARSE_MSG("");
8049 regprop(RExC_rx, mysv_val, val);
8050 PerlIO_printf(Perl_debug_log, "~ attach to %s (%"IVdf") offset to %"IVdf"\n",
8051 SvPV_nolen_const(mysv_val),
8052 (IV)REG_NODE_NUM(val),
8056 if (reg_off_by_arg[OP(scan)]) {
8057 ARG_SET(scan, val - scan);
8060 NEXT_OFF(scan) = val - scan;
8068 - regcurly - a little FSA that accepts {\d+,?\d*}
8071 S_regcurly(register const char *s)
8090 - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
8093 Perl_regdump(pTHX_ const regexp *r)
8097 SV * const sv = sv_newmortal();
8098 SV *dsv= sv_newmortal();
8101 (void)dumpuntil(r, ri->program, ri->program + 1, NULL, NULL, sv, 0, 0);
8103 /* Header fields of interest. */
8104 if (r->anchored_substr) {
8105 RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->anchored_substr),
8106 RE_SV_DUMPLEN(r->anchored_substr), 30);
8107 PerlIO_printf(Perl_debug_log,
8108 "anchored %s%s at %"IVdf" ",
8109 s, RE_SV_TAIL(r->anchored_substr),
8110 (IV)r->anchored_offset);
8111 } else if (r->anchored_utf8) {
8112 RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->anchored_utf8),
8113 RE_SV_DUMPLEN(r->anchored_utf8), 30);
8114 PerlIO_printf(Perl_debug_log,
8115 "anchored utf8 %s%s at %"IVdf" ",
8116 s, RE_SV_TAIL(r->anchored_utf8),
8117 (IV)r->anchored_offset);
8119 if (r->float_substr) {
8120 RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->float_substr),
8121 RE_SV_DUMPLEN(r->float_substr), 30);
8122 PerlIO_printf(Perl_debug_log,
8123 "floating %s%s at %"IVdf"..%"UVuf" ",
8124 s, RE_SV_TAIL(r->float_substr),
8125 (IV)r->float_min_offset, (UV)r->float_max_offset);
8126 } else if (r->float_utf8) {
8127 RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->float_utf8),
8128 RE_SV_DUMPLEN(r->float_utf8), 30);
8129 PerlIO_printf(Perl_debug_log,
8130 "floating utf8 %s%s at %"IVdf"..%"UVuf" ",
8131 s, RE_SV_TAIL(r->float_utf8),
8132 (IV)r->float_min_offset, (UV)r->float_max_offset);
8134 if (r->check_substr || r->check_utf8)
8135 PerlIO_printf(Perl_debug_log,
8137 (r->check_substr == r->float_substr
8138 && r->check_utf8 == r->float_utf8
8139 ? "(checking floating" : "(checking anchored"));
8140 if (r->extflags & RXf_NOSCAN)
8141 PerlIO_printf(Perl_debug_log, " noscan");
8142 if (r->extflags & RXf_CHECK_ALL)
8143 PerlIO_printf(Perl_debug_log, " isall");
8144 if (r->check_substr || r->check_utf8)
8145 PerlIO_printf(Perl_debug_log, ") ");
8147 if (ri->regstclass) {
8148 regprop(r, sv, ri->regstclass);
8149 PerlIO_printf(Perl_debug_log, "stclass %s ", SvPVX_const(sv));
8151 if (r->extflags & RXf_ANCH) {
8152 PerlIO_printf(Perl_debug_log, "anchored");
8153 if (r->extflags & RXf_ANCH_BOL)
8154 PerlIO_printf(Perl_debug_log, "(BOL)");
8155 if (r->extflags & RXf_ANCH_MBOL)
8156 PerlIO_printf(Perl_debug_log, "(MBOL)");
8157 if (r->extflags & RXf_ANCH_SBOL)
8158 PerlIO_printf(Perl_debug_log, "(SBOL)");
8159 if (r->extflags & RXf_ANCH_GPOS)
8160 PerlIO_printf(Perl_debug_log, "(GPOS)");
8161 PerlIO_putc(Perl_debug_log, ' ');
8163 if (r->extflags & RXf_GPOS_SEEN)
8164 PerlIO_printf(Perl_debug_log, "GPOS:%"UVuf" ", (UV)r->gofs);
8165 if (r->intflags & PREGf_SKIP)
8166 PerlIO_printf(Perl_debug_log, "plus ");
8167 if (r->intflags & PREGf_IMPLICIT)
8168 PerlIO_printf(Perl_debug_log, "implicit ");
8169 PerlIO_printf(Perl_debug_log, "minlen %"IVdf" ", (IV)r->minlen);
8170 if (r->extflags & RXf_EVAL_SEEN)
8171 PerlIO_printf(Perl_debug_log, "with eval ");
8172 PerlIO_printf(Perl_debug_log, "\n");
8174 PERL_UNUSED_CONTEXT;
8176 #endif /* DEBUGGING */
8180 - regprop - printable representation of opcode
8183 Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o)
8188 RXi_GET_DECL(prog,progi);
8189 GET_RE_DEBUG_FLAGS_DECL;
8192 sv_setpvn(sv, "", 0);
8194 if (OP(o) > REGNODE_MAX) /* regnode.type is unsigned */
8195 /* It would be nice to FAIL() here, but this may be called from
8196 regexec.c, and it would be hard to supply pRExC_state. */
8197 Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d", (int)OP(o), (int)REGNODE_MAX);
8198 sv_catpv(sv, reg_name[OP(o)]); /* Take off const! */
8200 k = PL_regkind[OP(o)];
8203 SV * const dsv = sv_2mortal(newSVpvs(""));
8204 /* Using is_utf8_string() (via PERL_PV_UNI_DETECT)
8205 * is a crude hack but it may be the best for now since
8206 * we have no flag "this EXACTish node was UTF-8"
8208 const char * const s =
8209 pv_pretty(dsv, STRING(o), STR_LEN(o), 60,
8210 PL_colors[0], PL_colors[1],
8211 PERL_PV_ESCAPE_UNI_DETECT |
8212 PERL_PV_PRETTY_ELIPSES |
8215 Perl_sv_catpvf(aTHX_ sv, " %s", s );
8216 } else if (k == TRIE) {
8217 /* print the details of the trie in dumpuntil instead, as
8218 * progi->data isn't available here */
8219 const char op = OP(o);
8220 const I32 n = ARG(o);
8221 const reg_ac_data * const ac = IS_TRIE_AC(op) ?
8222 (reg_ac_data *)progi->data->data[n] :
8224 const reg_trie_data * const trie = !IS_TRIE_AC(op) ?
8225 (reg_trie_data*)progi->data->data[n] :
8228 Perl_sv_catpvf(aTHX_ sv, "-%s",reg_name[o->flags]);
8229 DEBUG_TRIE_COMPILE_r(
8230 Perl_sv_catpvf(aTHX_ sv,
8231 "<S:%"UVuf"/%"IVdf" W:%"UVuf" L:%"UVuf"/%"UVuf" C:%"UVuf"/%"UVuf">",
8232 (UV)trie->startstate,
8233 (IV)trie->statecount-1, /* -1 because of the unused 0 element */
8234 (UV)trie->wordcount,
8237 (UV)TRIE_CHARCOUNT(trie),
8238 (UV)trie->uniquecharcount
8241 if ( IS_ANYOF_TRIE(op) || trie->bitmap ) {
8243 int rangestart = -1;
8244 U8* bitmap = IS_ANYOF_TRIE(op) ? (U8*)ANYOF_BITMAP(o) : (U8*)TRIE_BITMAP(trie);
8245 Perl_sv_catpvf(aTHX_ sv, "[");
8246 for (i = 0; i <= 256; i++) {
8247 if (i < 256 && BITMAP_TEST(bitmap,i)) {
8248 if (rangestart == -1)
8250 } else if (rangestart != -1) {
8251 if (i <= rangestart + 3)
8252 for (; rangestart < i; rangestart++)
8253 put_byte(sv, rangestart);
8255 put_byte(sv, rangestart);
8257 put_byte(sv, i - 1);
8262 Perl_sv_catpvf(aTHX_ sv, "]");
8265 } else if (k == CURLY) {
8266 if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX)
8267 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */
8268 Perl_sv_catpvf(aTHX_ sv, " {%d,%d}", ARG1(o), ARG2(o));
8270 else if (k == WHILEM && o->flags) /* Ordinal/of */
8271 Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4);
8272 else if (k == REF || k == OPEN || k == CLOSE || k == GROUPP || OP(o)==ACCEPT)
8273 Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o)); /* Parenth number */
8274 else if (k == GOSUB)
8275 Perl_sv_catpvf(aTHX_ sv, "%d[%+d]", (int)ARG(o),(int)ARG2L(o)); /* Paren and offset */
8276 else if (k == VERB) {
8278 Perl_sv_catpvf(aTHX_ sv, ":%"SVf,
8279 (SV*)progi->data->data[ ARG( o ) ]);
8280 } else if (k == LOGICAL)
8281 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* 2: embedded, otherwise 1 */
8282 else if (k == ANYOF) {
8283 int i, rangestart = -1;
8284 const U8 flags = ANYOF_FLAGS(o);
8286 /* Should be synchronized with * ANYOF_ #xdefines in regcomp.h */
8287 static const char * const anyofs[] = {
8320 if (flags & ANYOF_LOCALE)
8321 sv_catpvs(sv, "{loc}");
8322 if (flags & ANYOF_FOLD)
8323 sv_catpvs(sv, "{i}");
8324 Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
8325 if (flags & ANYOF_INVERT)
8327 for (i = 0; i <= 256; i++) {
8328 if (i < 256 && ANYOF_BITMAP_TEST(o,i)) {
8329 if (rangestart == -1)
8331 } else if (rangestart != -1) {
8332 if (i <= rangestart + 3)
8333 for (; rangestart < i; rangestart++)
8334 put_byte(sv, rangestart);
8336 put_byte(sv, rangestart);
8338 put_byte(sv, i - 1);
8344 if (o->flags & ANYOF_CLASS)
8345 for (i = 0; i < (int)(sizeof(anyofs)/sizeof(char*)); i++)
8346 if (ANYOF_CLASS_TEST(o,i))
8347 sv_catpv(sv, anyofs[i]);
8349 if (flags & ANYOF_UNICODE)
8350 sv_catpvs(sv, "{unicode}");
8351 else if (flags & ANYOF_UNICODE_ALL)
8352 sv_catpvs(sv, "{unicode_all}");
8356 SV * const sw = regclass_swash(prog, o, FALSE, &lv, 0);
8360 U8 s[UTF8_MAXBYTES_CASE+1];
8362 for (i = 0; i <= 256; i++) { /* just the first 256 */
8363 uvchr_to_utf8(s, i);
8365 if (i < 256 && swash_fetch(sw, s, TRUE)) {
8366 if (rangestart == -1)
8368 } else if (rangestart != -1) {
8369 if (i <= rangestart + 3)
8370 for (; rangestart < i; rangestart++) {
8371 const U8 * const e = uvchr_to_utf8(s,rangestart);
8373 for(p = s; p < e; p++)
8377 const U8 *e = uvchr_to_utf8(s,rangestart);
8379 for (p = s; p < e; p++)
8382 e = uvchr_to_utf8(s, i-1);
8383 for (p = s; p < e; p++)
8390 sv_catpvs(sv, "..."); /* et cetera */
8394 char *s = savesvpv(lv);
8395 char * const origs = s;
8397 while (*s && *s != '\n')
8401 const char * const t = ++s;
8419 Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
8421 else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH))
8422 Perl_sv_catpvf(aTHX_ sv, "[%d]", -(o->flags));
8424 PERL_UNUSED_CONTEXT;
8425 PERL_UNUSED_ARG(sv);
8427 PERL_UNUSED_ARG(prog);
8428 #endif /* DEBUGGING */
8432 Perl_re_intuit_string(pTHX_ regexp *prog)
8433 { /* Assume that RE_INTUIT is set */
8435 GET_RE_DEBUG_FLAGS_DECL;
8436 PERL_UNUSED_CONTEXT;
8440 const char * const s = SvPV_nolen_const(prog->check_substr
8441 ? prog->check_substr : prog->check_utf8);
8443 if (!PL_colorset) reginitcolors();
8444 PerlIO_printf(Perl_debug_log,
8445 "%sUsing REx %ssubstr:%s \"%s%.60s%s%s\"\n",
8447 prog->check_substr ? "" : "utf8 ",
8448 PL_colors[5],PL_colors[0],
8451 (strlen(s) > 60 ? "..." : ""));
8454 return prog->check_substr ? prog->check_substr : prog->check_utf8;
8458 pregfree - free a regexp
8460 See regdupe below if you change anything here.
8464 Perl_pregfree(pTHX_ struct regexp *r)
8468 GET_RE_DEBUG_FLAGS_DECL;
8470 if (!r || (--r->refcnt > 0))
8476 SV *dsv= sv_newmortal();
8477 RE_PV_QUOTED_DECL(s, (r->extflags & RXf_UTF8),
8478 dsv, r->precomp, r->prelen, 60);
8479 PerlIO_printf(Perl_debug_log,"%sFreeing REx:%s %s\n",
8480 PL_colors[4],PL_colors[5],s);
8484 /* gcov results gave these as non-null 100% of the time, so there's no
8485 optimisation in checking them before calling Safefree */
8486 Safefree(r->precomp);
8487 Safefree(ri->offsets); /* 20010421 MJD */
8488 RX_MATCH_COPY_FREE(r);
8489 #ifdef PERL_OLD_COPY_ON_WRITE
8491 SvREFCNT_dec(r->saved_copy);
8494 if (r->anchored_substr)
8495 SvREFCNT_dec(r->anchored_substr);
8496 if (r->anchored_utf8)
8497 SvREFCNT_dec(r->anchored_utf8);
8498 if (r->float_substr)
8499 SvREFCNT_dec(r->float_substr);
8501 SvREFCNT_dec(r->float_utf8);
8502 Safefree(r->substrs);
8505 SvREFCNT_dec(r->paren_names);
8507 int n = ri->data->count;
8508 PAD* new_comppad = NULL;
8513 /* If you add a ->what type here, update the comment in regcomp.h */
8514 switch (ri->data->what[n]) {
8517 SvREFCNT_dec((SV*)ri->data->data[n]);
8520 Safefree(ri->data->data[n]);
8523 new_comppad = (AV*)ri->data->data[n];
8526 if (new_comppad == NULL)
8527 Perl_croak(aTHX_ "panic: pregfree comppad");
8528 PAD_SAVE_LOCAL(old_comppad,
8529 /* Watch out for global destruction's random ordering. */
8530 (SvTYPE(new_comppad) == SVt_PVAV) ? new_comppad : NULL
8533 refcnt = OpREFCNT_dec((OP_4tree*)ri->data->data[n]);
8536 op_free((OP_4tree*)ri->data->data[n]);
8538 PAD_RESTORE_LOCAL(old_comppad);
8539 SvREFCNT_dec((SV*)new_comppad);
8545 { /* Aho Corasick add-on structure for a trie node.
8546 Used in stclass optimization only */
8548 reg_ac_data *aho=(reg_ac_data*)ri->data->data[n];
8550 refcount = --aho->refcount;
8553 Safefree(aho->states);
8554 Safefree(aho->fail);
8555 aho->trie=NULL; /* not necessary to free this as it is
8556 handled by the 't' case */
8557 Safefree(ri->data->data[n]); /* do this last!!!! */
8558 Safefree(ri->regstclass);
8564 /* trie structure. */
8566 reg_trie_data *trie=(reg_trie_data*)ri->data->data[n];
8568 refcount = --trie->refcount;
8571 Safefree(trie->charmap);
8572 if (trie->widecharmap)
8573 SvREFCNT_dec((SV*)trie->widecharmap);
8574 Safefree(trie->states);
8575 Safefree(trie->trans);
8577 Safefree(trie->bitmap);
8579 Safefree(trie->wordlen);
8581 Safefree(trie->jump);
8583 Safefree(trie->nextword);
8586 SvREFCNT_dec((SV*)trie->words);
8587 if (trie->revcharmap)
8588 SvREFCNT_dec((SV*)trie->revcharmap);
8590 Safefree(ri->data->data[n]); /* do this last!!!! */
8595 Perl_croak(aTHX_ "panic: regfree data code '%c'", ri->data->what[n]);
8598 Safefree(ri->data->what);
8601 Safefree(r->startp);
8604 Safefree(ri->swap->startp);
8605 Safefree(ri->swap->endp);
8612 #define sv_dup_inc(s,t) SvREFCNT_inc(sv_dup(s,t))
8613 #define av_dup_inc(s,t) (AV*)SvREFCNT_inc(sv_dup((SV*)s,t))
8614 #define hv_dup_inc(s,t) (HV*)SvREFCNT_inc(sv_dup((SV*)s,t))
8615 #define SAVEPVN(p,n) ((p) ? savepvn(p,n) : NULL)
8618 regdupe - duplicate a regexp.
8620 This routine is called by sv.c's re_dup and is expected to clone a
8621 given regexp structure. It is a no-op when not under USE_ITHREADS.
8622 (Originally this *was* re_dup() for change history see sv.c)
8624 See pregfree() above if you change anything here.
8626 #if defined(USE_ITHREADS)
8628 Perl_regdupe(pTHX_ const regexp *r, CLONE_PARAMS *param)
8632 regexp_internal *reti;
8634 struct reg_substr_datum *s;
8638 return (REGEXP *)NULL;
8640 if ((ret = (REGEXP *)ptr_table_fetch(PL_ptr_table, r)))
8643 len = ri->offsets[0];
8644 npar = r->nparens+1;
8646 Newxz(ret, 1, regexp);
8647 Newxc(reti, sizeof(regexp_internal) + (len+1)*sizeof(regnode), char, regexp_internal);
8649 Copy(ri->program, reti->program, len+1, regnode);
8651 Newx(ret->startp, npar, I32);
8652 Copy(r->startp, ret->startp, npar, I32);
8653 Newx(ret->endp, npar, I32);
8654 Copy(r->startp, ret->startp, npar, I32);
8656 Newx(reti->swap, 1, regexp_paren_ofs);
8657 /* no need to copy these */
8658 Newx(reti->swap->startp, npar, I32);
8659 Newx(reti->swap->endp, npar, I32);
8664 Newx(ret->substrs, 1, struct reg_substr_data);
8665 for (s = ret->substrs->data, i = 0; i < 3; i++, s++) {
8666 s->min_offset = r->substrs->data[i].min_offset;
8667 s->max_offset = r->substrs->data[i].max_offset;
8668 s->end_shift = r->substrs->data[i].end_shift;
8669 s->substr = sv_dup_inc(r->substrs->data[i].substr, param);
8670 s->utf8_substr = sv_dup_inc(r->substrs->data[i].utf8_substr, param);
8673 reti->regstclass = NULL;
8676 const int count = ri->data->count;
8679 Newxc(d, sizeof(struct reg_data) + count*sizeof(void *),
8680 char, struct reg_data);
8681 Newx(d->what, count, U8);
8684 for (i = 0; i < count; i++) {
8685 d->what[i] = ri->data->what[i];
8686 switch (d->what[i]) {
8687 /* legal options are one of: sSfpontT
8688 see also regcomp.h and pregfree() */
8691 d->data[i] = sv_dup_inc((SV *)ri->data->data[i], param);
8694 d->data[i] = av_dup_inc((AV *)ri->data->data[i], param);
8697 /* This is cheating. */
8698 Newx(d->data[i], 1, struct regnode_charclass_class);
8699 StructCopy(ri->data->data[i], d->data[i],
8700 struct regnode_charclass_class);
8701 reti->regstclass = (regnode*)d->data[i];
8704 /* Compiled op trees are readonly and in shared memory,
8705 and can thus be shared without duplication. */
8707 d->data[i] = (void*)OpREFCNT_inc((OP*)ri->data->data[i]);
8711 d->data[i] = ri->data->data[i];
8714 d->data[i] = ri->data->data[i];
8716 ((reg_trie_data*)d->data[i])->refcount++;
8720 d->data[i] = ri->data->data[i];
8722 ((reg_ac_data*)d->data[i])->refcount++;
8724 /* Trie stclasses are readonly and can thus be shared
8725 * without duplication. We free the stclass in pregfree
8726 * when the corresponding reg_ac_data struct is freed.
8728 reti->regstclass= ri->regstclass;
8731 Perl_croak(aTHX_ "panic: re_dup unknown data code '%c'", ri->data->what[i]);
8740 Newx(reti->offsets, 2*len+1, U32);
8741 Copy(ri->offsets, reti->offsets, 2*len+1, U32);
8743 ret->precomp = SAVEPVN(r->precomp, r->prelen);
8744 ret->refcnt = r->refcnt;
8745 ret->minlen = r->minlen;
8746 ret->minlenret = r->minlenret;
8747 ret->prelen = r->prelen;
8748 ret->nparens = r->nparens;
8749 ret->lastparen = r->lastparen;
8750 ret->lastcloseparen = r->lastcloseparen;
8751 ret->intflags = r->intflags;
8752 ret->extflags = r->extflags;
8754 ret->sublen = r->sublen;
8756 ret->engine = r->engine;
8758 ret->paren_names = hv_dup_inc(r->paren_names, param);
8760 if (RX_MATCH_COPIED(ret))
8761 ret->subbeg = SAVEPVN(r->subbeg, r->sublen);
8764 #ifdef PERL_OLD_COPY_ON_WRITE
8765 ret->saved_copy = NULL;
8768 ptr_table_store(PL_ptr_table, r, ret);
8776 converts a regexp embedded in a MAGIC struct to its stringified form,
8777 caching the converted form in the struct and returns the cached
8780 If lp is nonnull then it is used to return the length of the
8783 If flags is nonnull and the returned string contains UTF8 then
8784 (flags & 1) will be true.
8786 If haseval is nonnull then it is used to return whether the pattern
8789 Normally called via macro:
8791 CALLREG_STRINGIFY(mg,0,0);
8795 CALLREG_AS_STR(mg,lp,flags,haseval)
8797 See sv_2pv_flags() in sv.c for an example of internal usage.
8802 Perl_reg_stringify(pTHX_ MAGIC *mg, STRLEN *lp, U32 *flags, I32 *haseval ) {
8804 const regexp * const re = (regexp *)mg->mg_obj;
8805 RXi_GET_DECL(re,ri);
8808 const char *fptr = "msix";
8813 bool need_newline = 0;
8814 U16 reganch = (U16)((re->extflags & RXf_PMf_COMPILETIME) >> 12);
8816 while((ch = *fptr++)) {
8818 reflags[left++] = ch;
8821 reflags[right--] = ch;
8826 reflags[left] = '-';
8830 mg->mg_len = re->prelen + 4 + left;
8832 * If /x was used, we have to worry about a regex ending with a
8833 * comment later being embedded within another regex. If so, we don't
8834 * want this regex's "commentization" to leak out to the right part of
8835 * the enclosing regex, we must cap it with a newline.
8837 * So, if /x was used, we scan backwards from the end of the regex. If
8838 * we find a '#' before we find a newline, we need to add a newline
8839 * ourself. If we find a '\n' first (or if we don't find '#' or '\n'),
8840 * we don't need to add anything. -jfriedl
8842 if (PMf_EXTENDED & re->extflags) {
8843 const char *endptr = re->precomp + re->prelen;
8844 while (endptr >= re->precomp) {
8845 const char c = *(endptr--);
8847 break; /* don't need another */
8849 /* we end while in a comment, so we need a newline */
8850 mg->mg_len++; /* save space for it */
8851 need_newline = 1; /* note to add it */
8857 Newx(mg->mg_ptr, mg->mg_len + 1 + left, char);
8858 mg->mg_ptr[0] = '(';
8859 mg->mg_ptr[1] = '?';
8860 Copy(reflags, mg->mg_ptr+2, left, char);
8861 *(mg->mg_ptr+left+2) = ':';
8862 Copy(re->precomp, mg->mg_ptr+3+left, re->prelen, char);
8864 mg->mg_ptr[mg->mg_len - 2] = '\n';
8865 mg->mg_ptr[mg->mg_len - 1] = ')';
8866 mg->mg_ptr[mg->mg_len] = 0;
8869 *haseval = ri->program[0].next_off;
8871 *flags = ((re->extflags & RXf_UTF8) ? 1 : 0);
8879 #ifndef PERL_IN_XSUB_RE
8881 - regnext - dig the "next" pointer out of a node
8884 Perl_regnext(pTHX_ register regnode *p)
8887 register I32 offset;
8892 offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p));
8901 S_re_croak2(pTHX_ const char* pat1,const char* pat2,...)
8904 STRLEN l1 = strlen(pat1);
8905 STRLEN l2 = strlen(pat2);
8908 const char *message;
8914 Copy(pat1, buf, l1 , char);
8915 Copy(pat2, buf + l1, l2 , char);
8916 buf[l1 + l2] = '\n';
8917 buf[l1 + l2 + 1] = '\0';
8919 /* ANSI variant takes additional second argument */
8920 va_start(args, pat2);
8924 msv = vmess(buf, &args);
8926 message = SvPV_const(msv,l1);
8929 Copy(message, buf, l1 , char);
8930 buf[l1-1] = '\0'; /* Overwrite \n */
8931 Perl_croak(aTHX_ "%s", buf);
8934 /* XXX Here's a total kludge. But we need to re-enter for swash routines. */
8936 #ifndef PERL_IN_XSUB_RE
8938 Perl_save_re_context(pTHX)
8942 struct re_save_state *state;
8944 SAVEVPTR(PL_curcop);
8945 SSGROW(SAVESTACK_ALLOC_FOR_RE_SAVE_STATE + 1);
8947 state = (struct re_save_state *)(PL_savestack + PL_savestack_ix);
8948 PL_savestack_ix += SAVESTACK_ALLOC_FOR_RE_SAVE_STATE;
8949 SSPUSHINT(SAVEt_RE_STATE);
8951 Copy(&PL_reg_state, state, 1, struct re_save_state);
8953 PL_reg_start_tmp = 0;
8954 PL_reg_start_tmpl = 0;
8955 PL_reg_oldsaved = NULL;
8956 PL_reg_oldsavedlen = 0;
8958 PL_reg_leftiter = 0;
8959 PL_reg_poscache = NULL;
8960 PL_reg_poscache_size = 0;
8961 #ifdef PERL_OLD_COPY_ON_WRITE
8965 /* Save $1..$n (#18107: UTF-8 s/(\w+)/uc($1)/e); AMS 20021106. */
8967 const REGEXP * const rx = PM_GETRE(PL_curpm);
8970 for (i = 1; i <= rx->nparens; i++) {
8971 char digits[TYPE_CHARS(long)];
8972 const STRLEN len = my_snprintf(digits, sizeof(digits), "%lu", (long)i);
8973 GV *const *const gvp
8974 = (GV**)hv_fetch(PL_defstash, digits, len, 0);
8977 GV * const gv = *gvp;
8978 if (SvTYPE(gv) == SVt_PVGV && GvSV(gv))
8988 clear_re(pTHX_ void *r)
8991 ReREFCNT_dec((regexp *)r);
8997 S_put_byte(pTHX_ SV *sv, int c)
8999 if (isCNTRL(c) || c == 255 || !isPRINT(c))
9000 Perl_sv_catpvf(aTHX_ sv, "\\%o", c);
9001 else if (c == '-' || c == ']' || c == '\\' || c == '^')
9002 Perl_sv_catpvf(aTHX_ sv, "\\%c", c);
9004 Perl_sv_catpvf(aTHX_ sv, "%c", c);
9008 #define CLEAR_OPTSTART \
9009 if (optstart) STMT_START { \
9010 DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log, " (%"IVdf" nodes)\n", (IV)(node - optstart))); \
9014 #define DUMPUNTIL(b,e) CLEAR_OPTSTART; node=dumpuntil(r,start,(b),(e),last,sv,indent+1,depth+1);
9016 STATIC const regnode *
9017 S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node,
9018 const regnode *last, const regnode *plast,
9019 SV* sv, I32 indent, U32 depth)
9022 register U8 op = PSEUDO; /* Arbitrary non-END op. */
9023 register const regnode *next;
9024 const regnode *optstart= NULL;
9026 GET_RE_DEBUG_FLAGS_DECL;
9028 #ifdef DEBUG_DUMPUNTIL
9029 PerlIO_printf(Perl_debug_log, "--- %d : %d - %d - %d\n",indent,node-start,
9030 last ? last-start : 0,plast ? plast-start : 0);
9033 if (plast && plast < last)
9036 while (PL_regkind[op] != END && (!last || node < last)) {
9037 /* While that wasn't END last time... */
9041 if (op == CLOSE || op == WHILEM)
9043 next = regnext((regnode *)node);
9046 if (OP(node) == OPTIMIZED) {
9047 if (!optstart && RE_DEBUG_FLAG(RE_DEBUG_COMPILE_OPTIMISE))
9054 regprop(r, sv, node);
9055 PerlIO_printf(Perl_debug_log, "%4"IVdf":%*s%s", (IV)(node - start),
9056 (int)(2*indent + 1), "", SvPVX_const(sv));
9058 if (OP(node) != OPTIMIZED) {
9059 if (next == NULL) /* Next ptr. */
9060 PerlIO_printf(Perl_debug_log, "(0)");
9061 else if (PL_regkind[(U8)op] == BRANCH && PL_regkind[OP(next)] != BRANCH )
9062 PerlIO_printf(Perl_debug_log, "(FAIL)");
9064 PerlIO_printf(Perl_debug_log, "(%"IVdf")", (IV)(next - start));
9066 /*if (PL_regkind[(U8)op] != TRIE)*/
9067 (void)PerlIO_putc(Perl_debug_log, '\n');
9071 if (PL_regkind[(U8)op] == BRANCHJ) {
9074 register const regnode *nnode = (OP(next) == LONGJMP
9075 ? regnext((regnode *)next)
9077 if (last && nnode > last)
9079 DUMPUNTIL(NEXTOPER(NEXTOPER(node)), nnode);
9082 else if (PL_regkind[(U8)op] == BRANCH) {
9084 DUMPUNTIL(NEXTOPER(node), next);
9086 else if ( PL_regkind[(U8)op] == TRIE ) {
9087 const regnode *this_trie = node;
9088 const char op = OP(node);
9089 const I32 n = ARG(node);
9090 const reg_ac_data * const ac = op>=AHOCORASICK ?
9091 (reg_ac_data *)ri->data->data[n] :
9093 const reg_trie_data * const trie = op<AHOCORASICK ?
9094 (reg_trie_data*)ri->data->data[n] :
9096 const regnode *nextbranch= NULL;
9098 sv_setpvn(sv, "", 0);
9099 for (word_idx= 0; word_idx < (I32)trie->wordcount; word_idx++) {
9100 SV ** const elem_ptr = av_fetch(trie->words,word_idx,0);
9102 PerlIO_printf(Perl_debug_log, "%*s%s ",
9103 (int)(2*(indent+3)), "",
9104 elem_ptr ? pv_pretty(sv, SvPV_nolen_const(*elem_ptr), SvCUR(*elem_ptr), 60,
9105 PL_colors[0], PL_colors[1],
9106 (SvUTF8(*elem_ptr) ? PERL_PV_ESCAPE_UNI : 0) |
9107 PERL_PV_PRETTY_ELIPSES |
9113 U16 dist= trie->jump[word_idx+1];
9114 PerlIO_printf(Perl_debug_log, "(%"UVuf")\n",
9115 (UV)((dist ? this_trie + dist : next) - start));
9118 nextbranch= this_trie + trie->jump[0];
9119 DUMPUNTIL(this_trie + dist, nextbranch);
9121 if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
9122 nextbranch= regnext((regnode *)nextbranch);
9124 PerlIO_printf(Perl_debug_log, "\n");
9127 if (last && next > last)
9132 else if ( op == CURLY ) { /* "next" might be very big: optimizer */
9133 DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS,
9134 NEXTOPER(node) + EXTRA_STEP_2ARGS + 1);
9136 else if (PL_regkind[(U8)op] == CURLY && op != CURLYX) {
9138 DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS, next);
9140 else if ( op == PLUS || op == STAR) {
9141 DUMPUNTIL(NEXTOPER(node), NEXTOPER(node) + 1);
9143 else if (op == ANYOF) {
9144 /* arglen 1 + class block */
9145 node += 1 + ((ANYOF_FLAGS(node) & ANYOF_LARGE)
9146 ? ANYOF_CLASS_SKIP : ANYOF_SKIP);
9147 node = NEXTOPER(node);
9149 else if (PL_regkind[(U8)op] == EXACT) {
9150 /* Literal string, where present. */
9151 node += NODE_SZ_STR(node) - 1;
9152 node = NEXTOPER(node);
9155 node = NEXTOPER(node);
9156 node += regarglen[(U8)op];
9158 if (op == CURLYX || op == OPEN)
9162 #ifdef DEBUG_DUMPUNTIL
9163 PerlIO_printf(Perl_debug_log, "--- %d\n", (int)indent);
9168 #endif /* DEBUGGING */
9172 * c-indentation-style: bsd
9174 * indent-tabs-mode: t
9177 * ex: set ts=8 sts=4 sw=4 noet: