5 * "A fair jaw-cracker dwarf-language must be." --Samwise Gamgee
8 /* This file contains functions for compiling a regular expression. See
9 * also regexec.c which funnily enough, contains functions for executing
10 * a regular expression.
12 * This file is also copied at build time to ext/re/re_comp.c, where
13 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
14 * This causes the main functions to be compiled under new names and with
15 * debugging support added, which makes "use re 'debug'" work.
18 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
19 * confused with the original package (see point 3 below). Thanks, Henry!
22 /* Additional note: this code is very heavily munged from Henry's version
23 * in places. In some spots I've traded clarity for efficiency, so don't
24 * blame Henry for some of the lack of readability.
27 /* The names of the functions have been changed from regcomp and
28 * regexec to pregcomp and pregexec in order to avoid conflicts
29 * with the POSIX routines of the same names.
32 #ifdef PERL_EXT_RE_BUILD
37 * pregcomp and pregexec -- regsub and regerror are not used in perl
39 * Copyright (c) 1986 by University of Toronto.
40 * Written by Henry Spencer. Not derived from licensed software.
42 * Permission is granted to anyone to use this software for any
43 * purpose on any computer system, and to redistribute it freely,
44 * subject to the following restrictions:
46 * 1. The author is not responsible for the consequences of use of
47 * this software, no matter how awful, even if they arise
50 * 2. The origin of this software must not be misrepresented, either
51 * by explicit claim or by omission.
53 * 3. Altered versions must be plainly marked as such, and must not
54 * be misrepresented as being the original software.
57 **** Alterations to Henry's code are...
59 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
60 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, by Larry Wall and others
62 **** You may distribute under the terms of either the GNU General Public
63 **** License or the Artistic License, as specified in the README file.
66 * Beware that some of this code is subtly aware of the way operator
67 * precedence is structured in regular expressions. Serious changes in
68 * regular-expression syntax might require a total rethink.
71 #define PERL_IN_REGCOMP_C
74 #ifndef PERL_IN_XSUB_RE
79 #ifdef PERL_IN_XSUB_RE
90 # if defined(BUGGY_MSC6)
91 /* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */
92 # pragma optimize("a",off)
93 /* But MSC 6.00A is happy with 'w', for aliases only across function calls*/
94 # pragma optimize("w",on )
95 # endif /* BUGGY_MSC6 */
102 typedef struct RExC_state_t {
103 U32 flags; /* are we folding, multilining? */
104 char *precomp; /* uncompiled string. */
105 regexp *rx; /* perl core regexp structure */
106 regexp_internal *rxi; /* internal data for regexp object pprivate field */
107 char *start; /* Start of input for compile */
108 char *end; /* End of input for compile */
109 char *parse; /* Input-scan pointer. */
110 I32 whilem_seen; /* number of WHILEM in this expr */
111 regnode *emit_start; /* Start of emitted-code area */
112 regnode *emit; /* Code-emit pointer; ®dummy = don't = compiling */
113 I32 naughty; /* How bad is this pattern? */
114 I32 sawback; /* Did we see \1, ...? */
116 I32 size; /* Code size. */
117 I32 npar; /* Capture buffer count, (OPEN). */
118 I32 cpar; /* Capture buffer count, (CLOSE). */
119 I32 nestroot; /* root parens we are in - used by accept */
123 regnode **open_parens; /* pointers to open parens */
124 regnode **close_parens; /* pointers to close parens */
125 regnode *opend; /* END node in program */
127 HV *charnames; /* cache of named sequences */
128 HV *paren_names; /* Paren names */
129 regnode **recurse; /* Recurse regops */
130 I32 recurse_count; /* Number of recurse regops */
132 char *starttry; /* -Dr: where regtry was called. */
133 #define RExC_starttry (pRExC_state->starttry)
136 const char *lastparse;
138 #define RExC_lastparse (pRExC_state->lastparse)
139 #define RExC_lastnum (pRExC_state->lastnum)
143 #define RExC_flags (pRExC_state->flags)
144 #define RExC_precomp (pRExC_state->precomp)
145 #define RExC_rx (pRExC_state->rx)
146 #define RExC_rxi (pRExC_state->rxi)
147 #define RExC_start (pRExC_state->start)
148 #define RExC_end (pRExC_state->end)
149 #define RExC_parse (pRExC_state->parse)
150 #define RExC_whilem_seen (pRExC_state->whilem_seen)
151 #define RExC_offsets (pRExC_state->rxi->offsets) /* I am not like the others */
152 #define RExC_emit (pRExC_state->emit)
153 #define RExC_emit_start (pRExC_state->emit_start)
154 #define RExC_naughty (pRExC_state->naughty)
155 #define RExC_sawback (pRExC_state->sawback)
156 #define RExC_seen (pRExC_state->seen)
157 #define RExC_size (pRExC_state->size)
158 #define RExC_npar (pRExC_state->npar)
159 #define RExC_cpar (pRExC_state->cpar)
160 #define RExC_nestroot (pRExC_state->nestroot)
161 #define RExC_extralen (pRExC_state->extralen)
162 #define RExC_seen_zerolen (pRExC_state->seen_zerolen)
163 #define RExC_seen_evals (pRExC_state->seen_evals)
164 #define RExC_utf8 (pRExC_state->utf8)
165 #define RExC_charnames (pRExC_state->charnames)
166 #define RExC_open_parens (pRExC_state->open_parens)
167 #define RExC_close_parens (pRExC_state->close_parens)
168 #define RExC_opend (pRExC_state->opend)
169 #define RExC_paren_names (pRExC_state->paren_names)
170 #define RExC_recurse (pRExC_state->recurse)
171 #define RExC_recurse_count (pRExC_state->recurse_count)
173 #define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?')
174 #define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
175 ((*s) == '{' && regcurly(s)))
178 #undef SPSTART /* dratted cpp namespace... */
181 * Flags to be passed up and down.
183 #define WORST 0 /* Worst case. */
184 #define HASWIDTH 0x1 /* Known to match non-null strings. */
185 #define SIMPLE 0x2 /* Simple enough to be STAR/PLUS operand. */
186 #define SPSTART 0x4 /* Starts with * or +. */
187 #define TRYAGAIN 0x8 /* Weeded out a declaration. */
189 #define REG_NODE_NUM(x) ((x) ? (int)((x)-RExC_emit_start) : -1)
191 /* whether trie related optimizations are enabled */
192 #if PERL_ENABLE_EXTENDED_TRIE_OPTIMISATION
193 #define TRIE_STUDY_OPT
194 #define FULL_TRIE_STUDY
200 #define PBYTE(u8str,paren) ((U8*)(u8str))[(paren) >> 3]
201 #define PBITVAL(paren) (1 << ((paren) & 7))
202 #define PAREN_TEST(u8str,paren) ( PBYTE(u8str,paren) & PBITVAL(paren))
203 #define PAREN_SET(u8str,paren) PBYTE(u8str,paren) |= PBITVAL(paren)
204 #define PAREN_UNSET(u8str,paren) PBYTE(u8str,paren) &= (~PBITVAL(paren))
207 /* About scan_data_t.
209 During optimisation we recurse through the regexp program performing
210 various inplace (keyhole style) optimisations. In addition study_chunk
211 and scan_commit populate this data structure with information about
212 what strings MUST appear in the pattern. We look for the longest
213 string that must appear for at a fixed location, and we look for the
214 longest string that may appear at a floating location. So for instance
219 Both 'FOO' and 'A' are fixed strings. Both 'B' and 'BAR' are floating
220 strings (because they follow a .* construct). study_chunk will identify
221 both FOO and BAR as being the longest fixed and floating strings respectively.
223 The strings can be composites, for instance
227 will result in a composite fixed substring 'foo'.
229 For each string some basic information is maintained:
231 - offset or min_offset
232 This is the position the string must appear at, or not before.
233 It also implicitly (when combined with minlenp) tells us how many
234 character must match before the string we are searching.
235 Likewise when combined with minlenp and the length of the string
236 tells us how many characters must appear after the string we have
240 Only used for floating strings. This is the rightmost point that
241 the string can appear at. Ifset to I32 max it indicates that the
242 string can occur infinitely far to the right.
245 A pointer to the minimum length of the pattern that the string
246 was found inside. This is important as in the case of positive
247 lookahead or positive lookbehind we can have multiple patterns
252 The minimum length of the pattern overall is 3, the minimum length
253 of the lookahead part is 3, but the minimum length of the part that
254 will actually match is 1. So 'FOO's minimum length is 3, but the
255 minimum length for the F is 1. This is important as the minimum length
256 is used to determine offsets in front of and behind the string being
257 looked for. Since strings can be composites this is the length of the
258 pattern at the time it was commited with a scan_commit. Note that
259 the length is calculated by study_chunk, so that the minimum lengths
260 are not known until the full pattern has been compiled, thus the
261 pointer to the value.
265 In the case of lookbehind the string being searched for can be
266 offset past the start point of the final matching string.
267 If this value was just blithely removed from the min_offset it would
268 invalidate some of the calculations for how many chars must match
269 before or after (as they are derived from min_offset and minlen and
270 the length of the string being searched for).
271 When the final pattern is compiled and the data is moved from the
272 scan_data_t structure into the regexp structure the information
273 about lookbehind is factored in, with the information that would
274 have been lost precalculated in the end_shift field for the
277 The fields pos_min and pos_delta are used to store the minimum offset
278 and the delta to the maximum offset at the current point in the pattern.
282 typedef struct scan_data_t {
283 /*I32 len_min; unused */
284 /*I32 len_delta; unused */
288 I32 last_end; /* min value, <0 unless valid. */
291 SV **longest; /* Either &l_fixed, or &l_float. */
292 SV *longest_fixed; /* longest fixed string found in pattern */
293 I32 offset_fixed; /* offset where it starts */
294 I32 *minlen_fixed; /* pointer to the minlen relevent to the string */
295 I32 lookbehind_fixed; /* is the position of the string modfied by LB */
296 SV *longest_float; /* longest floating string found in pattern */
297 I32 offset_float_min; /* earliest point in string it can appear */
298 I32 offset_float_max; /* latest point in string it can appear */
299 I32 *minlen_float; /* pointer to the minlen relevent to the string */
300 I32 lookbehind_float; /* is the position of the string modified by LB */
304 struct regnode_charclass_class *start_class;
308 * Forward declarations for pregcomp()'s friends.
311 static const scan_data_t zero_scan_data =
312 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ,0};
314 #define SF_BEFORE_EOL (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
315 #define SF_BEFORE_SEOL 0x0001
316 #define SF_BEFORE_MEOL 0x0002
317 #define SF_FIX_BEFORE_EOL (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
318 #define SF_FL_BEFORE_EOL (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
321 # define SF_FIX_SHIFT_EOL (0+2)
322 # define SF_FL_SHIFT_EOL (0+4)
324 # define SF_FIX_SHIFT_EOL (+2)
325 # define SF_FL_SHIFT_EOL (+4)
328 #define SF_FIX_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
329 #define SF_FIX_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
331 #define SF_FL_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
332 #define SF_FL_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
333 #define SF_IS_INF 0x0040
334 #define SF_HAS_PAR 0x0080
335 #define SF_IN_PAR 0x0100
336 #define SF_HAS_EVAL 0x0200
337 #define SCF_DO_SUBSTR 0x0400
338 #define SCF_DO_STCLASS_AND 0x0800
339 #define SCF_DO_STCLASS_OR 0x1000
340 #define SCF_DO_STCLASS (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
341 #define SCF_WHILEM_VISITED_POS 0x2000
343 #define SCF_TRIE_RESTUDY 0x4000 /* Do restudy? */
344 #define SCF_SEEN_ACCEPT 0x8000
346 #define UTF (RExC_utf8 != 0)
347 #define LOC ((RExC_flags & RXf_PMf_LOCALE) != 0)
348 #define FOLD ((RExC_flags & RXf_PMf_FOLD) != 0)
350 #define OOB_UNICODE 12345678
351 #define OOB_NAMEDCLASS -1
353 #define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
354 #define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
357 /* length of regex to show in messages that don't mark a position within */
358 #define RegexLengthToShowInErrorMessages 127
361 * If MARKER[12] are adjusted, be sure to adjust the constants at the top
362 * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
363 * op/pragma/warn/regcomp.
365 #define MARKER1 "<-- HERE" /* marker as it appears in the description */
366 #define MARKER2 " <-- HERE " /* marker as it appears within the regex */
368 #define REPORT_LOCATION " in regex; marked by " MARKER1 " in m/%.*s" MARKER2 "%s/"
371 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
372 * arg. Show regex, up to a maximum length. If it's too long, chop and add
375 #define _FAIL(code) STMT_START { \
376 const char *ellipses = ""; \
377 IV len = RExC_end - RExC_precomp; \
380 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
381 if (len > RegexLengthToShowInErrorMessages) { \
382 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
383 len = RegexLengthToShowInErrorMessages - 10; \
389 #define FAIL(msg) _FAIL( \
390 Perl_croak(aTHX_ "%s in regex m/%.*s%s/", \
391 msg, (int)len, RExC_precomp, ellipses))
393 #define FAIL2(msg,arg) _FAIL( \
394 Perl_croak(aTHX_ msg " in regex m/%.*s%s/", \
395 arg, (int)len, RExC_precomp, ellipses))
398 * Simple_vFAIL -- like FAIL, but marks the current location in the scan
400 #define Simple_vFAIL(m) STMT_START { \
401 const IV offset = RExC_parse - RExC_precomp; \
402 Perl_croak(aTHX_ "%s" REPORT_LOCATION, \
403 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
407 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
409 #define vFAIL(m) STMT_START { \
411 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
416 * Like Simple_vFAIL(), but accepts two arguments.
418 #define Simple_vFAIL2(m,a1) STMT_START { \
419 const IV offset = RExC_parse - RExC_precomp; \
420 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, \
421 (int)offset, RExC_precomp, RExC_precomp + offset); \
425 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
427 #define vFAIL2(m,a1) STMT_START { \
429 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
430 Simple_vFAIL2(m, a1); \
435 * Like Simple_vFAIL(), but accepts three arguments.
437 #define Simple_vFAIL3(m, a1, a2) STMT_START { \
438 const IV offset = RExC_parse - RExC_precomp; \
439 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, \
440 (int)offset, RExC_precomp, RExC_precomp + offset); \
444 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
446 #define vFAIL3(m,a1,a2) STMT_START { \
448 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
449 Simple_vFAIL3(m, a1, a2); \
453 * Like Simple_vFAIL(), but accepts four arguments.
455 #define Simple_vFAIL4(m, a1, a2, a3) STMT_START { \
456 const IV offset = RExC_parse - RExC_precomp; \
457 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3, \
458 (int)offset, RExC_precomp, RExC_precomp + offset); \
461 #define vWARN(loc,m) STMT_START { \
462 const IV offset = loc - RExC_precomp; \
463 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s" REPORT_LOCATION, \
464 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
467 #define vWARNdep(loc,m) STMT_START { \
468 const IV offset = loc - RExC_precomp; \
469 Perl_warner(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), \
470 "%s" REPORT_LOCATION, \
471 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
475 #define vWARN2(loc, m, a1) STMT_START { \
476 const IV offset = loc - RExC_precomp; \
477 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
478 a1, (int)offset, RExC_precomp, RExC_precomp + offset); \
481 #define vWARN3(loc, m, a1, a2) STMT_START { \
482 const IV offset = loc - RExC_precomp; \
483 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
484 a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset); \
487 #define vWARN4(loc, m, a1, a2, a3) STMT_START { \
488 const IV offset = loc - RExC_precomp; \
489 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
490 a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
493 #define vWARN5(loc, m, a1, a2, a3, a4) STMT_START { \
494 const IV offset = loc - RExC_precomp; \
495 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
496 a1, a2, a3, a4, (int)offset, RExC_precomp, RExC_precomp + offset); \
500 /* Allow for side effects in s */
501 #define REGC(c,s) STMT_START { \
502 if (!SIZE_ONLY) *(s) = (c); else (void)(s); \
505 /* Macros for recording node offsets. 20001227 mjd@plover.com
506 * Nodes are numbered 1, 2, 3, 4. Node #n's position is recorded in
507 * element 2*n-1 of the array. Element #2n holds the byte length node #n.
508 * Element 0 holds the number n.
509 * Position is 1 indexed.
512 #define Set_Node_Offset_To_R(node,byte) STMT_START { \
514 MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n", \
515 __LINE__, (int)(node), (int)(byte))); \
517 Perl_croak(aTHX_ "value of node is %d in Offset macro", (int)(node)); \
519 RExC_offsets[2*(node)-1] = (byte); \
524 #define Set_Node_Offset(node,byte) \
525 Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
526 #define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
528 #define Set_Node_Length_To_R(node,len) STMT_START { \
530 MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n", \
531 __LINE__, (int)(node), (int)(len))); \
533 Perl_croak(aTHX_ "value of node is %d in Length macro", (int)(node)); \
535 RExC_offsets[2*(node)] = (len); \
540 #define Set_Node_Length(node,len) \
541 Set_Node_Length_To_R((node)-RExC_emit_start, len)
542 #define Set_Cur_Node_Length(len) Set_Node_Length(RExC_emit, len)
543 #define Set_Node_Cur_Length(node) \
544 Set_Node_Length(node, RExC_parse - parse_start)
546 /* Get offsets and lengths */
547 #define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
548 #define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
550 #define Set_Node_Offset_Length(node,offset,len) STMT_START { \
551 Set_Node_Offset_To_R((node)-RExC_emit_start, (offset)); \
552 Set_Node_Length_To_R((node)-RExC_emit_start, (len)); \
556 #if PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS
557 #define EXPERIMENTAL_INPLACESCAN
560 #define DEBUG_STUDYDATA(data,depth) \
561 DEBUG_OPTIMISE_MORE_r(if(data){ \
562 PerlIO_printf(Perl_debug_log, \
563 "%*s"/* Len:%"IVdf"/%"IVdf" */"Pos:%"IVdf"/%"IVdf \
564 " Flags: %"IVdf" Whilem_c: %"IVdf" Lcp: %"IVdf" ", \
565 (int)(depth)*2, "", \
566 (IV)((data)->pos_min), \
567 (IV)((data)->pos_delta), \
568 (IV)((data)->flags), \
569 (IV)((data)->whilem_c), \
570 (IV)((data)->last_closep ? *((data)->last_closep) : -1) \
572 if ((data)->last_found) \
573 PerlIO_printf(Perl_debug_log, \
574 "Last:'%s' %"IVdf":%"IVdf"/%"IVdf" %sFixed:'%s' @ %"IVdf \
575 " %sFloat: '%s' @ %"IVdf"/%"IVdf"", \
576 SvPVX_const((data)->last_found), \
577 (IV)((data)->last_end), \
578 (IV)((data)->last_start_min), \
579 (IV)((data)->last_start_max), \
580 ((data)->longest && \
581 (data)->longest==&((data)->longest_fixed)) ? "*" : "", \
582 SvPVX_const((data)->longest_fixed), \
583 (IV)((data)->offset_fixed), \
584 ((data)->longest && \
585 (data)->longest==&((data)->longest_float)) ? "*" : "", \
586 SvPVX_const((data)->longest_float), \
587 (IV)((data)->offset_float_min), \
588 (IV)((data)->offset_float_max) \
590 PerlIO_printf(Perl_debug_log,"\n"); \
593 static void clear_re(pTHX_ void *r);
595 /* Mark that we cannot extend a found fixed substring at this point.
596 Update the longest found anchored substring and the longest found
597 floating substrings if needed. */
600 S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data, I32 *minlenp)
602 const STRLEN l = CHR_SVLEN(data->last_found);
603 const STRLEN old_l = CHR_SVLEN(*data->longest);
604 GET_RE_DEBUG_FLAGS_DECL;
606 if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
607 SvSetMagicSV(*data->longest, data->last_found);
608 if (*data->longest == data->longest_fixed) {
609 data->offset_fixed = l ? data->last_start_min : data->pos_min;
610 if (data->flags & SF_BEFORE_EOL)
612 |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
614 data->flags &= ~SF_FIX_BEFORE_EOL;
615 data->minlen_fixed=minlenp;
616 data->lookbehind_fixed=0;
619 data->offset_float_min = l ? data->last_start_min : data->pos_min;
620 data->offset_float_max = (l
621 ? data->last_start_max
622 : data->pos_min + data->pos_delta);
623 if ((U32)data->offset_float_max > (U32)I32_MAX)
624 data->offset_float_max = I32_MAX;
625 if (data->flags & SF_BEFORE_EOL)
627 |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
629 data->flags &= ~SF_FL_BEFORE_EOL;
630 data->minlen_float=minlenp;
631 data->lookbehind_float=0;
634 SvCUR_set(data->last_found, 0);
636 SV * const sv = data->last_found;
637 if (SvUTF8(sv) && SvMAGICAL(sv)) {
638 MAGIC * const mg = mg_find(sv, PERL_MAGIC_utf8);
644 data->flags &= ~SF_BEFORE_EOL;
645 DEBUG_STUDYDATA(data,0);
648 /* Can match anything (initialization) */
650 S_cl_anything(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
652 ANYOF_CLASS_ZERO(cl);
653 ANYOF_BITMAP_SETALL(cl);
654 cl->flags = ANYOF_EOS|ANYOF_UNICODE_ALL;
656 cl->flags |= ANYOF_LOCALE;
659 /* Can match anything (initialization) */
661 S_cl_is_anything(const struct regnode_charclass_class *cl)
665 for (value = 0; value <= ANYOF_MAX; value += 2)
666 if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1))
668 if (!(cl->flags & ANYOF_UNICODE_ALL))
670 if (!ANYOF_BITMAP_TESTALLSET((const void*)cl))
675 /* Can match anything (initialization) */
677 S_cl_init(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
679 Zero(cl, 1, struct regnode_charclass_class);
681 cl_anything(pRExC_state, cl);
685 S_cl_init_zero(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
687 Zero(cl, 1, struct regnode_charclass_class);
689 cl_anything(pRExC_state, cl);
691 cl->flags |= ANYOF_LOCALE;
694 /* 'And' a given class with another one. Can create false positives */
695 /* We assume that cl is not inverted */
697 S_cl_and(struct regnode_charclass_class *cl,
698 const struct regnode_charclass_class *and_with)
701 assert(and_with->type == ANYOF);
702 if (!(and_with->flags & ANYOF_CLASS)
703 && !(cl->flags & ANYOF_CLASS)
704 && (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
705 && !(and_with->flags & ANYOF_FOLD)
706 && !(cl->flags & ANYOF_FOLD)) {
709 if (and_with->flags & ANYOF_INVERT)
710 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
711 cl->bitmap[i] &= ~and_with->bitmap[i];
713 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
714 cl->bitmap[i] &= and_with->bitmap[i];
715 } /* XXXX: logic is complicated otherwise, leave it along for a moment. */
716 if (!(and_with->flags & ANYOF_EOS))
717 cl->flags &= ~ANYOF_EOS;
719 if (cl->flags & ANYOF_UNICODE_ALL && and_with->flags & ANYOF_UNICODE &&
720 !(and_with->flags & ANYOF_INVERT)) {
721 cl->flags &= ~ANYOF_UNICODE_ALL;
722 cl->flags |= ANYOF_UNICODE;
723 ARG_SET(cl, ARG(and_with));
725 if (!(and_with->flags & ANYOF_UNICODE_ALL) &&
726 !(and_with->flags & ANYOF_INVERT))
727 cl->flags &= ~ANYOF_UNICODE_ALL;
728 if (!(and_with->flags & (ANYOF_UNICODE|ANYOF_UNICODE_ALL)) &&
729 !(and_with->flags & ANYOF_INVERT))
730 cl->flags &= ~ANYOF_UNICODE;
733 /* 'OR' a given class with another one. Can create false positives */
734 /* We assume that cl is not inverted */
736 S_cl_or(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, const struct regnode_charclass_class *or_with)
738 if (or_with->flags & ANYOF_INVERT) {
740 * (B1 | CL1) | (!B2 & !CL2) = (B1 | !B2 & !CL2) | (CL1 | (!B2 & !CL2))
741 * <= (B1 | !B2) | (CL1 | !CL2)
742 * which is wasteful if CL2 is small, but we ignore CL2:
743 * (B1 | CL1) | (!B2 & !CL2) <= (B1 | CL1) | !B2 = (B1 | !B2) | CL1
744 * XXXX Can we handle case-fold? Unclear:
745 * (OK1(i) | OK1(i')) | !(OK1(i) | OK1(i')) =
746 * (OK1(i) | OK1(i')) | (!OK1(i) & !OK1(i'))
748 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
749 && !(or_with->flags & ANYOF_FOLD)
750 && !(cl->flags & ANYOF_FOLD) ) {
753 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
754 cl->bitmap[i] |= ~or_with->bitmap[i];
755 } /* XXXX: logic is complicated otherwise */
757 cl_anything(pRExC_state, cl);
760 /* (B1 | CL1) | (B2 | CL2) = (B1 | B2) | (CL1 | CL2)) */
761 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
762 && (!(or_with->flags & ANYOF_FOLD)
763 || (cl->flags & ANYOF_FOLD)) ) {
766 /* OR char bitmap and class bitmap separately */
767 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
768 cl->bitmap[i] |= or_with->bitmap[i];
769 if (or_with->flags & ANYOF_CLASS) {
770 for (i = 0; i < ANYOF_CLASSBITMAP_SIZE; i++)
771 cl->classflags[i] |= or_with->classflags[i];
772 cl->flags |= ANYOF_CLASS;
775 else { /* XXXX: logic is complicated, leave it along for a moment. */
776 cl_anything(pRExC_state, cl);
779 if (or_with->flags & ANYOF_EOS)
780 cl->flags |= ANYOF_EOS;
782 if (cl->flags & ANYOF_UNICODE && or_with->flags & ANYOF_UNICODE &&
783 ARG(cl) != ARG(or_with)) {
784 cl->flags |= ANYOF_UNICODE_ALL;
785 cl->flags &= ~ANYOF_UNICODE;
787 if (or_with->flags & ANYOF_UNICODE_ALL) {
788 cl->flags |= ANYOF_UNICODE_ALL;
789 cl->flags &= ~ANYOF_UNICODE;
793 #define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ]
794 #define TRIE_LIST_CUR(state) ( TRIE_LIST_ITEM( state, 0 ).forid )
795 #define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate )
796 #define TRIE_LIST_USED(idx) ( trie->states[state].trans.list ? (TRIE_LIST_CUR( idx ) - 1) : 0 )
801 dump_trie(trie,widecharmap,revcharmap)
802 dump_trie_interim_list(trie,widecharmap,revcharmap,next_alloc)
803 dump_trie_interim_table(trie,widecharmap,revcharmap,next_alloc)
805 These routines dump out a trie in a somewhat readable format.
806 The _interim_ variants are used for debugging the interim
807 tables that are used to generate the final compressed
808 representation which is what dump_trie expects.
810 Part of the reason for their existance is to provide a form
811 of documentation as to how the different representations function.
816 Dumps the final compressed table form of the trie to Perl_debug_log.
817 Used for debugging make_trie().
821 S_dump_trie(pTHX_ const struct _reg_trie_data *trie, HV *widecharmap,
822 AV *revcharmap, U32 depth)
825 SV *sv=sv_newmortal();
826 int colwidth= widecharmap ? 6 : 4;
827 GET_RE_DEBUG_FLAGS_DECL;
830 PerlIO_printf( Perl_debug_log, "%*sChar : %-6s%-6s%-4s ",
831 (int)depth * 2 + 2,"",
832 "Match","Base","Ofs" );
834 for( state = 0 ; state < trie->uniquecharcount ; state++ ) {
835 SV ** const tmp = av_fetch( revcharmap, state, 0);
837 PerlIO_printf( Perl_debug_log, "%*s",
839 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
840 PL_colors[0], PL_colors[1],
841 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
842 PERL_PV_ESCAPE_FIRSTCHAR
847 PerlIO_printf( Perl_debug_log, "\n%*sState|-----------------------",
848 (int)depth * 2 + 2,"");
850 for( state = 0 ; state < trie->uniquecharcount ; state++ )
851 PerlIO_printf( Perl_debug_log, "%.*s", colwidth, "--------");
852 PerlIO_printf( Perl_debug_log, "\n");
854 for( state = 1 ; state < trie->statecount ; state++ ) {
855 const U32 base = trie->states[ state ].trans.base;
857 PerlIO_printf( Perl_debug_log, "%*s#%4"UVXf"|", (int)depth * 2 + 2,"", (UV)state);
859 if ( trie->states[ state ].wordnum ) {
860 PerlIO_printf( Perl_debug_log, " W%4X", trie->states[ state ].wordnum );
862 PerlIO_printf( Perl_debug_log, "%6s", "" );
865 PerlIO_printf( Perl_debug_log, " @%4"UVXf" ", (UV)base );
870 while( ( base + ofs < trie->uniquecharcount ) ||
871 ( base + ofs - trie->uniquecharcount < trie->lasttrans
872 && trie->trans[ base + ofs - trie->uniquecharcount ].check != state))
875 PerlIO_printf( Perl_debug_log, "+%2"UVXf"[ ", (UV)ofs);
877 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
878 if ( ( base + ofs >= trie->uniquecharcount ) &&
879 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
880 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
882 PerlIO_printf( Perl_debug_log, "%*"UVXf,
884 (UV)trie->trans[ base + ofs - trie->uniquecharcount ].next );
886 PerlIO_printf( Perl_debug_log, "%*s",colwidth," ." );
890 PerlIO_printf( Perl_debug_log, "]");
893 PerlIO_printf( Perl_debug_log, "\n" );
897 Dumps a fully constructed but uncompressed trie in list form.
898 List tries normally only are used for construction when the number of
899 possible chars (trie->uniquecharcount) is very high.
900 Used for debugging make_trie().
903 S_dump_trie_interim_list(pTHX_ const struct _reg_trie_data *trie,
904 HV *widecharmap, AV *revcharmap, U32 next_alloc,
908 SV *sv=sv_newmortal();
909 int colwidth= widecharmap ? 6 : 4;
910 GET_RE_DEBUG_FLAGS_DECL;
911 /* print out the table precompression. */
912 PerlIO_printf( Perl_debug_log, "%*sState :Word | Transition Data\n%*s%s",
913 (int)depth * 2 + 2,"", (int)depth * 2 + 2,"",
914 "------:-----+-----------------\n" );
916 for( state=1 ; state < next_alloc ; state ++ ) {
919 PerlIO_printf( Perl_debug_log, "%*s %4"UVXf" :",
920 (int)depth * 2 + 2,"", (UV)state );
921 if ( ! trie->states[ state ].wordnum ) {
922 PerlIO_printf( Perl_debug_log, "%5s| ","");
924 PerlIO_printf( Perl_debug_log, "W%4x| ",
925 trie->states[ state ].wordnum
928 for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) {
929 SV ** const tmp = av_fetch( revcharmap, TRIE_LIST_ITEM(state,charid).forid, 0);
931 PerlIO_printf( Perl_debug_log, "%*s:%3X=%4"UVXf" | ",
933 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
934 PL_colors[0], PL_colors[1],
935 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
936 PERL_PV_ESCAPE_FIRSTCHAR
938 TRIE_LIST_ITEM(state,charid).forid,
939 (UV)TRIE_LIST_ITEM(state,charid).newstate
942 PerlIO_printf(Perl_debug_log, "\n%*s| ",
943 (int)((depth * 2) + 14), "");
946 PerlIO_printf( Perl_debug_log, "\n");
951 Dumps a fully constructed but uncompressed trie in table form.
952 This is the normal DFA style state transition table, with a few
953 twists to facilitate compression later.
954 Used for debugging make_trie().
957 S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie,
958 HV *widecharmap, AV *revcharmap, U32 next_alloc,
963 SV *sv=sv_newmortal();
964 int colwidth= widecharmap ? 6 : 4;
965 GET_RE_DEBUG_FLAGS_DECL;
968 print out the table precompression so that we can do a visual check
969 that they are identical.
972 PerlIO_printf( Perl_debug_log, "%*sChar : ",(int)depth * 2 + 2,"" );
974 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
975 SV ** const tmp = av_fetch( revcharmap, charid, 0);
977 PerlIO_printf( Perl_debug_log, "%*s",
979 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
980 PL_colors[0], PL_colors[1],
981 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
982 PERL_PV_ESCAPE_FIRSTCHAR
988 PerlIO_printf( Perl_debug_log, "\n%*sState+-",(int)depth * 2 + 2,"" );
990 for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) {
991 PerlIO_printf( Perl_debug_log, "%.*s", colwidth,"--------");
994 PerlIO_printf( Perl_debug_log, "\n" );
996 for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) {
998 PerlIO_printf( Perl_debug_log, "%*s%4"UVXf" : ",
999 (int)depth * 2 + 2,"",
1000 (UV)TRIE_NODENUM( state ) );
1002 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
1003 UV v=(UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next );
1005 PerlIO_printf( Perl_debug_log, "%*"UVXf, colwidth, v );
1007 PerlIO_printf( Perl_debug_log, "%*s", colwidth, "." );
1009 if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) {
1010 PerlIO_printf( Perl_debug_log, " (%4"UVXf")\n", (UV)trie->trans[ state ].check );
1012 PerlIO_printf( Perl_debug_log, " (%4"UVXf") W%4X\n", (UV)trie->trans[ state ].check,
1013 trie->states[ TRIE_NODENUM( state ) ].wordnum );
1020 /* make_trie(startbranch,first,last,tail,word_count,flags,depth)
1021 startbranch: the first branch in the whole branch sequence
1022 first : start branch of sequence of branch-exact nodes.
1023 May be the same as startbranch
1024 last : Thing following the last branch.
1025 May be the same as tail.
1026 tail : item following the branch sequence
1027 count : words in the sequence
1028 flags : currently the OP() type we will be building one of /EXACT(|F|Fl)/
1029 depth : indent depth
1031 Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node.
1033 A trie is an N'ary tree where the branches are determined by digital
1034 decomposition of the key. IE, at the root node you look up the 1st character and
1035 follow that branch repeat until you find the end of the branches. Nodes can be
1036 marked as "accepting" meaning they represent a complete word. Eg:
1040 would convert into the following structure. Numbers represent states, letters
1041 following numbers represent valid transitions on the letter from that state, if
1042 the number is in square brackets it represents an accepting state, otherwise it
1043 will be in parenthesis.
1045 +-h->+-e->[3]-+-r->(8)-+-s->[9]
1049 (1) +-i->(6)-+-s->[7]
1051 +-s->(3)-+-h->(4)-+-e->[5]
1053 Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers)
1055 This shows that when matching against the string 'hers' we will begin at state 1
1056 read 'h' and move to state 2, read 'e' and move to state 3 which is accepting,
1057 then read 'r' and go to state 8 followed by 's' which takes us to state 9 which
1058 is also accepting. Thus we know that we can match both 'he' and 'hers' with a
1059 single traverse. We store a mapping from accepting to state to which word was
1060 matched, and then when we have multiple possibilities we try to complete the
1061 rest of the regex in the order in which they occured in the alternation.
1063 The only prior NFA like behaviour that would be changed by the TRIE support is
1064 the silent ignoring of duplicate alternations which are of the form:
1066 / (DUPE|DUPE) X? (?{ ... }) Y /x
1068 Thus EVAL blocks follwing a trie may be called a different number of times with
1069 and without the optimisation. With the optimisations dupes will be silently
1070 ignored. This inconsistant behaviour of EVAL type nodes is well established as
1071 the following demonstrates:
1073 'words'=~/(word|word|word)(?{ print $1 })[xyz]/
1075 which prints out 'word' three times, but
1077 'words'=~/(word|word|word)(?{ print $1 })S/
1079 which doesnt print it out at all. This is due to other optimisations kicking in.
1081 Example of what happens on a structural level:
1083 The regexp /(ac|ad|ab)+/ will produce the folowing debug output:
1085 1: CURLYM[1] {1,32767}(18)
1096 This would be optimizable with startbranch=5, first=5, last=16, tail=16
1097 and should turn into:
1099 1: CURLYM[1] {1,32767}(18)
1101 [Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1]
1109 Cases where tail != last would be like /(?foo|bar)baz/:
1119 which would be optimizable with startbranch=1, first=1, last=7, tail=8
1120 and would end up looking like:
1123 [Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1]
1130 d = uvuni_to_utf8_flags(d, uv, 0);
1132 is the recommended Unicode-aware way of saying
1137 #define TRIE_STORE_REVCHAR \
1139 SV *tmp = newSVpvs(""); \
1140 if (UTF) SvUTF8_on(tmp); \
1141 Perl_sv_catpvf( aTHX_ tmp, "%c", (int)uvc ); \
1142 av_push( revcharmap, tmp ); \
1145 #define TRIE_READ_CHAR STMT_START { \
1149 if ( foldlen > 0 ) { \
1150 uvc = utf8n_to_uvuni( scan, UTF8_MAXLEN, &len, uniflags ); \
1155 uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
1156 uvc = to_uni_fold( uvc, foldbuf, &foldlen ); \
1157 foldlen -= UNISKIP( uvc ); \
1158 scan = foldbuf + UNISKIP( uvc ); \
1161 uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
1171 #define TRIE_LIST_PUSH(state,fid,ns) STMT_START { \
1172 if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) { \
1173 U32 ging = TRIE_LIST_LEN( state ) *= 2; \
1174 Renew( trie->states[ state ].trans.list, ging, reg_trie_trans_le ); \
1176 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid; \
1177 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns; \
1178 TRIE_LIST_CUR( state )++; \
1181 #define TRIE_LIST_NEW(state) STMT_START { \
1182 Newxz( trie->states[ state ].trans.list, \
1183 4, reg_trie_trans_le ); \
1184 TRIE_LIST_CUR( state ) = 1; \
1185 TRIE_LIST_LEN( state ) = 4; \
1188 #define TRIE_HANDLE_WORD(state) STMT_START { \
1189 U16 dupe= trie->states[ state ].wordnum; \
1190 regnode * const noper_next = regnext( noper ); \
1192 if (trie->wordlen) \
1193 trie->wordlen[ curword ] = wordlen; \
1195 /* store the word for dumping */ \
1197 if (OP(noper) != NOTHING) \
1198 tmp = newSVpvn(STRING(noper), STR_LEN(noper)); \
1200 tmp = newSVpvn( "", 0 ); \
1201 if ( UTF ) SvUTF8_on( tmp ); \
1202 av_push( trie_words, tmp ); \
1207 if ( noper_next < tail ) { \
1209 trie->jump = PerlMemShared_calloc( word_count + 1, sizeof(U16) ); \
1210 trie->jump[curword] = (U16)(noper_next - convert); \
1212 jumper = noper_next; \
1214 nextbranch= regnext(cur); \
1218 /* So it's a dupe. This means we need to maintain a */\
1219 /* linked-list from the first to the next. */\
1220 /* we only allocate the nextword buffer when there */\
1221 /* a dupe, so first time we have to do the allocation */\
1222 if (!trie->nextword) \
1224 PerlMemShared_calloc( word_count + 1, sizeof(U16)); \
1225 while ( trie->nextword[dupe] ) \
1226 dupe= trie->nextword[dupe]; \
1227 trie->nextword[dupe]= curword; \
1229 /* we haven't inserted this word yet. */ \
1230 trie->states[ state ].wordnum = curword; \
1235 #define TRIE_TRANS_STATE(state,base,ucharcount,charid,special) \
1236 ( ( base + charid >= ucharcount \
1237 && base + charid < ubound \
1238 && state == trie->trans[ base - ucharcount + charid ].check \
1239 && trie->trans[ base - ucharcount + charid ].next ) \
1240 ? trie->trans[ base - ucharcount + charid ].next \
1241 : ( state==1 ? special : 0 ) \
1245 #define MADE_JUMP_TRIE 2
1246 #define MADE_EXACT_TRIE 4
1249 S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *first, regnode *last, regnode *tail, U32 word_count, U32 flags, U32 depth)
1252 /* first pass, loop through and scan words */
1253 reg_trie_data *trie;
1254 HV *widecharmap = NULL;
1255 AV *revcharmap = newAV();
1257 const U32 uniflags = UTF8_ALLOW_DEFAULT;
1262 regnode *jumper = NULL;
1263 regnode *nextbranch = NULL;
1264 regnode *convert = NULL;
1265 /* we just use folder as a flag in utf8 */
1266 const U8 * const folder = ( flags == EXACTF
1268 : ( flags == EXACTFL
1275 const U32 data_slot = add_data( pRExC_state, 4, "tuuu" );
1276 AV *trie_words = NULL;
1277 /* along with revcharmap, this only used during construction but both are
1278 * useful during debugging so we store them in the struct when debugging.
1281 const U32 data_slot = add_data( pRExC_state, 2, "tu" );
1282 STRLEN trie_charcount=0;
1284 SV *re_trie_maxbuff;
1285 GET_RE_DEBUG_FLAGS_DECL;
1287 PERL_UNUSED_ARG(depth);
1290 trie = PerlMemShared_calloc( 1, sizeof(reg_trie_data) );
1292 trie->startstate = 1;
1293 trie->wordcount = word_count;
1294 RExC_rxi->data->data[ data_slot ] = (void*)trie;
1295 trie->charmap = PerlMemShared_calloc( 256, sizeof(U16) );
1296 if (!(UTF && folder))
1297 trie->bitmap = PerlMemShared_calloc( ANYOF_BITMAP_SIZE, 1 );
1299 trie_words = newAV();
1302 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
1303 if (!SvIOK(re_trie_maxbuff)) {
1304 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
1307 PerlIO_printf( Perl_debug_log,
1308 "%*smake_trie start==%d, first==%d, last==%d, tail==%d depth=%d\n",
1309 (int)depth * 2 + 2, "",
1310 REG_NODE_NUM(startbranch),REG_NODE_NUM(first),
1311 REG_NODE_NUM(last), REG_NODE_NUM(tail),
1315 /* Find the node we are going to overwrite */
1316 if ( first == startbranch && OP( last ) != BRANCH ) {
1317 /* whole branch chain */
1320 /* branch sub-chain */
1321 convert = NEXTOPER( first );
1324 /* -- First loop and Setup --
1326 We first traverse the branches and scan each word to determine if it
1327 contains widechars, and how many unique chars there are, this is
1328 important as we have to build a table with at least as many columns as we
1331 We use an array of integers to represent the character codes 0..255
1332 (trie->charmap) and we use a an HV* to store unicode characters. We use the
1333 native representation of the character value as the key and IV's for the
1336 *TODO* If we keep track of how many times each character is used we can
1337 remap the columns so that the table compression later on is more
1338 efficient in terms of memory by ensuring most common value is in the
1339 middle and the least common are on the outside. IMO this would be better
1340 than a most to least common mapping as theres a decent chance the most
1341 common letter will share a node with the least common, meaning the node
1342 will not be compressable. With a middle is most common approach the worst
1343 case is when we have the least common nodes twice.
1347 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1348 regnode * const noper = NEXTOPER( cur );
1349 const U8 *uc = (U8*)STRING( noper );
1350 const U8 * const e = uc + STR_LEN( noper );
1352 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1353 const U8 *scan = (U8*)NULL;
1354 U32 wordlen = 0; /* required init */
1357 if (OP(noper) == NOTHING) {
1362 TRIE_BITMAP_SET(trie,*uc);
1363 if ( folder ) TRIE_BITMAP_SET(trie,folder[ *uc ]);
1365 for ( ; uc < e ; uc += len ) {
1366 TRIE_CHARCOUNT(trie)++;
1370 if ( !trie->charmap[ uvc ] ) {
1371 trie->charmap[ uvc ]=( ++trie->uniquecharcount );
1373 trie->charmap[ folder[ uvc ] ] = trie->charmap[ uvc ];
1379 widecharmap = newHV();
1381 svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 1 );
1384 Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%"UVXf, uvc );
1386 if ( !SvTRUE( *svpp ) ) {
1387 sv_setiv( *svpp, ++trie->uniquecharcount );
1392 if( cur == first ) {
1395 } else if (chars < trie->minlen) {
1397 } else if (chars > trie->maxlen) {
1401 } /* end first pass */
1402 DEBUG_TRIE_COMPILE_r(
1403 PerlIO_printf( Perl_debug_log, "%*sTRIE(%s): W:%d C:%d Uq:%d Min:%d Max:%d\n",
1404 (int)depth * 2 + 2,"",
1405 ( widecharmap ? "UTF8" : "NATIVE" ), (int)word_count,
1406 (int)TRIE_CHARCOUNT(trie), trie->uniquecharcount,
1407 (int)trie->minlen, (int)trie->maxlen )
1409 trie->wordlen = PerlMemShared_calloc( word_count, sizeof(U32) );
1412 We now know what we are dealing with in terms of unique chars and
1413 string sizes so we can calculate how much memory a naive
1414 representation using a flat table will take. If it's over a reasonable
1415 limit (as specified by ${^RE_TRIE_MAXBUF}) we use a more memory
1416 conservative but potentially much slower representation using an array
1419 At the end we convert both representations into the same compressed
1420 form that will be used in regexec.c for matching with. The latter
1421 is a form that cannot be used to construct with but has memory
1422 properties similar to the list form and access properties similar
1423 to the table form making it both suitable for fast searches and
1424 small enough that its feasable to store for the duration of a program.
1426 See the comment in the code where the compressed table is produced
1427 inplace from the flat tabe representation for an explanation of how
1428 the compression works.
1433 if ( (IV)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1) > SvIV(re_trie_maxbuff) ) {
1435 Second Pass -- Array Of Lists Representation
1437 Each state will be represented by a list of charid:state records
1438 (reg_trie_trans_le) the first such element holds the CUR and LEN
1439 points of the allocated array. (See defines above).
1441 We build the initial structure using the lists, and then convert
1442 it into the compressed table form which allows faster lookups
1443 (but cant be modified once converted).
1446 STRLEN transcount = 1;
1448 DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
1449 "%*sCompiling trie using list compiler\n",
1450 (int)depth * 2 + 2, ""));
1452 trie->states = PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
1453 sizeof(reg_trie_state) );
1457 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1459 regnode * const noper = NEXTOPER( cur );
1460 U8 *uc = (U8*)STRING( noper );
1461 const U8 * const e = uc + STR_LEN( noper );
1462 U32 state = 1; /* required init */
1463 U16 charid = 0; /* sanity init */
1464 U8 *scan = (U8*)NULL; /* sanity init */
1465 STRLEN foldlen = 0; /* required init */
1466 U32 wordlen = 0; /* required init */
1467 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1469 if (OP(noper) != NOTHING) {
1470 for ( ; uc < e ; uc += len ) {
1475 charid = trie->charmap[ uvc ];
1477 SV** const svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 0);
1481 charid=(U16)SvIV( *svpp );
1484 /* charid is now 0 if we dont know the char read, or nonzero if we do */
1491 if ( !trie->states[ state ].trans.list ) {
1492 TRIE_LIST_NEW( state );
1494 for ( check = 1; check <= TRIE_LIST_USED( state ); check++ ) {
1495 if ( TRIE_LIST_ITEM( state, check ).forid == charid ) {
1496 newstate = TRIE_LIST_ITEM( state, check ).newstate;
1501 newstate = next_alloc++;
1502 TRIE_LIST_PUSH( state, charid, newstate );
1507 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
1511 TRIE_HANDLE_WORD(state);
1513 } /* end second pass */
1515 /* next alloc is the NEXT state to be allocated */
1516 trie->statecount = next_alloc;
1517 trie->states = PerlMemShared_realloc( trie->states, next_alloc
1518 * sizeof(reg_trie_state) );
1520 /* and now dump it out before we compress it */
1521 DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_list(trie, widecharmap,
1522 revcharmap, next_alloc,
1527 = PerlMemShared_calloc( transcount, sizeof(reg_trie_trans) );
1534 for( state=1 ; state < next_alloc ; state ++ ) {
1538 DEBUG_TRIE_COMPILE_MORE_r(
1539 PerlIO_printf( Perl_debug_log, "tp: %d zp: %d ",tp,zp)
1543 if (trie->states[state].trans.list) {
1544 U16 minid=TRIE_LIST_ITEM( state, 1).forid;
1548 for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
1549 const U16 forid = TRIE_LIST_ITEM( state, idx).forid;
1550 if ( forid < minid ) {
1552 } else if ( forid > maxid ) {
1556 if ( transcount < tp + maxid - minid + 1) {
1559 = PerlMemShared_realloc( trie->trans,
1561 * sizeof(reg_trie_trans) );
1562 Zero( trie->trans + (transcount / 2), transcount / 2 , reg_trie_trans );
1564 base = trie->uniquecharcount + tp - minid;
1565 if ( maxid == minid ) {
1567 for ( ; zp < tp ; zp++ ) {
1568 if ( ! trie->trans[ zp ].next ) {
1569 base = trie->uniquecharcount + zp - minid;
1570 trie->trans[ zp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1571 trie->trans[ zp ].check = state;
1577 trie->trans[ tp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1578 trie->trans[ tp ].check = state;
1583 for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
1584 const U32 tid = base - trie->uniquecharcount + TRIE_LIST_ITEM( state, idx ).forid;
1585 trie->trans[ tid ].next = TRIE_LIST_ITEM( state, idx ).newstate;
1586 trie->trans[ tid ].check = state;
1588 tp += ( maxid - minid + 1 );
1590 Safefree(trie->states[ state ].trans.list);
1593 DEBUG_TRIE_COMPILE_MORE_r(
1594 PerlIO_printf( Perl_debug_log, " base: %d\n",base);
1597 trie->states[ state ].trans.base=base;
1599 trie->lasttrans = tp + 1;
1603 Second Pass -- Flat Table Representation.
1605 we dont use the 0 slot of either trans[] or states[] so we add 1 to each.
1606 We know that we will need Charcount+1 trans at most to store the data
1607 (one row per char at worst case) So we preallocate both structures
1608 assuming worst case.
1610 We then construct the trie using only the .next slots of the entry
1613 We use the .check field of the first entry of the node temporarily to
1614 make compression both faster and easier by keeping track of how many non
1615 zero fields are in the node.
1617 Since trans are numbered from 1 any 0 pointer in the table is a FAIL
1620 There are two terms at use here: state as a TRIE_NODEIDX() which is a
1621 number representing the first entry of the node, and state as a
1622 TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1) and
1623 TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3) if there
1624 are 2 entrys per node. eg:
1632 The table is internally in the right hand, idx form. However as we also
1633 have to deal with the states array which is indexed by nodenum we have to
1634 use TRIE_NODENUM() to convert.
1637 DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
1638 "%*sCompiling trie using table compiler\n",
1639 (int)depth * 2 + 2, ""));
1641 trie->trans = PerlMemShared_calloc( ( TRIE_CHARCOUNT(trie) + 1 )
1642 * trie->uniquecharcount + 1,
1643 sizeof(reg_trie_trans) );
1644 trie->states = PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
1645 sizeof(reg_trie_state) );
1646 next_alloc = trie->uniquecharcount + 1;
1649 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1651 regnode * const noper = NEXTOPER( cur );
1652 const U8 *uc = (U8*)STRING( noper );
1653 const U8 * const e = uc + STR_LEN( noper );
1655 U32 state = 1; /* required init */
1657 U16 charid = 0; /* sanity init */
1658 U32 accept_state = 0; /* sanity init */
1659 U8 *scan = (U8*)NULL; /* sanity init */
1661 STRLEN foldlen = 0; /* required init */
1662 U32 wordlen = 0; /* required init */
1663 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1665 if ( OP(noper) != NOTHING ) {
1666 for ( ; uc < e ; uc += len ) {
1671 charid = trie->charmap[ uvc ];
1673 SV* const * const svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 0);
1674 charid = svpp ? (U16)SvIV(*svpp) : 0;
1678 if ( !trie->trans[ state + charid ].next ) {
1679 trie->trans[ state + charid ].next = next_alloc;
1680 trie->trans[ state ].check++;
1681 next_alloc += trie->uniquecharcount;
1683 state = trie->trans[ state + charid ].next;
1685 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
1687 /* charid is now 0 if we dont know the char read, or nonzero if we do */
1690 accept_state = TRIE_NODENUM( state );
1691 TRIE_HANDLE_WORD(accept_state);
1693 } /* end second pass */
1695 /* and now dump it out before we compress it */
1696 DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_table(trie, widecharmap,
1698 next_alloc, depth+1));
1702 * Inplace compress the table.*
1704 For sparse data sets the table constructed by the trie algorithm will
1705 be mostly 0/FAIL transitions or to put it another way mostly empty.
1706 (Note that leaf nodes will not contain any transitions.)
1708 This algorithm compresses the tables by eliminating most such
1709 transitions, at the cost of a modest bit of extra work during lookup:
1711 - Each states[] entry contains a .base field which indicates the
1712 index in the state[] array wheres its transition data is stored.
1714 - If .base is 0 there are no valid transitions from that node.
1716 - If .base is nonzero then charid is added to it to find an entry in
1719 -If trans[states[state].base+charid].check!=state then the
1720 transition is taken to be a 0/Fail transition. Thus if there are fail
1721 transitions at the front of the node then the .base offset will point
1722 somewhere inside the previous nodes data (or maybe even into a node
1723 even earlier), but the .check field determines if the transition is
1727 The following process inplace converts the table to the compressed
1728 table: We first do not compress the root node 1,and mark its all its
1729 .check pointers as 1 and set its .base pointer as 1 as well. This
1730 allows to do a DFA construction from the compressed table later, and
1731 ensures that any .base pointers we calculate later are greater than
1734 - We set 'pos' to indicate the first entry of the second node.
1736 - We then iterate over the columns of the node, finding the first and
1737 last used entry at l and m. We then copy l..m into pos..(pos+m-l),
1738 and set the .check pointers accordingly, and advance pos
1739 appropriately and repreat for the next node. Note that when we copy
1740 the next pointers we have to convert them from the original
1741 NODEIDX form to NODENUM form as the former is not valid post
1744 - If a node has no transitions used we mark its base as 0 and do not
1745 advance the pos pointer.
1747 - If a node only has one transition we use a second pointer into the
1748 structure to fill in allocated fail transitions from other states.
1749 This pointer is independent of the main pointer and scans forward
1750 looking for null transitions that are allocated to a state. When it
1751 finds one it writes the single transition into the "hole". If the
1752 pointer doesnt find one the single transition is appended as normal.
1754 - Once compressed we can Renew/realloc the structures to release the
1757 See "Table-Compression Methods" in sec 3.9 of the Red Dragon,
1758 specifically Fig 3.47 and the associated pseudocode.
1762 const U32 laststate = TRIE_NODENUM( next_alloc );
1765 trie->statecount = laststate;
1767 for ( state = 1 ; state < laststate ; state++ ) {
1769 const U32 stateidx = TRIE_NODEIDX( state );
1770 const U32 o_used = trie->trans[ stateidx ].check;
1771 U32 used = trie->trans[ stateidx ].check;
1772 trie->trans[ stateidx ].check = 0;
1774 for ( charid = 0 ; used && charid < trie->uniquecharcount ; charid++ ) {
1775 if ( flag || trie->trans[ stateidx + charid ].next ) {
1776 if ( trie->trans[ stateidx + charid ].next ) {
1778 for ( ; zp < pos ; zp++ ) {
1779 if ( ! trie->trans[ zp ].next ) {
1783 trie->states[ state ].trans.base = zp + trie->uniquecharcount - charid ;
1784 trie->trans[ zp ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
1785 trie->trans[ zp ].check = state;
1786 if ( ++zp > pos ) pos = zp;
1793 trie->states[ state ].trans.base = pos + trie->uniquecharcount - charid ;
1795 trie->trans[ pos ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
1796 trie->trans[ pos ].check = state;
1801 trie->lasttrans = pos + 1;
1802 trie->states = PerlMemShared_realloc( trie->states, laststate
1803 * sizeof(reg_trie_state) );
1804 DEBUG_TRIE_COMPILE_MORE_r(
1805 PerlIO_printf( Perl_debug_log,
1806 "%*sAlloc: %d Orig: %"IVdf" elements, Final:%"IVdf". Savings of %%%5.2f\n",
1807 (int)depth * 2 + 2,"",
1808 (int)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1 ),
1811 ( ( next_alloc - pos ) * 100 ) / (double)next_alloc );
1814 } /* end table compress */
1816 DEBUG_TRIE_COMPILE_MORE_r(
1817 PerlIO_printf(Perl_debug_log, "%*sStatecount:%"UVxf" Lasttrans:%"UVxf"\n",
1818 (int)depth * 2 + 2, "",
1819 (UV)trie->statecount,
1820 (UV)trie->lasttrans)
1822 /* resize the trans array to remove unused space */
1823 trie->trans = PerlMemShared_realloc( trie->trans, trie->lasttrans
1824 * sizeof(reg_trie_trans) );
1826 /* and now dump out the compressed format */
1827 DEBUG_TRIE_COMPILE_r(dump_trie(trie, widecharmap, revcharmap, depth+1));
1829 { /* Modify the program and insert the new TRIE node*/
1830 U8 nodetype =(U8)(flags & 0xFF);
1834 regnode *optimize = NULL;
1836 U32 mjd_nodelen = 0;
1839 This means we convert either the first branch or the first Exact,
1840 depending on whether the thing following (in 'last') is a branch
1841 or not and whther first is the startbranch (ie is it a sub part of
1842 the alternation or is it the whole thing.)
1843 Assuming its a sub part we conver the EXACT otherwise we convert
1844 the whole branch sequence, including the first.
1846 /* Find the node we are going to overwrite */
1847 if ( first != startbranch || OP( last ) == BRANCH ) {
1848 /* branch sub-chain */
1849 NEXT_OFF( first ) = (U16)(last - first);
1851 mjd_offset= Node_Offset((convert));
1852 mjd_nodelen= Node_Length((convert));
1854 /* whole branch chain */
1857 const regnode *nop = NEXTOPER( convert );
1858 mjd_offset= Node_Offset((nop));
1859 mjd_nodelen= Node_Length((nop));
1864 PerlIO_printf(Perl_debug_log, "%*sMJD offset:%"UVuf" MJD length:%"UVuf"\n",
1865 (int)depth * 2 + 2, "",
1866 (UV)mjd_offset, (UV)mjd_nodelen)
1869 /* But first we check to see if there is a common prefix we can
1870 split out as an EXACT and put in front of the TRIE node. */
1871 trie->startstate= 1;
1872 if ( trie->bitmap && !widecharmap && !trie->jump ) {
1874 for ( state = 1 ; state < trie->statecount-1 ; state++ ) {
1878 const U32 base = trie->states[ state ].trans.base;
1880 if ( trie->states[state].wordnum )
1883 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
1884 if ( ( base + ofs >= trie->uniquecharcount ) &&
1885 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
1886 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
1888 if ( ++count > 1 ) {
1889 SV **tmp = av_fetch( revcharmap, ofs, 0);
1890 const U8 *ch = (U8*)SvPV_nolen_const( *tmp );
1891 if ( state == 1 ) break;
1893 Zero(trie->bitmap, ANYOF_BITMAP_SIZE, char);
1895 PerlIO_printf(Perl_debug_log,
1896 "%*sNew Start State=%"UVuf" Class: [",
1897 (int)depth * 2 + 2, "",
1900 SV ** const tmp = av_fetch( revcharmap, idx, 0);
1901 const U8 * const ch = (U8*)SvPV_nolen_const( *tmp );
1903 TRIE_BITMAP_SET(trie,*ch);
1905 TRIE_BITMAP_SET(trie, folder[ *ch ]);
1907 PerlIO_printf(Perl_debug_log, (char*)ch)
1911 TRIE_BITMAP_SET(trie,*ch);
1913 TRIE_BITMAP_SET(trie,folder[ *ch ]);
1914 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"%s", ch));
1920 SV **tmp = av_fetch( revcharmap, idx, 0);
1921 char *ch = SvPV_nolen( *tmp );
1923 SV *sv=sv_newmortal();
1924 PerlIO_printf( Perl_debug_log,
1925 "%*sPrefix State: %"UVuf" Idx:%"UVuf" Char='%s'\n",
1926 (int)depth * 2 + 2, "",
1928 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 6,
1929 PL_colors[0], PL_colors[1],
1930 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1931 PERL_PV_ESCAPE_FIRSTCHAR
1936 OP( convert ) = nodetype;
1937 str=STRING(convert);
1948 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"]\n"));
1954 regnode *n = convert+NODE_SZ_STR(convert);
1955 NEXT_OFF(convert) = NODE_SZ_STR(convert);
1956 trie->startstate = state;
1957 trie->minlen -= (state - 1);
1958 trie->maxlen -= (state - 1);
1960 regnode *fix = convert;
1961 U32 word = trie->wordcount;
1963 Set_Node_Offset_Length(convert, mjd_offset, state - 1);
1964 while( ++fix < n ) {
1965 Set_Node_Offset_Length(fix, 0, 0);
1968 SV ** const tmp = av_fetch( trie_words, word, 0 );
1970 if ( STR_LEN(convert) <= SvCUR(*tmp) )
1971 sv_chop(*tmp, SvPV_nolen(*tmp) + STR_LEN(convert));
1973 sv_chop(*tmp, SvPV_nolen(*tmp) + SvCUR(*tmp));
1980 NEXT_OFF(convert) = (U16)(tail - convert);
1981 DEBUG_r(optimize= n);
1987 if ( trie->maxlen ) {
1988 NEXT_OFF( convert ) = (U16)(tail - convert);
1989 ARG_SET( convert, data_slot );
1990 /* Store the offset to the first unabsorbed branch in
1991 jump[0], which is otherwise unused by the jump logic.
1992 We use this when dumping a trie and during optimisation. */
1994 trie->jump[0] = (U16)(nextbranch - convert);
1997 if ( !trie->states[trie->startstate].wordnum && trie->bitmap &&
1998 ( (char *)jumper - (char *)convert) >= (int)sizeof(struct regnode_charclass) )
2000 OP( convert ) = TRIEC;
2001 Copy(trie->bitmap, ((struct regnode_charclass *)convert)->bitmap, ANYOF_BITMAP_SIZE, char);
2002 PerlMemShared_free(trie->bitmap);
2005 OP( convert ) = TRIE;
2007 /* store the type in the flags */
2008 convert->flags = nodetype;
2012 + regarglen[ OP( convert ) ];
2014 /* XXX We really should free up the resource in trie now,
2015 as we won't use them - (which resources?) dmq */
2017 /* needed for dumping*/
2018 DEBUG_r(if (optimize) {
2019 regnode *opt = convert;
2020 while ( ++opt < optimize) {
2021 Set_Node_Offset_Length(opt,0,0);
2024 Try to clean up some of the debris left after the
2027 while( optimize < jumper ) {
2028 mjd_nodelen += Node_Length((optimize));
2029 OP( optimize ) = OPTIMIZED;
2030 Set_Node_Offset_Length(optimize,0,0);
2033 Set_Node_Offset_Length(convert,mjd_offset,mjd_nodelen);
2035 } /* end node insert */
2036 RExC_rxi->data->data[ data_slot + 1 ] = (void*)widecharmap;
2038 RExC_rxi->data->data[ data_slot + TRIE_WORDS_OFFSET ] = (void*)trie_words;
2039 RExC_rxi->data->data[ data_slot + 3 ] = (void*)revcharmap;
2041 SvREFCNT_dec(revcharmap);
2045 : trie->startstate>1
2051 S_make_trie_failtable(pTHX_ RExC_state_t *pRExC_state, regnode *source, regnode *stclass, U32 depth)
2053 /* The Trie is constructed and compressed now so we can build a fail array now if its needed
2055 This is basically the Aho-Corasick algorithm. Its from exercise 3.31 and 3.32 in the
2056 "Red Dragon" -- Compilers, principles, techniques, and tools. Aho, Sethi, Ullman 1985/88
2059 We find the fail state for each state in the trie, this state is the longest proper
2060 suffix of the current states 'word' that is also a proper prefix of another word in our
2061 trie. State 1 represents the word '' and is the thus the default fail state. This allows
2062 the DFA not to have to restart after its tried and failed a word at a given point, it
2063 simply continues as though it had been matching the other word in the first place.
2065 'abcdgu'=~/abcdefg|cdgu/
2066 When we get to 'd' we are still matching the first word, we would encounter 'g' which would
2067 fail, which would bring use to the state representing 'd' in the second word where we would
2068 try 'g' and succeed, prodceding to match 'cdgu'.
2070 /* add a fail transition */
2071 const U32 trie_offset = ARG(source);
2072 reg_trie_data *trie=(reg_trie_data *)RExC_rxi->data->data[trie_offset];
2074 const U32 ucharcount = trie->uniquecharcount;
2075 const U32 numstates = trie->statecount;
2076 const U32 ubound = trie->lasttrans + ucharcount;
2080 U32 base = trie->states[ 1 ].trans.base;
2083 const U32 data_slot = add_data( pRExC_state, 1, "T" );
2084 GET_RE_DEBUG_FLAGS_DECL;
2086 PERL_UNUSED_ARG(depth);
2090 ARG_SET( stclass, data_slot );
2091 aho = PerlMemShared_calloc( 1, sizeof(reg_ac_data) );
2092 RExC_rxi->data->data[ data_slot ] = (void*)aho;
2093 aho->trie=trie_offset;
2094 aho->states=(reg_trie_state *)PerlMemShared_malloc( numstates * sizeof(reg_trie_state) );
2095 Copy( trie->states, aho->states, numstates, reg_trie_state );
2096 Newxz( q, numstates, U32);
2097 aho->fail = PerlMemShared_calloc( numstates, sizeof(U32) );
2100 /* initialize fail[0..1] to be 1 so that we always have
2101 a valid final fail state */
2102 fail[ 0 ] = fail[ 1 ] = 1;
2104 for ( charid = 0; charid < ucharcount ; charid++ ) {
2105 const U32 newstate = TRIE_TRANS_STATE( 1, base, ucharcount, charid, 0 );
2107 q[ q_write ] = newstate;
2108 /* set to point at the root */
2109 fail[ q[ q_write++ ] ]=1;
2112 while ( q_read < q_write) {
2113 const U32 cur = q[ q_read++ % numstates ];
2114 base = trie->states[ cur ].trans.base;
2116 for ( charid = 0 ; charid < ucharcount ; charid++ ) {
2117 const U32 ch_state = TRIE_TRANS_STATE( cur, base, ucharcount, charid, 1 );
2119 U32 fail_state = cur;
2122 fail_state = fail[ fail_state ];
2123 fail_base = aho->states[ fail_state ].trans.base;
2124 } while ( !TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 ) );
2126 fail_state = TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 );
2127 fail[ ch_state ] = fail_state;
2128 if ( !aho->states[ ch_state ].wordnum && aho->states[ fail_state ].wordnum )
2130 aho->states[ ch_state ].wordnum = aho->states[ fail_state ].wordnum;
2132 q[ q_write++ % numstates] = ch_state;
2136 /* restore fail[0..1] to 0 so that we "fall out" of the AC loop
2137 when we fail in state 1, this allows us to use the
2138 charclass scan to find a valid start char. This is based on the principle
2139 that theres a good chance the string being searched contains lots of stuff
2140 that cant be a start char.
2142 fail[ 0 ] = fail[ 1 ] = 0;
2143 DEBUG_TRIE_COMPILE_r({
2144 PerlIO_printf(Perl_debug_log,
2145 "%*sStclass Failtable (%"UVuf" states): 0",
2146 (int)(depth * 2), "", (UV)numstates
2148 for( q_read=1; q_read<numstates; q_read++ ) {
2149 PerlIO_printf(Perl_debug_log, ", %"UVuf, (UV)fail[q_read]);
2151 PerlIO_printf(Perl_debug_log, "\n");
2154 /*RExC_seen |= REG_SEEN_TRIEDFA;*/
2159 * There are strange code-generation bugs caused on sparc64 by gcc-2.95.2.
2160 * These need to be revisited when a newer toolchain becomes available.
2162 #if defined(__sparc64__) && defined(__GNUC__)
2163 # if __GNUC__ < 2 || (__GNUC__ == 2 && __GNUC_MINOR__ < 96)
2164 # undef SPARC64_GCC_WORKAROUND
2165 # define SPARC64_GCC_WORKAROUND 1
2169 #define DEBUG_PEEP(str,scan,depth) \
2170 DEBUG_OPTIMISE_r({if (scan){ \
2171 SV * const mysv=sv_newmortal(); \
2172 regnode *Next = regnext(scan); \
2173 regprop(RExC_rx, mysv, scan); \
2174 PerlIO_printf(Perl_debug_log, "%*s" str ">%3d: %s (%d)\n", \
2175 (int)depth*2, "", REG_NODE_NUM(scan), SvPV_nolen_const(mysv),\
2176 Next ? (REG_NODE_NUM(Next)) : 0 ); \
2183 #define JOIN_EXACT(scan,min,flags) \
2184 if (PL_regkind[OP(scan)] == EXACT) \
2185 join_exact(pRExC_state,(scan),(min),(flags),NULL,depth+1)
2188 S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, I32 *min, U32 flags,regnode *val, U32 depth) {
2189 /* Merge several consecutive EXACTish nodes into one. */
2190 regnode *n = regnext(scan);
2192 regnode *next = scan + NODE_SZ_STR(scan);
2196 regnode *stop = scan;
2197 GET_RE_DEBUG_FLAGS_DECL;
2199 PERL_UNUSED_ARG(depth);
2201 #ifndef EXPERIMENTAL_INPLACESCAN
2202 PERL_UNUSED_ARG(flags);
2203 PERL_UNUSED_ARG(val);
2205 DEBUG_PEEP("join",scan,depth);
2207 /* Skip NOTHING, merge EXACT*. */
2209 ( PL_regkind[OP(n)] == NOTHING ||
2210 (stringok && (OP(n) == OP(scan))))
2212 && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX) {
2214 if (OP(n) == TAIL || n > next)
2216 if (PL_regkind[OP(n)] == NOTHING) {
2217 DEBUG_PEEP("skip:",n,depth);
2218 NEXT_OFF(scan) += NEXT_OFF(n);
2219 next = n + NODE_STEP_REGNODE;
2226 else if (stringok) {
2227 const unsigned int oldl = STR_LEN(scan);
2228 regnode * const nnext = regnext(n);
2230 DEBUG_PEEP("merg",n,depth);
2233 if (oldl + STR_LEN(n) > U8_MAX)
2235 NEXT_OFF(scan) += NEXT_OFF(n);
2236 STR_LEN(scan) += STR_LEN(n);
2237 next = n + NODE_SZ_STR(n);
2238 /* Now we can overwrite *n : */
2239 Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
2247 #ifdef EXPERIMENTAL_INPLACESCAN
2248 if (flags && !NEXT_OFF(n)) {
2249 DEBUG_PEEP("atch", val, depth);
2250 if (reg_off_by_arg[OP(n)]) {
2251 ARG_SET(n, val - n);
2254 NEXT_OFF(n) = val - n;
2261 if (UTF && ( OP(scan) == EXACTF ) && ( STR_LEN(scan) >= 6 ) ) {
2263 Two problematic code points in Unicode casefolding of EXACT nodes:
2265 U+0390 - GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
2266 U+03B0 - GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
2272 U+03B9 U+0308 U+0301 0xCE 0xB9 0xCC 0x88 0xCC 0x81
2273 U+03C5 U+0308 U+0301 0xCF 0x85 0xCC 0x88 0xCC 0x81
2275 This means that in case-insensitive matching (or "loose matching",
2276 as Unicode calls it), an EXACTF of length six (the UTF-8 encoded byte
2277 length of the above casefolded versions) can match a target string
2278 of length two (the byte length of UTF-8 encoded U+0390 or U+03B0).
2279 This would rather mess up the minimum length computation.
2281 What we'll do is to look for the tail four bytes, and then peek
2282 at the preceding two bytes to see whether we need to decrease
2283 the minimum length by four (six minus two).
2285 Thanks to the design of UTF-8, there cannot be false matches:
2286 A sequence of valid UTF-8 bytes cannot be a subsequence of
2287 another valid sequence of UTF-8 bytes.
2290 char * const s0 = STRING(scan), *s, *t;
2291 char * const s1 = s0 + STR_LEN(scan) - 1;
2292 char * const s2 = s1 - 4;
2293 #ifdef EBCDIC /* RD tunifold greek 0390 and 03B0 */
2294 const char t0[] = "\xaf\x49\xaf\x42";
2296 const char t0[] = "\xcc\x88\xcc\x81";
2298 const char * const t1 = t0 + 3;
2301 s < s2 && (t = ninstr(s, s1, t0, t1));
2304 if (((U8)t[-1] == 0x68 && (U8)t[-2] == 0xB4) ||
2305 ((U8)t[-1] == 0x46 && (U8)t[-2] == 0xB5))
2307 if (((U8)t[-1] == 0xB9 && (U8)t[-2] == 0xCE) ||
2308 ((U8)t[-1] == 0x85 && (U8)t[-2] == 0xCF))
2316 n = scan + NODE_SZ_STR(scan);
2318 if (PL_regkind[OP(n)] != NOTHING || OP(n) == NOTHING) {
2325 DEBUG_OPTIMISE_r(if (merged){DEBUG_PEEP("finl",scan,depth)});
2329 /* REx optimizer. Converts nodes into quickier variants "in place".
2330 Finds fixed substrings. */
2332 /* Stops at toplevel WHILEM as well as at "last". At end *scanp is set
2333 to the position after last scanned or to NULL. */
2335 #define INIT_AND_WITHP \
2336 assert(!and_withp); \
2337 Newx(and_withp,1,struct regnode_charclass_class); \
2338 SAVEFREEPV(and_withp)
2340 /* this is a chain of data about sub patterns we are processing that
2341 need to be handled seperately/specially in study_chunk. Its so
2342 we can simulate recursion without losing state. */
2344 typedef struct scan_frame {
2345 regnode *last; /* last node to process in this frame */
2346 regnode *next; /* next node to process when last is reached */
2347 struct scan_frame *prev; /*previous frame*/
2348 I32 stop; /* what stopparen do we use */
2352 S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
2353 I32 *minlenp, I32 *deltap,
2358 struct regnode_charclass_class *and_withp,
2359 U32 flags, U32 depth)
2360 /* scanp: Start here (read-write). */
2361 /* deltap: Write maxlen-minlen here. */
2362 /* last: Stop before this one. */
2363 /* data: string data about the pattern */
2364 /* stopparen: treat close N as END */
2365 /* recursed: which subroutines have we recursed into */
2366 /* and_withp: Valid if flags & SCF_DO_STCLASS_OR */
2369 I32 min = 0, pars = 0, code;
2370 regnode *scan = *scanp, *next;
2372 int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
2373 int is_inf_internal = 0; /* The studied chunk is infinite */
2374 I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
2375 scan_data_t data_fake;
2376 SV *re_trie_maxbuff = NULL;
2377 regnode *first_non_open = scan;
2378 I32 stopmin = I32_MAX;
2379 scan_frame *frame = NULL;
2381 GET_RE_DEBUG_FLAGS_DECL;
2384 StructCopy(&zero_scan_data, &data_fake, scan_data_t);
2388 while (first_non_open && OP(first_non_open) == OPEN)
2389 first_non_open=regnext(first_non_open);
2394 while ( scan && OP(scan) != END && scan < last ){
2395 /* Peephole optimizer: */
2396 DEBUG_STUDYDATA(data,depth);
2397 DEBUG_PEEP("Peep",scan,depth);
2398 JOIN_EXACT(scan,&min,0);
2400 /* Follow the next-chain of the current node and optimize
2401 away all the NOTHINGs from it. */
2402 if (OP(scan) != CURLYX) {
2403 const int max = (reg_off_by_arg[OP(scan)]
2405 /* I32 may be smaller than U16 on CRAYs! */
2406 : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
2407 int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
2411 /* Skip NOTHING and LONGJMP. */
2412 while ((n = regnext(n))
2413 && ((PL_regkind[OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
2414 || ((OP(n) == LONGJMP) && (noff = ARG(n))))
2415 && off + noff < max)
2417 if (reg_off_by_arg[OP(scan)])
2420 NEXT_OFF(scan) = off;
2425 /* The principal pseudo-switch. Cannot be a switch, since we
2426 look into several different things. */
2427 if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
2428 || OP(scan) == IFTHEN) {
2429 next = regnext(scan);
2431 /* demq: the op(next)==code check is to see if we have "branch-branch" AFAICT */
2433 if (OP(next) == code || code == IFTHEN) {
2434 /* NOTE - There is similar code to this block below for handling
2435 TRIE nodes on a re-study. If you change stuff here check there
2437 I32 max1 = 0, min1 = I32_MAX, num = 0;
2438 struct regnode_charclass_class accum;
2439 regnode * const startbranch=scan;
2441 if (flags & SCF_DO_SUBSTR)
2442 scan_commit(pRExC_state, data, minlenp); /* Cannot merge strings after this. */
2443 if (flags & SCF_DO_STCLASS)
2444 cl_init_zero(pRExC_state, &accum);
2446 while (OP(scan) == code) {
2447 I32 deltanext, minnext, f = 0, fake;
2448 struct regnode_charclass_class this_class;
2451 data_fake.flags = 0;
2453 data_fake.whilem_c = data->whilem_c;
2454 data_fake.last_closep = data->last_closep;
2457 data_fake.last_closep = &fake;
2459 data_fake.pos_delta = delta;
2460 next = regnext(scan);
2461 scan = NEXTOPER(scan);
2463 scan = NEXTOPER(scan);
2464 if (flags & SCF_DO_STCLASS) {
2465 cl_init(pRExC_state, &this_class);
2466 data_fake.start_class = &this_class;
2467 f = SCF_DO_STCLASS_AND;
2469 if (flags & SCF_WHILEM_VISITED_POS)
2470 f |= SCF_WHILEM_VISITED_POS;
2472 /* we suppose the run is continuous, last=next...*/
2473 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
2475 stopparen, recursed, NULL, f,depth+1);
2478 if (max1 < minnext + deltanext)
2479 max1 = minnext + deltanext;
2480 if (deltanext == I32_MAX)
2481 is_inf = is_inf_internal = 1;
2483 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
2485 if (data_fake.flags & SCF_SEEN_ACCEPT) {
2486 if ( stopmin > minnext)
2487 stopmin = min + min1;
2488 flags &= ~SCF_DO_SUBSTR;
2490 data->flags |= SCF_SEEN_ACCEPT;
2493 if (data_fake.flags & SF_HAS_EVAL)
2494 data->flags |= SF_HAS_EVAL;
2495 data->whilem_c = data_fake.whilem_c;
2497 if (flags & SCF_DO_STCLASS)
2498 cl_or(pRExC_state, &accum, &this_class);
2500 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
2502 if (flags & SCF_DO_SUBSTR) {
2503 data->pos_min += min1;
2504 data->pos_delta += max1 - min1;
2505 if (max1 != min1 || is_inf)
2506 data->longest = &(data->longest_float);
2509 delta += max1 - min1;
2510 if (flags & SCF_DO_STCLASS_OR) {
2511 cl_or(pRExC_state, data->start_class, &accum);
2513 cl_and(data->start_class, and_withp);
2514 flags &= ~SCF_DO_STCLASS;
2517 else if (flags & SCF_DO_STCLASS_AND) {
2519 cl_and(data->start_class, &accum);
2520 flags &= ~SCF_DO_STCLASS;
2523 /* Switch to OR mode: cache the old value of
2524 * data->start_class */
2526 StructCopy(data->start_class, and_withp,
2527 struct regnode_charclass_class);
2528 flags &= ~SCF_DO_STCLASS_AND;
2529 StructCopy(&accum, data->start_class,
2530 struct regnode_charclass_class);
2531 flags |= SCF_DO_STCLASS_OR;
2532 data->start_class->flags |= ANYOF_EOS;
2536 if (PERL_ENABLE_TRIE_OPTIMISATION && OP( startbranch ) == BRANCH ) {
2539 Assuming this was/is a branch we are dealing with: 'scan' now
2540 points at the item that follows the branch sequence, whatever
2541 it is. We now start at the beginning of the sequence and look
2548 which would be constructed from a pattern like /A|LIST|OF|WORDS/
2550 If we can find such a subseqence we need to turn the first
2551 element into a trie and then add the subsequent branch exact
2552 strings to the trie.
2556 1. patterns where the whole set of branch can be converted.
2558 2. patterns where only a subset can be converted.
2560 In case 1 we can replace the whole set with a single regop
2561 for the trie. In case 2 we need to keep the start and end
2564 'BRANCH EXACT; BRANCH EXACT; BRANCH X'
2565 becomes BRANCH TRIE; BRANCH X;
2567 There is an additional case, that being where there is a
2568 common prefix, which gets split out into an EXACT like node
2569 preceding the TRIE node.
2571 If x(1..n)==tail then we can do a simple trie, if not we make
2572 a "jump" trie, such that when we match the appropriate word
2573 we "jump" to the appopriate tail node. Essentailly we turn
2574 a nested if into a case structure of sorts.
2579 if (!re_trie_maxbuff) {
2580 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
2581 if (!SvIOK(re_trie_maxbuff))
2582 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
2584 if ( SvIV(re_trie_maxbuff)>=0 ) {
2586 regnode *first = (regnode *)NULL;
2587 regnode *last = (regnode *)NULL;
2588 regnode *tail = scan;
2593 SV * const mysv = sv_newmortal(); /* for dumping */
2595 /* var tail is used because there may be a TAIL
2596 regop in the way. Ie, the exacts will point to the
2597 thing following the TAIL, but the last branch will
2598 point at the TAIL. So we advance tail. If we
2599 have nested (?:) we may have to move through several
2603 while ( OP( tail ) == TAIL ) {
2604 /* this is the TAIL generated by (?:) */
2605 tail = regnext( tail );
2610 regprop(RExC_rx, mysv, tail );
2611 PerlIO_printf( Perl_debug_log, "%*s%s%s\n",
2612 (int)depth * 2 + 2, "",
2613 "Looking for TRIE'able sequences. Tail node is: ",
2614 SvPV_nolen_const( mysv )
2620 step through the branches, cur represents each
2621 branch, noper is the first thing to be matched
2622 as part of that branch and noper_next is the
2623 regnext() of that node. if noper is an EXACT
2624 and noper_next is the same as scan (our current
2625 position in the regex) then the EXACT branch is
2626 a possible optimization target. Once we have
2627 two or more consequetive such branches we can
2628 create a trie of the EXACT's contents and stich
2629 it in place. If the sequence represents all of
2630 the branches we eliminate the whole thing and
2631 replace it with a single TRIE. If it is a
2632 subsequence then we need to stitch it in. This
2633 means the first branch has to remain, and needs
2634 to be repointed at the item on the branch chain
2635 following the last branch optimized. This could
2636 be either a BRANCH, in which case the
2637 subsequence is internal, or it could be the
2638 item following the branch sequence in which
2639 case the subsequence is at the end.
2643 /* dont use tail as the end marker for this traverse */
2644 for ( cur = startbranch ; cur != scan ; cur = regnext( cur ) ) {
2645 regnode * const noper = NEXTOPER( cur );
2646 #if defined(DEBUGGING) || defined(NOJUMPTRIE)
2647 regnode * const noper_next = regnext( noper );
2651 regprop(RExC_rx, mysv, cur);
2652 PerlIO_printf( Perl_debug_log, "%*s- %s (%d)",
2653 (int)depth * 2 + 2,"", SvPV_nolen_const( mysv ), REG_NODE_NUM(cur) );
2655 regprop(RExC_rx, mysv, noper);
2656 PerlIO_printf( Perl_debug_log, " -> %s",
2657 SvPV_nolen_const(mysv));
2660 regprop(RExC_rx, mysv, noper_next );
2661 PerlIO_printf( Perl_debug_log,"\t=> %s\t",
2662 SvPV_nolen_const(mysv));
2664 PerlIO_printf( Perl_debug_log, "(First==%d,Last==%d,Cur==%d)\n",
2665 REG_NODE_NUM(first), REG_NODE_NUM(last), REG_NODE_NUM(cur) );
2667 if ( (((first && optype!=NOTHING) ? OP( noper ) == optype
2668 : PL_regkind[ OP( noper ) ] == EXACT )
2669 || OP(noper) == NOTHING )
2671 && noper_next == tail
2676 if ( !first || optype == NOTHING ) {
2677 if (!first) first = cur;
2678 optype = OP( noper );
2684 make_trie( pRExC_state,
2685 startbranch, first, cur, tail, count,
2688 if ( PL_regkind[ OP( noper ) ] == EXACT
2690 && noper_next == tail
2695 optype = OP( noper );
2705 regprop(RExC_rx, mysv, cur);
2706 PerlIO_printf( Perl_debug_log,
2707 "%*s- %s (%d) <SCAN FINISHED>\n", (int)depth * 2 + 2,
2708 "", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur));
2712 made= make_trie( pRExC_state, startbranch, first, scan, tail, count, optype, depth+1 );
2713 #ifdef TRIE_STUDY_OPT
2714 if ( ((made == MADE_EXACT_TRIE &&
2715 startbranch == first)
2716 || ( first_non_open == first )) &&
2718 flags |= SCF_TRIE_RESTUDY;
2719 if ( startbranch == first
2722 RExC_seen &=~REG_TOP_LEVEL_BRANCHES;
2732 else if ( code == BRANCHJ ) { /* single branch is optimized. */
2733 scan = NEXTOPER(NEXTOPER(scan));
2734 } else /* single branch is optimized. */
2735 scan = NEXTOPER(scan);
2737 } else if (OP(scan) == SUSPEND || OP(scan) == GOSUB || OP(scan) == GOSTART) {
2738 scan_frame *newframe = NULL;
2743 if (OP(scan) != SUSPEND) {
2744 /* set the pointer */
2745 if (OP(scan) == GOSUB) {
2747 RExC_recurse[ARG2L(scan)] = scan;
2748 start = RExC_open_parens[paren-1];
2749 end = RExC_close_parens[paren-1];
2752 start = RExC_rxi->program + 1;
2756 Newxz(recursed, (((RExC_npar)>>3) +1), U8);
2757 SAVEFREEPV(recursed);
2759 if (!PAREN_TEST(recursed,paren+1)) {
2760 PAREN_SET(recursed,paren+1);
2761 Newx(newframe,1,scan_frame);
2763 if (flags & SCF_DO_SUBSTR) {
2764 scan_commit(pRExC_state,data,minlenp);
2765 data->longest = &(data->longest_float);
2767 is_inf = is_inf_internal = 1;
2768 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
2769 cl_anything(pRExC_state, data->start_class);
2770 flags &= ~SCF_DO_STCLASS;
2773 Newx(newframe,1,scan_frame);
2776 end = regnext(scan);
2781 SAVEFREEPV(newframe);
2782 newframe->next = regnext(scan);
2783 newframe->last = last;
2784 newframe->stop = stopparen;
2785 newframe->prev = frame;
2795 else if (OP(scan) == EXACT) {
2796 I32 l = STR_LEN(scan);
2799 const U8 * const s = (U8*)STRING(scan);
2800 l = utf8_length(s, s + l);
2801 uc = utf8_to_uvchr(s, NULL);
2803 uc = *((U8*)STRING(scan));
2806 if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
2807 /* The code below prefers earlier match for fixed
2808 offset, later match for variable offset. */
2809 if (data->last_end == -1) { /* Update the start info. */
2810 data->last_start_min = data->pos_min;
2811 data->last_start_max = is_inf
2812 ? I32_MAX : data->pos_min + data->pos_delta;
2814 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
2816 SvUTF8_on(data->last_found);
2818 SV * const sv = data->last_found;
2819 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
2820 mg_find(sv, PERL_MAGIC_utf8) : NULL;
2821 if (mg && mg->mg_len >= 0)
2822 mg->mg_len += utf8_length((U8*)STRING(scan),
2823 (U8*)STRING(scan)+STR_LEN(scan));
2825 data->last_end = data->pos_min + l;
2826 data->pos_min += l; /* As in the first entry. */
2827 data->flags &= ~SF_BEFORE_EOL;
2829 if (flags & SCF_DO_STCLASS_AND) {
2830 /* Check whether it is compatible with what we know already! */
2834 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
2835 && !ANYOF_BITMAP_TEST(data->start_class, uc)
2836 && (!(data->start_class->flags & ANYOF_FOLD)
2837 || !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
2840 ANYOF_CLASS_ZERO(data->start_class);
2841 ANYOF_BITMAP_ZERO(data->start_class);
2843 ANYOF_BITMAP_SET(data->start_class, uc);
2844 data->start_class->flags &= ~ANYOF_EOS;
2846 data->start_class->flags &= ~ANYOF_UNICODE_ALL;
2848 else if (flags & SCF_DO_STCLASS_OR) {
2849 /* false positive possible if the class is case-folded */
2851 ANYOF_BITMAP_SET(data->start_class, uc);
2853 data->start_class->flags |= ANYOF_UNICODE_ALL;
2854 data->start_class->flags &= ~ANYOF_EOS;
2855 cl_and(data->start_class, and_withp);
2857 flags &= ~SCF_DO_STCLASS;
2859 else if (PL_regkind[OP(scan)] == EXACT) { /* But OP != EXACT! */
2860 I32 l = STR_LEN(scan);
2861 UV uc = *((U8*)STRING(scan));
2863 /* Search for fixed substrings supports EXACT only. */
2864 if (flags & SCF_DO_SUBSTR) {
2866 scan_commit(pRExC_state, data, minlenp);
2869 const U8 * const s = (U8 *)STRING(scan);
2870 l = utf8_length(s, s + l);
2871 uc = utf8_to_uvchr(s, NULL);
2874 if (flags & SCF_DO_SUBSTR)
2876 if (flags & SCF_DO_STCLASS_AND) {
2877 /* Check whether it is compatible with what we know already! */
2881 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
2882 && !ANYOF_BITMAP_TEST(data->start_class, uc)
2883 && !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
2885 ANYOF_CLASS_ZERO(data->start_class);
2886 ANYOF_BITMAP_ZERO(data->start_class);
2888 ANYOF_BITMAP_SET(data->start_class, uc);
2889 data->start_class->flags &= ~ANYOF_EOS;
2890 data->start_class->flags |= ANYOF_FOLD;
2891 if (OP(scan) == EXACTFL)
2892 data->start_class->flags |= ANYOF_LOCALE;
2895 else if (flags & SCF_DO_STCLASS_OR) {
2896 if (data->start_class->flags & ANYOF_FOLD) {
2897 /* false positive possible if the class is case-folded.
2898 Assume that the locale settings are the same... */
2900 ANYOF_BITMAP_SET(data->start_class, uc);
2901 data->start_class->flags &= ~ANYOF_EOS;
2903 cl_and(data->start_class, and_withp);
2905 flags &= ~SCF_DO_STCLASS;
2907 else if (strchr((const char*)PL_varies,OP(scan))) {
2908 I32 mincount, maxcount, minnext, deltanext, fl = 0;
2909 I32 f = flags, pos_before = 0;
2910 regnode * const oscan = scan;
2911 struct regnode_charclass_class this_class;
2912 struct regnode_charclass_class *oclass = NULL;
2913 I32 next_is_eval = 0;
2915 switch (PL_regkind[OP(scan)]) {
2916 case WHILEM: /* End of (?:...)* . */
2917 scan = NEXTOPER(scan);
2920 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
2921 next = NEXTOPER(scan);
2922 if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) {
2924 maxcount = REG_INFTY;
2925 next = regnext(scan);
2926 scan = NEXTOPER(scan);
2930 if (flags & SCF_DO_SUBSTR)
2935 if (flags & SCF_DO_STCLASS) {
2937 maxcount = REG_INFTY;
2938 next = regnext(scan);
2939 scan = NEXTOPER(scan);
2942 is_inf = is_inf_internal = 1;
2943 scan = regnext(scan);
2944 if (flags & SCF_DO_SUBSTR) {
2945 scan_commit(pRExC_state, data, minlenp); /* Cannot extend fixed substrings */
2946 data->longest = &(data->longest_float);
2948 goto optimize_curly_tail;
2950 if (stopparen>0 && (OP(scan)==CURLYN || OP(scan)==CURLYM)
2951 && (scan->flags == stopparen))
2956 mincount = ARG1(scan);
2957 maxcount = ARG2(scan);
2959 next = regnext(scan);
2960 if (OP(scan) == CURLYX) {
2961 I32 lp = (data ? *(data->last_closep) : 0);
2962 scan->flags = ((lp <= (I32)U8_MAX) ? (U8)lp : U8_MAX);
2964 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
2965 next_is_eval = (OP(scan) == EVAL);
2967 if (flags & SCF_DO_SUBSTR) {
2968 if (mincount == 0) scan_commit(pRExC_state,data,minlenp); /* Cannot extend fixed substrings */
2969 pos_before = data->pos_min;
2973 data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
2975 data->flags |= SF_IS_INF;
2977 if (flags & SCF_DO_STCLASS) {
2978 cl_init(pRExC_state, &this_class);
2979 oclass = data->start_class;
2980 data->start_class = &this_class;
2981 f |= SCF_DO_STCLASS_AND;
2982 f &= ~SCF_DO_STCLASS_OR;
2984 /* These are the cases when once a subexpression
2985 fails at a particular position, it cannot succeed
2986 even after backtracking at the enclosing scope.
2988 XXXX what if minimal match and we are at the
2989 initial run of {n,m}? */
2990 if ((mincount != maxcount - 1) && (maxcount != REG_INFTY))
2991 f &= ~SCF_WHILEM_VISITED_POS;
2993 /* This will finish on WHILEM, setting scan, or on NULL: */
2994 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
2995 last, data, stopparen, recursed, NULL,
2997 ? (f & ~SCF_DO_SUBSTR) : f),depth+1);
2999 if (flags & SCF_DO_STCLASS)
3000 data->start_class = oclass;
3001 if (mincount == 0 || minnext == 0) {
3002 if (flags & SCF_DO_STCLASS_OR) {
3003 cl_or(pRExC_state, data->start_class, &this_class);
3005 else if (flags & SCF_DO_STCLASS_AND) {
3006 /* Switch to OR mode: cache the old value of
3007 * data->start_class */
3009 StructCopy(data->start_class, and_withp,
3010 struct regnode_charclass_class);
3011 flags &= ~SCF_DO_STCLASS_AND;
3012 StructCopy(&this_class, data->start_class,
3013 struct regnode_charclass_class);
3014 flags |= SCF_DO_STCLASS_OR;
3015 data->start_class->flags |= ANYOF_EOS;
3017 } else { /* Non-zero len */
3018 if (flags & SCF_DO_STCLASS_OR) {
3019 cl_or(pRExC_state, data->start_class, &this_class);
3020 cl_and(data->start_class, and_withp);
3022 else if (flags & SCF_DO_STCLASS_AND)
3023 cl_and(data->start_class, &this_class);
3024 flags &= ~SCF_DO_STCLASS;
3026 if (!scan) /* It was not CURLYX, but CURLY. */
3028 if ( /* ? quantifier ok, except for (?{ ... }) */
3029 (next_is_eval || !(mincount == 0 && maxcount == 1))
3030 && (minnext == 0) && (deltanext == 0)
3031 && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
3032 && maxcount <= REG_INFTY/3 /* Complement check for big count */
3033 && ckWARN(WARN_REGEXP))
3036 "Quantifier unexpected on zero-length expression");
3039 min += minnext * mincount;
3040 is_inf_internal |= ((maxcount == REG_INFTY
3041 && (minnext + deltanext) > 0)
3042 || deltanext == I32_MAX);
3043 is_inf |= is_inf_internal;
3044 delta += (minnext + deltanext) * maxcount - minnext * mincount;
3046 /* Try powerful optimization CURLYX => CURLYN. */
3047 if ( OP(oscan) == CURLYX && data
3048 && data->flags & SF_IN_PAR
3049 && !(data->flags & SF_HAS_EVAL)
3050 && !deltanext && minnext == 1 ) {
3051 /* Try to optimize to CURLYN. */
3052 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
3053 regnode * const nxt1 = nxt;
3060 if (!strchr((const char*)PL_simple,OP(nxt))
3061 && !(PL_regkind[OP(nxt)] == EXACT
3062 && STR_LEN(nxt) == 1))
3068 if (OP(nxt) != CLOSE)
3070 if (RExC_open_parens) {
3071 RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
3072 RExC_close_parens[ARG(nxt1)-1]=nxt+2; /*close->while*/
3074 /* Now we know that nxt2 is the only contents: */
3075 oscan->flags = (U8)ARG(nxt);
3077 OP(nxt1) = NOTHING; /* was OPEN. */
3080 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
3081 NEXT_OFF(nxt1+ 1) = 0; /* just for consistancy. */
3082 NEXT_OFF(nxt2) = 0; /* just for consistancy with CURLY. */
3083 OP(nxt) = OPTIMIZED; /* was CLOSE. */
3084 OP(nxt + 1) = OPTIMIZED; /* was count. */
3085 NEXT_OFF(nxt+ 1) = 0; /* just for consistancy. */
3090 /* Try optimization CURLYX => CURLYM. */
3091 if ( OP(oscan) == CURLYX && data
3092 && !(data->flags & SF_HAS_PAR)
3093 && !(data->flags & SF_HAS_EVAL)
3094 && !deltanext /* atom is fixed width */
3095 && minnext != 0 /* CURLYM can't handle zero width */
3097 /* XXXX How to optimize if data == 0? */
3098 /* Optimize to a simpler form. */
3099 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
3103 while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
3104 && (OP(nxt2) != WHILEM))
3106 OP(nxt2) = SUCCEED; /* Whas WHILEM */
3107 /* Need to optimize away parenths. */
3108 if (data->flags & SF_IN_PAR) {
3109 /* Set the parenth number. */
3110 regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
3112 if (OP(nxt) != CLOSE)
3113 FAIL("Panic opt close");
3114 oscan->flags = (U8)ARG(nxt);
3115 if (RExC_open_parens) {
3116 RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
3117 RExC_close_parens[ARG(nxt1)-1]=nxt2+1; /*close->NOTHING*/
3119 OP(nxt1) = OPTIMIZED; /* was OPEN. */
3120 OP(nxt) = OPTIMIZED; /* was CLOSE. */
3123 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
3124 OP(nxt + 1) = OPTIMIZED; /* was count. */
3125 NEXT_OFF(nxt1 + 1) = 0; /* just for consistancy. */
3126 NEXT_OFF(nxt + 1) = 0; /* just for consistancy. */
3129 while ( nxt1 && (OP(nxt1) != WHILEM)) {
3130 regnode *nnxt = regnext(nxt1);
3133 if (reg_off_by_arg[OP(nxt1)])
3134 ARG_SET(nxt1, nxt2 - nxt1);
3135 else if (nxt2 - nxt1 < U16_MAX)
3136 NEXT_OFF(nxt1) = nxt2 - nxt1;
3138 OP(nxt) = NOTHING; /* Cannot beautify */
3143 /* Optimize again: */
3144 study_chunk(pRExC_state, &nxt1, minlenp, &deltanext, nxt,
3145 NULL, stopparen, recursed, NULL, 0,depth+1);
3150 else if ((OP(oscan) == CURLYX)
3151 && (flags & SCF_WHILEM_VISITED_POS)
3152 /* See the comment on a similar expression above.
3153 However, this time it not a subexpression
3154 we care about, but the expression itself. */
3155 && (maxcount == REG_INFTY)
3156 && data && ++data->whilem_c < 16) {
3157 /* This stays as CURLYX, we can put the count/of pair. */
3158 /* Find WHILEM (as in regexec.c) */
3159 regnode *nxt = oscan + NEXT_OFF(oscan);
3161 if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
3163 PREVOPER(nxt)->flags = (U8)(data->whilem_c
3164 | (RExC_whilem_seen << 4)); /* On WHILEM */
3166 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
3168 if (flags & SCF_DO_SUBSTR) {
3169 SV *last_str = NULL;
3170 int counted = mincount != 0;
3172 if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */
3173 #if defined(SPARC64_GCC_WORKAROUND)
3176 const char *s = NULL;
3179 if (pos_before >= data->last_start_min)
3182 b = data->last_start_min;
3185 s = SvPV_const(data->last_found, l);
3186 old = b - data->last_start_min;
3189 I32 b = pos_before >= data->last_start_min
3190 ? pos_before : data->last_start_min;
3192 const char * const s = SvPV_const(data->last_found, l);
3193 I32 old = b - data->last_start_min;
3197 old = utf8_hop((U8*)s, old) - (U8*)s;
3200 /* Get the added string: */
3201 last_str = newSVpvn(s + old, l);
3203 SvUTF8_on(last_str);
3204 if (deltanext == 0 && pos_before == b) {
3205 /* What was added is a constant string */
3207 SvGROW(last_str, (mincount * l) + 1);
3208 repeatcpy(SvPVX(last_str) + l,
3209 SvPVX_const(last_str), l, mincount - 1);
3210 SvCUR_set(last_str, SvCUR(last_str) * mincount);
3211 /* Add additional parts. */
3212 SvCUR_set(data->last_found,
3213 SvCUR(data->last_found) - l);
3214 sv_catsv(data->last_found, last_str);
3216 SV * sv = data->last_found;
3218 SvUTF8(sv) && SvMAGICAL(sv) ?
3219 mg_find(sv, PERL_MAGIC_utf8) : NULL;
3220 if (mg && mg->mg_len >= 0)
3221 mg->mg_len += CHR_SVLEN(last_str);
3223 data->last_end += l * (mincount - 1);
3226 /* start offset must point into the last copy */
3227 data->last_start_min += minnext * (mincount - 1);
3228 data->last_start_max += is_inf ? I32_MAX
3229 : (maxcount - 1) * (minnext + data->pos_delta);
3232 /* It is counted once already... */
3233 data->pos_min += minnext * (mincount - counted);
3234 data->pos_delta += - counted * deltanext +
3235 (minnext + deltanext) * maxcount - minnext * mincount;
3236 if (mincount != maxcount) {
3237 /* Cannot extend fixed substrings found inside
3239 scan_commit(pRExC_state,data,minlenp);
3240 if (mincount && last_str) {
3241 SV * const sv = data->last_found;
3242 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
3243 mg_find(sv, PERL_MAGIC_utf8) : NULL;
3247 sv_setsv(sv, last_str);
3248 data->last_end = data->pos_min;
3249 data->last_start_min =
3250 data->pos_min - CHR_SVLEN(last_str);
3251 data->last_start_max = is_inf
3253 : data->pos_min + data->pos_delta
3254 - CHR_SVLEN(last_str);
3256 data->longest = &(data->longest_float);
3258 SvREFCNT_dec(last_str);
3260 if (data && (fl & SF_HAS_EVAL))
3261 data->flags |= SF_HAS_EVAL;
3262 optimize_curly_tail:
3263 if (OP(oscan) != CURLYX) {
3264 while (PL_regkind[OP(next = regnext(oscan))] == NOTHING
3266 NEXT_OFF(oscan) += NEXT_OFF(next);
3269 default: /* REF and CLUMP only? */
3270 if (flags & SCF_DO_SUBSTR) {
3271 scan_commit(pRExC_state,data,minlenp); /* Cannot expect anything... */
3272 data->longest = &(data->longest_float);
3274 is_inf = is_inf_internal = 1;
3275 if (flags & SCF_DO_STCLASS_OR)
3276 cl_anything(pRExC_state, data->start_class);
3277 flags &= ~SCF_DO_STCLASS;
3281 else if (strchr((const char*)PL_simple,OP(scan))) {
3284 if (flags & SCF_DO_SUBSTR) {
3285 scan_commit(pRExC_state,data,minlenp);
3289 if (flags & SCF_DO_STCLASS) {
3290 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
3292 /* Some of the logic below assumes that switching
3293 locale on will only add false positives. */
3294 switch (PL_regkind[OP(scan)]) {
3298 /* Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan)); */
3299 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
3300 cl_anything(pRExC_state, data->start_class);
3303 if (OP(scan) == SANY)
3305 if (flags & SCF_DO_STCLASS_OR) { /* Everything but \n */
3306 value = (ANYOF_BITMAP_TEST(data->start_class,'\n')
3307 || (data->start_class->flags & ANYOF_CLASS));
3308 cl_anything(pRExC_state, data->start_class);
3310 if (flags & SCF_DO_STCLASS_AND || !value)
3311 ANYOF_BITMAP_CLEAR(data->start_class,'\n');
3314 if (flags & SCF_DO_STCLASS_AND)
3315 cl_and(data->start_class,
3316 (struct regnode_charclass_class*)scan);
3318 cl_or(pRExC_state, data->start_class,
3319 (struct regnode_charclass_class*)scan);
3322 if (flags & SCF_DO_STCLASS_AND) {
3323 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3324 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
3325 for (value = 0; value < 256; value++)
3326 if (!isALNUM(value))
3327 ANYOF_BITMAP_CLEAR(data->start_class, value);
3331 if (data->start_class->flags & ANYOF_LOCALE)
3332 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
3334 for (value = 0; value < 256; value++)
3336 ANYOF_BITMAP_SET(data->start_class, value);
3341 if (flags & SCF_DO_STCLASS_AND) {
3342 if (data->start_class->flags & ANYOF_LOCALE)
3343 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
3346 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
3347 data->start_class->flags |= ANYOF_LOCALE;
3351 if (flags & SCF_DO_STCLASS_AND) {
3352 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3353 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
3354 for (value = 0; value < 256; value++)
3356 ANYOF_BITMAP_CLEAR(data->start_class, value);
3360 if (data->start_class->flags & ANYOF_LOCALE)
3361 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
3363 for (value = 0; value < 256; value++)
3364 if (!isALNUM(value))
3365 ANYOF_BITMAP_SET(data->start_class, value);
3370 if (flags & SCF_DO_STCLASS_AND) {
3371 if (data->start_class->flags & ANYOF_LOCALE)
3372 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
3375 data->start_class->flags |= ANYOF_LOCALE;
3376 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
3380 if (flags & SCF_DO_STCLASS_AND) {
3381 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3382 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
3383 for (value = 0; value < 256; value++)
3384 if (!isSPACE(value))
3385 ANYOF_BITMAP_CLEAR(data->start_class, value);
3389 if (data->start_class->flags & ANYOF_LOCALE)
3390 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
3392 for (value = 0; value < 256; value++)
3394 ANYOF_BITMAP_SET(data->start_class, value);
3399 if (flags & SCF_DO_STCLASS_AND) {
3400 if (data->start_class->flags & ANYOF_LOCALE)
3401 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
3404 data->start_class->flags |= ANYOF_LOCALE;
3405 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
3409 if (flags & SCF_DO_STCLASS_AND) {
3410 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3411 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
3412 for (value = 0; value < 256; value++)
3414 ANYOF_BITMAP_CLEAR(data->start_class, value);
3418 if (data->start_class->flags & ANYOF_LOCALE)
3419 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
3421 for (value = 0; value < 256; value++)
3422 if (!isSPACE(value))
3423 ANYOF_BITMAP_SET(data->start_class, value);
3428 if (flags & SCF_DO_STCLASS_AND) {
3429 if (data->start_class->flags & ANYOF_LOCALE) {
3430 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
3431 for (value = 0; value < 256; value++)
3432 if (!isSPACE(value))
3433 ANYOF_BITMAP_CLEAR(data->start_class, value);
3437 data->start_class->flags |= ANYOF_LOCALE;
3438 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
3442 if (flags & SCF_DO_STCLASS_AND) {
3443 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NDIGIT);
3444 for (value = 0; value < 256; value++)
3445 if (!isDIGIT(value))
3446 ANYOF_BITMAP_CLEAR(data->start_class, value);
3449 if (data->start_class->flags & ANYOF_LOCALE)
3450 ANYOF_CLASS_SET(data->start_class,ANYOF_DIGIT);
3452 for (value = 0; value < 256; value++)
3454 ANYOF_BITMAP_SET(data->start_class, value);
3459 if (flags & SCF_DO_STCLASS_AND) {
3460 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_DIGIT);
3461 for (value = 0; value < 256; value++)
3463 ANYOF_BITMAP_CLEAR(data->start_class, value);
3466 if (data->start_class->flags & ANYOF_LOCALE)
3467 ANYOF_CLASS_SET(data->start_class,ANYOF_NDIGIT);
3469 for (value = 0; value < 256; value++)
3470 if (!isDIGIT(value))
3471 ANYOF_BITMAP_SET(data->start_class, value);
3476 if (flags & SCF_DO_STCLASS_OR)
3477 cl_and(data->start_class, and_withp);
3478 flags &= ~SCF_DO_STCLASS;
3481 else if (PL_regkind[OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
3482 data->flags |= (OP(scan) == MEOL
3486 else if ( PL_regkind[OP(scan)] == BRANCHJ
3487 /* Lookbehind, or need to calculate parens/evals/stclass: */
3488 && (scan->flags || data || (flags & SCF_DO_STCLASS))
3489 && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) {
3490 if ( !PERL_ENABLE_POSITIVE_ASSERTION_STUDY
3491 || OP(scan) == UNLESSM )
3493 /* Negative Lookahead/lookbehind
3494 In this case we can't do fixed string optimisation.
3497 I32 deltanext, minnext, fake = 0;
3499 struct regnode_charclass_class intrnl;
3502 data_fake.flags = 0;
3504 data_fake.whilem_c = data->whilem_c;
3505 data_fake.last_closep = data->last_closep;
3508 data_fake.last_closep = &fake;
3509 data_fake.pos_delta = delta;
3510 if ( flags & SCF_DO_STCLASS && !scan->flags
3511 && OP(scan) == IFMATCH ) { /* Lookahead */
3512 cl_init(pRExC_state, &intrnl);
3513 data_fake.start_class = &intrnl;
3514 f |= SCF_DO_STCLASS_AND;
3516 if (flags & SCF_WHILEM_VISITED_POS)
3517 f |= SCF_WHILEM_VISITED_POS;
3518 next = regnext(scan);
3519 nscan = NEXTOPER(NEXTOPER(scan));
3520 minnext = study_chunk(pRExC_state, &nscan, minlenp, &deltanext,
3521 last, &data_fake, stopparen, recursed, NULL, f, depth+1);
3524 FAIL("Variable length lookbehind not implemented");
3526 else if (minnext > (I32)U8_MAX) {
3527 FAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
3529 scan->flags = (U8)minnext;
3532 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3534 if (data_fake.flags & SF_HAS_EVAL)
3535 data->flags |= SF_HAS_EVAL;
3536 data->whilem_c = data_fake.whilem_c;
3538 if (f & SCF_DO_STCLASS_AND) {
3539 const int was = (data->start_class->flags & ANYOF_EOS);
3541 cl_and(data->start_class, &intrnl);
3543 data->start_class->flags |= ANYOF_EOS;
3546 #if PERL_ENABLE_POSITIVE_ASSERTION_STUDY
3548 /* Positive Lookahead/lookbehind
3549 In this case we can do fixed string optimisation,
3550 but we must be careful about it. Note in the case of
3551 lookbehind the positions will be offset by the minimum
3552 length of the pattern, something we won't know about
3553 until after the recurse.
3555 I32 deltanext, fake = 0;
3557 struct regnode_charclass_class intrnl;
3559 /* We use SAVEFREEPV so that when the full compile
3560 is finished perl will clean up the allocated
3561 minlens when its all done. This was we don't
3562 have to worry about freeing them when we know
3563 they wont be used, which would be a pain.
3566 Newx( minnextp, 1, I32 );
3567 SAVEFREEPV(minnextp);
3570 StructCopy(data, &data_fake, scan_data_t);
3571 if ((flags & SCF_DO_SUBSTR) && data->last_found) {
3574 scan_commit(pRExC_state, &data_fake,minlenp);
3575 data_fake.last_found=newSVsv(data->last_found);
3579 data_fake.last_closep = &fake;
3580 data_fake.flags = 0;
3581 data_fake.pos_delta = delta;
3583 data_fake.flags |= SF_IS_INF;
3584 if ( flags & SCF_DO_STCLASS && !scan->flags
3585 && OP(scan) == IFMATCH ) { /* Lookahead */
3586 cl_init(pRExC_state, &intrnl);
3587 data_fake.start_class = &intrnl;
3588 f |= SCF_DO_STCLASS_AND;
3590 if (flags & SCF_WHILEM_VISITED_POS)
3591 f |= SCF_WHILEM_VISITED_POS;
3592 next = regnext(scan);
3593 nscan = NEXTOPER(NEXTOPER(scan));
3595 *minnextp = study_chunk(pRExC_state, &nscan, minnextp, &deltanext,
3596 last, &data_fake, stopparen, recursed, NULL, f,depth+1);
3599 FAIL("Variable length lookbehind not implemented");
3601 else if (*minnextp > (I32)U8_MAX) {
3602 FAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
3604 scan->flags = (U8)*minnextp;
3609 if (f & SCF_DO_STCLASS_AND) {
3610 const int was = (data->start_class->flags & ANYOF_EOS);
3612 cl_and(data->start_class, &intrnl);
3614 data->start_class->flags |= ANYOF_EOS;
3617 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3619 if (data_fake.flags & SF_HAS_EVAL)
3620 data->flags |= SF_HAS_EVAL;
3621 data->whilem_c = data_fake.whilem_c;
3622 if ((flags & SCF_DO_SUBSTR) && data_fake.last_found) {
3623 if (RExC_rx->minlen<*minnextp)
3624 RExC_rx->minlen=*minnextp;
3625 scan_commit(pRExC_state, &data_fake, minnextp);
3626 SvREFCNT_dec(data_fake.last_found);
3628 if ( data_fake.minlen_fixed != minlenp )
3630 data->offset_fixed= data_fake.offset_fixed;
3631 data->minlen_fixed= data_fake.minlen_fixed;
3632 data->lookbehind_fixed+= scan->flags;
3634 if ( data_fake.minlen_float != minlenp )
3636 data->minlen_float= data_fake.minlen_float;
3637 data->offset_float_min=data_fake.offset_float_min;
3638 data->offset_float_max=data_fake.offset_float_max;
3639 data->lookbehind_float+= scan->flags;
3648 else if (OP(scan) == OPEN) {
3649 if (stopparen != (I32)ARG(scan))
3652 else if (OP(scan) == CLOSE) {
3653 if (stopparen == (I32)ARG(scan)) {
3656 if ((I32)ARG(scan) == is_par) {
3657 next = regnext(scan);
3659 if ( next && (OP(next) != WHILEM) && next < last)
3660 is_par = 0; /* Disable optimization */
3663 *(data->last_closep) = ARG(scan);
3665 else if (OP(scan) == EVAL) {
3667 data->flags |= SF_HAS_EVAL;
3669 else if ( PL_regkind[OP(scan)] == ENDLIKE ) {
3670 if (flags & SCF_DO_SUBSTR) {
3671 scan_commit(pRExC_state,data,minlenp);
3672 flags &= ~SCF_DO_SUBSTR;
3674 if (data && OP(scan)==ACCEPT) {
3675 data->flags |= SCF_SEEN_ACCEPT;
3680 else if (OP(scan) == LOGICAL && scan->flags == 2) /* Embedded follows */
3682 if (flags & SCF_DO_SUBSTR) {
3683 scan_commit(pRExC_state,data,minlenp);
3684 data->longest = &(data->longest_float);
3686 is_inf = is_inf_internal = 1;
3687 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
3688 cl_anything(pRExC_state, data->start_class);
3689 flags &= ~SCF_DO_STCLASS;
3691 else if (OP(scan) == GPOS) {
3692 if (!(RExC_rx->extflags & RXf_GPOS_FLOAT) &&
3693 !(delta || is_inf || (data && data->pos_delta)))
3695 if (!(RExC_rx->extflags & RXf_ANCH) && (flags & SCF_DO_SUBSTR))
3696 RExC_rx->extflags |= RXf_ANCH_GPOS;
3697 if (RExC_rx->gofs < (U32)min)
3698 RExC_rx->gofs = min;
3700 RExC_rx->extflags |= RXf_GPOS_FLOAT;
3704 #ifdef TRIE_STUDY_OPT
3705 #ifdef FULL_TRIE_STUDY
3706 else if (PL_regkind[OP(scan)] == TRIE) {
3707 /* NOTE - There is similar code to this block above for handling
3708 BRANCH nodes on the initial study. If you change stuff here
3710 regnode *trie_node= scan;
3711 regnode *tail= regnext(scan);
3712 reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
3713 I32 max1 = 0, min1 = I32_MAX;
3714 struct regnode_charclass_class accum;
3716 if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
3717 scan_commit(pRExC_state, data,minlenp); /* Cannot merge strings after this. */
3718 if (flags & SCF_DO_STCLASS)
3719 cl_init_zero(pRExC_state, &accum);
3725 const regnode *nextbranch= NULL;
3728 for ( word=1 ; word <= trie->wordcount ; word++)
3730 I32 deltanext=0, minnext=0, f = 0, fake;
3731 struct regnode_charclass_class this_class;
3733 data_fake.flags = 0;
3735 data_fake.whilem_c = data->whilem_c;
3736 data_fake.last_closep = data->last_closep;
3739 data_fake.last_closep = &fake;
3740 data_fake.pos_delta = delta;
3741 if (flags & SCF_DO_STCLASS) {
3742 cl_init(pRExC_state, &this_class);
3743 data_fake.start_class = &this_class;
3744 f = SCF_DO_STCLASS_AND;
3746 if (flags & SCF_WHILEM_VISITED_POS)
3747 f |= SCF_WHILEM_VISITED_POS;
3749 if (trie->jump[word]) {
3751 nextbranch = trie_node + trie->jump[0];
3752 scan= trie_node + trie->jump[word];
3753 /* We go from the jump point to the branch that follows
3754 it. Note this means we need the vestigal unused branches
3755 even though they arent otherwise used.
3757 minnext = study_chunk(pRExC_state, &scan, minlenp,
3758 &deltanext, (regnode *)nextbranch, &data_fake,
3759 stopparen, recursed, NULL, f,depth+1);
3761 if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
3762 nextbranch= regnext((regnode*)nextbranch);
3764 if (min1 > (I32)(minnext + trie->minlen))
3765 min1 = minnext + trie->minlen;
3766 if (max1 < (I32)(minnext + deltanext + trie->maxlen))
3767 max1 = minnext + deltanext + trie->maxlen;
3768 if (deltanext == I32_MAX)
3769 is_inf = is_inf_internal = 1;
3771 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3773 if (data_fake.flags & SCF_SEEN_ACCEPT) {
3774 if ( stopmin > min + min1)
3775 stopmin = min + min1;
3776 flags &= ~SCF_DO_SUBSTR;
3778 data->flags |= SCF_SEEN_ACCEPT;
3781 if (data_fake.flags & SF_HAS_EVAL)
3782 data->flags |= SF_HAS_EVAL;
3783 data->whilem_c = data_fake.whilem_c;
3785 if (flags & SCF_DO_STCLASS)
3786 cl_or(pRExC_state, &accum, &this_class);
3789 if (flags & SCF_DO_SUBSTR) {
3790 data->pos_min += min1;
3791 data->pos_delta += max1 - min1;
3792 if (max1 != min1 || is_inf)
3793 data->longest = &(data->longest_float);
3796 delta += max1 - min1;
3797 if (flags & SCF_DO_STCLASS_OR) {
3798 cl_or(pRExC_state, data->start_class, &accum);
3800 cl_and(data->start_class, and_withp);
3801 flags &= ~SCF_DO_STCLASS;
3804 else if (flags & SCF_DO_STCLASS_AND) {
3806 cl_and(data->start_class, &accum);
3807 flags &= ~SCF_DO_STCLASS;
3810 /* Switch to OR mode: cache the old value of
3811 * data->start_class */
3813 StructCopy(data->start_class, and_withp,
3814 struct regnode_charclass_class);
3815 flags &= ~SCF_DO_STCLASS_AND;
3816 StructCopy(&accum, data->start_class,
3817 struct regnode_charclass_class);
3818 flags |= SCF_DO_STCLASS_OR;
3819 data->start_class->flags |= ANYOF_EOS;
3826 else if (PL_regkind[OP(scan)] == TRIE) {
3827 reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
3830 min += trie->minlen;
3831 delta += (trie->maxlen - trie->minlen);
3832 flags &= ~SCF_DO_STCLASS; /* xxx */
3833 if (flags & SCF_DO_SUBSTR) {
3834 scan_commit(pRExC_state,data,minlenp); /* Cannot expect anything... */
3835 data->pos_min += trie->minlen;
3836 data->pos_delta += (trie->maxlen - trie->minlen);
3837 if (trie->maxlen != trie->minlen)
3838 data->longest = &(data->longest_float);
3840 if (trie->jump) /* no more substrings -- for now /grr*/
3841 flags &= ~SCF_DO_SUBSTR;
3843 #endif /* old or new */
3844 #endif /* TRIE_STUDY_OPT */
3845 /* Else: zero-length, ignore. */
3846 scan = regnext(scan);
3851 stopparen = frame->stop;
3852 frame = frame->prev;
3853 goto fake_study_recurse;
3860 *deltap = is_inf_internal ? I32_MAX : delta;
3861 if (flags & SCF_DO_SUBSTR && is_inf)
3862 data->pos_delta = I32_MAX - data->pos_min;
3863 if (is_par > (I32)U8_MAX)
3865 if (is_par && pars==1 && data) {
3866 data->flags |= SF_IN_PAR;
3867 data->flags &= ~SF_HAS_PAR;
3869 else if (pars && data) {
3870 data->flags |= SF_HAS_PAR;
3871 data->flags &= ~SF_IN_PAR;
3873 if (flags & SCF_DO_STCLASS_OR)
3874 cl_and(data->start_class, and_withp);
3875 if (flags & SCF_TRIE_RESTUDY)
3876 data->flags |= SCF_TRIE_RESTUDY;
3878 DEBUG_STUDYDATA(data,depth);
3880 return min < stopmin ? min : stopmin;
3884 S_add_data(RExC_state_t *pRExC_state, U32 n, const char *s)
3886 U32 count = RExC_rxi->data ? RExC_rxi->data->count : 0;
3888 Renewc(RExC_rxi->data,
3889 sizeof(*RExC_rxi->data) + sizeof(void*) * (count + n - 1),
3890 char, struct reg_data);
3892 Renew(RExC_rxi->data->what, count + n, U8);
3894 Newx(RExC_rxi->data->what, n, U8);
3895 RExC_rxi->data->count = count + n;
3896 Copy(s, RExC_rxi->data->what + count, n, U8);
3900 #ifndef PERL_IN_XSUB_RE
3902 Perl_reginitcolors(pTHX)
3905 const char * const s = PerlEnv_getenv("PERL_RE_COLORS");
3907 char *t = savepv(s);
3911 t = strchr(t, '\t');
3917 PL_colors[i] = t = (char *)"";
3922 PL_colors[i++] = (char *)"";
3929 #ifdef TRIE_STUDY_OPT
3930 #define CHECK_RESTUDY_GOTO \
3932 (data.flags & SCF_TRIE_RESTUDY) \
3936 #define CHECK_RESTUDY_GOTO
3940 - pregcomp - compile a regular expression into internal code
3942 * We can't allocate space until we know how big the compiled form will be,
3943 * but we can't compile it (and thus know how big it is) until we've got a
3944 * place to put the code. So we cheat: we compile it twice, once with code
3945 * generation turned off and size counting turned on, and once "for real".
3946 * This also means that we don't allocate space until we are sure that the
3947 * thing really will compile successfully, and we never have to move the
3948 * code and thus invalidate pointers into it. (Note that it has to be in
3949 * one piece because free() must be able to free it all.) [NB: not true in perl]
3951 * Beware that the optimization-preparation code in here knows about some
3952 * of the structure of the compiled regexp. [I'll say.]
3957 #ifndef PERL_IN_XSUB_RE
3958 #define RE_ENGINE_PTR &PL_core_reg_engine
3960 extern const struct regexp_engine my_reg_engine;
3961 #define RE_ENGINE_PTR &my_reg_engine
3963 /* these make a few things look better, to avoid indentation */
3964 #define BEGIN_BLOCK {
3968 Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm)
3971 GET_RE_DEBUG_FLAGS_DECL;
3972 DEBUG_r(if (!PL_colorset) reginitcolors());
3973 #ifndef PERL_IN_XSUB_RE
3975 /* Dispatch a request to compile a regexp to correct
3977 HV * const table = GvHV(PL_hintgv);
3979 SV **ptr= hv_fetchs(table, "regcomp", FALSE);
3980 if (ptr && SvIOK(*ptr) && SvIV(*ptr)) {
3981 const regexp_engine *eng=INT2PTR(regexp_engine*,SvIV(*ptr));
3983 PerlIO_printf(Perl_debug_log, "Using engine %"UVxf"\n",
3986 return CALLREGCOMP_ENG(eng, exp, xend, pm);
3993 register regexp_internal *ri;
4001 RExC_state_t RExC_state;
4002 RExC_state_t * const pRExC_state = &RExC_state;
4003 #ifdef TRIE_STUDY_OPT
4005 RExC_state_t copyRExC_state;
4008 FAIL("NULL regexp argument");
4010 RExC_utf8 = pm->op_pmdynflags & PMdf_CMP_UTF8;
4014 SV *dsv= sv_newmortal();
4015 RE_PV_QUOTED_DECL(s, RExC_utf8,
4016 dsv, RExC_precomp, (xend - exp), 60);
4017 PerlIO_printf(Perl_debug_log, "%sCompiling REx%s %s\n",
4018 PL_colors[4],PL_colors[5],s);
4020 RExC_flags = pm->op_pmflags;
4024 RExC_seen_zerolen = *exp == '^' ? -1 : 0;
4025 RExC_seen_evals = 0;
4028 /* First pass: determine size, legality. */
4037 RExC_emit = &PL_regdummy;
4038 RExC_whilem_seen = 0;
4039 RExC_charnames = NULL;
4040 RExC_open_parens = NULL;
4041 RExC_close_parens = NULL;
4043 RExC_paren_names = NULL;
4044 RExC_recurse = NULL;
4045 RExC_recurse_count = 0;
4047 #if 0 /* REGC() is (currently) a NOP at the first pass.
4048 * Clever compilers notice this and complain. --jhi */
4049 REGC((U8)REG_MAGIC, (char*)RExC_emit);
4051 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "Starting first pass (sizing)\n"));
4052 if (reg(pRExC_state, 0, &flags,1) == NULL) {
4053 RExC_precomp = NULL;
4057 PerlIO_printf(Perl_debug_log,
4058 "Required size %"IVdf" nodes\n"
4059 "Starting second pass (creation)\n",
4062 RExC_lastparse=NULL;
4064 /* Small enough for pointer-storage convention?
4065 If extralen==0, this means that we will not need long jumps. */
4066 if (RExC_size >= 0x10000L && RExC_extralen)
4067 RExC_size += RExC_extralen;
4070 if (RExC_whilem_seen > 15)
4071 RExC_whilem_seen = 15;
4074 /* Make room for a sentinel value at the end of the program */
4078 /* Allocate space and zero-initialize. Note, the two step process
4079 of zeroing when in debug mode, thus anything assigned has to
4080 happen after that */
4081 Newxz(r, 1, regexp);
4082 Newxc(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode),
4083 char, regexp_internal);
4084 if ( r == NULL || ri == NULL )
4085 FAIL("Regexp out of space");
4087 /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */
4088 Zero(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode), char);
4090 /* bulk initialize base fields with 0. */
4091 Zero(ri, sizeof(regexp_internal), char);
4094 /* non-zero initialization begins here */
4096 r->engine= RE_ENGINE_PTR;
4098 r->prelen = xend - exp;
4099 r->precomp = savepvn(RExC_precomp, r->prelen);
4100 r->extflags = pm->op_pmflags & RXf_PMf_COMPILETIME;
4102 r->nparens = RExC_npar - 1; /* set early to validate backrefs */
4104 if (RExC_seen & REG_SEEN_RECURSE) {
4105 Newxz(RExC_open_parens, RExC_npar,regnode *);
4106 SAVEFREEPV(RExC_open_parens);
4107 Newxz(RExC_close_parens,RExC_npar,regnode *);
4108 SAVEFREEPV(RExC_close_parens);
4111 /* Useful during FAIL. */
4112 Newxz(ri->offsets, 2*RExC_size+1, U32); /* MJD 20001228 */
4114 ri->offsets[0] = RExC_size;
4116 DEBUG_OFFSETS_r(PerlIO_printf(Perl_debug_log,
4117 "%s %"UVuf" bytes for offset annotations.\n",
4118 ri->offsets ? "Got" : "Couldn't get",
4119 (UV)((2*RExC_size+1) * sizeof(U32))));
4124 /* Second pass: emit code. */
4125 RExC_flags = pm->op_pmflags; /* don't let top level (?i) bleed */
4131 RExC_emit_start = ri->program;
4132 RExC_emit = ri->program;
4134 /* put a sentinal on the end of the program so we can check for
4136 ri->program[RExC_size].type = 255;
4138 /* Store the count of eval-groups for security checks: */
4139 RExC_emit->next_off = (RExC_seen_evals > (I32)U16_MAX) ? U16_MAX : (U16)RExC_seen_evals;
4140 REGC((U8)REG_MAGIC, (char*) RExC_emit++);
4141 if (reg(pRExC_state, 0, &flags,1) == NULL)
4144 /* XXXX To minimize changes to RE engine we always allocate
4145 3-units-long substrs field. */
4146 Newx(r->substrs, 1, struct reg_substr_data);
4147 if (RExC_recurse_count) {
4148 Newxz(RExC_recurse,RExC_recurse_count,regnode *);
4149 SAVEFREEPV(RExC_recurse);
4153 r->minlen = minlen = sawplus = sawopen = 0;
4154 Zero(r->substrs, 1, struct reg_substr_data);
4156 #ifdef TRIE_STUDY_OPT
4159 DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log,"Restudying\n"));
4161 RExC_state = copyRExC_state;
4162 if (seen & REG_TOP_LEVEL_BRANCHES)
4163 RExC_seen |= REG_TOP_LEVEL_BRANCHES;
4165 RExC_seen &= ~REG_TOP_LEVEL_BRANCHES;
4166 if (data.last_found) {
4167 SvREFCNT_dec(data.longest_fixed);
4168 SvREFCNT_dec(data.longest_float);
4169 SvREFCNT_dec(data.last_found);
4171 StructCopy(&zero_scan_data, &data, scan_data_t);
4173 StructCopy(&zero_scan_data, &data, scan_data_t);
4174 copyRExC_state = RExC_state;
4177 StructCopy(&zero_scan_data, &data, scan_data_t);
4180 /* Dig out information for optimizations. */
4181 r->extflags = pm->op_pmflags & RXf_PMf_COMPILETIME; /* Again? */
4182 pm->op_pmflags = RExC_flags;
4184 r->extflags |= RXf_UTF8; /* Unicode in it? */
4185 ri->regstclass = NULL;
4186 if (RExC_naughty >= 10) /* Probably an expensive pattern. */
4187 r->intflags |= PREGf_NAUGHTY;
4188 scan = ri->program + 1; /* First BRANCH. */
4190 /* testing for BRANCH here tells us whether there is "must appear"
4191 data in the pattern. If there is then we can use it for optimisations */
4192 if (!(RExC_seen & REG_TOP_LEVEL_BRANCHES)) { /* Only one top-level choice. */
4194 STRLEN longest_float_length, longest_fixed_length;
4195 struct regnode_charclass_class ch_class; /* pointed to by data */
4197 I32 last_close = 0; /* pointed to by data */
4200 /* Skip introductions and multiplicators >= 1. */
4201 while ((OP(first) == OPEN && (sawopen = 1)) ||
4202 /* An OR of *one* alternative - should not happen now. */
4203 (OP(first) == BRANCH && OP(regnext(first)) != BRANCH) ||
4204 /* for now we can't handle lookbehind IFMATCH*/
4205 (OP(first) == IFMATCH && !first->flags) ||
4206 (OP(first) == PLUS) ||
4207 (OP(first) == MINMOD) ||
4208 /* An {n,m} with n>0 */
4209 (PL_regkind[OP(first)] == CURLY && ARG1(first) > 0) )
4212 if (OP(first) == PLUS)
4215 first += regarglen[OP(first)];
4216 if (OP(first) == IFMATCH) {
4217 first = NEXTOPER(first);
4218 first += EXTRA_STEP_2ARGS;
4219 } else /* XXX possible optimisation for /(?=)/ */
4220 first = NEXTOPER(first);
4223 /* Starting-point info. */
4225 DEBUG_PEEP("first:",first,0);
4226 /* Ignore EXACT as we deal with it later. */
4227 if (PL_regkind[OP(first)] == EXACT) {
4228 if (OP(first) == EXACT)
4229 NOOP; /* Empty, get anchored substr later. */
4230 else if ((OP(first) == EXACTF || OP(first) == EXACTFL))
4231 ri->regstclass = first;
4234 else if (PL_regkind[OP(first)] == TRIE &&
4235 ((reg_trie_data *)ri->data->data[ ARG(first) ])->minlen>0)
4238 /* this can happen only on restudy */
4239 if ( OP(first) == TRIE ) {
4240 struct regnode_1 *trieop =
4241 PerlMemShared_calloc(1, sizeof(struct regnode_1));
4242 StructCopy(first,trieop,struct regnode_1);
4243 trie_op=(regnode *)trieop;
4245 struct regnode_charclass *trieop =
4246 PerlMemShared_calloc(1, sizeof(struct regnode_charclass));
4247 StructCopy(first,trieop,struct regnode_charclass);
4248 trie_op=(regnode *)trieop;
4251 make_trie_failtable(pRExC_state, (regnode *)first, trie_op, 0);
4252 ri->regstclass = trie_op;
4255 else if (strchr((const char*)PL_simple,OP(first)))
4256 ri->regstclass = first;
4257 else if (PL_regkind[OP(first)] == BOUND ||
4258 PL_regkind[OP(first)] == NBOUND)
4259 ri->regstclass = first;
4260 else if (PL_regkind[OP(first)] == BOL) {
4261 r->extflags |= (OP(first) == MBOL
4263 : (OP(first) == SBOL
4266 first = NEXTOPER(first);
4269 else if (OP(first) == GPOS) {
4270 r->extflags |= RXf_ANCH_GPOS;
4271 first = NEXTOPER(first);
4274 else if ((!sawopen || !RExC_sawback) &&
4275 (OP(first) == STAR &&
4276 PL_regkind[OP(NEXTOPER(first))] == REG_ANY) &&
4277 !(r->extflags & RXf_ANCH) && !(RExC_seen & REG_SEEN_EVAL))
4279 /* turn .* into ^.* with an implied $*=1 */
4281 (OP(NEXTOPER(first)) == REG_ANY)
4284 r->extflags |= type;
4285 r->intflags |= PREGf_IMPLICIT;
4286 first = NEXTOPER(first);
4289 if (sawplus && (!sawopen || !RExC_sawback)
4290 && !(RExC_seen & REG_SEEN_EVAL)) /* May examine pos and $& */
4291 /* x+ must match at the 1st pos of run of x's */
4292 r->intflags |= PREGf_SKIP;
4294 /* Scan is after the zeroth branch, first is atomic matcher. */
4295 #ifdef TRIE_STUDY_OPT
4298 PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
4299 (IV)(first - scan + 1))
4303 PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
4304 (IV)(first - scan + 1))
4310 * If there's something expensive in the r.e., find the
4311 * longest literal string that must appear and make it the
4312 * regmust. Resolve ties in favor of later strings, since
4313 * the regstart check works with the beginning of the r.e.
4314 * and avoiding duplication strengthens checking. Not a
4315 * strong reason, but sufficient in the absence of others.
4316 * [Now we resolve ties in favor of the earlier string if
4317 * it happens that c_offset_min has been invalidated, since the
4318 * earlier string may buy us something the later one won't.]
4321 data.longest_fixed = newSVpvs("");
4322 data.longest_float = newSVpvs("");
4323 data.last_found = newSVpvs("");
4324 data.longest = &(data.longest_fixed);
4326 if (!ri->regstclass) {
4327 cl_init(pRExC_state, &ch_class);
4328 data.start_class = &ch_class;
4329 stclass_flag = SCF_DO_STCLASS_AND;
4330 } else /* XXXX Check for BOUND? */
4332 data.last_closep = &last_close;
4334 minlen = study_chunk(pRExC_state, &first, &minlen, &fake, scan + RExC_size, /* Up to end */
4335 &data, -1, NULL, NULL,
4336 SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag,0);
4342 if ( RExC_npar == 1 && data.longest == &(data.longest_fixed)
4343 && data.last_start_min == 0 && data.last_end > 0
4344 && !RExC_seen_zerolen
4345 && (!(RExC_seen & REG_SEEN_GPOS) || (r->extflags & RXf_ANCH_GPOS)))
4346 r->extflags |= RXf_CHECK_ALL;
4347 scan_commit(pRExC_state, &data,&minlen);
4348 SvREFCNT_dec(data.last_found);
4350 /* Note that code very similar to this but for anchored string
4351 follows immediately below, changes may need to be made to both.
4354 longest_float_length = CHR_SVLEN(data.longest_float);
4355 if (longest_float_length
4356 || (data.flags & SF_FL_BEFORE_EOL
4357 && (!(data.flags & SF_FL_BEFORE_MEOL)
4358 || (RExC_flags & RXf_PMf_MULTILINE))))
4362 if (SvCUR(data.longest_fixed) /* ok to leave SvCUR */
4363 && data.offset_fixed == data.offset_float_min
4364 && SvCUR(data.longest_fixed) == SvCUR(data.longest_float))
4365 goto remove_float; /* As in (a)+. */
4367 /* copy the information about the longest float from the reg_scan_data
4368 over to the program. */
4369 if (SvUTF8(data.longest_float)) {
4370 r->float_utf8 = data.longest_float;
4371 r->float_substr = NULL;
4373 r->float_substr = data.longest_float;
4374 r->float_utf8 = NULL;
4376 /* float_end_shift is how many chars that must be matched that
4377 follow this item. We calculate it ahead of time as once the
4378 lookbehind offset is added in we lose the ability to correctly
4380 ml = data.minlen_float ? *(data.minlen_float)
4381 : (I32)longest_float_length;
4382 r->float_end_shift = ml - data.offset_float_min
4383 - longest_float_length + (SvTAIL(data.longest_float) != 0)
4384 + data.lookbehind_float;
4385 r->float_min_offset = data.offset_float_min - data.lookbehind_float;
4386 r->float_max_offset = data.offset_float_max;
4387 if (data.offset_float_max < I32_MAX) /* Don't offset infinity */
4388 r->float_max_offset -= data.lookbehind_float;
4390 t = (data.flags & SF_FL_BEFORE_EOL /* Can't have SEOL and MULTI */
4391 && (!(data.flags & SF_FL_BEFORE_MEOL)
4392 || (RExC_flags & RXf_PMf_MULTILINE)));
4393 fbm_compile(data.longest_float, t ? FBMcf_TAIL : 0);
4397 r->float_substr = r->float_utf8 = NULL;
4398 SvREFCNT_dec(data.longest_float);
4399 longest_float_length = 0;
4402 /* Note that code very similar to this but for floating string
4403 is immediately above, changes may need to be made to both.
4406 longest_fixed_length = CHR_SVLEN(data.longest_fixed);
4407 if (longest_fixed_length
4408 || (data.flags & SF_FIX_BEFORE_EOL /* Cannot have SEOL and MULTI */
4409 && (!(data.flags & SF_FIX_BEFORE_MEOL)
4410 || (RExC_flags & RXf_PMf_MULTILINE))))
4414 /* copy the information about the longest fixed
4415 from the reg_scan_data over to the program. */
4416 if (SvUTF8(data.longest_fixed)) {
4417 r->anchored_utf8 = data.longest_fixed;
4418 r->anchored_substr = NULL;
4420 r->anchored_substr = data.longest_fixed;
4421 r->anchored_utf8 = NULL;
4423 /* fixed_end_shift is how many chars that must be matched that
4424 follow this item. We calculate it ahead of time as once the
4425 lookbehind offset is added in we lose the ability to correctly
4427 ml = data.minlen_fixed ? *(data.minlen_fixed)
4428 : (I32)longest_fixed_length;
4429 r->anchored_end_shift = ml - data.offset_fixed
4430 - longest_fixed_length + (SvTAIL(data.longest_fixed) != 0)
4431 + data.lookbehind_fixed;
4432 r->anchored_offset = data.offset_fixed - data.lookbehind_fixed;
4434 t = (data.flags & SF_FIX_BEFORE_EOL /* Can't have SEOL and MULTI */
4435 && (!(data.flags & SF_FIX_BEFORE_MEOL)
4436 || (RExC_flags & RXf_PMf_MULTILINE)));
4437 fbm_compile(data.longest_fixed, t ? FBMcf_TAIL : 0);
4440 r->anchored_substr = r->anchored_utf8 = NULL;
4441 SvREFCNT_dec(data.longest_fixed);
4442 longest_fixed_length = 0;
4445 && (OP(ri->regstclass) == REG_ANY || OP(ri->regstclass) == SANY))
4446 ri->regstclass = NULL;
4447 if ((!(r->anchored_substr || r->anchored_utf8) || r->anchored_offset)
4449 && !(data.start_class->flags & ANYOF_EOS)
4450 && !cl_is_anything(data.start_class))
4452 const U32 n = add_data(pRExC_state, 1, "f");
4454 Newx(RExC_rxi->data->data[n], 1,
4455 struct regnode_charclass_class);
4456 StructCopy(data.start_class,
4457 (struct regnode_charclass_class*)RExC_rxi->data->data[n],
4458 struct regnode_charclass_class);
4459 ri->regstclass = (regnode*)RExC_rxi->data->data[n];
4460 r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
4461 DEBUG_COMPILE_r({ SV *sv = sv_newmortal();
4462 regprop(r, sv, (regnode*)data.start_class);
4463 PerlIO_printf(Perl_debug_log,
4464 "synthetic stclass \"%s\".\n",
4465 SvPVX_const(sv));});
4468 /* A temporary algorithm prefers floated substr to fixed one to dig more info. */
4469 if (longest_fixed_length > longest_float_length) {
4470 r->check_end_shift = r->anchored_end_shift;
4471 r->check_substr = r->anchored_substr;
4472 r->check_utf8 = r->anchored_utf8;
4473 r->check_offset_min = r->check_offset_max = r->anchored_offset;
4474 if (r->extflags & RXf_ANCH_SINGLE)
4475 r->extflags |= RXf_NOSCAN;
4478 r->check_end_shift = r->float_end_shift;
4479 r->check_substr = r->float_substr;
4480 r->check_utf8 = r->float_utf8;
4481 r->check_offset_min = r->float_min_offset;
4482 r->check_offset_max = r->float_max_offset;
4484 /* XXXX Currently intuiting is not compatible with ANCH_GPOS.
4485 This should be changed ASAP! */
4486 if ((r->check_substr || r->check_utf8) && !(r->extflags & RXf_ANCH_GPOS)) {
4487 r->extflags |= RXf_USE_INTUIT;
4488 if (SvTAIL(r->check_substr ? r->check_substr : r->check_utf8))
4489 r->extflags |= RXf_INTUIT_TAIL;
4491 /* XXX Unneeded? dmq (shouldn't as this is handled elsewhere)
4492 if ( (STRLEN)minlen < longest_float_length )
4493 minlen= longest_float_length;
4494 if ( (STRLEN)minlen < longest_fixed_length )
4495 minlen= longest_fixed_length;
4499 /* Several toplevels. Best we can is to set minlen. */
4501 struct regnode_charclass_class ch_class;
4504 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "\nMulti Top Level\n"));
4506 scan = ri->program + 1;
4507 cl_init(pRExC_state, &ch_class);
4508 data.start_class = &ch_class;
4509 data.last_closep = &last_close;
4512 minlen = study_chunk(pRExC_state, &scan, &minlen, &fake, scan + RExC_size,
4513 &data, -1, NULL, NULL, SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS,0);
4517 r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8
4518 = r->float_substr = r->float_utf8 = NULL;
4519 if (!(data.start_class->flags & ANYOF_EOS)
4520 && !cl_is_anything(data.start_class))
4522 const U32 n = add_data(pRExC_state, 1, "f");
4524 Newx(RExC_rxi->data->data[n], 1,
4525 struct regnode_charclass_class);
4526 StructCopy(data.start_class,
4527 (struct regnode_charclass_class*)RExC_rxi->data->data[n],
4528 struct regnode_charclass_class);
4529 ri->regstclass = (regnode*)RExC_rxi->data->data[n];
4530 r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
4531 DEBUG_COMPILE_r({ SV* sv = sv_newmortal();
4532 regprop(r, sv, (regnode*)data.start_class);
4533 PerlIO_printf(Perl_debug_log,
4534 "synthetic stclass \"%s\".\n",
4535 SvPVX_const(sv));});
4539 /* Guard against an embedded (?=) or (?<=) with a longer minlen than
4540 the "real" pattern. */
4542 PerlIO_printf(Perl_debug_log,"minlen: %"IVdf" r->minlen:%"IVdf"\n",
4543 (IV)minlen, (IV)r->minlen);
4545 r->minlenret = minlen;
4546 if (r->minlen < minlen)
4549 if (RExC_seen & REG_SEEN_GPOS)
4550 r->extflags |= RXf_GPOS_SEEN;
4551 if (RExC_seen & REG_SEEN_LOOKBEHIND)
4552 r->extflags |= RXf_LOOKBEHIND_SEEN;
4553 if (RExC_seen & REG_SEEN_EVAL)
4554 r->extflags |= RXf_EVAL_SEEN;
4555 if (RExC_seen & REG_SEEN_CANY)
4556 r->extflags |= RXf_CANY_SEEN;
4557 if (RExC_seen & REG_SEEN_VERBARG)
4558 r->intflags |= PREGf_VERBARG_SEEN;
4559 if (RExC_seen & REG_SEEN_CUTGROUP)
4560 r->intflags |= PREGf_CUTGROUP_SEEN;
4561 if (RExC_paren_names)
4562 r->paren_names = (HV*)SvREFCNT_inc(RExC_paren_names);
4564 r->paren_names = NULL;
4566 if (RExC_recurse_count) {
4567 for ( ; RExC_recurse_count ; RExC_recurse_count-- ) {
4568 const regnode *scan = RExC_recurse[RExC_recurse_count-1];
4569 ARG2L_SET( scan, RExC_open_parens[ARG(scan)-1] - scan );
4572 Newxz(r->startp, RExC_npar, I32);
4573 Newxz(r->endp, RExC_npar, I32);
4574 /* assume we don't need to swap parens around before we match */
4577 PerlIO_printf(Perl_debug_log,"Final program:\n");
4580 DEBUG_OFFSETS_r(if (ri->offsets) {
4581 const U32 len = ri->offsets[0];
4583 GET_RE_DEBUG_FLAGS_DECL;
4584 PerlIO_printf(Perl_debug_log, "Offsets: [%"UVuf"]\n\t", (UV)ri->offsets[0]);
4585 for (i = 1; i <= len; i++) {
4586 if (ri->offsets[i*2-1] || ri->offsets[i*2])
4587 PerlIO_printf(Perl_debug_log, "%"UVuf":%"UVuf"[%"UVuf"] ",
4588 (UV)i, (UV)ri->offsets[i*2-1], (UV)ri->offsets[i*2]);
4590 PerlIO_printf(Perl_debug_log, "\n");
4596 #undef CORE_ONLY_BLOCK
4598 #undef RE_ENGINE_PTR
4600 #ifndef PERL_IN_XSUB_RE
4602 Perl_reg_named_buff_sv(pTHX_ SV* namesv)
4604 I32 parno = 0; /* no match */
4606 const REGEXP * const rx = PM_GETRE(PL_curpm);
4607 if (rx && rx->paren_names) {
4608 HE *he_str = hv_fetch_ent( rx->paren_names, namesv, 0, 0 );
4611 SV* sv_dat=HeVAL(he_str);
4612 I32 *nums=(I32*)SvPVX(sv_dat);
4613 for ( i=0; i<SvIVX(sv_dat); i++ ) {
4614 if ((I32)(rx->lastparen) >= nums[i] &&
4615 rx->endp[nums[i]] != -1)
4628 SV *sv= sv_newmortal();
4629 Perl_sv_setpvf(aTHX_ sv, "%"IVdf,(IV)parno);
4630 gv_paren= Perl_gv_fetchsv(aTHX_ sv, GV_ADD, SVt_PVGV);
4631 return GvSVn(gv_paren);
4636 /* Scans the name of a named buffer from the pattern.
4637 * If flags is REG_RSN_RETURN_NULL returns null.
4638 * If flags is REG_RSN_RETURN_NAME returns an SV* containing the name
4639 * If flags is REG_RSN_RETURN_DATA returns the data SV* corresponding
4640 * to the parsed name as looked up in the RExC_paren_names hash.
4641 * If there is an error throws a vFAIL().. type exception.
4644 #define REG_RSN_RETURN_NULL 0
4645 #define REG_RSN_RETURN_NAME 1
4646 #define REG_RSN_RETURN_DATA 2
4649 S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags) {
4650 char *name_start = RExC_parse;
4653 while( isIDFIRST_uni(utf8n_to_uvchr((U8*)RExC_parse,
4654 RExC_end - RExC_parse, &numlen, UTF8_ALLOW_DEFAULT)))
4656 RExC_parse += numlen;
4659 while( isIDFIRST(*RExC_parse) )
4663 SV* sv_name = sv_2mortal(Perl_newSVpvn(aTHX_ name_start,
4664 (int)(RExC_parse - name_start)));
4667 if ( flags == REG_RSN_RETURN_NAME)
4669 else if (flags==REG_RSN_RETURN_DATA) {
4672 if ( ! sv_name ) /* should not happen*/
4673 Perl_croak(aTHX_ "panic: no svname in reg_scan_name");
4674 if (RExC_paren_names)
4675 he_str = hv_fetch_ent( RExC_paren_names, sv_name, 0, 0 );
4677 sv_dat = HeVAL(he_str);
4679 vFAIL("Reference to nonexistent named group");
4683 Perl_croak(aTHX_ "panic: bad flag in reg_scan_name");
4690 #define DEBUG_PARSE_MSG(funcname) DEBUG_PARSE_r({ \
4691 int rem=(int)(RExC_end - RExC_parse); \
4700 if (RExC_lastparse!=RExC_parse) \
4701 PerlIO_printf(Perl_debug_log," >%.*s%-*s", \
4704 iscut ? "..." : "<" \
4707 PerlIO_printf(Perl_debug_log,"%16s",""); \
4712 num=REG_NODE_NUM(RExC_emit); \
4713 if (RExC_lastnum!=num) \
4714 PerlIO_printf(Perl_debug_log,"|%4d",num); \
4716 PerlIO_printf(Perl_debug_log,"|%4s",""); \
4717 PerlIO_printf(Perl_debug_log,"|%*s%-4s", \
4718 (int)((depth*2)), "", \
4722 RExC_lastparse=RExC_parse; \
4727 #define DEBUG_PARSE(funcname) DEBUG_PARSE_r({ \
4728 DEBUG_PARSE_MSG((funcname)); \
4729 PerlIO_printf(Perl_debug_log,"%4s","\n"); \
4731 #define DEBUG_PARSE_FMT(funcname,fmt,args) DEBUG_PARSE_r({ \
4732 DEBUG_PARSE_MSG((funcname)); \
4733 PerlIO_printf(Perl_debug_log,fmt "\n",args); \
4736 - reg - regular expression, i.e. main body or parenthesized thing
4738 * Caller must absorb opening parenthesis.
4740 * Combining parenthesis handling with the base level of regular expression
4741 * is a trifle forced, but the need to tie the tails of the branches to what
4742 * follows makes it hard to avoid.
4744 #define REGTAIL(x,y,z) regtail((x),(y),(z),depth+1)
4746 #define REGTAIL_STUDY(x,y,z) regtail_study((x),(y),(z),depth+1)
4748 #define REGTAIL_STUDY(x,y,z) regtail((x),(y),(z),depth+1)
4751 /* this idea is borrowed from STR_WITH_LEN in handy.h */
4752 #define CHECK_WORD(s,v,l) \
4753 (((sizeof(s)-1)==(l)) && (strnEQ(start_verb, (s ""), (sizeof(s)-1))))
4756 S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
4757 /* paren: Parenthesized? 0=top, 1=(, inside: changed to letter. */
4760 register regnode *ret; /* Will be the head of the group. */
4761 register regnode *br;
4762 register regnode *lastbr;
4763 register regnode *ender = NULL;
4764 register I32 parno = 0;
4766 const I32 oregflags = RExC_flags;
4767 bool have_branch = 0;
4770 /* for (?g), (?gc), and (?o) warnings; warning
4771 about (?c) will warn about (?g) -- japhy */
4773 #define WASTED_O 0x01
4774 #define WASTED_G 0x02
4775 #define WASTED_C 0x04
4776 #define WASTED_GC (0x02|0x04)
4777 I32 wastedflags = 0x00;
4779 char * parse_start = RExC_parse; /* MJD */
4780 char * const oregcomp_parse = RExC_parse;
4782 GET_RE_DEBUG_FLAGS_DECL;
4783 DEBUG_PARSE("reg ");
4786 *flagp = 0; /* Tentatively. */
4789 /* Make an OPEN node, if parenthesized. */
4791 if ( *RExC_parse == '*') { /* (*VERB:ARG) */
4792 char *start_verb = RExC_parse;
4793 STRLEN verb_len = 0;
4794 char *start_arg = NULL;
4795 unsigned char op = 0;
4797 int internal_argval = 0; /* internal_argval is only useful if !argok */
4798 while ( *RExC_parse && *RExC_parse != ')' ) {
4799 if ( *RExC_parse == ':' ) {
4800 start_arg = RExC_parse + 1;
4806 verb_len = RExC_parse - start_verb;
4809 while ( *RExC_parse && *RExC_parse != ')' )
4811 if ( *RExC_parse != ')' )
4812 vFAIL("Unterminated verb pattern argument");
4813 if ( RExC_parse == start_arg )
4816 if ( *RExC_parse != ')' )
4817 vFAIL("Unterminated verb pattern");
4820 switch ( *start_verb ) {
4821 case 'A': /* (*ACCEPT) */
4822 if ( CHECK_WORD("ACCEPT",start_verb,verb_len) ) {
4824 internal_argval = RExC_nestroot;
4827 case 'C': /* (*COMMIT) */
4828 if ( CHECK_WORD("COMMIT",start_verb,verb_len) )
4831 case 'F': /* (*FAIL) */
4832 if ( verb_len==1 || CHECK_WORD("FAIL",start_verb,verb_len) ) {
4837 case ':': /* (*:NAME) */
4838 case 'M': /* (*MARK:NAME) */
4839 if ( verb_len==0 || CHECK_WORD("MARK",start_verb,verb_len) ) {
4844 case 'P': /* (*PRUNE) */
4845 if ( CHECK_WORD("PRUNE",start_verb,verb_len) )
4848 case 'S': /* (*SKIP) */
4849 if ( CHECK_WORD("SKIP",start_verb,verb_len) )
4852 case 'T': /* (*THEN) */
4853 /* [19:06] <TimToady> :: is then */
4854 if ( CHECK_WORD("THEN",start_verb,verb_len) ) {
4856 RExC_seen |= REG_SEEN_CUTGROUP;
4862 vFAIL3("Unknown verb pattern '%.*s'",
4863 verb_len, start_verb);
4866 if ( start_arg && internal_argval ) {
4867 vFAIL3("Verb pattern '%.*s' may not have an argument",
4868 verb_len, start_verb);
4869 } else if ( argok < 0 && !start_arg ) {
4870 vFAIL3("Verb pattern '%.*s' has a mandatory argument",
4871 verb_len, start_verb);
4873 ret = reganode(pRExC_state, op, internal_argval);
4874 if ( ! internal_argval && ! SIZE_ONLY ) {
4876 SV *sv = newSVpvn( start_arg, RExC_parse - start_arg);
4877 ARG(ret) = add_data( pRExC_state, 1, "S" );
4878 RExC_rxi->data->data[ARG(ret)]=(void*)sv;
4885 if (!internal_argval)
4886 RExC_seen |= REG_SEEN_VERBARG;
4887 } else if ( start_arg ) {
4888 vFAIL3("Verb pattern '%.*s' may not have an argument",
4889 verb_len, start_verb);
4891 ret = reg_node(pRExC_state, op);
4893 nextchar(pRExC_state);
4896 if (*RExC_parse == '?') { /* (?...) */
4897 U32 posflags = 0, negflags = 0;
4898 U32 *flagsp = &posflags;
4899 bool is_logical = 0;
4900 const char * const seqstart = RExC_parse;
4903 paren = *RExC_parse++;
4904 ret = NULL; /* For look-ahead/behind. */
4907 case '<': /* (?<...) */
4908 if (*RExC_parse == '!')
4910 else if (*RExC_parse != '=')
4915 case '\'': /* (?'...') */
4916 name_start= RExC_parse;
4917 svname = reg_scan_name(pRExC_state,
4918 SIZE_ONLY ? /* reverse test from the others */
4919 REG_RSN_RETURN_NAME :
4920 REG_RSN_RETURN_NULL);
4921 if (RExC_parse == name_start)
4923 if (*RExC_parse != paren)
4924 vFAIL2("Sequence (?%c... not terminated",
4925 paren=='>' ? '<' : paren);
4929 if (!svname) /* shouldnt happen */
4931 "panic: reg_scan_name returned NULL");
4932 if (!RExC_paren_names) {
4933 RExC_paren_names= newHV();
4934 sv_2mortal((SV*)RExC_paren_names);
4936 he_str = hv_fetch_ent( RExC_paren_names, svname, 1, 0 );
4938 sv_dat = HeVAL(he_str);
4940 /* croak baby croak */
4942 "panic: paren_name hash element allocation failed");
4943 } else if ( SvPOK(sv_dat) ) {
4944 IV count=SvIV(sv_dat);
4945 I32 *pv=(I32*)SvGROW(sv_dat,SvCUR(sv_dat)+sizeof(I32)+1);
4946 SvCUR_set(sv_dat,SvCUR(sv_dat)+sizeof(I32));
4947 pv[count]=RExC_npar;
4950 (void)SvUPGRADE(sv_dat,SVt_PVNV);
4951 sv_setpvn(sv_dat, (char *)&(RExC_npar), sizeof(I32));
4956 /*sv_dump(sv_dat);*/
4958 nextchar(pRExC_state);
4960 goto capturing_parens;
4962 RExC_seen |= REG_SEEN_LOOKBEHIND;
4964 case '=': /* (?=...) */
4965 case '!': /* (?!...) */
4966 RExC_seen_zerolen++;
4967 if (*RExC_parse == ')') {
4968 ret=reg_node(pRExC_state, OPFAIL);
4969 nextchar(pRExC_state);
4972 case ':': /* (?:...) */
4973 case '>': /* (?>...) */
4975 case '$': /* (?$...) */
4976 case '@': /* (?@...) */
4977 vFAIL2("Sequence (?%c...) not implemented", (int)paren);
4979 case '#': /* (?#...) */
4980 while (*RExC_parse && *RExC_parse != ')')
4982 if (*RExC_parse != ')')
4983 FAIL("Sequence (?#... not terminated");
4984 nextchar(pRExC_state);
4987 case '0' : /* (?0) */
4988 case 'R' : /* (?R) */
4989 if (*RExC_parse != ')')
4990 FAIL("Sequence (?R) not terminated");
4991 ret = reg_node(pRExC_state, GOSTART);
4992 nextchar(pRExC_state);
4995 { /* named and numeric backreferences */
4998 case '&': /* (?&NAME) */
4999 parse_start = RExC_parse - 1;
5001 SV *sv_dat = reg_scan_name(pRExC_state,
5002 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
5003 num = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
5005 goto gen_recurse_regop;
5008 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
5010 vFAIL("Illegal pattern");
5012 goto parse_recursion;
5014 case '-': /* (?-1) */
5015 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
5016 RExC_parse--; /* rewind to let it be handled later */
5020 case '1': case '2': case '3': case '4': /* (?1) */
5021 case '5': case '6': case '7': case '8': case '9':
5024 num = atoi(RExC_parse);
5025 parse_start = RExC_parse - 1; /* MJD */
5026 if (*RExC_parse == '-')
5028 while (isDIGIT(*RExC_parse))
5030 if (*RExC_parse!=')')
5031 vFAIL("Expecting close bracket");
5034 if ( paren == '-' ) {
5036 Diagram of capture buffer numbering.
5037 Top line is the normal capture buffer numbers
5038 Botton line is the negative indexing as from
5042 /(a(x)y)(a(b(c(?-2)d)e)f)(g(h))/
5046 num = RExC_npar + num;
5049 vFAIL("Reference to nonexistent group");
5051 } else if ( paren == '+' ) {
5052 num = RExC_npar + num - 1;
5055 ret = reganode(pRExC_state, GOSUB, num);
5057 if (num > (I32)RExC_rx->nparens) {
5059 vFAIL("Reference to nonexistent group");
5061 ARG2L_SET( ret, RExC_recurse_count++);
5063 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
5064 "Recurse #%"UVuf" to %"IVdf"\n", (UV)ARG(ret), (IV)ARG2L(ret)));
5068 RExC_seen |= REG_SEEN_RECURSE;
5069 Set_Node_Length(ret, 1 + regarglen[OP(ret)]); /* MJD */
5070 Set_Node_Offset(ret, parse_start); /* MJD */
5072 nextchar(pRExC_state);
5074 } /* named and numeric backreferences */
5077 case 'p': /* (?p...) */
5078 if (SIZE_ONLY && ckWARN2(WARN_DEPRECATED, WARN_REGEXP))
5079 vWARNdep(RExC_parse, "(?p{}) is deprecated - use (??{})");
5081 case '?': /* (??...) */
5083 if (*RExC_parse != '{')
5085 paren = *RExC_parse++;
5087 case '{': /* (?{...}) */
5092 char *s = RExC_parse;
5094 RExC_seen_zerolen++;
5095 RExC_seen |= REG_SEEN_EVAL;
5096 while (count && (c = *RExC_parse)) {
5107 if (*RExC_parse != ')') {
5109 vFAIL("Sequence (?{...}) not terminated or not {}-balanced");
5113 OP_4tree *sop, *rop;
5114 SV * const sv = newSVpvn(s, RExC_parse - 1 - s);
5117 Perl_save_re_context(aTHX);
5118 rop = sv_compile_2op(sv, &sop, "re", &pad);
5119 sop->op_private |= OPpREFCOUNTED;
5120 /* re_dup will OpREFCNT_inc */
5121 OpREFCNT_set(sop, 1);
5124 n = add_data(pRExC_state, 3, "nop");
5125 RExC_rxi->data->data[n] = (void*)rop;
5126 RExC_rxi->data->data[n+1] = (void*)sop;
5127 RExC_rxi->data->data[n+2] = (void*)pad;
5130 else { /* First pass */
5131 if (PL_reginterp_cnt < ++RExC_seen_evals
5133 /* No compiled RE interpolated, has runtime
5134 components ===> unsafe. */
5135 FAIL("Eval-group not allowed at runtime, use re 'eval'");
5136 if (PL_tainting && PL_tainted)
5137 FAIL("Eval-group in insecure regular expression");
5138 #if PERL_VERSION > 8
5139 if (IN_PERL_COMPILETIME)
5144 nextchar(pRExC_state);
5146 ret = reg_node(pRExC_state, LOGICAL);
5149 REGTAIL(pRExC_state, ret, reganode(pRExC_state, EVAL, n));
5150 /* deal with the length of this later - MJD */
5153 ret = reganode(pRExC_state, EVAL, n);
5154 Set_Node_Length(ret, RExC_parse - parse_start + 1);
5155 Set_Node_Offset(ret, parse_start);
5158 case '(': /* (?(?{...})...) and (?(?=...)...) */
5161 if (RExC_parse[0] == '?') { /* (?(?...)) */
5162 if (RExC_parse[1] == '=' || RExC_parse[1] == '!'
5163 || RExC_parse[1] == '<'
5164 || RExC_parse[1] == '{') { /* Lookahead or eval. */
5167 ret = reg_node(pRExC_state, LOGICAL);
5170 REGTAIL(pRExC_state, ret, reg(pRExC_state, 1, &flag,depth+1));
5174 else if ( RExC_parse[0] == '<' /* (?(<NAME>)...) */
5175 || RExC_parse[0] == '\'' ) /* (?('NAME')...) */
5177 char ch = RExC_parse[0] == '<' ? '>' : '\'';
5178 char *name_start= RExC_parse++;
5180 SV *sv_dat=reg_scan_name(pRExC_state,
5181 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
5182 if (RExC_parse == name_start || *RExC_parse != ch)
5183 vFAIL2("Sequence (?(%c... not terminated",
5184 (ch == '>' ? '<' : ch));
5187 num = add_data( pRExC_state, 1, "S" );
5188 RExC_rxi->data->data[num]=(void*)sv_dat;
5189 SvREFCNT_inc(sv_dat);
5191 ret = reganode(pRExC_state,NGROUPP,num);
5192 goto insert_if_check_paren;
5194 else if (RExC_parse[0] == 'D' &&
5195 RExC_parse[1] == 'E' &&
5196 RExC_parse[2] == 'F' &&
5197 RExC_parse[3] == 'I' &&
5198 RExC_parse[4] == 'N' &&
5199 RExC_parse[5] == 'E')
5201 ret = reganode(pRExC_state,DEFINEP,0);
5204 goto insert_if_check_paren;
5206 else if (RExC_parse[0] == 'R') {
5209 if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
5210 parno = atoi(RExC_parse++);
5211 while (isDIGIT(*RExC_parse))
5213 } else if (RExC_parse[0] == '&') {
5216 sv_dat = reg_scan_name(pRExC_state,
5217 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
5218 parno = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
5220 ret = reganode(pRExC_state,INSUBP,parno);
5221 goto insert_if_check_paren;
5223 else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
5226 parno = atoi(RExC_parse++);
5228 while (isDIGIT(*RExC_parse))
5230 ret = reganode(pRExC_state, GROUPP, parno);
5232 insert_if_check_paren:
5233 if ((c = *nextchar(pRExC_state)) != ')')
5234 vFAIL("Switch condition not recognized");
5236 REGTAIL(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
5237 br = regbranch(pRExC_state, &flags, 1,depth+1);
5239 br = reganode(pRExC_state, LONGJMP, 0);
5241 REGTAIL(pRExC_state, br, reganode(pRExC_state, LONGJMP, 0));
5242 c = *nextchar(pRExC_state);
5247 vFAIL("(?(DEFINE)....) does not allow branches");
5248 lastbr = reganode(pRExC_state, IFTHEN, 0); /* Fake one for optimizer. */
5249 regbranch(pRExC_state, &flags, 1,depth+1);
5250 REGTAIL(pRExC_state, ret, lastbr);
5253 c = *nextchar(pRExC_state);
5258 vFAIL("Switch (?(condition)... contains too many branches");
5259 ender = reg_node(pRExC_state, TAIL);
5260 REGTAIL(pRExC_state, br, ender);
5262 REGTAIL(pRExC_state, lastbr, ender);
5263 REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
5266 REGTAIL(pRExC_state, ret, ender);
5270 vFAIL2("Unknown switch condition (?(%.2s", RExC_parse);
5274 RExC_parse--; /* for vFAIL to print correctly */
5275 vFAIL("Sequence (? incomplete");
5279 parse_flags: /* (?i) */
5280 while (*RExC_parse && strchr("iogcmsx", *RExC_parse)) {
5281 /* (?g), (?gc) and (?o) are useless here
5282 and must be globally applied -- japhy */
5284 if (*RExC_parse == 'o' || *RExC_parse == 'g') {
5285 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
5286 const I32 wflagbit = *RExC_parse == 'o' ? WASTED_O : WASTED_G;
5287 if (! (wastedflags & wflagbit) ) {
5288 wastedflags |= wflagbit;
5291 "Useless (%s%c) - %suse /%c modifier",
5292 flagsp == &negflags ? "?-" : "?",
5294 flagsp == &negflags ? "don't " : "",
5300 else if (*RExC_parse == 'c') {
5301 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
5302 if (! (wastedflags & WASTED_C) ) {
5303 wastedflags |= WASTED_GC;
5306 "Useless (%sc) - %suse /gc modifier",
5307 flagsp == &negflags ? "?-" : "?",
5308 flagsp == &negflags ? "don't " : ""
5313 else { pmflag(flagsp, *RExC_parse); }
5317 if (*RExC_parse == '-') {
5319 wastedflags = 0; /* reset so (?g-c) warns twice */
5323 RExC_flags |= posflags;
5324 RExC_flags &= ~negflags;
5325 if (*RExC_parse == ':') {
5331 if (*RExC_parse != ')') {
5333 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
5335 nextchar(pRExC_state);
5345 ret = reganode(pRExC_state, OPEN, parno);
5348 RExC_nestroot = parno;
5349 if (RExC_seen & REG_SEEN_RECURSE) {
5350 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
5351 "Setting open paren #%"IVdf" to %d\n",
5352 (IV)parno, REG_NODE_NUM(ret)));
5353 RExC_open_parens[parno-1]= ret;
5356 Set_Node_Length(ret, 1); /* MJD */
5357 Set_Node_Offset(ret, RExC_parse); /* MJD */
5364 /* Pick up the branches, linking them together. */
5365 parse_start = RExC_parse; /* MJD */
5366 br = regbranch(pRExC_state, &flags, 1,depth+1);
5367 /* branch_len = (paren != 0); */
5371 if (*RExC_parse == '|') {
5372 if (!SIZE_ONLY && RExC_extralen) {
5373 reginsert(pRExC_state, BRANCHJ, br, depth+1);
5376 reginsert(pRExC_state, BRANCH, br, depth+1);
5377 Set_Node_Length(br, paren != 0);
5378 Set_Node_Offset_To_R(br-RExC_emit_start, parse_start-RExC_start);
5382 RExC_extralen += 1; /* For BRANCHJ-BRANCH. */
5384 else if (paren == ':') {
5385 *flagp |= flags&SIMPLE;
5387 if (is_open) { /* Starts with OPEN. */
5388 REGTAIL(pRExC_state, ret, br); /* OPEN -> first. */
5390 else if (paren != '?') /* Not Conditional */
5392 *flagp |= flags & (SPSTART | HASWIDTH);
5394 while (*RExC_parse == '|') {
5395 if (!SIZE_ONLY && RExC_extralen) {
5396 ender = reganode(pRExC_state, LONGJMP,0);
5397 REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender); /* Append to the previous. */
5400 RExC_extralen += 2; /* Account for LONGJMP. */
5401 nextchar(pRExC_state);
5402 br = regbranch(pRExC_state, &flags, 0, depth+1);
5406 REGTAIL(pRExC_state, lastbr, br); /* BRANCH -> BRANCH. */
5410 *flagp |= flags&SPSTART;
5413 if (have_branch || paren != ':') {
5414 /* Make a closing node, and hook it on the end. */
5417 ender = reg_node(pRExC_state, TAIL);
5421 ender = reganode(pRExC_state, CLOSE, parno);
5422 if (!SIZE_ONLY && RExC_seen & REG_SEEN_RECURSE) {
5423 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
5424 "Setting close paren #%"IVdf" to %d\n",
5425 (IV)parno, REG_NODE_NUM(ender)));
5426 RExC_close_parens[parno-1]= ender;
5427 if (RExC_nestroot == parno)
5430 Set_Node_Offset(ender,RExC_parse+1); /* MJD */
5431 Set_Node_Length(ender,1); /* MJD */
5437 *flagp &= ~HASWIDTH;
5440 ender = reg_node(pRExC_state, SUCCEED);
5443 ender = reg_node(pRExC_state, END);
5445 assert(!RExC_opend); /* there can only be one! */
5450 REGTAIL(pRExC_state, lastbr, ender);
5452 if (have_branch && !SIZE_ONLY) {
5454 RExC_seen |= REG_TOP_LEVEL_BRANCHES;
5456 /* Hook the tails of the branches to the closing node. */
5457 for (br = ret; br; br = regnext(br)) {
5458 const U8 op = PL_regkind[OP(br)];
5460 REGTAIL_STUDY(pRExC_state, NEXTOPER(br), ender);
5462 else if (op == BRANCHJ) {
5463 REGTAIL_STUDY(pRExC_state, NEXTOPER(NEXTOPER(br)), ender);
5471 static const char parens[] = "=!<,>";
5473 if (paren && (p = strchr(parens, paren))) {
5474 U8 node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
5475 int flag = (p - parens) > 1;
5478 node = SUSPEND, flag = 0;
5479 reginsert(pRExC_state, node,ret, depth+1);
5480 Set_Node_Cur_Length(ret);
5481 Set_Node_Offset(ret, parse_start + 1);
5483 REGTAIL_STUDY(pRExC_state, ret, reg_node(pRExC_state, TAIL));
5487 /* Check for proper termination. */
5489 RExC_flags = oregflags;
5490 if (RExC_parse >= RExC_end || *nextchar(pRExC_state) != ')') {
5491 RExC_parse = oregcomp_parse;
5492 vFAIL("Unmatched (");
5495 else if (!paren && RExC_parse < RExC_end) {
5496 if (*RExC_parse == ')') {
5498 vFAIL("Unmatched )");
5501 FAIL("Junk on end of regexp"); /* "Can't happen". */
5509 - regbranch - one alternative of an | operator
5511 * Implements the concatenation operator.
5514 S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first, U32 depth)
5517 register regnode *ret;
5518 register regnode *chain = NULL;
5519 register regnode *latest;
5520 I32 flags = 0, c = 0;
5521 GET_RE_DEBUG_FLAGS_DECL;
5522 DEBUG_PARSE("brnc");
5526 if (!SIZE_ONLY && RExC_extralen)
5527 ret = reganode(pRExC_state, BRANCHJ,0);
5529 ret = reg_node(pRExC_state, BRANCH);
5530 Set_Node_Length(ret, 1);
5534 if (!first && SIZE_ONLY)
5535 RExC_extralen += 1; /* BRANCHJ */
5537 *flagp = WORST; /* Tentatively. */
5540 nextchar(pRExC_state);
5541 while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
5543 latest = regpiece(pRExC_state, &flags,depth+1);
5544 if (latest == NULL) {
5545 if (flags & TRYAGAIN)
5549 else if (ret == NULL)
5551 *flagp |= flags&HASWIDTH;
5552 if (chain == NULL) /* First piece. */
5553 *flagp |= flags&SPSTART;
5556 REGTAIL(pRExC_state, chain, latest);
5561 if (chain == NULL) { /* Loop ran zero times. */
5562 chain = reg_node(pRExC_state, NOTHING);
5567 *flagp |= flags&SIMPLE;
5574 - regpiece - something followed by possible [*+?]
5576 * Note that the branching code sequences used for ? and the general cases
5577 * of * and + are somewhat optimized: they use the same NOTHING node as
5578 * both the endmarker for their branch list and the body of the last branch.
5579 * It might seem that this node could be dispensed with entirely, but the
5580 * endmarker role is not redundant.
5583 S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
5586 register regnode *ret;
5588 register char *next;
5590 const char * const origparse = RExC_parse;
5592 I32 max = REG_INFTY;
5594 const char *maxpos = NULL;
5595 GET_RE_DEBUG_FLAGS_DECL;
5596 DEBUG_PARSE("piec");
5598 ret = regatom(pRExC_state, &flags,depth+1);
5600 if (flags & TRYAGAIN)
5607 if (op == '{' && regcurly(RExC_parse)) {
5609 parse_start = RExC_parse; /* MJD */
5610 next = RExC_parse + 1;
5611 while (isDIGIT(*next) || *next == ',') {
5620 if (*next == '}') { /* got one */
5624 min = atoi(RExC_parse);
5628 maxpos = RExC_parse;
5630 if (!max && *maxpos != '0')
5631 max = REG_INFTY; /* meaning "infinity" */
5632 else if (max >= REG_INFTY)
5633 vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
5635 nextchar(pRExC_state);
5638 if ((flags&SIMPLE)) {
5639 RExC_naughty += 2 + RExC_naughty / 2;
5640 reginsert(pRExC_state, CURLY, ret, depth+1);
5641 Set_Node_Offset(ret, parse_start+1); /* MJD */
5642 Set_Node_Cur_Length(ret);
5645 regnode * const w = reg_node(pRExC_state, WHILEM);
5648 REGTAIL(pRExC_state, ret, w);
5649 if (!SIZE_ONLY && RExC_extralen) {
5650 reginsert(pRExC_state, LONGJMP,ret, depth+1);
5651 reginsert(pRExC_state, NOTHING,ret, depth+1);
5652 NEXT_OFF(ret) = 3; /* Go over LONGJMP. */
5654 reginsert(pRExC_state, CURLYX,ret, depth+1);
5656 Set_Node_Offset(ret, parse_start+1);
5657 Set_Node_Length(ret,
5658 op == '{' ? (RExC_parse - parse_start) : 1);
5660 if (!SIZE_ONLY && RExC_extralen)
5661 NEXT_OFF(ret) = 3; /* Go over NOTHING to LONGJMP. */
5662 REGTAIL(pRExC_state, ret, reg_node(pRExC_state, NOTHING));
5664 RExC_whilem_seen++, RExC_extralen += 3;
5665 RExC_naughty += 4 + RExC_naughty; /* compound interest */
5673 if (max && max < min)
5674 vFAIL("Can't do {n,m} with n > m");
5676 ARG1_SET(ret, (U16)min);
5677 ARG2_SET(ret, (U16)max);
5689 #if 0 /* Now runtime fix should be reliable. */
5691 /* if this is reinstated, don't forget to put this back into perldiag:
5693 =item Regexp *+ operand could be empty at {#} in regex m/%s/
5695 (F) The part of the regexp subject to either the * or + quantifier
5696 could match an empty string. The {#} shows in the regular
5697 expression about where the problem was discovered.
5701 if (!(flags&HASWIDTH) && op != '?')
5702 vFAIL("Regexp *+ operand could be empty");
5705 parse_start = RExC_parse;
5706 nextchar(pRExC_state);
5708 *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH);
5710 if (op == '*' && (flags&SIMPLE)) {
5711 reginsert(pRExC_state, STAR, ret, depth+1);
5715 else if (op == '*') {
5719 else if (op == '+' && (flags&SIMPLE)) {
5720 reginsert(pRExC_state, PLUS, ret, depth+1);
5724 else if (op == '+') {
5728 else if (op == '?') {
5733 if (!SIZE_ONLY && !(flags&HASWIDTH) && max > REG_INFTY/3 && ckWARN(WARN_REGEXP)) {
5735 "%.*s matches null string many times",
5736 (int)(RExC_parse >= origparse ? RExC_parse - origparse : 0),
5740 if (RExC_parse < RExC_end && *RExC_parse == '?') {
5741 nextchar(pRExC_state);
5742 reginsert(pRExC_state, MINMOD, ret, depth+1);
5743 REGTAIL(pRExC_state, ret, ret + NODE_STEP_REGNODE);
5745 #ifndef REG_ALLOW_MINMOD_SUSPEND
5748 if (RExC_parse < RExC_end && *RExC_parse == '+') {
5750 nextchar(pRExC_state);
5751 ender = reg_node(pRExC_state, SUCCEED);
5752 REGTAIL(pRExC_state, ret, ender);
5753 reginsert(pRExC_state, SUSPEND, ret, depth+1);
5755 ender = reg_node(pRExC_state, TAIL);
5756 REGTAIL(pRExC_state, ret, ender);
5760 if (RExC_parse < RExC_end && ISMULT2(RExC_parse)) {
5762 vFAIL("Nested quantifiers");
5769 /* reg_namedseq(pRExC_state,UVp)
5771 This is expected to be called by a parser routine that has
5772 recognized'\N' and needs to handle the rest. RExC_parse is
5773 expected to point at the first char following the N at the time
5776 If valuep is non-null then it is assumed that we are parsing inside
5777 of a charclass definition and the first codepoint in the resolved
5778 string is returned via *valuep and the routine will return NULL.
5779 In this mode if a multichar string is returned from the charnames
5780 handler a warning will be issued, and only the first char in the
5781 sequence will be examined. If the string returned is zero length
5782 then the value of *valuep is undefined and NON-NULL will
5783 be returned to indicate failure. (This will NOT be a valid pointer
5786 If value is null then it is assumed that we are parsing normal text
5787 and inserts a new EXACT node into the program containing the resolved
5788 string and returns a pointer to the new node. If the string is
5789 zerolength a NOTHING node is emitted.
5791 On success RExC_parse is set to the char following the endbrace.
5792 Parsing failures will generate a fatal errorvia vFAIL(...)
5794 NOTE: We cache all results from the charnames handler locally in
5795 the RExC_charnames hash (created on first use) to prevent a charnames
5796 handler from playing silly-buggers and returning a short string and
5797 then a long string for a given pattern. Since the regexp program
5798 size is calculated during an initial parse this would result
5799 in a buffer overrun so we cache to prevent the charname result from
5800 changing during the course of the parse.
5804 S_reg_namedseq(pTHX_ RExC_state_t *pRExC_state, UV *valuep)
5806 char * name; /* start of the content of the name */
5807 char * endbrace; /* endbrace following the name */
5810 STRLEN len; /* this has various purposes throughout the code */
5811 bool cached = 0; /* if this is true then we shouldn't refcount dev sv_str */
5812 regnode *ret = NULL;
5814 if (*RExC_parse != '{') {
5815 vFAIL("Missing braces on \\N{}");
5817 name = RExC_parse+1;
5818 endbrace = strchr(RExC_parse, '}');
5821 vFAIL("Missing right brace on \\N{}");
5823 RExC_parse = endbrace + 1;
5826 /* RExC_parse points at the beginning brace,
5827 endbrace points at the last */
5828 if ( name[0]=='U' && name[1]=='+' ) {
5829 /* its a "unicode hex" notation {U+89AB} */
5830 I32 fl = PERL_SCAN_ALLOW_UNDERSCORES
5831 | PERL_SCAN_DISALLOW_PREFIX
5832 | (SIZE_ONLY ? PERL_SCAN_SILENT_ILLDIGIT : 0);
5834 len = (STRLEN)(endbrace - name - 2);
5835 cp = grok_hex(name + 2, &len, &fl, NULL);
5836 if ( len != (STRLEN)(endbrace - name - 2) ) {
5845 sv_str= Perl_newSVpvf_nocontext("%c",(int)cp);
5847 /* fetch the charnames handler for this scope */
5848 HV * const table = GvHV(PL_hintgv);
5850 hv_fetchs(table, "charnames", FALSE) :
5852 SV *cv= cvp ? *cvp : NULL;
5855 /* create an SV with the name as argument */
5856 sv_name = newSVpvn(name, endbrace - name);
5858 if (!table || !(PL_hints & HINT_LOCALIZE_HH)) {
5859 vFAIL2("Constant(\\N{%s}) unknown: "
5860 "(possibly a missing \"use charnames ...\")",
5863 if (!cvp || !SvOK(*cvp)) { /* when $^H{charnames} = undef; */
5864 vFAIL2("Constant(\\N{%s}): "
5865 "$^H{charnames} is not defined",SvPVX(sv_name));
5870 if (!RExC_charnames) {
5871 /* make sure our cache is allocated */
5872 RExC_charnames = newHV();
5873 sv_2mortal((SV*)RExC_charnames);
5875 /* see if we have looked this one up before */
5876 he_str = hv_fetch_ent( RExC_charnames, sv_name, 0, 0 );
5878 sv_str = HeVAL(he_str);
5891 count= call_sv(cv, G_SCALAR);
5893 if (count == 1) { /* XXXX is this right? dmq */
5895 SvREFCNT_inc_simple_void(sv_str);
5903 if ( !sv_str || !SvOK(sv_str) ) {
5904 vFAIL2("Constant(\\N{%s}): Call to &{$^H{charnames}} "
5905 "did not return a defined value",SvPVX(sv_name));
5907 if (hv_store_ent( RExC_charnames, sv_name, sv_str, 0))
5912 char *p = SvPV(sv_str, len);
5915 if ( SvUTF8(sv_str) ) {
5916 *valuep = utf8_to_uvchr((U8*)p, &numlen);
5920 We have to turn on utf8 for high bit chars otherwise
5921 we get failures with
5923 "ss" =~ /[\N{LATIN SMALL LETTER SHARP S}]/i
5924 "SS" =~ /[\N{LATIN SMALL LETTER SHARP S}]/i
5926 This is different from what \x{} would do with the same
5927 codepoint, where the condition is > 0xFF.
5934 /* warn if we havent used the whole string? */
5936 if (numlen<len && SIZE_ONLY && ckWARN(WARN_REGEXP)) {
5938 "Ignoring excess chars from \\N{%s} in character class",
5942 } else if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
5944 "Ignoring zero length \\N{%s} in character class",
5949 SvREFCNT_dec(sv_name);
5951 SvREFCNT_dec(sv_str);
5952 return len ? NULL : (regnode *)&len;
5953 } else if(SvCUR(sv_str)) {
5958 char * parse_start = name-3; /* needed for the offsets */
5959 GET_RE_DEBUG_FLAGS_DECL; /* needed for the offsets */
5961 ret = reg_node(pRExC_state,
5962 (U8)(FOLD ? (LOC ? EXACTFL : EXACTF) : EXACT));
5965 if ( RExC_utf8 && !SvUTF8(sv_str) ) {
5966 sv_utf8_upgrade(sv_str);
5967 } else if ( !RExC_utf8 && SvUTF8(sv_str) ) {
5971 p = SvPV(sv_str, len);
5973 /* len is the length written, charlen is the size the char read */
5974 for ( len = 0; p < pend; p += charlen ) {
5976 UV uvc = utf8_to_uvchr((U8*)p, &charlen);
5978 STRLEN foldlen,numlen;
5979 U8 tmpbuf[UTF8_MAXBYTES_CASE+1], *foldbuf;
5980 uvc = toFOLD_uni(uvc, tmpbuf, &foldlen);
5981 /* Emit all the Unicode characters. */
5983 for (foldbuf = tmpbuf;
5987 uvc = utf8_to_uvchr(foldbuf, &numlen);
5989 const STRLEN unilen = reguni(pRExC_state, uvc, s);
5992 /* In EBCDIC the numlen
5993 * and unilen can differ. */
5995 if (numlen >= foldlen)
5999 break; /* "Can't happen." */
6002 const STRLEN unilen = reguni(pRExC_state, uvc, s);
6014 RExC_size += STR_SZ(len);
6017 RExC_emit += STR_SZ(len);
6019 Set_Node_Cur_Length(ret); /* MJD */
6021 nextchar(pRExC_state);
6023 ret = reg_node(pRExC_state,NOTHING);
6026 SvREFCNT_dec(sv_str);
6029 SvREFCNT_dec(sv_name);
6039 * It returns the code point in utf8 for the value in *encp.
6040 * value: a code value in the source encoding
6041 * encp: a pointer to an Encode object
6043 * If the result from Encode is not a single character,
6044 * it returns U+FFFD (Replacement character) and sets *encp to NULL.
6047 S_reg_recode(pTHX_ const char value, SV **encp)
6050 SV * const sv = sv_2mortal(newSVpvn(&value, numlen));
6051 const char * const s = encp && *encp ? sv_recode_to_utf8(sv, *encp)
6053 const STRLEN newlen = SvCUR(sv);
6054 UV uv = UNICODE_REPLACEMENT;
6058 ? utf8n_to_uvchr((U8*)s, newlen, &numlen, UTF8_ALLOW_DEFAULT)
6061 if (!newlen || numlen != newlen) {
6062 uv = UNICODE_REPLACEMENT;
6071 - regatom - the lowest level
6073 * Optimization: gobbles an entire sequence of ordinary characters so that
6074 * it can turn them into a single node, which is smaller to store and
6075 * faster to run. Backslashed characters are exceptions, each becoming a
6076 * separate node; the code is simpler that way and it's not worth fixing.
6078 * [Yes, it is worth fixing, some scripts can run twice the speed.]
6079 * [It looks like its ok, as in S_study_chunk we merge adjacent EXACT nodes]
6082 S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
6085 register regnode *ret = NULL;
6087 char *parse_start = RExC_parse;
6088 GET_RE_DEBUG_FLAGS_DECL;
6089 DEBUG_PARSE("atom");
6090 *flagp = WORST; /* Tentatively. */
6093 switch (*RExC_parse) {
6095 RExC_seen_zerolen++;
6096 nextchar(pRExC_state);
6097 if (RExC_flags & RXf_PMf_MULTILINE)
6098 ret = reg_node(pRExC_state, MBOL);
6099 else if (RExC_flags & RXf_PMf_SINGLELINE)
6100 ret = reg_node(pRExC_state, SBOL);
6102 ret = reg_node(pRExC_state, BOL);
6103 Set_Node_Length(ret, 1); /* MJD */
6106 nextchar(pRExC_state);
6108 RExC_seen_zerolen++;
6109 if (RExC_flags & RXf_PMf_MULTILINE)
6110 ret = reg_node(pRExC_state, MEOL);
6111 else if (RExC_flags & RXf_PMf_SINGLELINE)
6112 ret = reg_node(pRExC_state, SEOL);
6114 ret = reg_node(pRExC_state, EOL);
6115 Set_Node_Length(ret, 1); /* MJD */
6118 nextchar(pRExC_state);
6119 if (RExC_flags & RXf_PMf_SINGLELINE)
6120 ret = reg_node(pRExC_state, SANY);
6122 ret = reg_node(pRExC_state, REG_ANY);
6123 *flagp |= HASWIDTH|SIMPLE;
6125 Set_Node_Length(ret, 1); /* MJD */
6129 char * const oregcomp_parse = ++RExC_parse;
6130 ret = regclass(pRExC_state,depth+1);
6131 if (*RExC_parse != ']') {
6132 RExC_parse = oregcomp_parse;
6133 vFAIL("Unmatched [");
6135 nextchar(pRExC_state);
6136 *flagp |= HASWIDTH|SIMPLE;
6137 Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */
6141 nextchar(pRExC_state);
6142 ret = reg(pRExC_state, 1, &flags,depth+1);
6144 if (flags & TRYAGAIN) {
6145 if (RExC_parse == RExC_end) {
6146 /* Make parent create an empty node if needed. */
6154 *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE);
6158 if (flags & TRYAGAIN) {
6162 vFAIL("Internal urp");
6163 /* Supposed to be caught earlier. */
6166 if (!regcurly(RExC_parse)) {
6175 vFAIL("Quantifier follows nothing");
6178 switch (*++RExC_parse) {
6180 RExC_seen_zerolen++;
6181 ret = reg_node(pRExC_state, SBOL);
6183 nextchar(pRExC_state);
6184 Set_Node_Length(ret, 2); /* MJD */
6187 ret = reg_node(pRExC_state, GPOS);
6188 RExC_seen |= REG_SEEN_GPOS;
6190 nextchar(pRExC_state);
6191 Set_Node_Length(ret, 2); /* MJD */
6194 ret = reg_node(pRExC_state, SEOL);
6196 RExC_seen_zerolen++; /* Do not optimize RE away */
6197 nextchar(pRExC_state);
6200 ret = reg_node(pRExC_state, EOS);
6202 RExC_seen_zerolen++; /* Do not optimize RE away */
6203 nextchar(pRExC_state);
6204 Set_Node_Length(ret, 2); /* MJD */
6207 ret = reg_node(pRExC_state, CANY);
6208 RExC_seen |= REG_SEEN_CANY;
6209 *flagp |= HASWIDTH|SIMPLE;
6210 nextchar(pRExC_state);
6211 Set_Node_Length(ret, 2); /* MJD */
6214 ret = reg_node(pRExC_state, CLUMP);
6216 nextchar(pRExC_state);
6217 Set_Node_Length(ret, 2); /* MJD */
6220 ret = reg_node(pRExC_state, (U8)(LOC ? ALNUML : ALNUM));
6221 *flagp |= HASWIDTH|SIMPLE;
6222 nextchar(pRExC_state);
6223 Set_Node_Length(ret, 2); /* MJD */
6226 ret = reg_node(pRExC_state, (U8)(LOC ? NALNUML : NALNUM));
6227 *flagp |= HASWIDTH|SIMPLE;
6228 nextchar(pRExC_state);
6229 Set_Node_Length(ret, 2); /* MJD */
6232 RExC_seen_zerolen++;
6233 RExC_seen |= REG_SEEN_LOOKBEHIND;
6234 ret = reg_node(pRExC_state, (U8)(LOC ? BOUNDL : BOUND));
6236 nextchar(pRExC_state);
6237 Set_Node_Length(ret, 2); /* MJD */
6240 RExC_seen_zerolen++;
6241 RExC_seen |= REG_SEEN_LOOKBEHIND;
6242 ret = reg_node(pRExC_state, (U8)(LOC ? NBOUNDL : NBOUND));
6244 nextchar(pRExC_state);
6245 Set_Node_Length(ret, 2); /* MJD */
6248 ret = reg_node(pRExC_state, (U8)(LOC ? SPACEL : SPACE));
6249 *flagp |= HASWIDTH|SIMPLE;
6250 nextchar(pRExC_state);
6251 Set_Node_Length(ret, 2); /* MJD */
6254 ret = reg_node(pRExC_state, (U8)(LOC ? NSPACEL : NSPACE));
6255 *flagp |= HASWIDTH|SIMPLE;
6256 nextchar(pRExC_state);
6257 Set_Node_Length(ret, 2); /* MJD */
6260 ret = reg_node(pRExC_state, DIGIT);
6261 *flagp |= HASWIDTH|SIMPLE;
6262 nextchar(pRExC_state);
6263 Set_Node_Length(ret, 2); /* MJD */
6266 ret = reg_node(pRExC_state, NDIGIT);
6267 *flagp |= HASWIDTH|SIMPLE;
6268 nextchar(pRExC_state);
6269 Set_Node_Length(ret, 2); /* MJD */
6274 char* const oldregxend = RExC_end;
6275 char* parse_start = RExC_parse - 2;
6277 if (RExC_parse[1] == '{') {
6278 /* a lovely hack--pretend we saw [\pX] instead */
6279 RExC_end = strchr(RExC_parse, '}');
6281 const U8 c = (U8)*RExC_parse;
6283 RExC_end = oldregxend;
6284 vFAIL2("Missing right brace on \\%c{}", c);
6289 RExC_end = RExC_parse + 2;
6290 if (RExC_end > oldregxend)
6291 RExC_end = oldregxend;
6295 ret = regclass(pRExC_state,depth+1);
6297 RExC_end = oldregxend;
6300 Set_Node_Offset(ret, parse_start + 2);
6301 Set_Node_Cur_Length(ret);
6302 nextchar(pRExC_state);
6303 *flagp |= HASWIDTH|SIMPLE;
6307 /* Handle \N{NAME} here and not below because it can be
6308 multicharacter. join_exact() will join them up later on.
6309 Also this makes sure that things like /\N{BLAH}+/ and
6310 \N{BLAH} being multi char Just Happen. dmq*/
6312 ret= reg_namedseq(pRExC_state, NULL);
6314 case 'k': /* Handle \k<NAME> and \k'NAME' */
6316 char ch= RExC_parse[1];
6317 if (ch != '<' && ch != '\'') {
6319 vWARN( RExC_parse + 1,
6320 "Possible broken named back reference treated as literal k");
6324 char* name_start = (RExC_parse += 2);
6326 SV *sv_dat = reg_scan_name(pRExC_state,
6327 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
6328 ch= (ch == '<') ? '>' : '\'';
6330 if (RExC_parse == name_start || *RExC_parse != ch)
6331 vFAIL2("Sequence \\k%c... not terminated",
6332 (ch == '>' ? '<' : ch));
6335 ret = reganode(pRExC_state,
6336 (U8)(FOLD ? (LOC ? NREFFL : NREFF) : NREF),
6342 num = add_data( pRExC_state, 1, "S" );
6344 RExC_rxi->data->data[num]=(void*)sv_dat;
6345 SvREFCNT_inc(sv_dat);
6347 /* override incorrect value set in reganode MJD */
6348 Set_Node_Offset(ret, parse_start+1);
6349 Set_Node_Cur_Length(ret); /* MJD */
6350 nextchar(pRExC_state);
6366 case '1': case '2': case '3': case '4':
6367 case '5': case '6': case '7': case '8': case '9':
6370 bool isrel=(*RExC_parse=='R');
6373 num = atoi(RExC_parse);
6375 num = RExC_cpar - num;
6377 vFAIL("Reference to nonexistent or unclosed group");
6379 if (num > 9 && num >= RExC_npar)
6382 char * const parse_start = RExC_parse - 1; /* MJD */
6383 while (isDIGIT(*RExC_parse))
6387 if (num > (I32)RExC_rx->nparens)
6388 vFAIL("Reference to nonexistent group");
6389 /* People make this error all the time apparently.
6390 So we cant fail on it, even though we should
6392 else if (num >= RExC_cpar)
6393 vFAIL("Reference to unclosed group will always match");
6397 ret = reganode(pRExC_state,
6398 (U8)(FOLD ? (LOC ? REFFL : REFF) : REF),
6402 /* override incorrect value set in reganode MJD */
6403 Set_Node_Offset(ret, parse_start+1);
6404 Set_Node_Cur_Length(ret); /* MJD */
6406 nextchar(pRExC_state);
6411 if (RExC_parse >= RExC_end)
6412 FAIL("Trailing \\");
6415 /* Do not generate "unrecognized" warnings here, we fall
6416 back into the quick-grab loop below */
6423 if (RExC_flags & RXf_PMf_EXTENDED) {
6424 while (RExC_parse < RExC_end && *RExC_parse != '\n')
6426 if (RExC_parse < RExC_end)
6432 register STRLEN len;
6437 U8 tmpbuf[UTF8_MAXBYTES_CASE+1], *foldbuf;
6439 parse_start = RExC_parse - 1;
6445 ret = reg_node(pRExC_state,
6446 (U8)(FOLD ? (LOC ? EXACTFL : EXACTF) : EXACT));
6448 for (len = 0, p = RExC_parse - 1;
6449 len < 127 && p < RExC_end;
6452 char * const oldp = p;
6454 if (RExC_flags & RXf_PMf_EXTENDED)
6455 p = regwhite(p, RExC_end);
6504 ender = ASCII_TO_NATIVE('\033');
6508 ender = ASCII_TO_NATIVE('\007');
6513 char* const e = strchr(p, '}');
6517 vFAIL("Missing right brace on \\x{}");
6520 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
6521 | PERL_SCAN_DISALLOW_PREFIX;
6522 STRLEN numlen = e - p - 1;
6523 ender = grok_hex(p + 1, &numlen, &flags, NULL);
6530 I32 flags = PERL_SCAN_DISALLOW_PREFIX;
6532 ender = grok_hex(p, &numlen, &flags, NULL);
6535 if (PL_encoding && ender < 0x100)
6536 goto recode_encoding;
6540 ender = UCHARAT(p++);
6541 ender = toCTRL(ender);
6543 case '0': case '1': case '2': case '3':case '4':
6544 case '5': case '6': case '7': case '8':case '9':
6546 (isDIGIT(p[1]) && atoi(p) >= RExC_npar) ) {
6549 ender = grok_oct(p, &numlen, &flags, NULL);
6556 if (PL_encoding && ender < 0x100)
6557 goto recode_encoding;
6561 SV* enc = PL_encoding;
6562 ender = reg_recode((const char)(U8)ender, &enc);
6563 if (!enc && SIZE_ONLY && ckWARN(WARN_REGEXP))
6564 vWARN(p, "Invalid escape in the specified encoding");
6570 FAIL("Trailing \\");
6573 if (!SIZE_ONLY&& isALPHA(*p) && ckWARN(WARN_REGEXP))
6574 vWARN2(p + 1, "Unrecognized escape \\%c passed through", UCHARAT(p));
6575 goto normal_default;
6580 if (UTF8_IS_START(*p) && UTF) {
6582 ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
6583 &numlen, UTF8_ALLOW_DEFAULT);
6590 if (RExC_flags & RXf_PMf_EXTENDED)
6591 p = regwhite(p, RExC_end);
6593 /* Prime the casefolded buffer. */
6594 ender = toFOLD_uni(ender, tmpbuf, &foldlen);
6596 if (ISMULT2(p)) { /* Back off on ?+*. */
6601 /* Emit all the Unicode characters. */
6603 for (foldbuf = tmpbuf;
6605 foldlen -= numlen) {
6606 ender = utf8_to_uvchr(foldbuf, &numlen);
6608 const STRLEN unilen = reguni(pRExC_state, ender, s);
6611 /* In EBCDIC the numlen
6612 * and unilen can differ. */
6614 if (numlen >= foldlen)
6618 break; /* "Can't happen." */
6622 const STRLEN unilen = reguni(pRExC_state, ender, s);
6631 REGC((char)ender, s++);
6637 /* Emit all the Unicode characters. */
6639 for (foldbuf = tmpbuf;
6641 foldlen -= numlen) {
6642 ender = utf8_to_uvchr(foldbuf, &numlen);
6644 const STRLEN unilen = reguni(pRExC_state, ender, s);
6647 /* In EBCDIC the numlen
6648 * and unilen can differ. */
6650 if (numlen >= foldlen)
6658 const STRLEN unilen = reguni(pRExC_state, ender, s);
6667 REGC((char)ender, s++);
6671 Set_Node_Cur_Length(ret); /* MJD */
6672 nextchar(pRExC_state);
6674 /* len is STRLEN which is unsigned, need to copy to signed */
6677 vFAIL("Internal disaster");
6681 if (len == 1 && UNI_IS_INVARIANT(ender))
6685 RExC_size += STR_SZ(len);
6688 RExC_emit += STR_SZ(len);
6698 S_regwhite(char *p, const char *e)
6703 else if (*p == '#') {
6706 } while (p < e && *p != '\n');
6714 /* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
6715 Character classes ([:foo:]) can also be negated ([:^foo:]).
6716 Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
6717 Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
6718 but trigger failures because they are currently unimplemented. */
6720 #define POSIXCC_DONE(c) ((c) == ':')
6721 #define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.')
6722 #define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c))
6725 S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value)
6728 I32 namedclass = OOB_NAMEDCLASS;
6730 if (value == '[' && RExC_parse + 1 < RExC_end &&
6731 /* I smell either [: or [= or [. -- POSIX has been here, right? */
6732 POSIXCC(UCHARAT(RExC_parse))) {
6733 const char c = UCHARAT(RExC_parse);
6734 char* const s = RExC_parse++;
6736 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != c)
6738 if (RExC_parse == RExC_end)
6739 /* Grandfather lone [:, [=, [. */
6742 const char* const t = RExC_parse++; /* skip over the c */
6745 if (UCHARAT(RExC_parse) == ']') {
6746 const char *posixcc = s + 1;
6747 RExC_parse++; /* skip over the ending ] */
6750 const I32 complement = *posixcc == '^' ? *posixcc++ : 0;
6751 const I32 skip = t - posixcc;
6753 /* Initially switch on the length of the name. */
6756 if (memEQ(posixcc, "word", 4)) /* this is not POSIX, this is the Perl \w */
6757 namedclass = complement ? ANYOF_NALNUM : ANYOF_ALNUM;
6760 /* Names all of length 5. */
6761 /* alnum alpha ascii blank cntrl digit graph lower
6762 print punct space upper */
6763 /* Offset 4 gives the best switch position. */
6764 switch (posixcc[4]) {
6766 if (memEQ(posixcc, "alph", 4)) /* alpha */
6767 namedclass = complement ? ANYOF_NALPHA : ANYOF_ALPHA;
6770 if (memEQ(posixcc, "spac", 4)) /* space */
6771 namedclass = complement ? ANYOF_NPSXSPC : ANYOF_PSXSPC;
6774 if (memEQ(posixcc, "grap", 4)) /* graph */
6775 namedclass = complement ? ANYOF_NGRAPH : ANYOF_GRAPH;
6778 if (memEQ(posixcc, "asci", 4)) /* ascii */
6779 namedclass = complement ? ANYOF_NASCII : ANYOF_ASCII;
6782 if (memEQ(posixcc, "blan", 4)) /* blank */
6783 namedclass = complement ? ANYOF_NBLANK : ANYOF_BLANK;
6786 if (memEQ(posixcc, "cntr", 4)) /* cntrl */
6787 namedclass = complement ? ANYOF_NCNTRL : ANYOF_CNTRL;
6790 if (memEQ(posixcc, "alnu", 4)) /* alnum */
6791 namedclass = complement ? ANYOF_NALNUMC : ANYOF_ALNUMC;
6794 if (memEQ(posixcc, "lowe", 4)) /* lower */
6795 namedclass = complement ? ANYOF_NLOWER : ANYOF_LOWER;
6796 else if (memEQ(posixcc, "uppe", 4)) /* upper */
6797 namedclass = complement ? ANYOF_NUPPER : ANYOF_UPPER;
6800 if (memEQ(posixcc, "digi", 4)) /* digit */
6801 namedclass = complement ? ANYOF_NDIGIT : ANYOF_DIGIT;
6802 else if (memEQ(posixcc, "prin", 4)) /* print */
6803 namedclass = complement ? ANYOF_NPRINT : ANYOF_PRINT;
6804 else if (memEQ(posixcc, "punc", 4)) /* punct */
6805 namedclass = complement ? ANYOF_NPUNCT : ANYOF_PUNCT;
6810 if (memEQ(posixcc, "xdigit", 6))
6811 namedclass = complement ? ANYOF_NXDIGIT : ANYOF_XDIGIT;
6815 if (namedclass == OOB_NAMEDCLASS)
6816 Simple_vFAIL3("POSIX class [:%.*s:] unknown",
6818 assert (posixcc[skip] == ':');
6819 assert (posixcc[skip+1] == ']');
6820 } else if (!SIZE_ONLY) {
6821 /* [[=foo=]] and [[.foo.]] are still future. */
6823 /* adjust RExC_parse so the warning shows after
6825 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse) != ']')
6827 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
6830 /* Maternal grandfather:
6831 * "[:" ending in ":" but not in ":]" */
6841 S_checkposixcc(pTHX_ RExC_state_t *pRExC_state)
6844 if (POSIXCC(UCHARAT(RExC_parse))) {
6845 const char *s = RExC_parse;
6846 const char c = *s++;
6850 if (*s && c == *s && s[1] == ']') {
6851 if (ckWARN(WARN_REGEXP))
6853 "POSIX syntax [%c %c] belongs inside character classes",
6856 /* [[=foo=]] and [[.foo.]] are still future. */
6857 if (POSIXCC_NOTYET(c)) {
6858 /* adjust RExC_parse so the error shows after
6860 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse++) != ']')
6862 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
6870 parse a class specification and produce either an ANYOF node that
6871 matches the pattern. If the pattern matches a single char only and
6872 that char is < 256 then we produce an EXACT node instead.
6875 S_regclass(pTHX_ RExC_state_t *pRExC_state, U32 depth)
6878 register UV value = 0;
6879 register UV nextvalue;
6880 register IV prevvalue = OOB_UNICODE;
6881 register IV range = 0;
6882 register regnode *ret;
6885 char *rangebegin = NULL;
6886 bool need_class = 0;
6889 bool optimize_invert = TRUE;
6890 AV* unicode_alternate = NULL;
6892 UV literal_endpoint = 0;
6894 UV stored = 0; /* number of chars stored in the class */
6896 regnode * const orig_emit = RExC_emit; /* Save the original RExC_emit in
6897 case we need to change the emitted regop to an EXACT. */
6898 const char * orig_parse = RExC_parse;
6899 GET_RE_DEBUG_FLAGS_DECL;
6901 PERL_UNUSED_ARG(depth);
6904 DEBUG_PARSE("clas");
6906 /* Assume we are going to generate an ANYOF node. */
6907 ret = reganode(pRExC_state, ANYOF, 0);
6910 ANYOF_FLAGS(ret) = 0;
6912 if (UCHARAT(RExC_parse) == '^') { /* Complement of range. */
6916 ANYOF_FLAGS(ret) |= ANYOF_INVERT;
6920 RExC_size += ANYOF_SKIP;
6921 listsv = &PL_sv_undef; /* For code scanners: listsv always non-NULL. */
6924 RExC_emit += ANYOF_SKIP;
6926 ANYOF_FLAGS(ret) |= ANYOF_FOLD;
6928 ANYOF_FLAGS(ret) |= ANYOF_LOCALE;
6929 ANYOF_BITMAP_ZERO(ret);
6930 listsv = newSVpvs("# comment\n");
6933 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
6935 if (!SIZE_ONLY && POSIXCC(nextvalue))
6936 checkposixcc(pRExC_state);
6938 /* allow 1st char to be ] (allowing it to be - is dealt with later) */
6939 if (UCHARAT(RExC_parse) == ']')
6943 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != ']') {
6947 namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
6950 rangebegin = RExC_parse;
6952 value = utf8n_to_uvchr((U8*)RExC_parse,
6953 RExC_end - RExC_parse,
6954 &numlen, UTF8_ALLOW_DEFAULT);
6955 RExC_parse += numlen;
6958 value = UCHARAT(RExC_parse++);
6960 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
6961 if (value == '[' && POSIXCC(nextvalue))
6962 namedclass = regpposixcc(pRExC_state, value);
6963 else if (value == '\\') {
6965 value = utf8n_to_uvchr((U8*)RExC_parse,
6966 RExC_end - RExC_parse,
6967 &numlen, UTF8_ALLOW_DEFAULT);
6968 RExC_parse += numlen;
6971 value = UCHARAT(RExC_parse++);
6972 /* Some compilers cannot handle switching on 64-bit integer
6973 * values, therefore value cannot be an UV. Yes, this will
6974 * be a problem later if we want switch on Unicode.
6975 * A similar issue a little bit later when switching on
6976 * namedclass. --jhi */
6977 switch ((I32)value) {
6978 case 'w': namedclass = ANYOF_ALNUM; break;
6979 case 'W': namedclass = ANYOF_NALNUM; break;
6980 case 's': namedclass = ANYOF_SPACE; break;
6981 case 'S': namedclass = ANYOF_NSPACE; break;
6982 case 'd': namedclass = ANYOF_DIGIT; break;
6983 case 'D': namedclass = ANYOF_NDIGIT; break;
6984 case 'N': /* Handle \N{NAME} in class */
6986 /* We only pay attention to the first char of
6987 multichar strings being returned. I kinda wonder
6988 if this makes sense as it does change the behaviour
6989 from earlier versions, OTOH that behaviour was broken
6991 UV v; /* value is register so we cant & it /grrr */
6992 if (reg_namedseq(pRExC_state, &v)) {
7002 if (RExC_parse >= RExC_end)
7003 vFAIL2("Empty \\%c{}", (U8)value);
7004 if (*RExC_parse == '{') {
7005 const U8 c = (U8)value;
7006 e = strchr(RExC_parse++, '}');
7008 vFAIL2("Missing right brace on \\%c{}", c);
7009 while (isSPACE(UCHARAT(RExC_parse)))
7011 if (e == RExC_parse)
7012 vFAIL2("Empty \\%c{}", c);
7014 while (isSPACE(UCHARAT(RExC_parse + n - 1)))
7022 if (UCHARAT(RExC_parse) == '^') {
7025 value = value == 'p' ? 'P' : 'p'; /* toggle */
7026 while (isSPACE(UCHARAT(RExC_parse))) {
7031 Perl_sv_catpvf(aTHX_ listsv, "%cutf8::%.*s\n",
7032 (value=='p' ? '+' : '!'), (int)n, RExC_parse);
7035 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
7036 namedclass = ANYOF_MAX; /* no official name, but it's named */
7039 case 'n': value = '\n'; break;
7040 case 'r': value = '\r'; break;
7041 case 't': value = '\t'; break;
7042 case 'f': value = '\f'; break;
7043 case 'b': value = '\b'; break;
7044 case 'e': value = ASCII_TO_NATIVE('\033');break;
7045 case 'a': value = ASCII_TO_NATIVE('\007');break;
7047 if (*RExC_parse == '{') {
7048 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
7049 | PERL_SCAN_DISALLOW_PREFIX;
7050 char * const e = strchr(RExC_parse++, '}');
7052 vFAIL("Missing right brace on \\x{}");
7054 numlen = e - RExC_parse;
7055 value = grok_hex(RExC_parse, &numlen, &flags, NULL);
7059 I32 flags = PERL_SCAN_DISALLOW_PREFIX;
7061 value = grok_hex(RExC_parse, &numlen, &flags, NULL);
7062 RExC_parse += numlen;
7064 if (PL_encoding && value < 0x100)
7065 goto recode_encoding;
7068 value = UCHARAT(RExC_parse++);
7069 value = toCTRL(value);
7071 case '0': case '1': case '2': case '3': case '4':
7072 case '5': case '6': case '7': case '8': case '9':
7076 value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
7077 RExC_parse += numlen;
7078 if (PL_encoding && value < 0x100)
7079 goto recode_encoding;
7084 SV* enc = PL_encoding;
7085 value = reg_recode((const char)(U8)value, &enc);
7086 if (!enc && SIZE_ONLY && ckWARN(WARN_REGEXP))
7088 "Invalid escape in the specified encoding");
7092 if (!SIZE_ONLY && isALPHA(value) && ckWARN(WARN_REGEXP))
7094 "Unrecognized escape \\%c in character class passed through",
7098 } /* end of \blah */
7104 if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
7106 if (!SIZE_ONLY && !need_class)
7107 ANYOF_CLASS_ZERO(ret);
7111 /* a bad range like a-\d, a-[:digit:] ? */
7114 if (ckWARN(WARN_REGEXP)) {
7116 RExC_parse >= rangebegin ?
7117 RExC_parse - rangebegin : 0;
7119 "False [] range \"%*.*s\"",
7122 if (prevvalue < 256) {
7123 ANYOF_BITMAP_SET(ret, prevvalue);
7124 ANYOF_BITMAP_SET(ret, '-');
7127 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
7128 Perl_sv_catpvf(aTHX_ listsv,
7129 "%04"UVxf"\n%04"UVxf"\n", (UV)prevvalue, (UV) '-');
7133 range = 0; /* this was not a true range */
7137 const char *what = NULL;
7140 if (namedclass > OOB_NAMEDCLASS)
7141 optimize_invert = FALSE;
7142 /* Possible truncation here but in some 64-bit environments
7143 * the compiler gets heartburn about switch on 64-bit values.
7144 * A similar issue a little earlier when switching on value.
7146 switch ((I32)namedclass) {
7149 ANYOF_CLASS_SET(ret, ANYOF_ALNUM);
7151 for (value = 0; value < 256; value++)
7153 ANYOF_BITMAP_SET(ret, value);
7160 ANYOF_CLASS_SET(ret, ANYOF_NALNUM);
7162 for (value = 0; value < 256; value++)
7163 if (!isALNUM(value))
7164 ANYOF_BITMAP_SET(ret, value);
7171 ANYOF_CLASS_SET(ret, ANYOF_ALNUMC);
7173 for (value = 0; value < 256; value++)
7174 if (isALNUMC(value))
7175 ANYOF_BITMAP_SET(ret, value);
7182 ANYOF_CLASS_SET(ret, ANYOF_NALNUMC);
7184 for (value = 0; value < 256; value++)
7185 if (!isALNUMC(value))
7186 ANYOF_BITMAP_SET(ret, value);
7193 ANYOF_CLASS_SET(ret, ANYOF_ALPHA);
7195 for (value = 0; value < 256; value++)
7197 ANYOF_BITMAP_SET(ret, value);
7204 ANYOF_CLASS_SET(ret, ANYOF_NALPHA);
7206 for (value = 0; value < 256; value++)
7207 if (!isALPHA(value))
7208 ANYOF_BITMAP_SET(ret, value);
7215 ANYOF_CLASS_SET(ret, ANYOF_ASCII);
7218 for (value = 0; value < 128; value++)
7219 ANYOF_BITMAP_SET(ret, value);
7221 for (value = 0; value < 256; value++) {
7223 ANYOF_BITMAP_SET(ret, value);
7232 ANYOF_CLASS_SET(ret, ANYOF_NASCII);
7235 for (value = 128; value < 256; value++)
7236 ANYOF_BITMAP_SET(ret, value);
7238 for (value = 0; value < 256; value++) {
7239 if (!isASCII(value))
7240 ANYOF_BITMAP_SET(ret, value);
7249 ANYOF_CLASS_SET(ret, ANYOF_BLANK);
7251 for (value = 0; value < 256; value++)
7253 ANYOF_BITMAP_SET(ret, value);
7260 ANYOF_CLASS_SET(ret, ANYOF_NBLANK);
7262 for (value = 0; value < 256; value++)
7263 if (!isBLANK(value))
7264 ANYOF_BITMAP_SET(ret, value);
7271 ANYOF_CLASS_SET(ret, ANYOF_CNTRL);
7273 for (value = 0; value < 256; value++)
7275 ANYOF_BITMAP_SET(ret, value);
7282 ANYOF_CLASS_SET(ret, ANYOF_NCNTRL);
7284 for (value = 0; value < 256; value++)
7285 if (!isCNTRL(value))
7286 ANYOF_BITMAP_SET(ret, value);
7293 ANYOF_CLASS_SET(ret, ANYOF_DIGIT);
7295 /* consecutive digits assumed */
7296 for (value = '0'; value <= '9'; value++)
7297 ANYOF_BITMAP_SET(ret, value);
7304 ANYOF_CLASS_SET(ret, ANYOF_NDIGIT);
7306 /* consecutive digits assumed */
7307 for (value = 0; value < '0'; value++)
7308 ANYOF_BITMAP_SET(ret, value);
7309 for (value = '9' + 1; value < 256; value++)
7310 ANYOF_BITMAP_SET(ret, value);
7317 ANYOF_CLASS_SET(ret, ANYOF_GRAPH);
7319 for (value = 0; value < 256; value++)
7321 ANYOF_BITMAP_SET(ret, value);
7328 ANYOF_CLASS_SET(ret, ANYOF_NGRAPH);
7330 for (value = 0; value < 256; value++)
7331 if (!isGRAPH(value))
7332 ANYOF_BITMAP_SET(ret, value);
7339 ANYOF_CLASS_SET(ret, ANYOF_LOWER);
7341 for (value = 0; value < 256; value++)
7343 ANYOF_BITMAP_SET(ret, value);
7350 ANYOF_CLASS_SET(ret, ANYOF_NLOWER);
7352 for (value = 0; value < 256; value++)
7353 if (!isLOWER(value))
7354 ANYOF_BITMAP_SET(ret, value);
7361 ANYOF_CLASS_SET(ret, ANYOF_PRINT);
7363 for (value = 0; value < 256; value++)
7365 ANYOF_BITMAP_SET(ret, value);
7372 ANYOF_CLASS_SET(ret, ANYOF_NPRINT);
7374 for (value = 0; value < 256; value++)
7375 if (!isPRINT(value))
7376 ANYOF_BITMAP_SET(ret, value);
7383 ANYOF_CLASS_SET(ret, ANYOF_PSXSPC);
7385 for (value = 0; value < 256; value++)
7386 if (isPSXSPC(value))
7387 ANYOF_BITMAP_SET(ret, value);
7394 ANYOF_CLASS_SET(ret, ANYOF_NPSXSPC);
7396 for (value = 0; value < 256; value++)
7397 if (!isPSXSPC(value))
7398 ANYOF_BITMAP_SET(ret, value);
7405 ANYOF_CLASS_SET(ret, ANYOF_PUNCT);
7407 for (value = 0; value < 256; value++)
7409 ANYOF_BITMAP_SET(ret, value);
7416 ANYOF_CLASS_SET(ret, ANYOF_NPUNCT);
7418 for (value = 0; value < 256; value++)
7419 if (!isPUNCT(value))
7420 ANYOF_BITMAP_SET(ret, value);
7427 ANYOF_CLASS_SET(ret, ANYOF_SPACE);
7429 for (value = 0; value < 256; value++)
7431 ANYOF_BITMAP_SET(ret, value);
7438 ANYOF_CLASS_SET(ret, ANYOF_NSPACE);
7440 for (value = 0; value < 256; value++)
7441 if (!isSPACE(value))
7442 ANYOF_BITMAP_SET(ret, value);
7449 ANYOF_CLASS_SET(ret, ANYOF_UPPER);
7451 for (value = 0; value < 256; value++)
7453 ANYOF_BITMAP_SET(ret, value);
7460 ANYOF_CLASS_SET(ret, ANYOF_NUPPER);
7462 for (value = 0; value < 256; value++)
7463 if (!isUPPER(value))
7464 ANYOF_BITMAP_SET(ret, value);
7471 ANYOF_CLASS_SET(ret, ANYOF_XDIGIT);
7473 for (value = 0; value < 256; value++)
7474 if (isXDIGIT(value))
7475 ANYOF_BITMAP_SET(ret, value);
7482 ANYOF_CLASS_SET(ret, ANYOF_NXDIGIT);
7484 for (value = 0; value < 256; value++)
7485 if (!isXDIGIT(value))
7486 ANYOF_BITMAP_SET(ret, value);
7492 /* this is to handle \p and \P */
7495 vFAIL("Invalid [::] class");
7499 /* Strings such as "+utf8::isWord\n" */
7500 Perl_sv_catpvf(aTHX_ listsv, "%cutf8::Is%s\n", yesno, what);
7503 ANYOF_FLAGS(ret) |= ANYOF_CLASS;
7506 } /* end of namedclass \blah */
7509 if (prevvalue > (IV)value) /* b-a */ {
7510 const int w = RExC_parse - rangebegin;
7511 Simple_vFAIL4("Invalid [] range \"%*.*s\"", w, w, rangebegin);
7512 range = 0; /* not a valid range */
7516 prevvalue = value; /* save the beginning of the range */
7517 if (*RExC_parse == '-' && RExC_parse+1 < RExC_end &&
7518 RExC_parse[1] != ']') {
7521 /* a bad range like \w-, [:word:]- ? */
7522 if (namedclass > OOB_NAMEDCLASS) {
7523 if (ckWARN(WARN_REGEXP)) {
7525 RExC_parse >= rangebegin ?
7526 RExC_parse - rangebegin : 0;
7528 "False [] range \"%*.*s\"",
7532 ANYOF_BITMAP_SET(ret, '-');
7534 range = 1; /* yeah, it's a range! */
7535 continue; /* but do it the next time */
7539 /* now is the next time */
7540 /*stored += (value - prevvalue + 1);*/
7542 if (prevvalue < 256) {
7543 const IV ceilvalue = value < 256 ? value : 255;
7546 /* In EBCDIC [\x89-\x91] should include
7547 * the \x8e but [i-j] should not. */
7548 if (literal_endpoint == 2 &&
7549 ((isLOWER(prevvalue) && isLOWER(ceilvalue)) ||
7550 (isUPPER(prevvalue) && isUPPER(ceilvalue))))
7552 if (isLOWER(prevvalue)) {
7553 for (i = prevvalue; i <= ceilvalue; i++)
7555 ANYOF_BITMAP_SET(ret, i);
7557 for (i = prevvalue; i <= ceilvalue; i++)
7559 ANYOF_BITMAP_SET(ret, i);
7564 for (i = prevvalue; i <= ceilvalue; i++) {
7565 if (!ANYOF_BITMAP_TEST(ret,i)) {
7567 ANYOF_BITMAP_SET(ret, i);
7571 if (value > 255 || UTF) {
7572 const UV prevnatvalue = NATIVE_TO_UNI(prevvalue);
7573 const UV natvalue = NATIVE_TO_UNI(value);
7574 stored+=2; /* can't optimize this class */
7575 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
7576 if (prevnatvalue < natvalue) { /* what about > ? */
7577 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\t%04"UVxf"\n",
7578 prevnatvalue, natvalue);
7580 else if (prevnatvalue == natvalue) {
7581 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n", natvalue);
7583 U8 foldbuf[UTF8_MAXBYTES_CASE+1];
7585 const UV f = to_uni_fold(natvalue, foldbuf, &foldlen);
7587 #ifdef EBCDIC /* RD t/uni/fold ff and 6b */
7588 if (RExC_precomp[0] == ':' &&
7589 RExC_precomp[1] == '[' &&
7590 (f == 0xDF || f == 0x92)) {
7591 f = NATIVE_TO_UNI(f);
7594 /* If folding and foldable and a single
7595 * character, insert also the folded version
7596 * to the charclass. */
7598 #ifdef EBCDIC /* RD tunifold ligatures s,t fb05, fb06 */
7599 if ((RExC_precomp[0] == ':' &&
7600 RExC_precomp[1] == '[' &&
7602 (value == 0xFB05 || value == 0xFB06))) ?
7603 foldlen == ((STRLEN)UNISKIP(f) - 1) :
7604 foldlen == (STRLEN)UNISKIP(f) )
7606 if (foldlen == (STRLEN)UNISKIP(f))
7608 Perl_sv_catpvf(aTHX_ listsv,
7611 /* Any multicharacter foldings
7612 * require the following transform:
7613 * [ABCDEF] -> (?:[ABCabcDEFd]|pq|rst)
7614 * where E folds into "pq" and F folds
7615 * into "rst", all other characters
7616 * fold to single characters. We save
7617 * away these multicharacter foldings,
7618 * to be later saved as part of the
7619 * additional "s" data. */
7622 if (!unicode_alternate)
7623 unicode_alternate = newAV();
7624 sv = newSVpvn((char*)foldbuf, foldlen);
7626 av_push(unicode_alternate, sv);
7630 /* If folding and the value is one of the Greek
7631 * sigmas insert a few more sigmas to make the
7632 * folding rules of the sigmas to work right.
7633 * Note that not all the possible combinations
7634 * are handled here: some of them are handled
7635 * by the standard folding rules, and some of
7636 * them (literal or EXACTF cases) are handled
7637 * during runtime in regexec.c:S_find_byclass(). */
7638 if (value == UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA) {
7639 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
7640 (UV)UNICODE_GREEK_CAPITAL_LETTER_SIGMA);
7641 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
7642 (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
7644 else if (value == UNICODE_GREEK_CAPITAL_LETTER_SIGMA)
7645 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
7646 (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
7651 literal_endpoint = 0;
7655 range = 0; /* this range (if it was one) is done now */
7659 ANYOF_FLAGS(ret) |= ANYOF_LARGE;
7661 RExC_size += ANYOF_CLASS_ADD_SKIP;
7663 RExC_emit += ANYOF_CLASS_ADD_SKIP;
7669 /****** !SIZE_ONLY AFTER HERE *********/
7671 if( stored == 1 && value < 256
7672 && !( ANYOF_FLAGS(ret) & ( ANYOF_FLAGS_ALL ^ ANYOF_FOLD ) )
7674 /* optimize single char class to an EXACT node
7675 but *only* when its not a UTF/high char */
7676 const char * cur_parse= RExC_parse;
7677 RExC_emit = (regnode *)orig_emit;
7678 RExC_parse = (char *)orig_parse;
7679 ret = reg_node(pRExC_state,
7680 (U8)((ANYOF_FLAGS(ret) & ANYOF_FOLD) ? EXACTF : EXACT));
7681 RExC_parse = (char *)cur_parse;
7682 *STRING(ret)= (char)value;
7684 RExC_emit += STR_SZ(1);
7687 /* optimize case-insensitive simple patterns (e.g. /[a-z]/i) */
7688 if ( /* If the only flag is folding (plus possibly inversion). */
7689 ((ANYOF_FLAGS(ret) & (ANYOF_FLAGS_ALL ^ ANYOF_INVERT)) == ANYOF_FOLD)
7691 for (value = 0; value < 256; ++value) {
7692 if (ANYOF_BITMAP_TEST(ret, value)) {
7693 UV fold = PL_fold[value];
7696 ANYOF_BITMAP_SET(ret, fold);
7699 ANYOF_FLAGS(ret) &= ~ANYOF_FOLD;
7702 /* optimize inverted simple patterns (e.g. [^a-z]) */
7703 if (optimize_invert &&
7704 /* If the only flag is inversion. */
7705 (ANYOF_FLAGS(ret) & ANYOF_FLAGS_ALL) == ANYOF_INVERT) {
7706 for (value = 0; value < ANYOF_BITMAP_SIZE; ++value)
7707 ANYOF_BITMAP(ret)[value] ^= ANYOF_FLAGS_ALL;
7708 ANYOF_FLAGS(ret) = ANYOF_UNICODE_ALL;
7711 AV * const av = newAV();
7713 /* The 0th element stores the character class description
7714 * in its textual form: used later (regexec.c:Perl_regclass_swash())
7715 * to initialize the appropriate swash (which gets stored in
7716 * the 1st element), and also useful for dumping the regnode.
7717 * The 2nd element stores the multicharacter foldings,
7718 * used later (regexec.c:S_reginclass()). */
7719 av_store(av, 0, listsv);
7720 av_store(av, 1, NULL);
7721 av_store(av, 2, (SV*)unicode_alternate);
7722 rv = newRV_noinc((SV*)av);
7723 n = add_data(pRExC_state, 1, "s");
7724 RExC_rxi->data->data[n] = (void*)rv;
7731 S_nextchar(pTHX_ RExC_state_t *pRExC_state)
7733 char* const retval = RExC_parse++;
7736 if (*RExC_parse == '(' && RExC_parse[1] == '?' &&
7737 RExC_parse[2] == '#') {
7738 while (*RExC_parse != ')') {
7739 if (RExC_parse == RExC_end)
7740 FAIL("Sequence (?#... not terminated");
7746 if (RExC_flags & RXf_PMf_EXTENDED) {
7747 if (isSPACE(*RExC_parse)) {
7751 else if (*RExC_parse == '#') {
7752 while (RExC_parse < RExC_end)
7753 if (*RExC_parse++ == '\n') break;
7762 - reg_node - emit a node
7764 STATIC regnode * /* Location. */
7765 S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
7768 register regnode *ptr;
7769 regnode * const ret = RExC_emit;
7770 GET_RE_DEBUG_FLAGS_DECL;
7773 SIZE_ALIGN(RExC_size);
7778 if (OP(RExC_emit) == 255)
7779 Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %s: %d ",
7780 reg_name[op], OP(RExC_emit));
7782 NODE_ALIGN_FILL(ret);
7784 FILL_ADVANCE_NODE(ptr, op);
7785 if (RExC_offsets) { /* MJD */
7786 MJD_OFFSET_DEBUG(("%s:%d: (op %s) %s %"UVuf" (len %"UVuf") (max %"UVuf").\n",
7787 "reg_node", __LINE__,
7789 (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0]
7790 ? "Overwriting end of array!\n" : "OK",
7791 (UV)(RExC_emit - RExC_emit_start),
7792 (UV)(RExC_parse - RExC_start),
7793 (UV)RExC_offsets[0]));
7794 Set_Node_Offset(RExC_emit, RExC_parse + (op == END));
7802 - reganode - emit a node with an argument
7804 STATIC regnode * /* Location. */
7805 S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
7808 register regnode *ptr;
7809 regnode * const ret = RExC_emit;
7810 GET_RE_DEBUG_FLAGS_DECL;
7813 SIZE_ALIGN(RExC_size);
7818 assert(2==regarglen[op]+1);
7820 Anything larger than this has to allocate the extra amount.
7821 If we changed this to be:
7823 RExC_size += (1 + regarglen[op]);
7825 then it wouldn't matter. Its not clear what side effect
7826 might come from that so its not done so far.
7832 if (OP(RExC_emit) == 255)
7833 Perl_croak(aTHX_ "panic: reganode overwriting end of allocated program space");
7835 NODE_ALIGN_FILL(ret);
7837 FILL_ADVANCE_NODE_ARG(ptr, op, arg);
7838 if (RExC_offsets) { /* MJD */
7839 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
7843 (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0] ?
7844 "Overwriting end of array!\n" : "OK",
7845 (UV)(RExC_emit - RExC_emit_start),
7846 (UV)(RExC_parse - RExC_start),
7847 (UV)RExC_offsets[0]));
7848 Set_Cur_Node_Offset;
7856 - reguni - emit (if appropriate) a Unicode character
7859 S_reguni(pTHX_ const RExC_state_t *pRExC_state, UV uv, char* s)
7862 return SIZE_ONLY ? UNISKIP(uv) : (uvchr_to_utf8((U8*)s, uv) - (U8*)s);
7866 - reginsert - insert an operator in front of already-emitted operand
7868 * Means relocating the operand.
7871 S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd, U32 depth)
7874 register regnode *src;
7875 register regnode *dst;
7876 register regnode *place;
7877 const int offset = regarglen[(U8)op];
7878 const int size = NODE_STEP_REGNODE + offset;
7879 GET_RE_DEBUG_FLAGS_DECL;
7880 /* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */
7881 DEBUG_PARSE_FMT("inst"," - %s",reg_name[op]);
7890 if (RExC_open_parens) {
7892 DEBUG_PARSE_FMT("inst"," - %"IVdf, (IV)RExC_npar);
7893 for ( paren=0 ; paren < RExC_npar ; paren++ ) {
7894 if ( RExC_open_parens[paren] >= opnd ) {
7895 DEBUG_PARSE_FMT("open"," - %d",size);
7896 RExC_open_parens[paren] += size;
7898 DEBUG_PARSE_FMT("open"," - %s","ok");
7900 if ( RExC_close_parens[paren] >= opnd ) {
7901 DEBUG_PARSE_FMT("close"," - %d",size);
7902 RExC_close_parens[paren] += size;
7904 DEBUG_PARSE_FMT("close"," - %s","ok");
7909 while (src > opnd) {
7910 StructCopy(--src, --dst, regnode);
7911 if (RExC_offsets) { /* MJD 20010112 */
7912 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s copy %"UVuf" -> %"UVuf" (max %"UVuf").\n",
7916 (UV)(dst - RExC_emit_start) > RExC_offsets[0]
7917 ? "Overwriting end of array!\n" : "OK",
7918 (UV)(src - RExC_emit_start),
7919 (UV)(dst - RExC_emit_start),
7920 (UV)RExC_offsets[0]));
7921 Set_Node_Offset_To_R(dst-RExC_emit_start, Node_Offset(src));
7922 Set_Node_Length_To_R(dst-RExC_emit_start, Node_Length(src));
7927 place = opnd; /* Op node, where operand used to be. */
7928 if (RExC_offsets) { /* MJD */
7929 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
7933 (UV)(place - RExC_emit_start) > RExC_offsets[0]
7934 ? "Overwriting end of array!\n" : "OK",
7935 (UV)(place - RExC_emit_start),
7936 (UV)(RExC_parse - RExC_start),
7937 (UV)RExC_offsets[0]));
7938 Set_Node_Offset(place, RExC_parse);
7939 Set_Node_Length(place, 1);
7941 src = NEXTOPER(place);
7942 FILL_ADVANCE_NODE(place, op);
7943 Zero(src, offset, regnode);
7947 - regtail - set the next-pointer at the end of a node chain of p to val.
7948 - SEE ALSO: regtail_study
7950 /* TODO: All three parms should be const */
7952 S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
7955 register regnode *scan;
7956 GET_RE_DEBUG_FLAGS_DECL;
7958 PERL_UNUSED_ARG(depth);
7964 /* Find last node. */
7967 regnode * const temp = regnext(scan);
7969 SV * const mysv=sv_newmortal();
7970 DEBUG_PARSE_MSG((scan==p ? "tail" : ""));
7971 regprop(RExC_rx, mysv, scan);
7972 PerlIO_printf(Perl_debug_log, "~ %s (%d) %s %s\n",
7973 SvPV_nolen_const(mysv), REG_NODE_NUM(scan),
7974 (temp == NULL ? "->" : ""),
7975 (temp == NULL ? reg_name[OP(val)] : "")
7983 if (reg_off_by_arg[OP(scan)]) {
7984 ARG_SET(scan, val - scan);
7987 NEXT_OFF(scan) = val - scan;
7993 - regtail_study - set the next-pointer at the end of a node chain of p to val.
7994 - Look for optimizable sequences at the same time.
7995 - currently only looks for EXACT chains.
7997 This is expermental code. The idea is to use this routine to perform
7998 in place optimizations on branches and groups as they are constructed,
7999 with the long term intention of removing optimization from study_chunk so
8000 that it is purely analytical.
8002 Currently only used when in DEBUG mode. The macro REGTAIL_STUDY() is used
8003 to control which is which.
8006 /* TODO: All four parms should be const */
8009 S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
8012 register regnode *scan;
8014 #ifdef EXPERIMENTAL_INPLACESCAN
8018 GET_RE_DEBUG_FLAGS_DECL;
8024 /* Find last node. */
8028 regnode * const temp = regnext(scan);
8029 #ifdef EXPERIMENTAL_INPLACESCAN
8030 if (PL_regkind[OP(scan)] == EXACT)
8031 if (join_exact(pRExC_state,scan,&min,1,val,depth+1))
8039 if( exact == PSEUDO )
8041 else if ( exact != OP(scan) )
8050 SV * const mysv=sv_newmortal();
8051 DEBUG_PARSE_MSG((scan==p ? "tsdy" : ""));
8052 regprop(RExC_rx, mysv, scan);
8053 PerlIO_printf(Perl_debug_log, "~ %s (%d) -> %s\n",
8054 SvPV_nolen_const(mysv),
8063 SV * const mysv_val=sv_newmortal();
8064 DEBUG_PARSE_MSG("");
8065 regprop(RExC_rx, mysv_val, val);
8066 PerlIO_printf(Perl_debug_log, "~ attach to %s (%"IVdf") offset to %"IVdf"\n",
8067 SvPV_nolen_const(mysv_val),
8068 (IV)REG_NODE_NUM(val),
8072 if (reg_off_by_arg[OP(scan)]) {
8073 ARG_SET(scan, val - scan);
8076 NEXT_OFF(scan) = val - scan;
8084 - regcurly - a little FSA that accepts {\d+,?\d*}
8087 S_regcurly(register const char *s)
8106 - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
8109 Perl_regdump(pTHX_ const regexp *r)
8113 SV * const sv = sv_newmortal();
8114 SV *dsv= sv_newmortal();
8117 (void)dumpuntil(r, ri->program, ri->program + 1, NULL, NULL, sv, 0, 0);
8119 /* Header fields of interest. */
8120 if (r->anchored_substr) {
8121 RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->anchored_substr),
8122 RE_SV_DUMPLEN(r->anchored_substr), 30);
8123 PerlIO_printf(Perl_debug_log,
8124 "anchored %s%s at %"IVdf" ",
8125 s, RE_SV_TAIL(r->anchored_substr),
8126 (IV)r->anchored_offset);
8127 } else if (r->anchored_utf8) {
8128 RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->anchored_utf8),
8129 RE_SV_DUMPLEN(r->anchored_utf8), 30);
8130 PerlIO_printf(Perl_debug_log,
8131 "anchored utf8 %s%s at %"IVdf" ",
8132 s, RE_SV_TAIL(r->anchored_utf8),
8133 (IV)r->anchored_offset);
8135 if (r->float_substr) {
8136 RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->float_substr),
8137 RE_SV_DUMPLEN(r->float_substr), 30);
8138 PerlIO_printf(Perl_debug_log,
8139 "floating %s%s at %"IVdf"..%"UVuf" ",
8140 s, RE_SV_TAIL(r->float_substr),
8141 (IV)r->float_min_offset, (UV)r->float_max_offset);
8142 } else if (r->float_utf8) {
8143 RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->float_utf8),
8144 RE_SV_DUMPLEN(r->float_utf8), 30);
8145 PerlIO_printf(Perl_debug_log,
8146 "floating utf8 %s%s at %"IVdf"..%"UVuf" ",
8147 s, RE_SV_TAIL(r->float_utf8),
8148 (IV)r->float_min_offset, (UV)r->float_max_offset);
8150 if (r->check_substr || r->check_utf8)
8151 PerlIO_printf(Perl_debug_log,
8153 (r->check_substr == r->float_substr
8154 && r->check_utf8 == r->float_utf8
8155 ? "(checking floating" : "(checking anchored"));
8156 if (r->extflags & RXf_NOSCAN)
8157 PerlIO_printf(Perl_debug_log, " noscan");
8158 if (r->extflags & RXf_CHECK_ALL)
8159 PerlIO_printf(Perl_debug_log, " isall");
8160 if (r->check_substr || r->check_utf8)
8161 PerlIO_printf(Perl_debug_log, ") ");
8163 if (ri->regstclass) {
8164 regprop(r, sv, ri->regstclass);
8165 PerlIO_printf(Perl_debug_log, "stclass %s ", SvPVX_const(sv));
8167 if (r->extflags & RXf_ANCH) {
8168 PerlIO_printf(Perl_debug_log, "anchored");
8169 if (r->extflags & RXf_ANCH_BOL)
8170 PerlIO_printf(Perl_debug_log, "(BOL)");
8171 if (r->extflags & RXf_ANCH_MBOL)
8172 PerlIO_printf(Perl_debug_log, "(MBOL)");
8173 if (r->extflags & RXf_ANCH_SBOL)
8174 PerlIO_printf(Perl_debug_log, "(SBOL)");
8175 if (r->extflags & RXf_ANCH_GPOS)
8176 PerlIO_printf(Perl_debug_log, "(GPOS)");
8177 PerlIO_putc(Perl_debug_log, ' ');
8179 if (r->extflags & RXf_GPOS_SEEN)
8180 PerlIO_printf(Perl_debug_log, "GPOS:%"UVuf" ", (UV)r->gofs);
8181 if (r->intflags & PREGf_SKIP)
8182 PerlIO_printf(Perl_debug_log, "plus ");
8183 if (r->intflags & PREGf_IMPLICIT)
8184 PerlIO_printf(Perl_debug_log, "implicit ");
8185 PerlIO_printf(Perl_debug_log, "minlen %"IVdf" ", (IV)r->minlen);
8186 if (r->extflags & RXf_EVAL_SEEN)
8187 PerlIO_printf(Perl_debug_log, "with eval ");
8188 PerlIO_printf(Perl_debug_log, "\n");
8190 PERL_UNUSED_CONTEXT;
8192 #endif /* DEBUGGING */
8196 - regprop - printable representation of opcode
8199 Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o)
8204 RXi_GET_DECL(prog,progi);
8205 GET_RE_DEBUG_FLAGS_DECL;
8208 sv_setpvn(sv, "", 0);
8210 if (OP(o) > REGNODE_MAX) /* regnode.type is unsigned */
8211 /* It would be nice to FAIL() here, but this may be called from
8212 regexec.c, and it would be hard to supply pRExC_state. */
8213 Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d", (int)OP(o), (int)REGNODE_MAX);
8214 sv_catpv(sv, reg_name[OP(o)]); /* Take off const! */
8216 k = PL_regkind[OP(o)];
8219 SV * const dsv = sv_2mortal(newSVpvs(""));
8220 /* Using is_utf8_string() (via PERL_PV_UNI_DETECT)
8221 * is a crude hack but it may be the best for now since
8222 * we have no flag "this EXACTish node was UTF-8"
8224 const char * const s =
8225 pv_pretty(dsv, STRING(o), STR_LEN(o), 60,
8226 PL_colors[0], PL_colors[1],
8227 PERL_PV_ESCAPE_UNI_DETECT |
8228 PERL_PV_PRETTY_ELIPSES |
8231 Perl_sv_catpvf(aTHX_ sv, " %s", s );
8232 } else if (k == TRIE) {
8233 /* print the details of the trie in dumpuntil instead, as
8234 * progi->data isn't available here */
8235 const char op = OP(o);
8236 const I32 n = ARG(o);
8237 const reg_ac_data * const ac = IS_TRIE_AC(op) ?
8238 (reg_ac_data *)progi->data->data[n] :
8240 const reg_trie_data * const trie
8241 = (reg_trie_data*)progi->data->data[!IS_TRIE_AC(op) ? n : ac->trie];
8243 Perl_sv_catpvf(aTHX_ sv, "-%s",reg_name[o->flags]);
8244 DEBUG_TRIE_COMPILE_r(
8245 Perl_sv_catpvf(aTHX_ sv,
8246 "<S:%"UVuf"/%"IVdf" W:%"UVuf" L:%"UVuf"/%"UVuf" C:%"UVuf"/%"UVuf">",
8247 (UV)trie->startstate,
8248 (IV)trie->statecount-1, /* -1 because of the unused 0 element */
8249 (UV)trie->wordcount,
8252 (UV)TRIE_CHARCOUNT(trie),
8253 (UV)trie->uniquecharcount
8256 if ( IS_ANYOF_TRIE(op) || trie->bitmap ) {
8258 int rangestart = -1;
8259 U8* bitmap = IS_ANYOF_TRIE(op) ? (U8*)ANYOF_BITMAP(o) : (U8*)TRIE_BITMAP(trie);
8260 Perl_sv_catpvf(aTHX_ sv, "[");
8261 for (i = 0; i <= 256; i++) {
8262 if (i < 256 && BITMAP_TEST(bitmap,i)) {
8263 if (rangestart == -1)
8265 } else if (rangestart != -1) {
8266 if (i <= rangestart + 3)
8267 for (; rangestart < i; rangestart++)
8268 put_byte(sv, rangestart);
8270 put_byte(sv, rangestart);
8272 put_byte(sv, i - 1);
8277 Perl_sv_catpvf(aTHX_ sv, "]");
8280 } else if (k == CURLY) {
8281 if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX)
8282 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */
8283 Perl_sv_catpvf(aTHX_ sv, " {%d,%d}", ARG1(o), ARG2(o));
8285 else if (k == WHILEM && o->flags) /* Ordinal/of */
8286 Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4);
8287 else if (k == REF || k == OPEN || k == CLOSE || k == GROUPP || OP(o)==ACCEPT)
8288 Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o)); /* Parenth number */
8289 else if (k == GOSUB)
8290 Perl_sv_catpvf(aTHX_ sv, "%d[%+d]", (int)ARG(o),(int)ARG2L(o)); /* Paren and offset */
8291 else if (k == VERB) {
8293 Perl_sv_catpvf(aTHX_ sv, ":%"SVf,
8294 (SV*)progi->data->data[ ARG( o ) ]);
8295 } else if (k == LOGICAL)
8296 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* 2: embedded, otherwise 1 */
8297 else if (k == ANYOF) {
8298 int i, rangestart = -1;
8299 const U8 flags = ANYOF_FLAGS(o);
8301 /* Should be synchronized with * ANYOF_ #xdefines in regcomp.h */
8302 static const char * const anyofs[] = {
8335 if (flags & ANYOF_LOCALE)
8336 sv_catpvs(sv, "{loc}");
8337 if (flags & ANYOF_FOLD)
8338 sv_catpvs(sv, "{i}");
8339 Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
8340 if (flags & ANYOF_INVERT)
8342 for (i = 0; i <= 256; i++) {
8343 if (i < 256 && ANYOF_BITMAP_TEST(o,i)) {
8344 if (rangestart == -1)
8346 } else if (rangestart != -1) {
8347 if (i <= rangestart + 3)
8348 for (; rangestart < i; rangestart++)
8349 put_byte(sv, rangestart);
8351 put_byte(sv, rangestart);
8353 put_byte(sv, i - 1);
8359 if (o->flags & ANYOF_CLASS)
8360 for (i = 0; i < (int)(sizeof(anyofs)/sizeof(char*)); i++)
8361 if (ANYOF_CLASS_TEST(o,i))
8362 sv_catpv(sv, anyofs[i]);
8364 if (flags & ANYOF_UNICODE)
8365 sv_catpvs(sv, "{unicode}");
8366 else if (flags & ANYOF_UNICODE_ALL)
8367 sv_catpvs(sv, "{unicode_all}");
8371 SV * const sw = regclass_swash(prog, o, FALSE, &lv, 0);
8375 U8 s[UTF8_MAXBYTES_CASE+1];
8377 for (i = 0; i <= 256; i++) { /* just the first 256 */
8378 uvchr_to_utf8(s, i);
8380 if (i < 256 && swash_fetch(sw, s, TRUE)) {
8381 if (rangestart == -1)
8383 } else if (rangestart != -1) {
8384 if (i <= rangestart + 3)
8385 for (; rangestart < i; rangestart++) {
8386 const U8 * const e = uvchr_to_utf8(s,rangestart);
8388 for(p = s; p < e; p++)
8392 const U8 *e = uvchr_to_utf8(s,rangestart);
8394 for (p = s; p < e; p++)
8397 e = uvchr_to_utf8(s, i-1);
8398 for (p = s; p < e; p++)
8405 sv_catpvs(sv, "..."); /* et cetera */
8409 char *s = savesvpv(lv);
8410 char * const origs = s;
8412 while (*s && *s != '\n')
8416 const char * const t = ++s;
8434 Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
8436 else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH))
8437 Perl_sv_catpvf(aTHX_ sv, "[%d]", -(o->flags));
8439 PERL_UNUSED_CONTEXT;
8440 PERL_UNUSED_ARG(sv);
8442 PERL_UNUSED_ARG(prog);
8443 #endif /* DEBUGGING */
8447 Perl_re_intuit_string(pTHX_ regexp *prog)
8448 { /* Assume that RE_INTUIT is set */
8450 GET_RE_DEBUG_FLAGS_DECL;
8451 PERL_UNUSED_CONTEXT;
8455 const char * const s = SvPV_nolen_const(prog->check_substr
8456 ? prog->check_substr : prog->check_utf8);
8458 if (!PL_colorset) reginitcolors();
8459 PerlIO_printf(Perl_debug_log,
8460 "%sUsing REx %ssubstr:%s \"%s%.60s%s%s\"\n",
8462 prog->check_substr ? "" : "utf8 ",
8463 PL_colors[5],PL_colors[0],
8466 (strlen(s) > 60 ? "..." : ""));
8469 return prog->check_substr ? prog->check_substr : prog->check_utf8;
8473 pregfree - free a regexp
8475 See regdupe below if you change anything here.
8479 Perl_pregfree(pTHX_ struct regexp *r)
8483 GET_RE_DEBUG_FLAGS_DECL;
8485 if (!r || (--r->refcnt > 0))
8491 SV *dsv= sv_newmortal();
8492 RE_PV_QUOTED_DECL(s, (r->extflags & RXf_UTF8),
8493 dsv, r->precomp, r->prelen, 60);
8494 PerlIO_printf(Perl_debug_log,"%sFreeing REx:%s %s\n",
8495 PL_colors[4],PL_colors[5],s);
8499 /* gcov results gave these as non-null 100% of the time, so there's no
8500 optimisation in checking them before calling Safefree */
8501 Safefree(r->precomp);
8502 Safefree(ri->offsets); /* 20010421 MJD */
8503 RX_MATCH_COPY_FREE(r);
8504 #ifdef PERL_OLD_COPY_ON_WRITE
8506 SvREFCNT_dec(r->saved_copy);
8509 if (r->anchored_substr)
8510 SvREFCNT_dec(r->anchored_substr);
8511 if (r->anchored_utf8)
8512 SvREFCNT_dec(r->anchored_utf8);
8513 if (r->float_substr)
8514 SvREFCNT_dec(r->float_substr);
8516 SvREFCNT_dec(r->float_utf8);
8517 Safefree(r->substrs);
8520 SvREFCNT_dec(r->paren_names);
8522 int n = ri->data->count;
8523 PAD* new_comppad = NULL;
8528 /* If you add a ->what type here, update the comment in regcomp.h */
8529 switch (ri->data->what[n]) {
8533 SvREFCNT_dec((SV*)ri->data->data[n]);
8536 Safefree(ri->data->data[n]);
8539 new_comppad = (AV*)ri->data->data[n];
8542 if (new_comppad == NULL)
8543 Perl_croak(aTHX_ "panic: pregfree comppad");
8544 PAD_SAVE_LOCAL(old_comppad,
8545 /* Watch out for global destruction's random ordering. */
8546 (SvTYPE(new_comppad) == SVt_PVAV) ? new_comppad : NULL
8549 refcnt = OpREFCNT_dec((OP_4tree*)ri->data->data[n]);
8552 op_free((OP_4tree*)ri->data->data[n]);
8554 PAD_RESTORE_LOCAL(old_comppad);
8555 SvREFCNT_dec((SV*)new_comppad);
8561 { /* Aho Corasick add-on structure for a trie node.
8562 Used in stclass optimization only */
8564 reg_ac_data *aho=(reg_ac_data*)ri->data->data[n];
8566 refcount = --aho->refcount;
8569 PerlMemShared_free(aho->states);
8570 PerlMemShared_free(aho->fail);
8571 /* do this last!!!! */
8572 PerlMemShared_free(ri->data->data[n]);
8573 PerlMemShared_free(ri->regstclass);
8579 /* trie structure. */
8581 reg_trie_data *trie=(reg_trie_data*)ri->data->data[n];
8583 refcount = --trie->refcount;
8586 PerlMemShared_free(trie->charmap);
8587 PerlMemShared_free(trie->states);
8588 PerlMemShared_free(trie->trans);
8590 PerlMemShared_free(trie->bitmap);
8592 PerlMemShared_free(trie->wordlen);
8594 PerlMemShared_free(trie->jump);
8596 PerlMemShared_free(trie->nextword);
8597 /* do this last!!!! */
8598 PerlMemShared_free(ri->data->data[n]);
8603 Perl_croak(aTHX_ "panic: regfree data code '%c'", ri->data->what[n]);
8606 Safefree(ri->data->what);
8609 Safefree(r->startp);
8612 Safefree(ri->swap->startp);
8613 Safefree(ri->swap->endp);
8620 #define sv_dup_inc(s,t) SvREFCNT_inc(sv_dup(s,t))
8621 #define av_dup_inc(s,t) (AV*)SvREFCNT_inc(sv_dup((SV*)s,t))
8622 #define hv_dup_inc(s,t) (HV*)SvREFCNT_inc(sv_dup((SV*)s,t))
8623 #define SAVEPVN(p,n) ((p) ? savepvn(p,n) : NULL)
8626 regdupe - duplicate a regexp.
8628 This routine is called by sv.c's re_dup and is expected to clone a
8629 given regexp structure. It is a no-op when not under USE_ITHREADS.
8630 (Originally this *was* re_dup() for change history see sv.c)
8632 See pregfree() above if you change anything here.
8634 #if defined(USE_ITHREADS)
8636 Perl_regdupe(pTHX_ const regexp *r, CLONE_PARAMS *param)
8640 regexp_internal *reti;
8642 struct reg_substr_datum *s;
8646 return (REGEXP *)NULL;
8648 if ((ret = (REGEXP *)ptr_table_fetch(PL_ptr_table, r)))
8651 len = ri->offsets[0];
8652 npar = r->nparens+1;
8654 Newxz(ret, 1, regexp);
8655 Newxc(reti, sizeof(regexp_internal) + (len+1)*sizeof(regnode), char, regexp_internal);
8657 Copy(ri->program, reti->program, len+1, regnode);
8659 Newx(ret->startp, npar, I32);
8660 Copy(r->startp, ret->startp, npar, I32);
8661 Newx(ret->endp, npar, I32);
8662 Copy(r->startp, ret->startp, npar, I32);
8664 Newx(reti->swap, 1, regexp_paren_ofs);
8665 /* no need to copy these */
8666 Newx(reti->swap->startp, npar, I32);
8667 Newx(reti->swap->endp, npar, I32);
8672 Newx(ret->substrs, 1, struct reg_substr_data);
8673 for (s = ret->substrs->data, i = 0; i < 3; i++, s++) {
8674 s->min_offset = r->substrs->data[i].min_offset;
8675 s->max_offset = r->substrs->data[i].max_offset;
8676 s->end_shift = r->substrs->data[i].end_shift;
8677 s->substr = sv_dup_inc(r->substrs->data[i].substr, param);
8678 s->utf8_substr = sv_dup_inc(r->substrs->data[i].utf8_substr, param);
8681 reti->regstclass = NULL;
8684 const int count = ri->data->count;
8687 Newxc(d, sizeof(struct reg_data) + count*sizeof(void *),
8688 char, struct reg_data);
8689 Newx(d->what, count, U8);
8692 for (i = 0; i < count; i++) {
8693 d->what[i] = ri->data->what[i];
8694 switch (d->what[i]) {
8695 /* legal options are one of: sSfpontTu
8696 see also regcomp.h and pregfree() */
8699 case 'p': /* actually an AV, but the dup function is identical. */
8700 case 'u': /* actually an HV, but the dup function is identical. */
8701 d->data[i] = sv_dup_inc((SV *)ri->data->data[i], param);
8704 /* This is cheating. */
8705 Newx(d->data[i], 1, struct regnode_charclass_class);
8706 StructCopy(ri->data->data[i], d->data[i],
8707 struct regnode_charclass_class);
8708 reti->regstclass = (regnode*)d->data[i];
8711 /* Compiled op trees are readonly and in shared memory,
8712 and can thus be shared without duplication. */
8714 d->data[i] = (void*)OpREFCNT_inc((OP*)ri->data->data[i]);
8718 /* Trie stclasses are readonly and can thus be shared
8719 * without duplication. We free the stclass in pregfree
8720 * when the corresponding reg_ac_data struct is freed.
8722 reti->regstclass= ri->regstclass;
8726 ((reg_trie_data*)ri->data->data[i])->refcount++;
8730 d->data[i] = ri->data->data[i];
8733 Perl_croak(aTHX_ "panic: re_dup unknown data code '%c'", ri->data->what[i]);
8742 Newx(reti->offsets, 2*len+1, U32);
8743 Copy(ri->offsets, reti->offsets, 2*len+1, U32);
8745 ret->precomp = SAVEPVN(r->precomp, r->prelen);
8746 ret->refcnt = r->refcnt;
8747 ret->minlen = r->minlen;
8748 ret->minlenret = r->minlenret;
8749 ret->prelen = r->prelen;
8750 ret->nparens = r->nparens;
8751 ret->lastparen = r->lastparen;
8752 ret->lastcloseparen = r->lastcloseparen;
8753 ret->intflags = r->intflags;
8754 ret->extflags = r->extflags;
8756 ret->sublen = r->sublen;
8758 ret->engine = r->engine;
8760 ret->paren_names = hv_dup_inc(r->paren_names, param);
8762 if (RX_MATCH_COPIED(ret))
8763 ret->subbeg = SAVEPVN(r->subbeg, r->sublen);
8766 #ifdef PERL_OLD_COPY_ON_WRITE
8767 ret->saved_copy = NULL;
8770 ptr_table_store(PL_ptr_table, r, ret);
8778 converts a regexp embedded in a MAGIC struct to its stringified form,
8779 caching the converted form in the struct and returns the cached
8782 If lp is nonnull then it is used to return the length of the
8785 If flags is nonnull and the returned string contains UTF8 then
8786 (flags & 1) will be true.
8788 If haseval is nonnull then it is used to return whether the pattern
8791 Normally called via macro:
8793 CALLREG_STRINGIFY(mg,0,0);
8797 CALLREG_AS_STR(mg,lp,flags,haseval)
8799 See sv_2pv_flags() in sv.c for an example of internal usage.
8804 Perl_reg_stringify(pTHX_ MAGIC *mg, STRLEN *lp, U32 *flags, I32 *haseval ) {
8806 const regexp * const re = (regexp *)mg->mg_obj;
8807 RXi_GET_DECL(re,ri);
8810 const char *fptr = "msix";
8815 bool need_newline = 0;
8816 U16 reganch = (U16)((re->extflags & RXf_PMf_COMPILETIME) >> 12);
8818 while((ch = *fptr++)) {
8820 reflags[left++] = ch;
8823 reflags[right--] = ch;
8828 reflags[left] = '-';
8832 mg->mg_len = re->prelen + 4 + left;
8834 * If /x was used, we have to worry about a regex ending with a
8835 * comment later being embedded within another regex. If so, we don't
8836 * want this regex's "commentization" to leak out to the right part of
8837 * the enclosing regex, we must cap it with a newline.
8839 * So, if /x was used, we scan backwards from the end of the regex. If
8840 * we find a '#' before we find a newline, we need to add a newline
8841 * ourself. If we find a '\n' first (or if we don't find '#' or '\n'),
8842 * we don't need to add anything. -jfriedl
8844 if (PMf_EXTENDED & re->extflags) {
8845 const char *endptr = re->precomp + re->prelen;
8846 while (endptr >= re->precomp) {
8847 const char c = *(endptr--);
8849 break; /* don't need another */
8851 /* we end while in a comment, so we need a newline */
8852 mg->mg_len++; /* save space for it */
8853 need_newline = 1; /* note to add it */
8859 Newx(mg->mg_ptr, mg->mg_len + 1 + left, char);
8860 mg->mg_ptr[0] = '(';
8861 mg->mg_ptr[1] = '?';
8862 Copy(reflags, mg->mg_ptr+2, left, char);
8863 *(mg->mg_ptr+left+2) = ':';
8864 Copy(re->precomp, mg->mg_ptr+3+left, re->prelen, char);
8866 mg->mg_ptr[mg->mg_len - 2] = '\n';
8867 mg->mg_ptr[mg->mg_len - 1] = ')';
8868 mg->mg_ptr[mg->mg_len] = 0;
8871 *haseval = ri->program[0].next_off;
8873 *flags = ((re->extflags & RXf_UTF8) ? 1 : 0);
8881 #ifndef PERL_IN_XSUB_RE
8883 - regnext - dig the "next" pointer out of a node
8886 Perl_regnext(pTHX_ register regnode *p)
8889 register I32 offset;
8894 offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p));
8903 S_re_croak2(pTHX_ const char* pat1,const char* pat2,...)
8906 STRLEN l1 = strlen(pat1);
8907 STRLEN l2 = strlen(pat2);
8910 const char *message;
8916 Copy(pat1, buf, l1 , char);
8917 Copy(pat2, buf + l1, l2 , char);
8918 buf[l1 + l2] = '\n';
8919 buf[l1 + l2 + 1] = '\0';
8921 /* ANSI variant takes additional second argument */
8922 va_start(args, pat2);
8926 msv = vmess(buf, &args);
8928 message = SvPV_const(msv,l1);
8931 Copy(message, buf, l1 , char);
8932 buf[l1-1] = '\0'; /* Overwrite \n */
8933 Perl_croak(aTHX_ "%s", buf);
8936 /* XXX Here's a total kludge. But we need to re-enter for swash routines. */
8938 #ifndef PERL_IN_XSUB_RE
8940 Perl_save_re_context(pTHX)
8944 struct re_save_state *state;
8946 SAVEVPTR(PL_curcop);
8947 SSGROW(SAVESTACK_ALLOC_FOR_RE_SAVE_STATE + 1);
8949 state = (struct re_save_state *)(PL_savestack + PL_savestack_ix);
8950 PL_savestack_ix += SAVESTACK_ALLOC_FOR_RE_SAVE_STATE;
8951 SSPUSHINT(SAVEt_RE_STATE);
8953 Copy(&PL_reg_state, state, 1, struct re_save_state);
8955 PL_reg_start_tmp = 0;
8956 PL_reg_start_tmpl = 0;
8957 PL_reg_oldsaved = NULL;
8958 PL_reg_oldsavedlen = 0;
8960 PL_reg_leftiter = 0;
8961 PL_reg_poscache = NULL;
8962 PL_reg_poscache_size = 0;
8963 #ifdef PERL_OLD_COPY_ON_WRITE
8967 /* Save $1..$n (#18107: UTF-8 s/(\w+)/uc($1)/e); AMS 20021106. */
8969 const REGEXP * const rx = PM_GETRE(PL_curpm);
8972 for (i = 1; i <= rx->nparens; i++) {
8973 char digits[TYPE_CHARS(long)];
8974 const STRLEN len = my_snprintf(digits, sizeof(digits), "%lu", (long)i);
8975 GV *const *const gvp
8976 = (GV**)hv_fetch(PL_defstash, digits, len, 0);
8979 GV * const gv = *gvp;
8980 if (SvTYPE(gv) == SVt_PVGV && GvSV(gv))
8990 clear_re(pTHX_ void *r)
8993 ReREFCNT_dec((regexp *)r);
8999 S_put_byte(pTHX_ SV *sv, int c)
9001 if (isCNTRL(c) || c == 255 || !isPRINT(c))
9002 Perl_sv_catpvf(aTHX_ sv, "\\%o", c);
9003 else if (c == '-' || c == ']' || c == '\\' || c == '^')
9004 Perl_sv_catpvf(aTHX_ sv, "\\%c", c);
9006 Perl_sv_catpvf(aTHX_ sv, "%c", c);
9010 #define CLEAR_OPTSTART \
9011 if (optstart) STMT_START { \
9012 DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log, " (%"IVdf" nodes)\n", (IV)(node - optstart))); \
9016 #define DUMPUNTIL(b,e) CLEAR_OPTSTART; node=dumpuntil(r,start,(b),(e),last,sv,indent+1,depth+1);
9018 STATIC const regnode *
9019 S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node,
9020 const regnode *last, const regnode *plast,
9021 SV* sv, I32 indent, U32 depth)
9024 register U8 op = PSEUDO; /* Arbitrary non-END op. */
9025 register const regnode *next;
9026 const regnode *optstart= NULL;
9028 GET_RE_DEBUG_FLAGS_DECL;
9030 #ifdef DEBUG_DUMPUNTIL
9031 PerlIO_printf(Perl_debug_log, "--- %d : %d - %d - %d\n",indent,node-start,
9032 last ? last-start : 0,plast ? plast-start : 0);
9035 if (plast && plast < last)
9038 while (PL_regkind[op] != END && (!last || node < last)) {
9039 /* While that wasn't END last time... */
9043 if (op == CLOSE || op == WHILEM)
9045 next = regnext((regnode *)node);
9048 if (OP(node) == OPTIMIZED) {
9049 if (!optstart && RE_DEBUG_FLAG(RE_DEBUG_COMPILE_OPTIMISE))
9056 regprop(r, sv, node);
9057 PerlIO_printf(Perl_debug_log, "%4"IVdf":%*s%s", (IV)(node - start),
9058 (int)(2*indent + 1), "", SvPVX_const(sv));
9060 if (OP(node) != OPTIMIZED) {
9061 if (next == NULL) /* Next ptr. */
9062 PerlIO_printf(Perl_debug_log, "(0)");
9063 else if (PL_regkind[(U8)op] == BRANCH && PL_regkind[OP(next)] != BRANCH )
9064 PerlIO_printf(Perl_debug_log, "(FAIL)");
9066 PerlIO_printf(Perl_debug_log, "(%"IVdf")", (IV)(next - start));
9068 /*if (PL_regkind[(U8)op] != TRIE)*/
9069 (void)PerlIO_putc(Perl_debug_log, '\n');
9073 if (PL_regkind[(U8)op] == BRANCHJ) {
9076 register const regnode *nnode = (OP(next) == LONGJMP
9077 ? regnext((regnode *)next)
9079 if (last && nnode > last)
9081 DUMPUNTIL(NEXTOPER(NEXTOPER(node)), nnode);
9084 else if (PL_regkind[(U8)op] == BRANCH) {
9086 DUMPUNTIL(NEXTOPER(node), next);
9088 else if ( PL_regkind[(U8)op] == TRIE ) {
9089 const regnode *this_trie = node;
9090 const char op = OP(node);
9091 const I32 n = ARG(node);
9092 const reg_ac_data * const ac = op>=AHOCORASICK ?
9093 (reg_ac_data *)ri->data->data[n] :
9095 const reg_trie_data * const trie =
9096 (reg_trie_data*)ri->data->data[op<AHOCORASICK ? n : ac->trie];
9098 AV *const trie_words = (AV *) ri->data->data[n + TRIE_WORDS_OFFSET];
9100 const regnode *nextbranch= NULL;
9102 sv_setpvn(sv, "", 0);
9103 for (word_idx= 0; word_idx < (I32)trie->wordcount; word_idx++) {
9104 SV ** const elem_ptr = av_fetch(trie_words,word_idx,0);
9106 PerlIO_printf(Perl_debug_log, "%*s%s ",
9107 (int)(2*(indent+3)), "",
9108 elem_ptr ? pv_pretty(sv, SvPV_nolen_const(*elem_ptr), SvCUR(*elem_ptr), 60,
9109 PL_colors[0], PL_colors[1],
9110 (SvUTF8(*elem_ptr) ? PERL_PV_ESCAPE_UNI : 0) |
9111 PERL_PV_PRETTY_ELIPSES |
9117 U16 dist= trie->jump[word_idx+1];
9118 PerlIO_printf(Perl_debug_log, "(%"UVuf")\n",
9119 (UV)((dist ? this_trie + dist : next) - start));
9122 nextbranch= this_trie + trie->jump[0];
9123 DUMPUNTIL(this_trie + dist, nextbranch);
9125 if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
9126 nextbranch= regnext((regnode *)nextbranch);
9128 PerlIO_printf(Perl_debug_log, "\n");
9131 if (last && next > last)
9136 else if ( op == CURLY ) { /* "next" might be very big: optimizer */
9137 DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS,
9138 NEXTOPER(node) + EXTRA_STEP_2ARGS + 1);
9140 else if (PL_regkind[(U8)op] == CURLY && op != CURLYX) {
9142 DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS, next);
9144 else if ( op == PLUS || op == STAR) {
9145 DUMPUNTIL(NEXTOPER(node), NEXTOPER(node) + 1);
9147 else if (op == ANYOF) {
9148 /* arglen 1 + class block */
9149 node += 1 + ((ANYOF_FLAGS(node) & ANYOF_LARGE)
9150 ? ANYOF_CLASS_SKIP : ANYOF_SKIP);
9151 node = NEXTOPER(node);
9153 else if (PL_regkind[(U8)op] == EXACT) {
9154 /* Literal string, where present. */
9155 node += NODE_SZ_STR(node) - 1;
9156 node = NEXTOPER(node);
9159 node = NEXTOPER(node);
9160 node += regarglen[(U8)op];
9162 if (op == CURLYX || op == OPEN)
9166 #ifdef DEBUG_DUMPUNTIL
9167 PerlIO_printf(Perl_debug_log, "--- %d\n", (int)indent);
9172 #endif /* DEBUGGING */
9176 * c-indentation-style: bsd
9178 * indent-tabs-mode: t
9181 * ex: set ts=8 sts=4 sw=4 noet: