5 * "A fair jaw-cracker dwarf-language must be." --Samwise Gamgee
8 /* This file contains functions for compiling a regular expression. See
9 * also regexec.c which funnily enough, contains functions for executing
10 * a regular expression.
12 * This file is also copied at build time to ext/re/re_comp.c, where
13 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
14 * This causes the main functions to be compiled under new names and with
15 * debugging support added, which makes "use re 'debug'" work.
18 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
19 * confused with the original package (see point 3 below). Thanks, Henry!
22 /* Additional note: this code is very heavily munged from Henry's version
23 * in places. In some spots I've traded clarity for efficiency, so don't
24 * blame Henry for some of the lack of readability.
27 /* The names of the functions have been changed from regcomp and
28 * regexec to pregcomp and pregexec in order to avoid conflicts
29 * with the POSIX routines of the same names.
32 #ifdef PERL_EXT_RE_BUILD
33 /* need to replace pregcomp et al, so enable that */
34 # ifndef PERL_IN_XSUB_RE
35 # define PERL_IN_XSUB_RE
37 /* need access to debugger hooks */
38 # if defined(PERL_EXT_RE_DEBUG) && !defined(DEBUGGING)
43 #ifdef PERL_IN_XSUB_RE
44 /* We *really* need to overwrite these symbols: */
45 # define Perl_pregcomp my_regcomp
46 # define Perl_regdump my_regdump
47 # define Perl_regprop my_regprop
48 # define Perl_pregfree my_regfree
49 # define Perl_re_intuit_string my_re_intuit_string
50 /* *These* symbols are masked to allow static link. */
51 # define Perl_regnext my_regnext
52 # define Perl_save_re_context my_save_re_context
53 # define Perl_reginitcolors my_reginitcolors
55 # define PERL_NO_GET_CONTEXT
60 * pregcomp and pregexec -- regsub and regerror are not used in perl
62 * Copyright (c) 1986 by University of Toronto.
63 * Written by Henry Spencer. Not derived from licensed software.
65 * Permission is granted to anyone to use this software for any
66 * purpose on any computer system, and to redistribute it freely,
67 * subject to the following restrictions:
69 * 1. The author is not responsible for the consequences of use of
70 * this software, no matter how awful, even if they arise
73 * 2. The origin of this software must not be misrepresented, either
74 * by explicit claim or by omission.
76 * 3. Altered versions must be plainly marked as such, and must not
77 * be misrepresented as being the original software.
80 **** Alterations to Henry's code are...
82 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
83 **** 2000, 2001, 2002, 2003, 2004, 2005, by Larry Wall and others
85 **** You may distribute under the terms of either the GNU General Public
86 **** License or the Artistic License, as specified in the README file.
89 * Beware that some of this code is subtly aware of the way operator
90 * precedence is structured in regular expressions. Serious changes in
91 * regular-expression syntax might require a total rethink.
94 #define PERL_IN_REGCOMP_C
97 #ifndef PERL_IN_XSUB_RE
109 # if defined(BUGGY_MSC6)
110 /* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */
111 # pragma optimize("a",off)
112 /* But MSC 6.00A is happy with 'w', for aliases only across function calls*/
113 # pragma optimize("w",on )
114 # endif /* BUGGY_MSC6 */
118 #define STATIC static
121 typedef struct RExC_state_t {
122 U32 flags; /* are we folding, multilining? */
123 char *precomp; /* uncompiled string. */
125 char *start; /* Start of input for compile */
126 char *end; /* End of input for compile */
127 char *parse; /* Input-scan pointer. */
128 I32 whilem_seen; /* number of WHILEM in this expr */
129 regnode *emit_start; /* Start of emitted-code area */
130 regnode *emit; /* Code-emit pointer; ®dummy = don't = compiling */
131 I32 naughty; /* How bad is this pattern? */
132 I32 sawback; /* Did we see \1, ...? */
134 I32 size; /* Code size. */
135 I32 npar; /* () count. */
141 char *starttry; /* -Dr: where regtry was called. */
142 #define RExC_starttry (pRExC_state->starttry)
146 #define RExC_flags (pRExC_state->flags)
147 #define RExC_precomp (pRExC_state->precomp)
148 #define RExC_rx (pRExC_state->rx)
149 #define RExC_start (pRExC_state->start)
150 #define RExC_end (pRExC_state->end)
151 #define RExC_parse (pRExC_state->parse)
152 #define RExC_whilem_seen (pRExC_state->whilem_seen)
153 #define RExC_offsets (pRExC_state->rx->offsets) /* I am not like the others */
154 #define RExC_emit (pRExC_state->emit)
155 #define RExC_emit_start (pRExC_state->emit_start)
156 #define RExC_naughty (pRExC_state->naughty)
157 #define RExC_sawback (pRExC_state->sawback)
158 #define RExC_seen (pRExC_state->seen)
159 #define RExC_size (pRExC_state->size)
160 #define RExC_npar (pRExC_state->npar)
161 #define RExC_extralen (pRExC_state->extralen)
162 #define RExC_seen_zerolen (pRExC_state->seen_zerolen)
163 #define RExC_seen_evals (pRExC_state->seen_evals)
164 #define RExC_utf8 (pRExC_state->utf8)
166 #define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?')
167 #define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
168 ((*s) == '{' && regcurly(s)))
171 #undef SPSTART /* dratted cpp namespace... */
174 * Flags to be passed up and down.
176 #define WORST 0 /* Worst case. */
177 #define HASWIDTH 0x1 /* Known to match non-null strings. */
178 #define SIMPLE 0x2 /* Simple enough to be STAR/PLUS operand. */
179 #define SPSTART 0x4 /* Starts with * or +. */
180 #define TRYAGAIN 0x8 /* Weeded out a declaration. */
182 /* Length of a variant. */
184 typedef struct scan_data_t {
190 I32 last_end; /* min value, <0 unless valid. */
193 SV **longest; /* Either &l_fixed, or &l_float. */
197 I32 offset_float_min;
198 I32 offset_float_max;
202 struct regnode_charclass_class *start_class;
206 * Forward declarations for pregcomp()'s friends.
209 static scan_data_t zero_scan_data = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
212 #define SF_BEFORE_EOL (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
213 #define SF_BEFORE_SEOL 0x1
214 #define SF_BEFORE_MEOL 0x2
215 #define SF_FIX_BEFORE_EOL (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
216 #define SF_FL_BEFORE_EOL (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
219 # define SF_FIX_SHIFT_EOL (0+2)
220 # define SF_FL_SHIFT_EOL (0+4)
222 # define SF_FIX_SHIFT_EOL (+2)
223 # define SF_FL_SHIFT_EOL (+4)
226 #define SF_FIX_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
227 #define SF_FIX_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
229 #define SF_FL_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
230 #define SF_FL_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
231 #define SF_IS_INF 0x40
232 #define SF_HAS_PAR 0x80
233 #define SF_IN_PAR 0x100
234 #define SF_HAS_EVAL 0x200
235 #define SCF_DO_SUBSTR 0x400
236 #define SCF_DO_STCLASS_AND 0x0800
237 #define SCF_DO_STCLASS_OR 0x1000
238 #define SCF_DO_STCLASS (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
239 #define SCF_WHILEM_VISITED_POS 0x2000
241 #define UTF (RExC_utf8 != 0)
242 #define LOC ((RExC_flags & PMf_LOCALE) != 0)
243 #define FOLD ((RExC_flags & PMf_FOLD) != 0)
245 #define OOB_UNICODE 12345678
246 #define OOB_NAMEDCLASS -1
248 #define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
249 #define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
252 /* length of regex to show in messages that don't mark a position within */
253 #define RegexLengthToShowInErrorMessages 127
256 * If MARKER[12] are adjusted, be sure to adjust the constants at the top
257 * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
258 * op/pragma/warn/regcomp.
260 #define MARKER1 "<-- HERE" /* marker as it appears in the description */
261 #define MARKER2 " <-- HERE " /* marker as it appears within the regex */
263 #define REPORT_LOCATION " in regex; marked by " MARKER1 " in m/%.*s" MARKER2 "%s/"
266 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
267 * arg. Show regex, up to a maximum length. If it's too long, chop and add
270 #define FAIL(msg) STMT_START { \
271 const char *ellipses = ""; \
272 IV len = RExC_end - RExC_precomp; \
275 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
276 if (len > RegexLengthToShowInErrorMessages) { \
277 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
278 len = RegexLengthToShowInErrorMessages - 10; \
281 Perl_croak(aTHX_ "%s in regex m/%.*s%s/", \
282 msg, (int)len, RExC_precomp, ellipses); \
286 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
287 * args. Show regex, up to a maximum length. If it's too long, chop and add
290 #define FAIL2(pat,msg) STMT_START { \
291 const char *ellipses = ""; \
292 IV len = RExC_end - RExC_precomp; \
295 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
296 if (len > RegexLengthToShowInErrorMessages) { \
297 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
298 len = RegexLengthToShowInErrorMessages - 10; \
301 S_re_croak2(aTHX_ pat, " in regex m/%.*s%s/", \
302 msg, (int)len, RExC_precomp, ellipses); \
307 * Simple_vFAIL -- like FAIL, but marks the current location in the scan
309 #define Simple_vFAIL(m) STMT_START { \
310 IV offset = RExC_parse - RExC_precomp; \
311 Perl_croak(aTHX_ "%s" REPORT_LOCATION, \
312 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
316 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
318 #define vFAIL(m) STMT_START { \
320 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
325 * Like Simple_vFAIL(), but accepts two arguments.
327 #define Simple_vFAIL2(m,a1) STMT_START { \
328 IV offset = RExC_parse - RExC_precomp; \
329 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, \
330 (int)offset, RExC_precomp, RExC_precomp + offset); \
334 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
336 #define vFAIL2(m,a1) STMT_START { \
338 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
339 Simple_vFAIL2(m, a1); \
344 * Like Simple_vFAIL(), but accepts three arguments.
346 #define Simple_vFAIL3(m, a1, a2) STMT_START { \
347 IV offset = RExC_parse - RExC_precomp; \
348 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, \
349 (int)offset, RExC_precomp, RExC_precomp + offset); \
353 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
355 #define vFAIL3(m,a1,a2) STMT_START { \
357 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
358 Simple_vFAIL3(m, a1, a2); \
362 * Like Simple_vFAIL(), but accepts four arguments.
364 #define Simple_vFAIL4(m, a1, a2, a3) STMT_START { \
365 IV offset = RExC_parse - RExC_precomp; \
366 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3, \
367 (int)offset, RExC_precomp, RExC_precomp + offset); \
371 * Like Simple_vFAIL(), but accepts five arguments.
373 #define Simple_vFAIL5(m, a1, a2, a3, a4) STMT_START { \
374 IV offset = RExC_parse - RExC_precomp; \
375 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3, a4, \
376 (int)offset, RExC_precomp, RExC_precomp + offset); \
380 #define vWARN(loc,m) STMT_START { \
381 IV offset = loc - RExC_precomp; \
382 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s" REPORT_LOCATION, \
383 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
386 #define vWARNdep(loc,m) STMT_START { \
387 IV offset = loc - RExC_precomp; \
388 Perl_warner(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), \
389 "%s" REPORT_LOCATION, \
390 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
394 #define vWARN2(loc, m, a1) STMT_START { \
395 IV offset = loc - RExC_precomp; \
396 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
397 a1, (int)offset, RExC_precomp, RExC_precomp + offset); \
400 #define vWARN3(loc, m, a1, a2) STMT_START { \
401 IV offset = loc - RExC_precomp; \
402 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
403 a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset); \
406 #define vWARN4(loc, m, a1, a2, a3) STMT_START { \
407 IV offset = loc - RExC_precomp; \
408 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
409 a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
412 #define vWARN5(loc, m, a1, a2, a3, a4) STMT_START { \
413 IV offset = loc - RExC_precomp; \
414 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
415 a1, a2, a3, a4, (int)offset, RExC_precomp, RExC_precomp + offset); \
419 /* Allow for side effects in s */
420 #define REGC(c,s) STMT_START { \
421 if (!SIZE_ONLY) *(s) = (c); else (void)(s); \
424 /* Macros for recording node offsets. 20001227 mjd@plover.com
425 * Nodes are numbered 1, 2, 3, 4. Node #n's position is recorded in
426 * element 2*n-1 of the array. Element #2n holds the byte length node #n.
427 * Element 0 holds the number n.
430 #define MJD_OFFSET_DEBUG(x)
431 /* #define MJD_OFFSET_DEBUG(x) DEBUG_r(Perl_warn_nocontext x) */
434 #define Set_Node_Offset_To_R(node,byte) STMT_START { \
436 MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n", \
437 __LINE__, (node), (byte))); \
439 Perl_croak(aTHX_ "value of node is %d in Offset macro", node); \
441 RExC_offsets[2*(node)-1] = (byte); \
446 #define Set_Node_Offset(node,byte) \
447 Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
448 #define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
450 #define Set_Node_Length_To_R(node,len) STMT_START { \
452 MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n", \
453 __LINE__, (node), (len))); \
455 Perl_croak(aTHX_ "value of node is %d in Length macro", node); \
457 RExC_offsets[2*(node)] = (len); \
462 #define Set_Node_Length(node,len) \
463 Set_Node_Length_To_R((node)-RExC_emit_start, len)
464 #define Set_Cur_Node_Length(len) Set_Node_Length(RExC_emit, len)
465 #define Set_Node_Cur_Length(node) \
466 Set_Node_Length(node, RExC_parse - parse_start)
468 /* Get offsets and lengths */
469 #define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
470 #define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
472 static void clear_re(pTHX_ void *r);
474 /* Mark that we cannot extend a found fixed substring at this point.
475 Updata the longest found anchored substring and the longest found
476 floating substrings if needed. */
479 S_scan_commit(pTHX_ RExC_state_t *pRExC_state, scan_data_t *data)
481 const STRLEN l = CHR_SVLEN(data->last_found);
482 const STRLEN old_l = CHR_SVLEN(*data->longest);
484 if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
485 SvSetMagicSV(*data->longest, data->last_found);
486 if (*data->longest == data->longest_fixed) {
487 data->offset_fixed = l ? data->last_start_min : data->pos_min;
488 if (data->flags & SF_BEFORE_EOL)
490 |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
492 data->flags &= ~SF_FIX_BEFORE_EOL;
495 data->offset_float_min = l ? data->last_start_min : data->pos_min;
496 data->offset_float_max = (l
497 ? data->last_start_max
498 : data->pos_min + data->pos_delta);
499 if ((U32)data->offset_float_max > (U32)I32_MAX)
500 data->offset_float_max = I32_MAX;
501 if (data->flags & SF_BEFORE_EOL)
503 |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
505 data->flags &= ~SF_FL_BEFORE_EOL;
508 SvCUR_set(data->last_found, 0);
510 SV * sv = data->last_found;
512 SvUTF8(sv) && SvMAGICAL(sv) ? mg_find(sv, PERL_MAGIC_utf8) : NULL;
513 if (mg && mg->mg_len > 0)
517 data->flags &= ~SF_BEFORE_EOL;
520 /* Can match anything (initialization) */
522 S_cl_anything(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
524 ANYOF_CLASS_ZERO(cl);
525 ANYOF_BITMAP_SETALL(cl);
526 cl->flags = ANYOF_EOS|ANYOF_UNICODE_ALL;
528 cl->flags |= ANYOF_LOCALE;
531 /* Can match anything (initialization) */
533 S_cl_is_anything(pTHX_ struct regnode_charclass_class *cl)
537 for (value = 0; value <= ANYOF_MAX; value += 2)
538 if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1))
540 if (!(cl->flags & ANYOF_UNICODE_ALL))
542 if (!ANYOF_BITMAP_TESTALLSET(cl))
547 /* Can match anything (initialization) */
549 S_cl_init(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
551 Zero(cl, 1, struct regnode_charclass_class);
553 cl_anything(pRExC_state, cl);
557 S_cl_init_zero(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
559 Zero(cl, 1, struct regnode_charclass_class);
561 cl_anything(pRExC_state, cl);
563 cl->flags |= ANYOF_LOCALE;
566 /* 'And' a given class with another one. Can create false positives */
567 /* We assume that cl is not inverted */
569 S_cl_and(pTHX_ struct regnode_charclass_class *cl,
570 struct regnode_charclass_class *and_with)
572 if (!(and_with->flags & ANYOF_CLASS)
573 && !(cl->flags & ANYOF_CLASS)
574 && (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
575 && !(and_with->flags & ANYOF_FOLD)
576 && !(cl->flags & ANYOF_FOLD)) {
579 if (and_with->flags & ANYOF_INVERT)
580 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
581 cl->bitmap[i] &= ~and_with->bitmap[i];
583 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
584 cl->bitmap[i] &= and_with->bitmap[i];
585 } /* XXXX: logic is complicated otherwise, leave it along for a moment. */
586 if (!(and_with->flags & ANYOF_EOS))
587 cl->flags &= ~ANYOF_EOS;
589 if (cl->flags & ANYOF_UNICODE_ALL && and_with->flags & ANYOF_UNICODE &&
590 !(and_with->flags & ANYOF_INVERT)) {
591 cl->flags &= ~ANYOF_UNICODE_ALL;
592 cl->flags |= ANYOF_UNICODE;
593 ARG_SET(cl, ARG(and_with));
595 if (!(and_with->flags & ANYOF_UNICODE_ALL) &&
596 !(and_with->flags & ANYOF_INVERT))
597 cl->flags &= ~ANYOF_UNICODE_ALL;
598 if (!(and_with->flags & (ANYOF_UNICODE|ANYOF_UNICODE_ALL)) &&
599 !(and_with->flags & ANYOF_INVERT))
600 cl->flags &= ~ANYOF_UNICODE;
603 /* 'OR' a given class with another one. Can create false positives */
604 /* We assume that cl is not inverted */
606 S_cl_or(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, struct regnode_charclass_class *or_with)
608 if (or_with->flags & ANYOF_INVERT) {
610 * (B1 | CL1) | (!B2 & !CL2) = (B1 | !B2 & !CL2) | (CL1 | (!B2 & !CL2))
611 * <= (B1 | !B2) | (CL1 | !CL2)
612 * which is wasteful if CL2 is small, but we ignore CL2:
613 * (B1 | CL1) | (!B2 & !CL2) <= (B1 | CL1) | !B2 = (B1 | !B2) | CL1
614 * XXXX Can we handle case-fold? Unclear:
615 * (OK1(i) | OK1(i')) | !(OK1(i) | OK1(i')) =
616 * (OK1(i) | OK1(i')) | (!OK1(i) & !OK1(i'))
618 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
619 && !(or_with->flags & ANYOF_FOLD)
620 && !(cl->flags & ANYOF_FOLD) ) {
623 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
624 cl->bitmap[i] |= ~or_with->bitmap[i];
625 } /* XXXX: logic is complicated otherwise */
627 cl_anything(pRExC_state, cl);
630 /* (B1 | CL1) | (B2 | CL2) = (B1 | B2) | (CL1 | CL2)) */
631 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
632 && (!(or_with->flags & ANYOF_FOLD)
633 || (cl->flags & ANYOF_FOLD)) ) {
636 /* OR char bitmap and class bitmap separately */
637 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
638 cl->bitmap[i] |= or_with->bitmap[i];
639 if (or_with->flags & ANYOF_CLASS) {
640 for (i = 0; i < ANYOF_CLASSBITMAP_SIZE; i++)
641 cl->classflags[i] |= or_with->classflags[i];
642 cl->flags |= ANYOF_CLASS;
645 else { /* XXXX: logic is complicated, leave it along for a moment. */
646 cl_anything(pRExC_state, cl);
649 if (or_with->flags & ANYOF_EOS)
650 cl->flags |= ANYOF_EOS;
652 if (cl->flags & ANYOF_UNICODE && or_with->flags & ANYOF_UNICODE &&
653 ARG(cl) != ARG(or_with)) {
654 cl->flags |= ANYOF_UNICODE_ALL;
655 cl->flags &= ~ANYOF_UNICODE;
657 if (or_with->flags & ANYOF_UNICODE_ALL) {
658 cl->flags |= ANYOF_UNICODE_ALL;
659 cl->flags &= ~ANYOF_UNICODE;
665 make_trie(startbranch,first,last,tail,flags)
666 startbranch: the first branch in the whole branch sequence
667 first : start branch of sequence of branch-exact nodes.
668 May be the same as startbranch
669 last : Thing following the last branch.
670 May be the same as tail.
671 tail : item following the branch sequence
672 flags : currently the OP() type we will be building one of /EXACT(|F|Fl)/
674 Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node.
676 A trie is an N'ary tree where the branches are determined by digital
677 decomposition of the key. IE, at the root node you look up the 1st character and
678 follow that branch repeat until you find the end of the branches. Nodes can be
679 marked as "accepting" meaning they represent a complete word. Eg:
683 would convert into the following structure. Numbers represent states, letters
684 following numbers represent valid transitions on the letter from that state, if
685 the number is in square brackets it represents an accepting state, otherwise it
686 will be in parenthesis.
688 +-h->+-e->[3]-+-r->(8)-+-s->[9]
692 (1) +-i->(6)-+-s->[7]
694 +-s->(3)-+-h->(4)-+-e->[5]
696 Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers)
698 This shows that when matching against the string 'hers' we will begin at state 1
699 read 'h' and move to state 2, read 'e' and move to state 3 which is accepting,
700 then read 'r' and go to state 8 followed by 's' which takes us to state 9 which
701 is also accepting. Thus we know that we can match both 'he' and 'hers' with a
702 single traverse. We store a mapping from accepting to state to which word was
703 matched, and then when we have multiple possibilities we try to complete the
704 rest of the regex in the order in which they occured in the alternation.
706 The only prior NFA like behaviour that would be changed by the TRIE support is
707 the silent ignoring of duplicate alternations which are of the form:
709 / (DUPE|DUPE) X? (?{ ... }) Y /x
711 Thus EVAL blocks follwing a trie may be called a different number of times with
712 and without the optimisation. With the optimisations dupes will be silently
713 ignored. This inconsistant behaviour of EVAL type nodes is well established as
714 the following demonstrates:
716 'words'=~/(word|word|word)(?{ print $1 })[xyz]/
718 which prints out 'word' three times, but
720 'words'=~/(word|word|word)(?{ print $1 })S/
722 which doesnt print it out at all. This is due to other optimisations kicking in.
724 Example of what happens on a structural level:
726 The regexp /(ac|ad|ab)+/ will produce the folowing debug output:
728 1: CURLYM[1] {1,32767}(18)
739 This would be optimizable with startbranch=5, first=5, last=16, tail=16
740 and should turn into:
742 1: CURLYM[1] {1,32767}(18)
744 [Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1]
752 Cases where tail != last would be like /(?foo|bar)baz/:
762 which would be optimizable with startbranch=1, first=1, last=7, tail=8
763 and would end up looking like:
766 [Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1]
775 #define TRIE_DEBUG_CHAR \
776 DEBUG_TRIE_COMPILE_r({ \
779 tmp = newSVpv( "", 0 ); \
780 pv_uni_display( tmp, uc, len, 60, UNI_DISPLAY_REGEX ); \
782 tmp = Perl_newSVpvf_nocontext( "%c", uvc ); \
784 av_push( trie->revcharmap, tmp ); \
787 #define TRIE_READ_CHAR STMT_START { \
790 if ( foldlen > 0 ) { \
791 uvc = utf8n_to_uvuni( scan, UTF8_MAXLEN, &len, uniflags ); \
796 uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
797 uvc = to_uni_fold( uvc, foldbuf, &foldlen ); \
798 foldlen -= UNISKIP( uvc ); \
799 scan = foldbuf + UNISKIP( uvc ); \
802 uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
811 #define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ]
812 #define TRIE_LIST_CUR(state) ( TRIE_LIST_ITEM( state, 0 ).forid )
813 #define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate )
814 #define TRIE_LIST_USED(idx) ( trie->states[state].trans.list ? (TRIE_LIST_CUR( idx ) - 1) : 0 )
816 #define TRIE_LIST_PUSH(state,fid,ns) STMT_START { \
817 if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) { \
818 TRIE_LIST_LEN( state ) *= 2; \
819 Renew( trie->states[ state ].trans.list, \
820 TRIE_LIST_LEN( state ), reg_trie_trans_le ); \
822 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid; \
823 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns; \
824 TRIE_LIST_CUR( state )++; \
827 #define TRIE_LIST_NEW(state) STMT_START { \
828 Newz( 1023, trie->states[ state ].trans.list, \
829 4, reg_trie_trans_le ); \
830 TRIE_LIST_CUR( state ) = 1; \
831 TRIE_LIST_LEN( state ) = 4; \
835 S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *first, regnode *last, regnode *tail, U32 flags)
837 /* first pass, loop through and scan words */
840 const U32 uniflags = ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY;
845 /* we just use folder as a flag in utf8 */
846 const U8 * const folder = ( flags == EXACTF
854 const U32 data_slot = add_data( pRExC_state, 1, "t" );
857 GET_RE_DEBUG_FLAGS_DECL;
859 Newz( 848200, trie, 1, reg_trie_data );
861 RExC_rx->data->data[ data_slot ] = (void*)trie;
862 Newz( 848201, trie->charmap, 256, U16 );
864 trie->words = newAV();
865 trie->revcharmap = newAV();
869 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
870 if (!SvIOK(re_trie_maxbuff)) {
871 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
874 /* -- First loop and Setup --
876 We first traverse the branches and scan each word to determine if it
877 contains widechars, and how many unique chars there are, this is
878 important as we have to build a table with at least as many columns as we
881 We use an array of integers to represent the character codes 0..255
882 (trie->charmap) and we use a an HV* to store unicode characters. We use the
883 native representation of the character value as the key and IV's for the
886 *TODO* If we keep track of how many times each character is used we can
887 remap the columns so that the table compression later on is more
888 efficient in terms of memory by ensuring most common value is in the
889 middle and the least common are on the outside. IMO this would be better
890 than a most to least common mapping as theres a decent chance the most
891 common letter will share a node with the least common, meaning the node
892 will not be compressable. With a middle is most common approach the worst
893 case is when we have the least common nodes twice.
898 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
899 regnode *noper = NEXTOPER( cur );
900 const U8 *uc = (U8*)STRING( noper );
901 const U8 *e = uc + STR_LEN( noper );
903 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
906 for ( ; uc < e ; uc += len ) {
910 if ( !trie->charmap[ uvc ] ) {
911 trie->charmap[ uvc ]=( ++trie->uniquecharcount );
913 trie->charmap[ folder[ uvc ] ] = trie->charmap[ uvc ];
918 if ( !trie->widecharmap )
919 trie->widecharmap = newHV();
921 svpp = hv_fetch( trie->widecharmap, (char*)&uvc, sizeof( UV ), 1 );
924 Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%X", uvc );
926 if ( !SvTRUE( *svpp ) ) {
927 sv_setiv( *svpp, ++trie->uniquecharcount );
933 } /* end first pass */
934 DEBUG_TRIE_COMPILE_r(
935 PerlIO_printf( Perl_debug_log, "TRIE(%s): W:%d C:%d Uq:%d \n",
936 ( trie->widecharmap ? "UTF8" : "NATIVE" ), trie->wordcount,
937 trie->charcount, trie->uniquecharcount )
942 We now know what we are dealing with in terms of unique chars and
943 string sizes so we can calculate how much memory a naive
944 representation using a flat table will take. If it's over a reasonable
945 limit (as specified by ${^RE_TRIE_MAXBUF}) we use a more memory
946 conservative but potentially much slower representation using an array
949 At the end we convert both representations into the same compressed
950 form that will be used in regexec.c for matching with. The latter
951 is a form that cannot be used to construct with but has memory
952 properties similar to the list form and access properties similar
953 to the table form making it both suitable for fast searches and
954 small enough that its feasable to store for the duration of a program.
956 See the comment in the code where the compressed table is produced
957 inplace from the flat tabe representation for an explanation of how
958 the compression works.
963 if ( (IV)( ( trie->charcount + 1 ) * trie->uniquecharcount + 1) > SvIV(re_trie_maxbuff) ) {
965 Second Pass -- Array Of Lists Representation
967 Each state will be represented by a list of charid:state records
968 (reg_trie_trans_le) the first such element holds the CUR and LEN
969 points of the allocated array. (See defines above).
971 We build the initial structure using the lists, and then convert
972 it into the compressed table form which allows faster lookups
973 (but cant be modified once converted).
979 STRLEN transcount = 1;
981 Newz( 848204, trie->states, trie->charcount + 2, reg_trie_state );
985 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
987 regnode *noper = NEXTOPER( cur );
988 U8 *uc = (U8*)STRING( noper );
989 U8 *e = uc + STR_LEN( noper );
990 U32 state = 1; /* required init */
991 U16 charid = 0; /* sanity init */
992 U8 *scan = (U8*)NULL; /* sanity init */
993 STRLEN foldlen = 0; /* required init */
994 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
997 for ( ; uc < e ; uc += len ) {
1002 charid = trie->charmap[ uvc ];
1004 SV** svpp=(SV**)NULL;
1005 svpp = hv_fetch( trie->widecharmap, (char*)&uvc, sizeof( UV ), 0);
1009 charid=(U16)SvIV( *svpp );
1018 if ( !trie->states[ state ].trans.list ) {
1019 TRIE_LIST_NEW( state );
1021 for ( check = 1; check <= TRIE_LIST_USED( state ); check++ ) {
1022 if ( TRIE_LIST_ITEM( state, check ).forid == charid ) {
1023 newstate = TRIE_LIST_ITEM( state, check ).newstate;
1028 newstate = next_alloc++;
1029 TRIE_LIST_PUSH( state, charid, newstate );
1035 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %d", uvc );
1037 /* charid is now 0 if we dont know the char read, or nonzero if we do */
1040 if ( !trie->states[ state ].wordnum ) {
1041 /* we havent inserted this word into the structure yet. */
1042 trie->states[ state ].wordnum = ++curword;
1045 /* store the word for dumping */
1046 SV* tmp = newSVpvn( STRING( noper ), STR_LEN( noper ) );
1047 if ( UTF ) SvUTF8_on( tmp );
1048 av_push( trie->words, tmp );
1052 /* Its a dupe. So ignore it. */
1055 } /* end second pass */
1057 trie->laststate = next_alloc;
1058 Renew( trie->states, next_alloc, reg_trie_state );
1060 DEBUG_TRIE_COMPILE_MORE_r({
1065 print out the table precompression.
1068 PerlIO_printf( Perl_debug_log, "\nState :Word | Transition Data\n" );
1069 PerlIO_printf( Perl_debug_log, "------:-----+-----------------" );
1071 for( state=1 ; state < next_alloc ; state ++ ) {
1073 PerlIO_printf( Perl_debug_log, "\n %04X :", state );
1074 if ( ! trie->states[ state ].wordnum ) {
1075 PerlIO_printf( Perl_debug_log, "%5s| ","");
1077 PerlIO_printf( Perl_debug_log, "W%04X| ",
1078 trie->states[ state ].wordnum
1081 for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) {
1082 SV **tmp = av_fetch( trie->revcharmap, TRIE_LIST_ITEM(state,charid).forid, 0);
1083 PerlIO_printf( Perl_debug_log, "%s:%3X=%04X | ",
1085 TRIE_LIST_ITEM(state,charid).forid,
1086 TRIE_LIST_ITEM(state,charid).newstate
1091 PerlIO_printf( Perl_debug_log, "\n\n" );
1094 Newz( 848203, trie->trans, transcount ,reg_trie_trans );
1102 for( state=1 ; state < next_alloc ; state ++ ) {
1106 DEBUG_TRIE_COMPILE_MORE_r(
1107 PerlIO_printf( Perl_debug_log, "tp: %d zp: %d ",tp,zp)
1111 if (trie->states[state].trans.list) {
1112 U16 minid=TRIE_LIST_ITEM( state, 1).forid;
1116 for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
1117 if ( TRIE_LIST_ITEM( state, idx).forid < minid ) {
1118 minid=TRIE_LIST_ITEM( state, idx).forid;
1119 } else if ( TRIE_LIST_ITEM( state, idx).forid > maxid ) {
1120 maxid=TRIE_LIST_ITEM( state, idx).forid;
1123 if ( transcount < tp + maxid - minid + 1) {
1125 Renew( trie->trans, transcount, reg_trie_trans );
1126 Zero( trie->trans + (transcount / 2), transcount / 2 , reg_trie_trans );
1128 base = trie->uniquecharcount + tp - minid;
1129 if ( maxid == minid ) {
1131 for ( ; zp < tp ; zp++ ) {
1132 if ( ! trie->trans[ zp ].next ) {
1133 base = trie->uniquecharcount + zp - minid;
1134 trie->trans[ zp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1135 trie->trans[ zp ].check = state;
1141 trie->trans[ tp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1142 trie->trans[ tp ].check = state;
1147 for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
1148 U32 tid = base - trie->uniquecharcount + TRIE_LIST_ITEM( state, idx ).forid;
1149 trie->trans[ tid ].next = TRIE_LIST_ITEM( state, idx ).newstate;
1150 trie->trans[ tid ].check = state;
1152 tp += ( maxid - minid + 1 );
1154 Safefree(trie->states[ state ].trans.list);
1157 DEBUG_TRIE_COMPILE_MORE_r(
1158 PerlIO_printf( Perl_debug_log, " base: %d\n",base);
1161 trie->states[ state ].trans.base=base;
1163 trie->lasttrans = tp + 1;
1167 Second Pass -- Flat Table Representation.
1169 we dont use the 0 slot of either trans[] or states[] so we add 1 to each.
1170 We know that we will need Charcount+1 trans at most to store the data
1171 (one row per char at worst case) So we preallocate both structures
1172 assuming worst case.
1174 We then construct the trie using only the .next slots of the entry
1177 We use the .check field of the first entry of the node temporarily to
1178 make compression both faster and easier by keeping track of how many non
1179 zero fields are in the node.
1181 Since trans are numbered from 1 any 0 pointer in the table is a FAIL
1184 There are two terms at use here: state as a TRIE_NODEIDX() which is a
1185 number representing the first entry of the node, and state as a
1186 TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1) and
1187 TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3) if there
1188 are 2 entrys per node. eg:
1196 The table is internally in the right hand, idx form. However as we also
1197 have to deal with the states array which is indexed by nodenum we have to
1198 use TRIE_NODENUM() to convert.
1202 Newz( 848203, trie->trans, ( trie->charcount + 1 ) * trie->uniquecharcount + 1,
1204 Newz( 848204, trie->states, trie->charcount + 2, reg_trie_state );
1205 next_alloc = trie->uniquecharcount + 1;
1207 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1209 regnode *noper = NEXTOPER( cur );
1210 U8 *uc = (U8*)STRING( noper );
1211 U8 *e = uc + STR_LEN( noper );
1213 U32 state = 1; /* required init */
1215 U16 charid = 0; /* sanity init */
1216 U32 accept_state = 0; /* sanity init */
1217 U8 *scan = (U8*)NULL; /* sanity init */
1219 STRLEN foldlen = 0; /* required init */
1220 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1223 for ( ; uc < e ; uc += len ) {
1228 charid = trie->charmap[ uvc ];
1230 SV** svpp=(SV**)NULL;
1231 svpp = hv_fetch( trie->widecharmap, (char*)&uvc, sizeof( UV ), 0);
1235 charid=(U16)SvIV( *svpp );
1240 if ( !trie->trans[ state + charid ].next ) {
1241 trie->trans[ state + charid ].next = next_alloc;
1242 trie->trans[ state ].check++;
1243 next_alloc += trie->uniquecharcount;
1245 state = trie->trans[ state + charid ].next;
1247 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %d", uvc );
1249 /* charid is now 0 if we dont know the char read, or nonzero if we do */
1252 accept_state = TRIE_NODENUM( state );
1253 if ( !trie->states[ accept_state ].wordnum ) {
1254 /* we havent inserted this word into the structure yet. */
1255 trie->states[ accept_state ].wordnum = ++curword;
1258 /* store the word for dumping */
1259 SV* tmp = newSVpvn( STRING( noper ), STR_LEN( noper ) );
1260 if ( UTF ) SvUTF8_on( tmp );
1261 av_push( trie->words, tmp );
1265 /* Its a dupe. So ignore it. */
1268 } /* end second pass */
1270 DEBUG_TRIE_COMPILE_MORE_r({
1272 print out the table precompression so that we can do a visual check
1273 that they are identical.
1277 PerlIO_printf( Perl_debug_log, "\nChar : " );
1279 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
1280 SV **tmp = av_fetch( trie->revcharmap, charid, 0);
1282 PerlIO_printf( Perl_debug_log, "%4.4s ", SvPV_nolen( *tmp ) );
1286 PerlIO_printf( Perl_debug_log, "\nState+-" );
1288 for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) {
1289 PerlIO_printf( Perl_debug_log, "%4s-", "----" );
1292 PerlIO_printf( Perl_debug_log, "\n" );
1294 for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) {
1296 PerlIO_printf( Perl_debug_log, "%04X : ", TRIE_NODENUM( state ) );
1298 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
1299 PerlIO_printf( Perl_debug_log, "%04X ",
1300 SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next ) );
1302 if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) {
1303 PerlIO_printf( Perl_debug_log, " (%04X)\n", trie->trans[ state ].check );
1305 PerlIO_printf( Perl_debug_log, " (%04X) W%04X\n", trie->trans[ state ].check,
1306 trie->states[ TRIE_NODENUM( state ) ].wordnum );
1309 PerlIO_printf( Perl_debug_log, "\n\n" );
1313 * Inplace compress the table.*
1315 For sparse data sets the table constructed by the trie algorithm will
1316 be mostly 0/FAIL transitions or to put it another way mostly empty.
1317 (Note that leaf nodes will not contain any transitions.)
1319 This algorithm compresses the tables by eliminating most such
1320 transitions, at the cost of a modest bit of extra work during lookup:
1322 - Each states[] entry contains a .base field which indicates the
1323 index in the state[] array wheres its transition data is stored.
1325 - If .base is 0 there are no valid transitions from that node.
1327 - If .base is nonzero then charid is added to it to find an entry in
1330 -If trans[states[state].base+charid].check!=state then the
1331 transition is taken to be a 0/Fail transition. Thus if there are fail
1332 transitions at the front of the node then the .base offset will point
1333 somewhere inside the previous nodes data (or maybe even into a node
1334 even earlier), but the .check field determines if the transition is
1337 The following process inplace converts the table to the compressed
1338 table: We first do not compress the root node 1,and mark its all its
1339 .check pointers as 1 and set its .base pointer as 1 as well. This
1340 allows to do a DFA construction from the compressed table later, and
1341 ensures that any .base pointers we calculate later are greater than
1344 - We set 'pos' to indicate the first entry of the second node.
1346 - We then iterate over the columns of the node, finding the first and
1347 last used entry at l and m. We then copy l..m into pos..(pos+m-l),
1348 and set the .check pointers accordingly, and advance pos
1349 appropriately and repreat for the next node. Note that when we copy
1350 the next pointers we have to convert them from the original
1351 NODEIDX form to NODENUM form as the former is not valid post
1354 - If a node has no transitions used we mark its base as 0 and do not
1355 advance the pos pointer.
1357 - If a node only has one transition we use a second pointer into the
1358 structure to fill in allocated fail transitions from other states.
1359 This pointer is independent of the main pointer and scans forward
1360 looking for null transitions that are allocated to a state. When it
1361 finds one it writes the single transition into the "hole". If the
1362 pointer doesnt find one the single transition is appeneded as normal.
1364 - Once compressed we can Renew/realloc the structures to release the
1367 See "Table-Compression Methods" in sec 3.9 of the Red Dragon,
1368 specifically Fig 3.47 and the associated pseudocode.
1372 U32 laststate = TRIE_NODENUM( next_alloc );
1373 U32 used , state, charid;
1375 trie->laststate = laststate;
1377 for ( state = 1 ; state < laststate ; state++ ) {
1379 U32 stateidx = TRIE_NODEIDX( state );
1380 U32 o_used=trie->trans[ stateidx ].check;
1381 used = trie->trans[ stateidx ].check;
1382 trie->trans[ stateidx ].check = 0;
1384 for ( charid = 0 ; used && charid < trie->uniquecharcount ; charid++ ) {
1385 if ( flag || trie->trans[ stateidx + charid ].next ) {
1386 if ( trie->trans[ stateidx + charid ].next ) {
1388 for ( ; zp < pos ; zp++ ) {
1389 if ( ! trie->trans[ zp ].next ) {
1393 trie->states[ state ].trans.base = zp + trie->uniquecharcount - charid ;
1394 trie->trans[ zp ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
1395 trie->trans[ zp ].check = state;
1396 if ( ++zp > pos ) pos = zp;
1403 trie->states[ state ].trans.base = pos + trie->uniquecharcount - charid ;
1405 trie->trans[ pos ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
1406 trie->trans[ pos ].check = state;
1411 trie->lasttrans = pos + 1;
1412 Renew( trie->states, laststate + 1, reg_trie_state);
1413 DEBUG_TRIE_COMPILE_MORE_r(
1414 PerlIO_printf( Perl_debug_log, " Alloc: %d Orig: %d elements, Final:%d. Savings of %%%5.2f\n",
1415 ( ( trie->charcount + 1 ) * trie->uniquecharcount + 1 ), next_alloc, pos,
1416 ( ( next_alloc - pos ) * 100 ) / (double)next_alloc );
1419 } /* end table compress */
1421 /* resize the trans array to remove unused space */
1422 Renew( trie->trans, trie->lasttrans, reg_trie_trans);
1424 DEBUG_TRIE_COMPILE_r({
1427 Now we print it out again, in a slightly different form as there is additional
1428 info we want to be able to see when its compressed. They are close enough for
1429 visual comparison though.
1431 PerlIO_printf( Perl_debug_log, "\nChar : %-6s%-6s%-4s ","Match","Base","Ofs" );
1433 for( state = 0 ; state < trie->uniquecharcount ; state++ ) {
1434 SV **tmp = av_fetch( trie->revcharmap, state, 0);
1436 PerlIO_printf( Perl_debug_log, "%4.4s ", SvPV_nolen( *tmp ) );
1439 PerlIO_printf( Perl_debug_log, "\n-----:-----------------------");
1441 for( state = 0 ; state < trie->uniquecharcount ; state++ )
1442 PerlIO_printf( Perl_debug_log, "-----");
1443 PerlIO_printf( Perl_debug_log, "\n");
1445 for( state = 1 ; state < trie->laststate ; state++ ) {
1446 U32 base = trie->states[ state ].trans.base;
1448 PerlIO_printf( Perl_debug_log, "#%04X ", state);
1450 if ( trie->states[ state ].wordnum ) {
1451 PerlIO_printf( Perl_debug_log, " W%04X", trie->states[ state ].wordnum );
1453 PerlIO_printf( Perl_debug_log, "%6s", "" );
1456 PerlIO_printf( Perl_debug_log, " @%04X ", base );
1461 while( ( base + ofs < trie->uniquecharcount ) ||
1462 ( base + ofs - trie->uniquecharcount < trie->lasttrans
1463 && trie->trans[ base + ofs - trie->uniquecharcount ].check != state))
1466 PerlIO_printf( Perl_debug_log, "+%02X[ ", ofs);
1468 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
1469 if ( ( base + ofs >= trie->uniquecharcount ) &&
1470 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
1471 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
1473 PerlIO_printf( Perl_debug_log, "%04X ",
1474 trie->trans[ base + ofs - trie->uniquecharcount ].next );
1476 PerlIO_printf( Perl_debug_log, "%4s "," 0" );
1480 PerlIO_printf( Perl_debug_log, "]", ofs);
1483 PerlIO_printf( Perl_debug_log, "\n" );
1488 /* now finally we "stitch in" the new TRIE node
1489 This means we convert either the first branch or the first Exact,
1490 depending on whether the thing following (in 'last') is a branch
1491 or not and whther first is the startbranch (ie is it a sub part of
1492 the alternation or is it the whole thing.)
1493 Assuming its a sub part we conver the EXACT otherwise we convert
1494 the whole branch sequence, including the first.
1501 if ( first == startbranch && OP( last ) != BRANCH ) {
1504 convert = NEXTOPER( first );
1505 NEXT_OFF( first ) = (U16)(last - first);
1508 OP( convert ) = TRIE + (U8)( flags - EXACT );
1509 NEXT_OFF( convert ) = (U16)(tail - convert);
1510 ARG_SET( convert, data_slot );
1512 /* tells us if we need to handle accept buffers specially */
1513 convert->flags = ( RExC_seen_evals ? 1 : 0 );
1516 /* needed for dumping*/
1518 regnode *optimize = convert + NODE_STEP_REGNODE + regarglen[ TRIE ];
1519 /* We now need to mark all of the space originally used by the
1520 branches as optimized away. This keeps the dumpuntil from
1521 throwing a wobbly as it doesnt use regnext() to traverse the
1524 while( optimize < last ) {
1525 OP( optimize ) = OPTIMIZED;
1529 } /* end node insert */
1536 * There are strange code-generation bugs caused on sparc64 by gcc-2.95.2.
1537 * These need to be revisited when a newer toolchain becomes available.
1539 #if defined(__sparc64__) && defined(__GNUC__)
1540 # if __GNUC__ < 2 || (__GNUC__ == 2 && __GNUC_MINOR__ < 96)
1541 # undef SPARC64_GCC_WORKAROUND
1542 # define SPARC64_GCC_WORKAROUND 1
1546 /* REx optimizer. Converts nodes into quickier variants "in place".
1547 Finds fixed substrings. */
1549 /* Stops at toplevel WHILEM as well as at `last'. At end *scanp is set
1550 to the position after last scanned or to NULL. */
1554 S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, I32 *deltap, regnode *last, scan_data_t *data, U32 flags, U32 depth)
1555 /* scanp: Start here (read-write). */
1556 /* deltap: Write maxlen-minlen here. */
1557 /* last: Stop before this one. */
1559 I32 min = 0, pars = 0, code;
1560 regnode *scan = *scanp, *next;
1562 int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
1563 int is_inf_internal = 0; /* The studied chunk is infinite */
1564 I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
1565 scan_data_t data_fake;
1566 struct regnode_charclass_class and_with; /* Valid if flags & SCF_DO_STCLASS_OR */
1567 SV *re_trie_maxbuff = NULL;
1569 GET_RE_DEBUG_FLAGS_DECL;
1571 while (scan && OP(scan) != END && scan < last) {
1572 /* Peephole optimizer: */
1574 SV *mysv=sv_newmortal();
1575 regprop( mysv, scan);
1576 PerlIO_printf(Perl_debug_log, "%*speep: %s (0x%08X)\n",depth*2,"",SvPV_nolen(mysv),scan);
1579 if (PL_regkind[(U8)OP(scan)] == EXACT) {
1580 /* Merge several consecutive EXACTish nodes into one. */
1581 regnode *n = regnext(scan);
1584 regnode *stop = scan;
1587 next = scan + NODE_SZ_STR(scan);
1588 /* Skip NOTHING, merge EXACT*. */
1590 ( PL_regkind[(U8)OP(n)] == NOTHING ||
1591 (stringok && (OP(n) == OP(scan))))
1593 && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX) {
1594 if (OP(n) == TAIL || n > next)
1596 if (PL_regkind[(U8)OP(n)] == NOTHING) {
1597 NEXT_OFF(scan) += NEXT_OFF(n);
1598 next = n + NODE_STEP_REGNODE;
1605 else if (stringok) {
1606 int oldl = STR_LEN(scan);
1607 regnode *nnext = regnext(n);
1609 if (oldl + STR_LEN(n) > U8_MAX)
1611 NEXT_OFF(scan) += NEXT_OFF(n);
1612 STR_LEN(scan) += STR_LEN(n);
1613 next = n + NODE_SZ_STR(n);
1614 /* Now we can overwrite *n : */
1615 Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
1623 if (UTF && ( OP(scan) == EXACTF ) && ( STR_LEN(scan) >= 6 ) ) {
1625 Two problematic code points in Unicode casefolding of EXACT nodes:
1627 U+0390 - GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
1628 U+03B0 - GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
1634 U+03B9 U+0308 U+0301 0xCE 0xB9 0xCC 0x88 0xCC 0x81
1635 U+03C5 U+0308 U+0301 0xCF 0x85 0xCC 0x88 0xCC 0x81
1637 This means that in case-insensitive matching (or "loose matching",
1638 as Unicode calls it), an EXACTF of length six (the UTF-8 encoded byte
1639 length of the above casefolded versions) can match a target string
1640 of length two (the byte length of UTF-8 encoded U+0390 or U+03B0).
1641 This would rather mess up the minimum length computation.
1643 What we'll do is to look for the tail four bytes, and then peek
1644 at the preceding two bytes to see whether we need to decrease
1645 the minimum length by four (six minus two).
1647 Thanks to the design of UTF-8, there cannot be false matches:
1648 A sequence of valid UTF-8 bytes cannot be a subsequence of
1649 another valid sequence of UTF-8 bytes.
1652 char *s0 = STRING(scan), *s, *t;
1653 char *s1 = s0 + STR_LEN(scan) - 1, *s2 = s1 - 4;
1654 const char *t0 = "\xcc\x88\xcc\x81";
1655 const char *t1 = t0 + 3;
1658 s < s2 && (t = ninstr(s, s1, t0, t1));
1660 if (((U8)t[-1] == 0xB9 && (U8)t[-2] == 0xCE) ||
1661 ((U8)t[-1] == 0x85 && (U8)t[-2] == 0xCF))
1668 n = scan + NODE_SZ_STR(scan);
1670 if (PL_regkind[(U8)OP(n)] != NOTHING || OP(n) == NOTHING) {
1681 /* Follow the next-chain of the current node and optimize
1682 away all the NOTHINGs from it. */
1683 if (OP(scan) != CURLYX) {
1684 int max = (reg_off_by_arg[OP(scan)]
1686 /* I32 may be smaller than U16 on CRAYs! */
1687 : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
1688 int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
1692 /* Skip NOTHING and LONGJMP. */
1693 while ((n = regnext(n))
1694 && ((PL_regkind[(U8)OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
1695 || ((OP(n) == LONGJMP) && (noff = ARG(n))))
1696 && off + noff < max)
1698 if (reg_off_by_arg[OP(scan)])
1701 NEXT_OFF(scan) = off;
1704 /* The principal pseudo-switch. Cannot be a switch, since we
1705 look into several different things. */
1706 if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
1707 || OP(scan) == IFTHEN || OP(scan) == SUSPEND) {
1708 next = regnext(scan);
1710 /* demq: the op(next)==code check is to see if we have "branch-branch" AFAICT */
1712 if (OP(next) == code || code == IFTHEN || code == SUSPEND) {
1713 I32 max1 = 0, min1 = I32_MAX, num = 0;
1714 struct regnode_charclass_class accum;
1715 regnode *startbranch=scan;
1717 if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
1718 scan_commit(pRExC_state, data); /* Cannot merge strings after this. */
1719 if (flags & SCF_DO_STCLASS)
1720 cl_init_zero(pRExC_state, &accum);
1722 while (OP(scan) == code) {
1723 I32 deltanext, minnext, f = 0, fake;
1724 struct regnode_charclass_class this_class;
1727 data_fake.flags = 0;
1729 data_fake.whilem_c = data->whilem_c;
1730 data_fake.last_closep = data->last_closep;
1733 data_fake.last_closep = &fake;
1734 next = regnext(scan);
1735 scan = NEXTOPER(scan);
1737 scan = NEXTOPER(scan);
1738 if (flags & SCF_DO_STCLASS) {
1739 cl_init(pRExC_state, &this_class);
1740 data_fake.start_class = &this_class;
1741 f = SCF_DO_STCLASS_AND;
1743 if (flags & SCF_WHILEM_VISITED_POS)
1744 f |= SCF_WHILEM_VISITED_POS;
1746 /* we suppose the run is continuous, last=next...*/
1747 minnext = study_chunk(pRExC_state, &scan, &deltanext,
1748 next, &data_fake, f,depth+1);
1751 if (max1 < minnext + deltanext)
1752 max1 = minnext + deltanext;
1753 if (deltanext == I32_MAX)
1754 is_inf = is_inf_internal = 1;
1756 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
1758 if (data && (data_fake.flags & SF_HAS_EVAL))
1759 data->flags |= SF_HAS_EVAL;
1761 data->whilem_c = data_fake.whilem_c;
1762 if (flags & SCF_DO_STCLASS)
1763 cl_or(pRExC_state, &accum, &this_class);
1764 if (code == SUSPEND)
1767 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
1769 if (flags & SCF_DO_SUBSTR) {
1770 data->pos_min += min1;
1771 data->pos_delta += max1 - min1;
1772 if (max1 != min1 || is_inf)
1773 data->longest = &(data->longest_float);
1776 delta += max1 - min1;
1777 if (flags & SCF_DO_STCLASS_OR) {
1778 cl_or(pRExC_state, data->start_class, &accum);
1780 cl_and(data->start_class, &and_with);
1781 flags &= ~SCF_DO_STCLASS;
1784 else if (flags & SCF_DO_STCLASS_AND) {
1786 cl_and(data->start_class, &accum);
1787 flags &= ~SCF_DO_STCLASS;
1790 /* Switch to OR mode: cache the old value of
1791 * data->start_class */
1792 StructCopy(data->start_class, &and_with,
1793 struct regnode_charclass_class);
1794 flags &= ~SCF_DO_STCLASS_AND;
1795 StructCopy(&accum, data->start_class,
1796 struct regnode_charclass_class);
1797 flags |= SCF_DO_STCLASS_OR;
1798 data->start_class->flags |= ANYOF_EOS;
1804 Assuming this was/is a branch we are dealing with: 'scan' now
1805 points at the item that follows the branch sequence, whatever
1806 it is. We now start at the beginning of the sequence and look
1812 which would be constructed from a pattern like /A|LIST|OF|WORDS/
1814 If we can find such a subseqence we need to turn the first
1815 element into a trie and then add the subsequent branch exact
1816 strings to the trie.
1820 1. patterns where the whole set of branch can be converted to a trie,
1822 2. patterns where only a subset of the alternations can be
1823 converted to a trie.
1825 In case 1 we can replace the whole set with a single regop
1826 for the trie. In case 2 we need to keep the start and end
1829 'BRANCH EXACT; BRANCH EXACT; BRANCH X'
1830 becomes BRANCH TRIE; BRANCH X;
1832 Hypthetically when we know the regex isnt anchored we can
1833 turn a case 1 into a DFA and let it rip... Every time it finds a match
1834 it would just call its tail, no WHILEM/CURLY needed.
1838 if (!re_trie_maxbuff) {
1839 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
1840 if (!SvIOK(re_trie_maxbuff))
1841 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
1843 if ( SvIV(re_trie_maxbuff)>=0 && OP( startbranch )==BRANCH ) {
1845 regnode *first = (regnode *)NULL;
1846 regnode *last = (regnode *)NULL;
1847 regnode *tail = scan;
1852 SV *mysv = sv_newmortal(); /* for dumping */
1854 /* var tail is used because there may be a TAIL
1855 regop in the way. Ie, the exacts will point to the
1856 thing following the TAIL, but the last branch will
1857 point at the TAIL. So we advance tail. If we
1858 have nested (?:) we may have to move through several
1862 while ( OP( tail ) == TAIL ) {
1863 /* this is the TAIL generated by (?:) */
1864 tail = regnext( tail );
1868 regprop( mysv, tail );
1869 PerlIO_printf( Perl_debug_log, "%*s%s%s%s\n",
1870 depth * 2 + 2, "", "Tail node is:", SvPV_nolen( mysv ),
1871 (RExC_seen_evals) ? "[EVAL]" : ""
1876 step through the branches, cur represents each
1877 branch, noper is the first thing to be matched
1878 as part of that branch and noper_next is the
1879 regnext() of that node. if noper is an EXACT
1880 and noper_next is the same as scan (our current
1881 position in the regex) then the EXACT branch is
1882 a possible optimization target. Once we have
1883 two or more consequetive such branches we can
1884 create a trie of the EXACT's contents and stich
1885 it in place. If the sequence represents all of
1886 the branches we eliminate the whole thing and
1887 replace it with a single TRIE. If it is a
1888 subsequence then we need to stitch it in. This
1889 means the first branch has to remain, and needs
1890 to be repointed at the item on the branch chain
1891 following the last branch optimized. This could
1892 be either a BRANCH, in which case the
1893 subsequence is internal, or it could be the
1894 item following the branch sequence in which
1895 case the subsequence is at the end.
1899 /* dont use tail as the end marker for this traverse */
1900 for ( cur = startbranch ; cur != scan ; cur = regnext( cur ) ) {
1901 regnode *noper = NEXTOPER( cur );
1902 regnode *noper_next = regnext( noper );
1905 regprop( mysv, cur);
1906 PerlIO_printf( Perl_debug_log, "%*s%s",
1907 depth * 2 + 2," ", SvPV_nolen( mysv ) );
1909 regprop( mysv, noper);
1910 PerlIO_printf( Perl_debug_log, " -> %s",
1914 regprop( mysv, noper_next );
1915 PerlIO_printf( Perl_debug_log,"\t=> %s\t",
1918 PerlIO_printf( Perl_debug_log, "0x%p,0x%p,0x%p)\n",
1921 if ( ( first ? OP( noper ) == optype
1922 : PL_regkind[ (U8)OP( noper ) ] == EXACT )
1923 && noper_next == tail && count<U16_MAX)
1928 optype = OP( noper );
1932 regprop( mysv, first);
1933 PerlIO_printf( Perl_debug_log, "%*s%s",
1934 depth * 2 + 2, "F:", SvPV_nolen( mysv ) );
1935 regprop( mysv, NEXTOPER(first) );
1936 PerlIO_printf( Perl_debug_log, " -> %s\n",
1937 SvPV_nolen( mysv ) );
1942 regprop( mysv, cur);
1943 PerlIO_printf( Perl_debug_log, "%*s%s",
1944 depth * 2 + 2, "N:", SvPV_nolen( mysv ) );
1945 regprop( mysv, noper );
1946 PerlIO_printf( Perl_debug_log, " -> %s\n",
1947 SvPV_nolen( mysv ) );
1953 PerlIO_printf( Perl_debug_log, "%*s%s\n",
1954 depth * 2 + 2, "E:", "**END**" );
1956 make_trie( pRExC_state, startbranch, first, cur, tail, optype );
1958 if ( PL_regkind[ (U8)OP( noper ) ] == EXACT
1959 && noper_next == tail )
1963 optype = OP( noper );
1973 regprop( mysv, cur);
1974 PerlIO_printf( Perl_debug_log,
1975 "%*s%s\t(0x%p,0x%p,0x%p)\n", depth * 2 + 2,
1976 " ", SvPV_nolen( mysv ), first, last, cur);
1981 PerlIO_printf( Perl_debug_log, "%*s%s\n",
1982 depth * 2 + 2, "E:", "==END==" );
1984 make_trie( pRExC_state, startbranch, first, scan, tail, optype );
1989 else if ( code == BRANCHJ ) { /* single branch is optimized. */
1990 scan = NEXTOPER(NEXTOPER(scan));
1991 } else /* single branch is optimized. */
1992 scan = NEXTOPER(scan);
1995 else if (OP(scan) == EXACT) {
1996 I32 l = STR_LEN(scan);
1997 UV uc = *((U8*)STRING(scan));
1999 U8 *s = (U8*)STRING(scan);
2000 l = utf8_length(s, s + l);
2001 uc = utf8_to_uvchr(s, NULL);
2004 if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
2005 /* The code below prefers earlier match for fixed
2006 offset, later match for variable offset. */
2007 if (data->last_end == -1) { /* Update the start info. */
2008 data->last_start_min = data->pos_min;
2009 data->last_start_max = is_inf
2010 ? I32_MAX : data->pos_min + data->pos_delta;
2012 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
2014 SV * sv = data->last_found;
2015 MAGIC *mg = SvUTF8(sv) && SvMAGICAL(sv) ?
2016 mg_find(sv, PERL_MAGIC_utf8) : NULL;
2017 if (mg && mg->mg_len >= 0)
2018 mg->mg_len += utf8_length((U8*)STRING(scan),
2019 (U8*)STRING(scan)+STR_LEN(scan));
2022 SvUTF8_on(data->last_found);
2023 data->last_end = data->pos_min + l;
2024 data->pos_min += l; /* As in the first entry. */
2025 data->flags &= ~SF_BEFORE_EOL;
2027 if (flags & SCF_DO_STCLASS_AND) {
2028 /* Check whether it is compatible with what we know already! */
2032 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
2033 && !ANYOF_BITMAP_TEST(data->start_class, uc)
2034 && (!(data->start_class->flags & ANYOF_FOLD)
2035 || !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
2038 ANYOF_CLASS_ZERO(data->start_class);
2039 ANYOF_BITMAP_ZERO(data->start_class);
2041 ANYOF_BITMAP_SET(data->start_class, uc);
2042 data->start_class->flags &= ~ANYOF_EOS;
2044 data->start_class->flags &= ~ANYOF_UNICODE_ALL;
2046 else if (flags & SCF_DO_STCLASS_OR) {
2047 /* false positive possible if the class is case-folded */
2049 ANYOF_BITMAP_SET(data->start_class, uc);
2051 data->start_class->flags |= ANYOF_UNICODE_ALL;
2052 data->start_class->flags &= ~ANYOF_EOS;
2053 cl_and(data->start_class, &and_with);
2055 flags &= ~SCF_DO_STCLASS;
2057 else if (PL_regkind[(U8)OP(scan)] == EXACT) { /* But OP != EXACT! */
2058 I32 l = STR_LEN(scan);
2059 UV uc = *((U8*)STRING(scan));
2061 /* Search for fixed substrings supports EXACT only. */
2062 if (flags & SCF_DO_SUBSTR)
2063 scan_commit(pRExC_state, data);
2065 U8 *s = (U8 *)STRING(scan);
2066 l = utf8_length(s, s + l);
2067 uc = utf8_to_uvchr(s, NULL);
2070 if (data && (flags & SCF_DO_SUBSTR))
2072 if (flags & SCF_DO_STCLASS_AND) {
2073 /* Check whether it is compatible with what we know already! */
2077 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
2078 && !ANYOF_BITMAP_TEST(data->start_class, uc)
2079 && !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
2081 ANYOF_CLASS_ZERO(data->start_class);
2082 ANYOF_BITMAP_ZERO(data->start_class);
2084 ANYOF_BITMAP_SET(data->start_class, uc);
2085 data->start_class->flags &= ~ANYOF_EOS;
2086 data->start_class->flags |= ANYOF_FOLD;
2087 if (OP(scan) == EXACTFL)
2088 data->start_class->flags |= ANYOF_LOCALE;
2091 else if (flags & SCF_DO_STCLASS_OR) {
2092 if (data->start_class->flags & ANYOF_FOLD) {
2093 /* false positive possible if the class is case-folded.
2094 Assume that the locale settings are the same... */
2096 ANYOF_BITMAP_SET(data->start_class, uc);
2097 data->start_class->flags &= ~ANYOF_EOS;
2099 cl_and(data->start_class, &and_with);
2101 flags &= ~SCF_DO_STCLASS;
2103 else if (strchr((const char*)PL_varies,OP(scan))) {
2104 I32 mincount, maxcount, minnext, deltanext, fl = 0;
2105 I32 f = flags, pos_before = 0;
2106 regnode *oscan = scan;
2107 struct regnode_charclass_class this_class;
2108 struct regnode_charclass_class *oclass = NULL;
2109 I32 next_is_eval = 0;
2111 switch (PL_regkind[(U8)OP(scan)]) {
2112 case WHILEM: /* End of (?:...)* . */
2113 scan = NEXTOPER(scan);
2116 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
2117 next = NEXTOPER(scan);
2118 if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) {
2120 maxcount = REG_INFTY;
2121 next = regnext(scan);
2122 scan = NEXTOPER(scan);
2126 if (flags & SCF_DO_SUBSTR)
2131 if (flags & SCF_DO_STCLASS) {
2133 maxcount = REG_INFTY;
2134 next = regnext(scan);
2135 scan = NEXTOPER(scan);
2138 is_inf = is_inf_internal = 1;
2139 scan = regnext(scan);
2140 if (flags & SCF_DO_SUBSTR) {
2141 scan_commit(pRExC_state, data); /* Cannot extend fixed substrings */
2142 data->longest = &(data->longest_float);
2144 goto optimize_curly_tail;
2146 mincount = ARG1(scan);
2147 maxcount = ARG2(scan);
2148 next = regnext(scan);
2149 if (OP(scan) == CURLYX) {
2150 I32 lp = (data ? *(data->last_closep) : 0);
2151 scan->flags = ((lp <= U8_MAX) ? (U8)lp : U8_MAX);
2153 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
2154 next_is_eval = (OP(scan) == EVAL);
2156 if (flags & SCF_DO_SUBSTR) {
2157 if (mincount == 0) scan_commit(pRExC_state,data); /* Cannot extend fixed substrings */
2158 pos_before = data->pos_min;
2162 data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
2164 data->flags |= SF_IS_INF;
2166 if (flags & SCF_DO_STCLASS) {
2167 cl_init(pRExC_state, &this_class);
2168 oclass = data->start_class;
2169 data->start_class = &this_class;
2170 f |= SCF_DO_STCLASS_AND;
2171 f &= ~SCF_DO_STCLASS_OR;
2173 /* These are the cases when once a subexpression
2174 fails at a particular position, it cannot succeed
2175 even after backtracking at the enclosing scope.
2177 XXXX what if minimal match and we are at the
2178 initial run of {n,m}? */
2179 if ((mincount != maxcount - 1) && (maxcount != REG_INFTY))
2180 f &= ~SCF_WHILEM_VISITED_POS;
2182 /* This will finish on WHILEM, setting scan, or on NULL: */
2183 minnext = study_chunk(pRExC_state, &scan, &deltanext, last, data,
2185 ? (f & ~SCF_DO_SUBSTR) : f),depth+1);
2187 if (flags & SCF_DO_STCLASS)
2188 data->start_class = oclass;
2189 if (mincount == 0 || minnext == 0) {
2190 if (flags & SCF_DO_STCLASS_OR) {
2191 cl_or(pRExC_state, data->start_class, &this_class);
2193 else if (flags & SCF_DO_STCLASS_AND) {
2194 /* Switch to OR mode: cache the old value of
2195 * data->start_class */
2196 StructCopy(data->start_class, &and_with,
2197 struct regnode_charclass_class);
2198 flags &= ~SCF_DO_STCLASS_AND;
2199 StructCopy(&this_class, data->start_class,
2200 struct regnode_charclass_class);
2201 flags |= SCF_DO_STCLASS_OR;
2202 data->start_class->flags |= ANYOF_EOS;
2204 } else { /* Non-zero len */
2205 if (flags & SCF_DO_STCLASS_OR) {
2206 cl_or(pRExC_state, data->start_class, &this_class);
2207 cl_and(data->start_class, &and_with);
2209 else if (flags & SCF_DO_STCLASS_AND)
2210 cl_and(data->start_class, &this_class);
2211 flags &= ~SCF_DO_STCLASS;
2213 if (!scan) /* It was not CURLYX, but CURLY. */
2215 if (ckWARN(WARN_REGEXP)
2216 /* ? quantifier ok, except for (?{ ... }) */
2217 && (next_is_eval || !(mincount == 0 && maxcount == 1))
2218 && (minnext == 0) && (deltanext == 0)
2219 && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
2220 && maxcount <= REG_INFTY/3) /* Complement check for big count */
2223 "Quantifier unexpected on zero-length expression");
2226 min += minnext * mincount;
2227 is_inf_internal |= ((maxcount == REG_INFTY
2228 && (minnext + deltanext) > 0)
2229 || deltanext == I32_MAX);
2230 is_inf |= is_inf_internal;
2231 delta += (minnext + deltanext) * maxcount - minnext * mincount;
2233 /* Try powerful optimization CURLYX => CURLYN. */
2234 if ( OP(oscan) == CURLYX && data
2235 && data->flags & SF_IN_PAR
2236 && !(data->flags & SF_HAS_EVAL)
2237 && !deltanext && minnext == 1 ) {
2238 /* Try to optimize to CURLYN. */
2239 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
2240 regnode *nxt1 = nxt;
2247 if (!strchr((const char*)PL_simple,OP(nxt))
2248 && !(PL_regkind[(U8)OP(nxt)] == EXACT
2249 && STR_LEN(nxt) == 1))
2255 if (OP(nxt) != CLOSE)
2257 /* Now we know that nxt2 is the only contents: */
2258 oscan->flags = (U8)ARG(nxt);
2260 OP(nxt1) = NOTHING; /* was OPEN. */
2262 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
2263 NEXT_OFF(nxt1+ 1) = 0; /* just for consistancy. */
2264 NEXT_OFF(nxt2) = 0; /* just for consistancy with CURLY. */
2265 OP(nxt) = OPTIMIZED; /* was CLOSE. */
2266 OP(nxt + 1) = OPTIMIZED; /* was count. */
2267 NEXT_OFF(nxt+ 1) = 0; /* just for consistancy. */
2272 /* Try optimization CURLYX => CURLYM. */
2273 if ( OP(oscan) == CURLYX && data
2274 && !(data->flags & SF_HAS_PAR)
2275 && !(data->flags & SF_HAS_EVAL)
2276 && !deltanext /* atom is fixed width */
2277 && minnext != 0 /* CURLYM can't handle zero width */
2279 /* XXXX How to optimize if data == 0? */
2280 /* Optimize to a simpler form. */
2281 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
2285 while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
2286 && (OP(nxt2) != WHILEM))
2288 OP(nxt2) = SUCCEED; /* Whas WHILEM */
2289 /* Need to optimize away parenths. */
2290 if (data->flags & SF_IN_PAR) {
2291 /* Set the parenth number. */
2292 regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
2294 if (OP(nxt) != CLOSE)
2295 FAIL("Panic opt close");
2296 oscan->flags = (U8)ARG(nxt);
2297 OP(nxt1) = OPTIMIZED; /* was OPEN. */
2298 OP(nxt) = OPTIMIZED; /* was CLOSE. */
2300 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
2301 OP(nxt + 1) = OPTIMIZED; /* was count. */
2302 NEXT_OFF(nxt1 + 1) = 0; /* just for consistancy. */
2303 NEXT_OFF(nxt + 1) = 0; /* just for consistancy. */
2306 while ( nxt1 && (OP(nxt1) != WHILEM)) {
2307 regnode *nnxt = regnext(nxt1);
2310 if (reg_off_by_arg[OP(nxt1)])
2311 ARG_SET(nxt1, nxt2 - nxt1);
2312 else if (nxt2 - nxt1 < U16_MAX)
2313 NEXT_OFF(nxt1) = nxt2 - nxt1;
2315 OP(nxt) = NOTHING; /* Cannot beautify */
2320 /* Optimize again: */
2321 study_chunk(pRExC_state, &nxt1, &deltanext, nxt,
2327 else if ((OP(oscan) == CURLYX)
2328 && (flags & SCF_WHILEM_VISITED_POS)
2329 /* See the comment on a similar expression above.
2330 However, this time it not a subexpression
2331 we care about, but the expression itself. */
2332 && (maxcount == REG_INFTY)
2333 && data && ++data->whilem_c < 16) {
2334 /* This stays as CURLYX, we can put the count/of pair. */
2335 /* Find WHILEM (as in regexec.c) */
2336 regnode *nxt = oscan + NEXT_OFF(oscan);
2338 if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
2340 PREVOPER(nxt)->flags = (U8)(data->whilem_c
2341 | (RExC_whilem_seen << 4)); /* On WHILEM */
2343 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
2345 if (flags & SCF_DO_SUBSTR) {
2346 SV *last_str = Nullsv;
2347 int counted = mincount != 0;
2349 if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */
2350 #if defined(SPARC64_GCC_WORKAROUND)
2356 if (pos_before >= data->last_start_min)
2359 b = data->last_start_min;
2362 s = SvPV(data->last_found, l);
2363 old = b - data->last_start_min;
2366 I32 b = pos_before >= data->last_start_min
2367 ? pos_before : data->last_start_min;
2369 char *s = SvPV(data->last_found, l);
2370 I32 old = b - data->last_start_min;
2374 old = utf8_hop((U8*)s, old) - (U8*)s;
2377 /* Get the added string: */
2378 last_str = newSVpvn(s + old, l);
2380 SvUTF8_on(last_str);
2381 if (deltanext == 0 && pos_before == b) {
2382 /* What was added is a constant string */
2384 SvGROW(last_str, (mincount * l) + 1);
2385 repeatcpy(SvPVX(last_str) + l,
2386 SvPVX(last_str), l, mincount - 1);
2387 SvCUR(last_str) *= mincount;
2388 /* Add additional parts. */
2389 SvCUR_set(data->last_found,
2390 SvCUR(data->last_found) - l);
2391 sv_catsv(data->last_found, last_str);
2393 SV * sv = data->last_found;
2395 SvUTF8(sv) && SvMAGICAL(sv) ?
2396 mg_find(sv, PERL_MAGIC_utf8) : NULL;
2397 if (mg && mg->mg_len >= 0)
2398 mg->mg_len += CHR_SVLEN(last_str);
2400 data->last_end += l * (mincount - 1);
2403 /* start offset must point into the last copy */
2404 data->last_start_min += minnext * (mincount - 1);
2405 data->last_start_max += is_inf ? I32_MAX
2406 : (maxcount - 1) * (minnext + data->pos_delta);
2409 /* It is counted once already... */
2410 data->pos_min += minnext * (mincount - counted);
2411 data->pos_delta += - counted * deltanext +
2412 (minnext + deltanext) * maxcount - minnext * mincount;
2413 if (mincount != maxcount) {
2414 /* Cannot extend fixed substrings found inside
2416 scan_commit(pRExC_state,data);
2417 if (mincount && last_str) {
2418 sv_setsv(data->last_found, last_str);
2419 data->last_end = data->pos_min;
2420 data->last_start_min =
2421 data->pos_min - CHR_SVLEN(last_str);
2422 data->last_start_max = is_inf
2424 : data->pos_min + data->pos_delta
2425 - CHR_SVLEN(last_str);
2427 data->longest = &(data->longest_float);
2429 SvREFCNT_dec(last_str);
2431 if (data && (fl & SF_HAS_EVAL))
2432 data->flags |= SF_HAS_EVAL;
2433 optimize_curly_tail:
2434 if (OP(oscan) != CURLYX) {
2435 while (PL_regkind[(U8)OP(next = regnext(oscan))] == NOTHING
2437 NEXT_OFF(oscan) += NEXT_OFF(next);
2440 default: /* REF and CLUMP only? */
2441 if (flags & SCF_DO_SUBSTR) {
2442 scan_commit(pRExC_state,data); /* Cannot expect anything... */
2443 data->longest = &(data->longest_float);
2445 is_inf = is_inf_internal = 1;
2446 if (flags & SCF_DO_STCLASS_OR)
2447 cl_anything(pRExC_state, data->start_class);
2448 flags &= ~SCF_DO_STCLASS;
2452 else if (strchr((const char*)PL_simple,OP(scan))) {
2455 if (flags & SCF_DO_SUBSTR) {
2456 scan_commit(pRExC_state,data);
2460 if (flags & SCF_DO_STCLASS) {
2461 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
2463 /* Some of the logic below assumes that switching
2464 locale on will only add false positives. */
2465 switch (PL_regkind[(U8)OP(scan)]) {
2469 /* Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan)); */
2470 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
2471 cl_anything(pRExC_state, data->start_class);
2474 if (OP(scan) == SANY)
2476 if (flags & SCF_DO_STCLASS_OR) { /* Everything but \n */
2477 value = (ANYOF_BITMAP_TEST(data->start_class,'\n')
2478 || (data->start_class->flags & ANYOF_CLASS));
2479 cl_anything(pRExC_state, data->start_class);
2481 if (flags & SCF_DO_STCLASS_AND || !value)
2482 ANYOF_BITMAP_CLEAR(data->start_class,'\n');
2485 if (flags & SCF_DO_STCLASS_AND)
2486 cl_and(data->start_class,
2487 (struct regnode_charclass_class*)scan);
2489 cl_or(pRExC_state, data->start_class,
2490 (struct regnode_charclass_class*)scan);
2493 if (flags & SCF_DO_STCLASS_AND) {
2494 if (!(data->start_class->flags & ANYOF_LOCALE)) {
2495 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
2496 for (value = 0; value < 256; value++)
2497 if (!isALNUM(value))
2498 ANYOF_BITMAP_CLEAR(data->start_class, value);
2502 if (data->start_class->flags & ANYOF_LOCALE)
2503 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
2505 for (value = 0; value < 256; value++)
2507 ANYOF_BITMAP_SET(data->start_class, value);
2512 if (flags & SCF_DO_STCLASS_AND) {
2513 if (data->start_class->flags & ANYOF_LOCALE)
2514 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
2517 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
2518 data->start_class->flags |= ANYOF_LOCALE;
2522 if (flags & SCF_DO_STCLASS_AND) {
2523 if (!(data->start_class->flags & ANYOF_LOCALE)) {
2524 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
2525 for (value = 0; value < 256; value++)
2527 ANYOF_BITMAP_CLEAR(data->start_class, value);
2531 if (data->start_class->flags & ANYOF_LOCALE)
2532 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
2534 for (value = 0; value < 256; value++)
2535 if (!isALNUM(value))
2536 ANYOF_BITMAP_SET(data->start_class, value);
2541 if (flags & SCF_DO_STCLASS_AND) {
2542 if (data->start_class->flags & ANYOF_LOCALE)
2543 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
2546 data->start_class->flags |= ANYOF_LOCALE;
2547 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
2551 if (flags & SCF_DO_STCLASS_AND) {
2552 if (!(data->start_class->flags & ANYOF_LOCALE)) {
2553 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
2554 for (value = 0; value < 256; value++)
2555 if (!isSPACE(value))
2556 ANYOF_BITMAP_CLEAR(data->start_class, value);
2560 if (data->start_class->flags & ANYOF_LOCALE)
2561 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
2563 for (value = 0; value < 256; value++)
2565 ANYOF_BITMAP_SET(data->start_class, value);
2570 if (flags & SCF_DO_STCLASS_AND) {
2571 if (data->start_class->flags & ANYOF_LOCALE)
2572 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
2575 data->start_class->flags |= ANYOF_LOCALE;
2576 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
2580 if (flags & SCF_DO_STCLASS_AND) {
2581 if (!(data->start_class->flags & ANYOF_LOCALE)) {
2582 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
2583 for (value = 0; value < 256; value++)
2585 ANYOF_BITMAP_CLEAR(data->start_class, value);
2589 if (data->start_class->flags & ANYOF_LOCALE)
2590 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
2592 for (value = 0; value < 256; value++)
2593 if (!isSPACE(value))
2594 ANYOF_BITMAP_SET(data->start_class, value);
2599 if (flags & SCF_DO_STCLASS_AND) {
2600 if (data->start_class->flags & ANYOF_LOCALE) {
2601 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
2602 for (value = 0; value < 256; value++)
2603 if (!isSPACE(value))
2604 ANYOF_BITMAP_CLEAR(data->start_class, value);
2608 data->start_class->flags |= ANYOF_LOCALE;
2609 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
2613 if (flags & SCF_DO_STCLASS_AND) {
2614 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NDIGIT);
2615 for (value = 0; value < 256; value++)
2616 if (!isDIGIT(value))
2617 ANYOF_BITMAP_CLEAR(data->start_class, value);
2620 if (data->start_class->flags & ANYOF_LOCALE)
2621 ANYOF_CLASS_SET(data->start_class,ANYOF_DIGIT);
2623 for (value = 0; value < 256; value++)
2625 ANYOF_BITMAP_SET(data->start_class, value);
2630 if (flags & SCF_DO_STCLASS_AND) {
2631 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_DIGIT);
2632 for (value = 0; value < 256; value++)
2634 ANYOF_BITMAP_CLEAR(data->start_class, value);
2637 if (data->start_class->flags & ANYOF_LOCALE)
2638 ANYOF_CLASS_SET(data->start_class,ANYOF_NDIGIT);
2640 for (value = 0; value < 256; value++)
2641 if (!isDIGIT(value))
2642 ANYOF_BITMAP_SET(data->start_class, value);
2647 if (flags & SCF_DO_STCLASS_OR)
2648 cl_and(data->start_class, &and_with);
2649 flags &= ~SCF_DO_STCLASS;
2652 else if (PL_regkind[(U8)OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
2653 data->flags |= (OP(scan) == MEOL
2657 else if ( PL_regkind[(U8)OP(scan)] == BRANCHJ
2658 /* Lookbehind, or need to calculate parens/evals/stclass: */
2659 && (scan->flags || data || (flags & SCF_DO_STCLASS))
2660 && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) {
2661 /* Lookahead/lookbehind */
2662 I32 deltanext, minnext, fake = 0;
2664 struct regnode_charclass_class intrnl;
2667 data_fake.flags = 0;
2669 data_fake.whilem_c = data->whilem_c;
2670 data_fake.last_closep = data->last_closep;
2673 data_fake.last_closep = &fake;
2674 if ( flags & SCF_DO_STCLASS && !scan->flags
2675 && OP(scan) == IFMATCH ) { /* Lookahead */
2676 cl_init(pRExC_state, &intrnl);
2677 data_fake.start_class = &intrnl;
2678 f |= SCF_DO_STCLASS_AND;
2680 if (flags & SCF_WHILEM_VISITED_POS)
2681 f |= SCF_WHILEM_VISITED_POS;
2682 next = regnext(scan);
2683 nscan = NEXTOPER(NEXTOPER(scan));
2684 minnext = study_chunk(pRExC_state, &nscan, &deltanext, last, &data_fake, f,depth+1);
2687 vFAIL("Variable length lookbehind not implemented");
2689 else if (minnext > U8_MAX) {
2690 vFAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
2692 scan->flags = (U8)minnext;
2694 if (data && data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
2696 if (data && (data_fake.flags & SF_HAS_EVAL))
2697 data->flags |= SF_HAS_EVAL;
2699 data->whilem_c = data_fake.whilem_c;
2700 if (f & SCF_DO_STCLASS_AND) {
2701 int was = (data->start_class->flags & ANYOF_EOS);
2703 cl_and(data->start_class, &intrnl);
2705 data->start_class->flags |= ANYOF_EOS;
2708 else if (OP(scan) == OPEN) {
2711 else if (OP(scan) == CLOSE) {
2712 if ((I32)ARG(scan) == is_par) {
2713 next = regnext(scan);
2715 if ( next && (OP(next) != WHILEM) && next < last)
2716 is_par = 0; /* Disable optimization */
2719 *(data->last_closep) = ARG(scan);
2721 else if (OP(scan) == EVAL) {
2723 data->flags |= SF_HAS_EVAL;
2725 else if (OP(scan) == LOGICAL && scan->flags == 2) { /* Embedded follows */
2726 if (flags & SCF_DO_SUBSTR) {
2727 scan_commit(pRExC_state,data);
2728 data->longest = &(data->longest_float);
2730 is_inf = is_inf_internal = 1;
2731 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
2732 cl_anything(pRExC_state, data->start_class);
2733 flags &= ~SCF_DO_STCLASS;
2735 /* Else: zero-length, ignore. */
2736 scan = regnext(scan);
2741 *deltap = is_inf_internal ? I32_MAX : delta;
2742 if (flags & SCF_DO_SUBSTR && is_inf)
2743 data->pos_delta = I32_MAX - data->pos_min;
2744 if (is_par > U8_MAX)
2746 if (is_par && pars==1 && data) {
2747 data->flags |= SF_IN_PAR;
2748 data->flags &= ~SF_HAS_PAR;
2750 else if (pars && data) {
2751 data->flags |= SF_HAS_PAR;
2752 data->flags &= ~SF_IN_PAR;
2754 if (flags & SCF_DO_STCLASS_OR)
2755 cl_and(data->start_class, &and_with);
2760 S_add_data(pTHX_ RExC_state_t *pRExC_state, I32 n, const char *s)
2762 if (RExC_rx->data) {
2763 Renewc(RExC_rx->data,
2764 sizeof(*RExC_rx->data) + sizeof(void*) * (RExC_rx->data->count + n - 1),
2765 char, struct reg_data);
2766 Renew(RExC_rx->data->what, RExC_rx->data->count + n, U8);
2767 RExC_rx->data->count += n;
2770 Newc(1207, RExC_rx->data, sizeof(*RExC_rx->data) + sizeof(void*) * (n - 1),
2771 char, struct reg_data);
2772 New(1208, RExC_rx->data->what, n, U8);
2773 RExC_rx->data->count = n;
2775 Copy(s, RExC_rx->data->what + RExC_rx->data->count - n, n, U8);
2776 return RExC_rx->data->count - n;
2780 Perl_reginitcolors(pTHX)
2783 char *s = PerlEnv_getenv("PERL_RE_COLORS");
2786 PL_colors[0] = s = savepv(s);
2788 s = strchr(s, '\t');
2794 PL_colors[i] = s = (char *)"";
2798 PL_colors[i++] = (char *)"";
2805 - pregcomp - compile a regular expression into internal code
2807 * We can't allocate space until we know how big the compiled form will be,
2808 * but we can't compile it (and thus know how big it is) until we've got a
2809 * place to put the code. So we cheat: we compile it twice, once with code
2810 * generation turned off and size counting turned on, and once "for real".
2811 * This also means that we don't allocate space until we are sure that the
2812 * thing really will compile successfully, and we never have to move the
2813 * code and thus invalidate pointers into it. (Note that it has to be in
2814 * one piece because free() must be able to free it all.) [NB: not true in perl]
2816 * Beware that the optimization-preparation code in here knows about some
2817 * of the structure of the compiled regexp. [I'll say.]
2820 Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm)
2830 RExC_state_t RExC_state;
2831 RExC_state_t *pRExC_state = &RExC_state;
2833 GET_RE_DEBUG_FLAGS_DECL;
2836 FAIL("NULL regexp argument");
2838 RExC_utf8 = pm->op_pmdynflags & PMdf_CMP_UTF8;
2841 DEBUG_r(if (!PL_colorset) reginitcolors());
2843 PerlIO_printf(Perl_debug_log, "%sCompiling REx%s \"%s%*s%s\"\n",
2844 PL_colors[4],PL_colors[5],PL_colors[0],
2845 (int)(xend - exp), RExC_precomp, PL_colors[1]);
2847 RExC_flags = pm->op_pmflags;
2851 RExC_seen_zerolen = *exp == '^' ? -1 : 0;
2852 RExC_seen_evals = 0;
2855 /* First pass: determine size, legality. */
2862 RExC_emit = &PL_regdummy;
2863 RExC_whilem_seen = 0;
2864 #if 0 /* REGC() is (currently) a NOP at the first pass.
2865 * Clever compilers notice this and complain. --jhi */
2866 REGC((U8)REG_MAGIC, (char*)RExC_emit);
2868 if (reg(pRExC_state, 0, &flags) == NULL) {
2869 RExC_precomp = Nullch;
2872 DEBUG_COMPILE_r(PerlIO_printf(Perl_debug_log, "size %"IVdf" ", (IV)RExC_size));
2874 /* Small enough for pointer-storage convention?
2875 If extralen==0, this means that we will not need long jumps. */
2876 if (RExC_size >= 0x10000L && RExC_extralen)
2877 RExC_size += RExC_extralen;
2880 if (RExC_whilem_seen > 15)
2881 RExC_whilem_seen = 15;
2883 /* Allocate space and initialize. */
2884 Newc(1001, r, sizeof(regexp) + (unsigned)RExC_size * sizeof(regnode),
2887 FAIL("Regexp out of space");
2890 /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */
2891 Zero(r, sizeof(regexp) + (unsigned)RExC_size * sizeof(regnode), char);
2894 r->prelen = xend - exp;
2895 r->precomp = savepvn(RExC_precomp, r->prelen);
2897 #ifdef PERL_COPY_ON_WRITE
2898 r->saved_copy = Nullsv;
2900 r->reganch = pm->op_pmflags & PMf_COMPILETIME;
2901 r->nparens = RExC_npar - 1; /* set early to validate backrefs */
2903 r->substrs = 0; /* Useful during FAIL. */
2904 r->startp = 0; /* Useful during FAIL. */
2905 r->endp = 0; /* Useful during FAIL. */
2907 Newz(1304, r->offsets, 2*RExC_size+1, U32); /* MJD 20001228 */
2909 r->offsets[0] = RExC_size;
2911 DEBUG_OFFSETS_r(PerlIO_printf(Perl_debug_log,
2912 "%s %"UVuf" bytes for offset annotations.\n",
2913 r->offsets ? "Got" : "Couldn't get",
2914 (UV)((2*RExC_size+1) * sizeof(U32))));
2918 /* Second pass: emit code. */
2919 RExC_flags = pm->op_pmflags; /* don't let top level (?i) bleed */
2924 RExC_emit_start = r->program;
2925 RExC_emit = r->program;
2926 /* Store the count of eval-groups for security checks: */
2927 RExC_emit->next_off = (U16)((RExC_seen_evals > U16_MAX) ? U16_MAX : RExC_seen_evals);
2928 REGC((U8)REG_MAGIC, (char*) RExC_emit++);
2930 if (reg(pRExC_state, 0, &flags) == NULL)
2934 /* Dig out information for optimizations. */
2935 r->reganch = pm->op_pmflags & PMf_COMPILETIME; /* Again? */
2936 pm->op_pmflags = RExC_flags;
2938 r->reganch |= ROPT_UTF8; /* Unicode in it? */
2939 r->regstclass = NULL;
2940 if (RExC_naughty >= 10) /* Probably an expensive pattern. */
2941 r->reganch |= ROPT_NAUGHTY;
2942 scan = r->program + 1; /* First BRANCH. */
2944 /* XXXX To minimize changes to RE engine we always allocate
2945 3-units-long substrs field. */
2946 Newz(1004, r->substrs, 1, struct reg_substr_data);
2948 StructCopy(&zero_scan_data, &data, scan_data_t);
2949 /* XXXX Should not we check for something else? Usually it is OPEN1... */
2950 if (OP(scan) != BRANCH) { /* Only one top-level choice. */
2952 STRLEN longest_float_length, longest_fixed_length;
2953 struct regnode_charclass_class ch_class;
2958 /* Skip introductions and multiplicators >= 1. */
2959 while ((OP(first) == OPEN && (sawopen = 1)) ||
2960 /* An OR of *one* alternative - should not happen now. */
2961 (OP(first) == BRANCH && OP(regnext(first)) != BRANCH) ||
2962 (OP(first) == PLUS) ||
2963 (OP(first) == MINMOD) ||
2964 /* An {n,m} with n>0 */
2965 (PL_regkind[(U8)OP(first)] == CURLY && ARG1(first) > 0) ) {
2966 if (OP(first) == PLUS)
2969 first += regarglen[(U8)OP(first)];
2970 first = NEXTOPER(first);
2973 /* Starting-point info. */
2975 if (PL_regkind[(U8)OP(first)] == EXACT) {
2976 if (OP(first) == EXACT)
2977 ; /* Empty, get anchored substr later. */
2978 else if ((OP(first) == EXACTF || OP(first) == EXACTFL))
2979 r->regstclass = first;
2981 else if (strchr((const char*)PL_simple,OP(first)))
2982 r->regstclass = first;
2983 else if (PL_regkind[(U8)OP(first)] == BOUND ||
2984 PL_regkind[(U8)OP(first)] == NBOUND)
2985 r->regstclass = first;
2986 else if (PL_regkind[(U8)OP(first)] == BOL) {
2987 r->reganch |= (OP(first) == MBOL
2989 : (OP(first) == SBOL
2992 first = NEXTOPER(first);
2995 else if (OP(first) == GPOS) {
2996 r->reganch |= ROPT_ANCH_GPOS;
2997 first = NEXTOPER(first);
3000 else if (!sawopen && (OP(first) == STAR &&
3001 PL_regkind[(U8)OP(NEXTOPER(first))] == REG_ANY) &&
3002 !(r->reganch & ROPT_ANCH) )
3004 /* turn .* into ^.* with an implied $*=1 */
3005 int type = OP(NEXTOPER(first));
3007 if (type == REG_ANY)
3008 type = ROPT_ANCH_MBOL;
3010 type = ROPT_ANCH_SBOL;
3012 r->reganch |= type | ROPT_IMPLICIT;
3013 first = NEXTOPER(first);
3016 if (sawplus && (!sawopen || !RExC_sawback)
3017 && !(RExC_seen & REG_SEEN_EVAL)) /* May examine pos and $& */
3018 /* x+ must match at the 1st pos of run of x's */
3019 r->reganch |= ROPT_SKIP;
3021 /* Scan is after the zeroth branch, first is atomic matcher. */
3022 DEBUG_COMPILE_r(PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
3023 (IV)(first - scan + 1)));
3025 * If there's something expensive in the r.e., find the
3026 * longest literal string that must appear and make it the
3027 * regmust. Resolve ties in favor of later strings, since
3028 * the regstart check works with the beginning of the r.e.
3029 * and avoiding duplication strengthens checking. Not a
3030 * strong reason, but sufficient in the absence of others.
3031 * [Now we resolve ties in favor of the earlier string if
3032 * it happens that c_offset_min has been invalidated, since the
3033 * earlier string may buy us something the later one won't.]
3037 data.longest_fixed = newSVpvn("",0);
3038 data.longest_float = newSVpvn("",0);
3039 data.last_found = newSVpvn("",0);
3040 data.longest = &(data.longest_fixed);
3042 if (!r->regstclass) {
3043 cl_init(pRExC_state, &ch_class);
3044 data.start_class = &ch_class;
3045 stclass_flag = SCF_DO_STCLASS_AND;
3046 } else /* XXXX Check for BOUND? */
3048 data.last_closep = &last_close;
3050 minlen = study_chunk(pRExC_state, &first, &fake, scan + RExC_size, /* Up to end */
3051 &data, SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag,0);
3052 if ( RExC_npar == 1 && data.longest == &(data.longest_fixed)
3053 && data.last_start_min == 0 && data.last_end > 0
3054 && !RExC_seen_zerolen
3055 && (!(RExC_seen & REG_SEEN_GPOS) || (r->reganch & ROPT_ANCH_GPOS)))
3056 r->reganch |= ROPT_CHECK_ALL;
3057 scan_commit(pRExC_state, &data);
3058 SvREFCNT_dec(data.last_found);
3060 longest_float_length = CHR_SVLEN(data.longest_float);
3061 if (longest_float_length
3062 || (data.flags & SF_FL_BEFORE_EOL
3063 && (!(data.flags & SF_FL_BEFORE_MEOL)
3064 || (RExC_flags & PMf_MULTILINE)))) {
3067 if (SvCUR(data.longest_fixed) /* ok to leave SvCUR */
3068 && data.offset_fixed == data.offset_float_min
3069 && SvCUR(data.longest_fixed) == SvCUR(data.longest_float))
3070 goto remove_float; /* As in (a)+. */
3072 if (SvUTF8(data.longest_float)) {
3073 r->float_utf8 = data.longest_float;
3074 r->float_substr = Nullsv;
3076 r->float_substr = data.longest_float;
3077 r->float_utf8 = Nullsv;
3079 r->float_min_offset = data.offset_float_min;
3080 r->float_max_offset = data.offset_float_max;
3081 t = (data.flags & SF_FL_BEFORE_EOL /* Can't have SEOL and MULTI */
3082 && (!(data.flags & SF_FL_BEFORE_MEOL)
3083 || (RExC_flags & PMf_MULTILINE)));
3084 fbm_compile(data.longest_float, t ? FBMcf_TAIL : 0);
3088 r->float_substr = r->float_utf8 = Nullsv;
3089 SvREFCNT_dec(data.longest_float);
3090 longest_float_length = 0;
3093 longest_fixed_length = CHR_SVLEN(data.longest_fixed);
3094 if (longest_fixed_length
3095 || (data.flags & SF_FIX_BEFORE_EOL /* Cannot have SEOL and MULTI */
3096 && (!(data.flags & SF_FIX_BEFORE_MEOL)
3097 || (RExC_flags & PMf_MULTILINE)))) {
3100 if (SvUTF8(data.longest_fixed)) {
3101 r->anchored_utf8 = data.longest_fixed;
3102 r->anchored_substr = Nullsv;
3104 r->anchored_substr = data.longest_fixed;
3105 r->anchored_utf8 = Nullsv;
3107 r->anchored_offset = data.offset_fixed;
3108 t = (data.flags & SF_FIX_BEFORE_EOL /* Can't have SEOL and MULTI */
3109 && (!(data.flags & SF_FIX_BEFORE_MEOL)
3110 || (RExC_flags & PMf_MULTILINE)));
3111 fbm_compile(data.longest_fixed, t ? FBMcf_TAIL : 0);
3114 r->anchored_substr = r->anchored_utf8 = Nullsv;
3115 SvREFCNT_dec(data.longest_fixed);
3116 longest_fixed_length = 0;
3119 && (OP(r->regstclass) == REG_ANY || OP(r->regstclass) == SANY))
3120 r->regstclass = NULL;
3121 if ((!(r->anchored_substr || r->anchored_utf8) || r->anchored_offset)
3123 && !(data.start_class->flags & ANYOF_EOS)
3124 && !cl_is_anything(data.start_class))
3126 I32 n = add_data(pRExC_state, 1, "f");
3128 New(1006, RExC_rx->data->data[n], 1,
3129 struct regnode_charclass_class);
3130 StructCopy(data.start_class,
3131 (struct regnode_charclass_class*)RExC_rx->data->data[n],
3132 struct regnode_charclass_class);
3133 r->regstclass = (regnode*)RExC_rx->data->data[n];
3134 r->reganch &= ~ROPT_SKIP; /* Used in find_byclass(). */
3135 PL_regdata = r->data; /* for regprop() */
3136 DEBUG_COMPILE_r({ SV *sv = sv_newmortal();
3137 regprop(sv, (regnode*)data.start_class);
3138 PerlIO_printf(Perl_debug_log,
3139 "synthetic stclass `%s'.\n",
3143 /* A temporary algorithm prefers floated substr to fixed one to dig more info. */
3144 if (longest_fixed_length > longest_float_length) {
3145 r->check_substr = r->anchored_substr;
3146 r->check_utf8 = r->anchored_utf8;
3147 r->check_offset_min = r->check_offset_max = r->anchored_offset;
3148 if (r->reganch & ROPT_ANCH_SINGLE)
3149 r->reganch |= ROPT_NOSCAN;
3152 r->check_substr = r->float_substr;
3153 r->check_utf8 = r->float_utf8;
3154 r->check_offset_min = data.offset_float_min;
3155 r->check_offset_max = data.offset_float_max;
3157 /* XXXX Currently intuiting is not compatible with ANCH_GPOS.
3158 This should be changed ASAP! */
3159 if ((r->check_substr || r->check_utf8) && !(r->reganch & ROPT_ANCH_GPOS)) {
3160 r->reganch |= RE_USE_INTUIT;
3161 if (SvTAIL(r->check_substr ? r->check_substr : r->check_utf8))
3162 r->reganch |= RE_INTUIT_TAIL;
3166 /* Several toplevels. Best we can is to set minlen. */
3168 struct regnode_charclass_class ch_class;
3171 DEBUG_COMPILE_r(PerlIO_printf(Perl_debug_log, "\n"));
3172 scan = r->program + 1;
3173 cl_init(pRExC_state, &ch_class);
3174 data.start_class = &ch_class;
3175 data.last_closep = &last_close;
3176 minlen = study_chunk(pRExC_state, &scan, &fake, scan + RExC_size, &data, SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS,0);
3177 r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8
3178 = r->float_substr = r->float_utf8 = Nullsv;
3179 if (!(data.start_class->flags & ANYOF_EOS)
3180 && !cl_is_anything(data.start_class))
3182 I32 n = add_data(pRExC_state, 1, "f");
3184 New(1006, RExC_rx->data->data[n], 1,
3185 struct regnode_charclass_class);
3186 StructCopy(data.start_class,
3187 (struct regnode_charclass_class*)RExC_rx->data->data[n],
3188 struct regnode_charclass_class);
3189 r->regstclass = (regnode*)RExC_rx->data->data[n];
3190 r->reganch &= ~ROPT_SKIP; /* Used in find_byclass(). */
3191 DEBUG_COMPILE_r({ SV* sv = sv_newmortal();
3192 regprop(sv, (regnode*)data.start_class);
3193 PerlIO_printf(Perl_debug_log,
3194 "synthetic stclass `%s'.\n",
3200 if (RExC_seen & REG_SEEN_GPOS)
3201 r->reganch |= ROPT_GPOS_SEEN;
3202 if (RExC_seen & REG_SEEN_LOOKBEHIND)
3203 r->reganch |= ROPT_LOOKBEHIND_SEEN;
3204 if (RExC_seen & REG_SEEN_EVAL)
3205 r->reganch |= ROPT_EVAL_SEEN;
3206 if (RExC_seen & REG_SEEN_CANY)
3207 r->reganch |= ROPT_CANY_SEEN;
3208 Newz(1002, r->startp, RExC_npar, I32);
3209 Newz(1002, r->endp, RExC_npar, I32);
3210 PL_regdata = r->data; /* for regprop() */
3211 DEBUG_COMPILE_r(regdump(r));
3216 - reg - regular expression, i.e. main body or parenthesized thing
3218 * Caller must absorb opening parenthesis.
3220 * Combining parenthesis handling with the base level of regular expression
3221 * is a trifle forced, but the need to tie the tails of the branches to what
3222 * follows makes it hard to avoid.
3225 S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp)
3226 /* paren: Parenthesized? 0=top, 1=(, inside: changed to letter. */
3228 register regnode *ret; /* Will be the head of the group. */
3229 register regnode *br;
3230 register regnode *lastbr;
3231 register regnode *ender = 0;
3232 register I32 parno = 0;
3233 I32 flags, oregflags = RExC_flags, have_branch = 0, open = 0;
3235 /* for (?g), (?gc), and (?o) warnings; warning
3236 about (?c) will warn about (?g) -- japhy */
3238 I32 wastedflags = 0x00,
3241 wasted_gc = 0x02 | 0x04,
3244 char * parse_start = RExC_parse; /* MJD */
3245 char *oregcomp_parse = RExC_parse;
3248 *flagp = 0; /* Tentatively. */
3251 /* Make an OPEN node, if parenthesized. */
3253 if (*RExC_parse == '?') { /* (?...) */
3254 U32 posflags = 0, negflags = 0;
3255 U32 *flagsp = &posflags;
3257 char *seqstart = RExC_parse;
3260 paren = *RExC_parse++;
3261 ret = NULL; /* For look-ahead/behind. */
3263 case '<': /* (?<...) */
3264 RExC_seen |= REG_SEEN_LOOKBEHIND;
3265 if (*RExC_parse == '!')
3267 if (*RExC_parse != '=' && *RExC_parse != '!')
3270 case '=': /* (?=...) */
3271 case '!': /* (?!...) */
3272 RExC_seen_zerolen++;
3273 case ':': /* (?:...) */
3274 case '>': /* (?>...) */
3276 case '$': /* (?$...) */
3277 case '@': /* (?@...) */
3278 vFAIL2("Sequence (?%c...) not implemented", (int)paren);
3280 case '#': /* (?#...) */
3281 while (*RExC_parse && *RExC_parse != ')')
3283 if (*RExC_parse != ')')
3284 FAIL("Sequence (?#... not terminated");
3285 nextchar(pRExC_state);
3288 case 'p': /* (?p...) */
3289 if (SIZE_ONLY && ckWARN2(WARN_DEPRECATED, WARN_REGEXP))
3290 vWARNdep(RExC_parse, "(?p{}) is deprecated - use (??{})");
3292 case '?': /* (??...) */
3294 if (*RExC_parse != '{')
3296 paren = *RExC_parse++;
3298 case '{': /* (?{...}) */
3300 I32 count = 1, n = 0;
3302 char *s = RExC_parse;
3304 OP_4tree *sop, *rop;
3306 RExC_seen_zerolen++;
3307 RExC_seen |= REG_SEEN_EVAL;
3308 while (count && (c = *RExC_parse)) {
3309 if (c == '\\' && RExC_parse[1])
3317 if (*RExC_parse != ')')
3320 vFAIL("Sequence (?{...}) not terminated or not {}-balanced");
3325 if (RExC_parse - 1 - s)
3326 sv = newSVpvn(s, RExC_parse - 1 - s);
3328 sv = newSVpvn("", 0);
3331 Perl_save_re_context(aTHX);
3332 rop = sv_compile_2op(sv, &sop, "re", &pad);
3333 sop->op_private |= OPpREFCOUNTED;
3334 /* re_dup will OpREFCNT_inc */
3335 OpREFCNT_set(sop, 1);
3338 n = add_data(pRExC_state, 3, "nop");
3339 RExC_rx->data->data[n] = (void*)rop;
3340 RExC_rx->data->data[n+1] = (void*)sop;
3341 RExC_rx->data->data[n+2] = (void*)pad;
3344 else { /* First pass */
3345 if (PL_reginterp_cnt < ++RExC_seen_evals
3347 /* No compiled RE interpolated, has runtime
3348 components ===> unsafe. */
3349 FAIL("Eval-group not allowed at runtime, use re 'eval'");
3350 if (PL_tainting && PL_tainted)
3351 FAIL("Eval-group in insecure regular expression");
3352 if (IN_PERL_COMPILETIME)
3356 nextchar(pRExC_state);
3358 ret = reg_node(pRExC_state, LOGICAL);
3361 regtail(pRExC_state, ret, reganode(pRExC_state, EVAL, n));
3362 /* deal with the length of this later - MJD */
3365 ret = reganode(pRExC_state, EVAL, n);
3366 Set_Node_Length(ret, RExC_parse - parse_start + 1);
3367 Set_Node_Offset(ret, parse_start);
3370 case '(': /* (?(?{...})...) and (?(?=...)...) */
3372 if (RExC_parse[0] == '?') { /* (?(?...)) */
3373 if (RExC_parse[1] == '=' || RExC_parse[1] == '!'
3374 || RExC_parse[1] == '<'
3375 || RExC_parse[1] == '{') { /* Lookahead or eval. */
3378 ret = reg_node(pRExC_state, LOGICAL);
3381 regtail(pRExC_state, ret, reg(pRExC_state, 1, &flag));
3385 else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
3387 parno = atoi(RExC_parse++);
3389 while (isDIGIT(*RExC_parse))
3391 ret = reganode(pRExC_state, GROUPP, parno);
3393 if ((c = *nextchar(pRExC_state)) != ')')
3394 vFAIL("Switch condition not recognized");
3396 regtail(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
3397 br = regbranch(pRExC_state, &flags, 1);
3399 br = reganode(pRExC_state, LONGJMP, 0);
3401 regtail(pRExC_state, br, reganode(pRExC_state, LONGJMP, 0));
3402 c = *nextchar(pRExC_state);
3406 lastbr = reganode(pRExC_state, IFTHEN, 0); /* Fake one for optimizer. */
3407 regbranch(pRExC_state, &flags, 1);
3408 regtail(pRExC_state, ret, lastbr);
3411 c = *nextchar(pRExC_state);
3416 vFAIL("Switch (?(condition)... contains too many branches");
3417 ender = reg_node(pRExC_state, TAIL);
3418 regtail(pRExC_state, br, ender);
3420 regtail(pRExC_state, lastbr, ender);
3421 regtail(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
3424 regtail(pRExC_state, ret, ender);
3428 vFAIL2("Unknown switch condition (?(%.2s", RExC_parse);
3432 RExC_parse--; /* for vFAIL to print correctly */
3433 vFAIL("Sequence (? incomplete");
3437 parse_flags: /* (?i) */
3438 while (*RExC_parse && strchr("iogcmsx", *RExC_parse)) {
3439 /* (?g), (?gc) and (?o) are useless here
3440 and must be globally applied -- japhy */
3442 if (*RExC_parse == 'o' || *RExC_parse == 'g') {
3443 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
3444 I32 wflagbit = *RExC_parse == 'o' ? wasted_o : wasted_g;
3445 if (! (wastedflags & wflagbit) ) {
3446 wastedflags |= wflagbit;
3449 "Useless (%s%c) - %suse /%c modifier",
3450 flagsp == &negflags ? "?-" : "?",
3452 flagsp == &negflags ? "don't " : "",
3458 else if (*RExC_parse == 'c') {
3459 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
3460 if (! (wastedflags & wasted_c) ) {
3461 wastedflags |= wasted_gc;
3464 "Useless (%sc) - %suse /gc modifier",
3465 flagsp == &negflags ? "?-" : "?",
3466 flagsp == &negflags ? "don't " : ""
3471 else { pmflag(flagsp, *RExC_parse); }
3475 if (*RExC_parse == '-') {
3477 wastedflags = 0; /* reset so (?g-c) warns twice */
3481 RExC_flags |= posflags;
3482 RExC_flags &= ~negflags;
3483 if (*RExC_parse == ':') {
3489 if (*RExC_parse != ')') {
3491 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
3493 nextchar(pRExC_state);
3501 ret = reganode(pRExC_state, OPEN, parno);
3502 Set_Node_Length(ret, 1); /* MJD */
3503 Set_Node_Offset(ret, RExC_parse); /* MJD */
3510 /* Pick up the branches, linking them together. */
3511 parse_start = RExC_parse; /* MJD */
3512 br = regbranch(pRExC_state, &flags, 1);
3513 /* branch_len = (paren != 0); */
3517 if (*RExC_parse == '|') {
3518 if (!SIZE_ONLY && RExC_extralen) {
3519 reginsert(pRExC_state, BRANCHJ, br);
3522 reginsert(pRExC_state, BRANCH, br);
3523 Set_Node_Length(br, paren != 0);
3524 Set_Node_Offset_To_R(br-RExC_emit_start, parse_start-RExC_start);
3528 RExC_extralen += 1; /* For BRANCHJ-BRANCH. */
3530 else if (paren == ':') {
3531 *flagp |= flags&SIMPLE;
3533 if (open) { /* Starts with OPEN. */
3534 regtail(pRExC_state, ret, br); /* OPEN -> first. */
3536 else if (paren != '?') /* Not Conditional */
3538 *flagp |= flags & (SPSTART | HASWIDTH);
3540 while (*RExC_parse == '|') {
3541 if (!SIZE_ONLY && RExC_extralen) {
3542 ender = reganode(pRExC_state, LONGJMP,0);
3543 regtail(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender); /* Append to the previous. */
3546 RExC_extralen += 2; /* Account for LONGJMP. */
3547 nextchar(pRExC_state);
3548 br = regbranch(pRExC_state, &flags, 0);
3552 regtail(pRExC_state, lastbr, br); /* BRANCH -> BRANCH. */
3556 *flagp |= flags&SPSTART;
3559 if (have_branch || paren != ':') {
3560 /* Make a closing node, and hook it on the end. */
3563 ender = reg_node(pRExC_state, TAIL);
3566 ender = reganode(pRExC_state, CLOSE, parno);
3567 Set_Node_Offset(ender,RExC_parse+1); /* MJD */
3568 Set_Node_Length(ender,1); /* MJD */
3574 *flagp &= ~HASWIDTH;
3577 ender = reg_node(pRExC_state, SUCCEED);
3580 ender = reg_node(pRExC_state, END);
3583 regtail(pRExC_state, lastbr, ender);
3586 /* Hook the tails of the branches to the closing node. */
3587 for (br = ret; br != NULL; br = regnext(br)) {
3588 regoptail(pRExC_state, br, ender);
3595 static const char parens[] = "=!<,>";
3597 if (paren && (p = strchr(parens, paren))) {
3598 U8 node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
3599 int flag = (p - parens) > 1;
3602 node = SUSPEND, flag = 0;
3603 reginsert(pRExC_state, node,ret);
3604 Set_Node_Cur_Length(ret);
3605 Set_Node_Offset(ret, parse_start + 1);
3607 regtail(pRExC_state, ret, reg_node(pRExC_state, TAIL));
3611 /* Check for proper termination. */
3613 RExC_flags = oregflags;
3614 if (RExC_parse >= RExC_end || *nextchar(pRExC_state) != ')') {
3615 RExC_parse = oregcomp_parse;
3616 vFAIL("Unmatched (");
3619 else if (!paren && RExC_parse < RExC_end) {
3620 if (*RExC_parse == ')') {
3622 vFAIL("Unmatched )");
3625 FAIL("Junk on end of regexp"); /* "Can't happen". */
3633 - regbranch - one alternative of an | operator
3635 * Implements the concatenation operator.
3638 S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first)
3640 register regnode *ret;
3641 register regnode *chain = NULL;
3642 register regnode *latest;
3643 I32 flags = 0, c = 0;
3648 if (!SIZE_ONLY && RExC_extralen)
3649 ret = reganode(pRExC_state, BRANCHJ,0);
3651 ret = reg_node(pRExC_state, BRANCH);
3652 Set_Node_Length(ret, 1);
3656 if (!first && SIZE_ONLY)
3657 RExC_extralen += 1; /* BRANCHJ */
3659 *flagp = WORST; /* Tentatively. */
3662 nextchar(pRExC_state);
3663 while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
3665 latest = regpiece(pRExC_state, &flags);
3666 if (latest == NULL) {
3667 if (flags & TRYAGAIN)
3671 else if (ret == NULL)
3673 *flagp |= flags&HASWIDTH;
3674 if (chain == NULL) /* First piece. */
3675 *flagp |= flags&SPSTART;
3678 regtail(pRExC_state, chain, latest);
3683 if (chain == NULL) { /* Loop ran zero times. */
3684 chain = reg_node(pRExC_state, NOTHING);
3689 *flagp |= flags&SIMPLE;
3696 - regpiece - something followed by possible [*+?]
3698 * Note that the branching code sequences used for ? and the general cases
3699 * of * and + are somewhat optimized: they use the same NOTHING node as
3700 * both the endmarker for their branch list and the body of the last branch.
3701 * It might seem that this node could be dispensed with entirely, but the
3702 * endmarker role is not redundant.
3705 S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp)
3707 register regnode *ret;
3709 register char *next;
3711 char *origparse = RExC_parse;
3714 I32 max = REG_INFTY;
3717 ret = regatom(pRExC_state, &flags);
3719 if (flags & TRYAGAIN)
3726 if (op == '{' && regcurly(RExC_parse)) {
3727 parse_start = RExC_parse; /* MJD */
3728 next = RExC_parse + 1;
3730 while (isDIGIT(*next) || *next == ',') {
3739 if (*next == '}') { /* got one */
3743 min = atoi(RExC_parse);
3747 maxpos = RExC_parse;
3749 if (!max && *maxpos != '0')
3750 max = REG_INFTY; /* meaning "infinity" */
3751 else if (max >= REG_INFTY)
3752 vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
3754 nextchar(pRExC_state);
3757 if ((flags&SIMPLE)) {
3758 RExC_naughty += 2 + RExC_naughty / 2;
3759 reginsert(pRExC_state, CURLY, ret);
3760 Set_Node_Offset(ret, parse_start+1); /* MJD */
3761 Set_Node_Cur_Length(ret);
3764 regnode *w = reg_node(pRExC_state, WHILEM);
3767 regtail(pRExC_state, ret, w);
3768 if (!SIZE_ONLY && RExC_extralen) {
3769 reginsert(pRExC_state, LONGJMP,ret);
3770 reginsert(pRExC_state, NOTHING,ret);
3771 NEXT_OFF(ret) = 3; /* Go over LONGJMP. */
3773 reginsert(pRExC_state, CURLYX,ret);
3775 Set_Node_Offset(ret, parse_start+1);
3776 Set_Node_Length(ret,
3777 op == '{' ? (RExC_parse - parse_start) : 1);
3779 if (!SIZE_ONLY && RExC_extralen)
3780 NEXT_OFF(ret) = 3; /* Go over NOTHING to LONGJMP. */
3781 regtail(pRExC_state, ret, reg_node(pRExC_state, NOTHING));
3783 RExC_whilem_seen++, RExC_extralen += 3;
3784 RExC_naughty += 4 + RExC_naughty; /* compound interest */
3792 if (max && max < min)
3793 vFAIL("Can't do {n,m} with n > m");
3795 ARG1_SET(ret, (U16)min);
3796 ARG2_SET(ret, (U16)max);
3808 #if 0 /* Now runtime fix should be reliable. */
3810 /* if this is reinstated, don't forget to put this back into perldiag:
3812 =item Regexp *+ operand could be empty at {#} in regex m/%s/
3814 (F) The part of the regexp subject to either the * or + quantifier
3815 could match an empty string. The {#} shows in the regular
3816 expression about where the problem was discovered.
3820 if (!(flags&HASWIDTH) && op != '?')
3821 vFAIL("Regexp *+ operand could be empty");
3824 parse_start = RExC_parse;
3825 nextchar(pRExC_state);
3827 *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH);
3829 if (op == '*' && (flags&SIMPLE)) {
3830 reginsert(pRExC_state, STAR, ret);
3834 else if (op == '*') {
3838 else if (op == '+' && (flags&SIMPLE)) {
3839 reginsert(pRExC_state, PLUS, ret);
3843 else if (op == '+') {
3847 else if (op == '?') {
3852 if (ckWARN(WARN_REGEXP) && !SIZE_ONLY && !(flags&HASWIDTH) && max > REG_INFTY/3) {
3854 "%.*s matches null string many times",
3855 RExC_parse - origparse,
3859 if (*RExC_parse == '?') {
3860 nextchar(pRExC_state);
3861 reginsert(pRExC_state, MINMOD, ret);
3862 regtail(pRExC_state, ret, ret + NODE_STEP_REGNODE);
3864 if (ISMULT2(RExC_parse)) {
3866 vFAIL("Nested quantifiers");
3873 - regatom - the lowest level
3875 * Optimization: gobbles an entire sequence of ordinary characters so that
3876 * it can turn them into a single node, which is smaller to store and
3877 * faster to run. Backslashed characters are exceptions, each becoming a
3878 * separate node; the code is simpler that way and it's not worth fixing.
3880 * [Yes, it is worth fixing, some scripts can run twice the speed.] */
3882 S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp)
3884 register regnode *ret = 0;
3886 char *parse_start = RExC_parse;
3888 *flagp = WORST; /* Tentatively. */
3891 switch (*RExC_parse) {
3893 RExC_seen_zerolen++;
3894 nextchar(pRExC_state);
3895 if (RExC_flags & PMf_MULTILINE)
3896 ret = reg_node(pRExC_state, MBOL);
3897 else if (RExC_flags & PMf_SINGLELINE)
3898 ret = reg_node(pRExC_state, SBOL);
3900 ret = reg_node(pRExC_state, BOL);
3901 Set_Node_Length(ret, 1); /* MJD */
3904 nextchar(pRExC_state);
3906 RExC_seen_zerolen++;
3907 if (RExC_flags & PMf_MULTILINE)
3908 ret = reg_node(pRExC_state, MEOL);
3909 else if (RExC_flags & PMf_SINGLELINE)
3910 ret = reg_node(pRExC_state, SEOL);
3912 ret = reg_node(pRExC_state, EOL);
3913 Set_Node_Length(ret, 1); /* MJD */
3916 nextchar(pRExC_state);
3917 if (RExC_flags & PMf_SINGLELINE)
3918 ret = reg_node(pRExC_state, SANY);
3920 ret = reg_node(pRExC_state, REG_ANY);
3921 *flagp |= HASWIDTH|SIMPLE;
3923 Set_Node_Length(ret, 1); /* MJD */
3927 char *oregcomp_parse = ++RExC_parse;
3928 ret = regclass(pRExC_state);
3929 if (*RExC_parse != ']') {
3930 RExC_parse = oregcomp_parse;
3931 vFAIL("Unmatched [");
3933 nextchar(pRExC_state);
3934 *flagp |= HASWIDTH|SIMPLE;
3935 Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */
3939 nextchar(pRExC_state);
3940 ret = reg(pRExC_state, 1, &flags);
3942 if (flags & TRYAGAIN) {
3943 if (RExC_parse == RExC_end) {
3944 /* Make parent create an empty node if needed. */
3952 *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE);
3956 if (flags & TRYAGAIN) {
3960 vFAIL("Internal urp");
3961 /* Supposed to be caught earlier. */
3964 if (!regcurly(RExC_parse)) {
3973 vFAIL("Quantifier follows nothing");
3976 switch (*++RExC_parse) {
3978 RExC_seen_zerolen++;
3979 ret = reg_node(pRExC_state, SBOL);
3981 nextchar(pRExC_state);
3982 Set_Node_Length(ret, 2); /* MJD */
3985 ret = reg_node(pRExC_state, GPOS);
3986 RExC_seen |= REG_SEEN_GPOS;
3988 nextchar(pRExC_state);
3989 Set_Node_Length(ret, 2); /* MJD */
3992 ret = reg_node(pRExC_state, SEOL);
3994 RExC_seen_zerolen++; /* Do not optimize RE away */
3995 nextchar(pRExC_state);
3998 ret = reg_node(pRExC_state, EOS);
4000 RExC_seen_zerolen++; /* Do not optimize RE away */
4001 nextchar(pRExC_state);
4002 Set_Node_Length(ret, 2); /* MJD */
4005 ret = reg_node(pRExC_state, CANY);
4006 RExC_seen |= REG_SEEN_CANY;
4007 *flagp |= HASWIDTH|SIMPLE;
4008 nextchar(pRExC_state);
4009 Set_Node_Length(ret, 2); /* MJD */
4012 ret = reg_node(pRExC_state, CLUMP);
4014 nextchar(pRExC_state);
4015 Set_Node_Length(ret, 2); /* MJD */
4018 ret = reg_node(pRExC_state, (U8)(LOC ? ALNUML : ALNUM));
4019 *flagp |= HASWIDTH|SIMPLE;
4020 nextchar(pRExC_state);
4021 Set_Node_Length(ret, 2); /* MJD */
4024 ret = reg_node(pRExC_state, (U8)(LOC ? NALNUML : NALNUM));
4025 *flagp |= HASWIDTH|SIMPLE;
4026 nextchar(pRExC_state);
4027 Set_Node_Length(ret, 2); /* MJD */
4030 RExC_seen_zerolen++;
4031 RExC_seen |= REG_SEEN_LOOKBEHIND;
4032 ret = reg_node(pRExC_state, (U8)(LOC ? BOUNDL : BOUND));
4034 nextchar(pRExC_state);
4035 Set_Node_Length(ret, 2); /* MJD */
4038 RExC_seen_zerolen++;
4039 RExC_seen |= REG_SEEN_LOOKBEHIND;
4040 ret = reg_node(pRExC_state, (U8)(LOC ? NBOUNDL : NBOUND));
4042 nextchar(pRExC_state);
4043 Set_Node_Length(ret, 2); /* MJD */
4046 ret = reg_node(pRExC_state, (U8)(LOC ? SPACEL : SPACE));
4047 *flagp |= HASWIDTH|SIMPLE;
4048 nextchar(pRExC_state);
4049 Set_Node_Length(ret, 2); /* MJD */
4052 ret = reg_node(pRExC_state, (U8)(LOC ? NSPACEL : NSPACE));
4053 *flagp |= HASWIDTH|SIMPLE;
4054 nextchar(pRExC_state);
4055 Set_Node_Length(ret, 2); /* MJD */
4058 ret = reg_node(pRExC_state, DIGIT);
4059 *flagp |= HASWIDTH|SIMPLE;
4060 nextchar(pRExC_state);
4061 Set_Node_Length(ret, 2); /* MJD */
4064 ret = reg_node(pRExC_state, NDIGIT);
4065 *flagp |= HASWIDTH|SIMPLE;
4066 nextchar(pRExC_state);
4067 Set_Node_Length(ret, 2); /* MJD */
4072 char* oldregxend = RExC_end;
4073 char* parse_start = RExC_parse - 2;
4075 if (RExC_parse[1] == '{') {
4076 /* a lovely hack--pretend we saw [\pX] instead */
4077 RExC_end = strchr(RExC_parse, '}');
4079 U8 c = (U8)*RExC_parse;
4081 RExC_end = oldregxend;
4082 vFAIL2("Missing right brace on \\%c{}", c);
4087 RExC_end = RExC_parse + 2;
4088 if (RExC_end > oldregxend)
4089 RExC_end = oldregxend;
4093 ret = regclass(pRExC_state);
4095 RExC_end = oldregxend;
4098 Set_Node_Offset(ret, parse_start + 2);
4099 Set_Node_Cur_Length(ret);
4100 nextchar(pRExC_state);
4101 *flagp |= HASWIDTH|SIMPLE;
4114 case '1': case '2': case '3': case '4':
4115 case '5': case '6': case '7': case '8': case '9':
4117 I32 num = atoi(RExC_parse);
4119 if (num > 9 && num >= RExC_npar)
4122 char * parse_start = RExC_parse - 1; /* MJD */
4123 while (isDIGIT(*RExC_parse))
4126 if (!SIZE_ONLY && num > (I32)RExC_rx->nparens)
4127 vFAIL("Reference to nonexistent group");
4129 ret = reganode(pRExC_state,
4130 (U8)(FOLD ? (LOC ? REFFL : REFF) : REF),
4134 /* override incorrect value set in reganode MJD */
4135 Set_Node_Offset(ret, parse_start+1);
4136 Set_Node_Cur_Length(ret); /* MJD */
4138 nextchar(pRExC_state);
4143 if (RExC_parse >= RExC_end)
4144 FAIL("Trailing \\");
4147 /* Do not generate `unrecognized' warnings here, we fall
4148 back into the quick-grab loop below */
4155 if (RExC_flags & PMf_EXTENDED) {
4156 while (RExC_parse < RExC_end && *RExC_parse != '\n') RExC_parse++;
4157 if (RExC_parse < RExC_end)
4163 register STRLEN len;
4169 U8 tmpbuf[UTF8_MAXBYTES_CASE+1], *foldbuf;
4171 parse_start = RExC_parse - 1;
4177 ret = reg_node(pRExC_state,
4178 (U8)(FOLD ? (LOC ? EXACTFL : EXACTF) : EXACT));
4180 for (len = 0, p = RExC_parse - 1;
4181 len < 127 && p < RExC_end;
4186 if (RExC_flags & PMf_EXTENDED)
4187 p = regwhite(p, RExC_end);
4234 ender = ASCII_TO_NATIVE('\033');
4238 ender = ASCII_TO_NATIVE('\007');
4243 char* e = strchr(p, '}');
4247 vFAIL("Missing right brace on \\x{}");
4250 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
4251 | PERL_SCAN_DISALLOW_PREFIX;
4253 ender = grok_hex(p + 1, &numlen, &flags, NULL);
4260 I32 flags = PERL_SCAN_DISALLOW_PREFIX;
4262 ender = grok_hex(p, &numlen, &flags, NULL);
4268 ender = UCHARAT(p++);
4269 ender = toCTRL(ender);
4271 case '0': case '1': case '2': case '3':case '4':
4272 case '5': case '6': case '7': case '8':case '9':
4274 (isDIGIT(p[1]) && atoi(p) >= RExC_npar) ) {
4277 ender = grok_oct(p, &numlen, &flags, NULL);
4287 FAIL("Trailing \\");
4290 if (!SIZE_ONLY && ckWARN(WARN_REGEXP) && isALPHA(*p))
4291 vWARN2(p + 1, "Unrecognized escape \\%c passed through", UCHARAT(p));
4292 goto normal_default;
4297 if (UTF8_IS_START(*p) && UTF) {
4298 ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
4306 if (RExC_flags & PMf_EXTENDED)
4307 p = regwhite(p, RExC_end);
4309 /* Prime the casefolded buffer. */
4310 ender = toFOLD_uni(ender, tmpbuf, &foldlen);
4312 if (ISMULT2(p)) { /* Back off on ?+*. */
4319 /* Emit all the Unicode characters. */
4320 for (foldbuf = tmpbuf;
4322 foldlen -= numlen) {
4323 ender = utf8_to_uvchr(foldbuf, &numlen);
4325 reguni(pRExC_state, ender, s, &unilen);
4328 /* In EBCDIC the numlen
4329 * and unilen can differ. */
4331 if (numlen >= foldlen)
4335 break; /* "Can't happen." */
4339 reguni(pRExC_state, ender, s, &unilen);
4348 REGC((char)ender, s++);
4356 /* Emit all the Unicode characters. */
4357 for (foldbuf = tmpbuf;
4359 foldlen -= numlen) {
4360 ender = utf8_to_uvchr(foldbuf, &numlen);
4362 reguni(pRExC_state, ender, s, &unilen);
4365 /* In EBCDIC the numlen
4366 * and unilen can differ. */
4368 if (numlen >= foldlen)
4376 reguni(pRExC_state, ender, s, &unilen);
4385 REGC((char)ender, s++);
4389 Set_Node_Cur_Length(ret); /* MJD */
4390 nextchar(pRExC_state);
4392 /* len is STRLEN which is unsigned, need to copy to signed */
4395 vFAIL("Internal disaster");
4399 if (len == 1 && UNI_IS_INVARIANT(ender))
4404 RExC_size += STR_SZ(len);
4406 RExC_emit += STR_SZ(len);
4411 /* If the encoding pragma is in effect recode the text of
4412 * any EXACT-kind nodes. */
4413 if (PL_encoding && PL_regkind[(U8)OP(ret)] == EXACT) {
4414 STRLEN oldlen = STR_LEN(ret);
4415 SV *sv = sv_2mortal(newSVpvn(STRING(ret), oldlen));
4419 if (sv_utf8_downgrade(sv, TRUE)) {
4420 char *s = sv_recode_to_utf8(sv, PL_encoding);
4421 STRLEN newlen = SvCUR(sv);
4426 GET_RE_DEBUG_FLAGS_DECL;
4427 DEBUG_COMPILE_r(PerlIO_printf(Perl_debug_log, "recode %*s to %*s\n",
4428 (int)oldlen, STRING(ret),
4430 Copy(s, STRING(ret), newlen, char);
4431 STR_LEN(ret) += newlen - oldlen;
4432 RExC_emit += STR_SZ(newlen) - STR_SZ(oldlen);
4434 RExC_size += STR_SZ(newlen) - STR_SZ(oldlen);
4442 S_regwhite(pTHX_ char *p, char *e)
4447 else if (*p == '#') {
4450 } while (p < e && *p != '\n');
4458 /* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
4459 Character classes ([:foo:]) can also be negated ([:^foo:]).
4460 Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
4461 Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
4462 but trigger failures because they are currently unimplemented. */
4464 #define POSIXCC_DONE(c) ((c) == ':')
4465 #define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.')
4466 #define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c))
4469 S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value)
4472 I32 namedclass = OOB_NAMEDCLASS;
4474 if (value == '[' && RExC_parse + 1 < RExC_end &&
4475 /* I smell either [: or [= or [. -- POSIX has been here, right? */
4476 POSIXCC(UCHARAT(RExC_parse))) {
4477 char c = UCHARAT(RExC_parse);
4478 char* s = RExC_parse++;
4480 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != c)
4482 if (RExC_parse == RExC_end)
4483 /* Grandfather lone [:, [=, [. */
4486 char* t = RExC_parse++; /* skip over the c */
4490 if (UCHARAT(RExC_parse) == ']') {
4491 RExC_parse++; /* skip over the ending ] */
4494 I32 complement = *posixcc == '^' ? *posixcc++ : 0;
4495 I32 skip = t - posixcc;
4497 /* Initially switch on the length of the name. */
4500 if (memEQ(posixcc, "word", 4)) {
4501 /* this is not POSIX, this is the Perl \w */;
4503 = complement ? ANYOF_NALNUM : ANYOF_ALNUM;
4507 /* Names all of length 5. */
4508 /* alnum alpha ascii blank cntrl digit graph lower
4509 print punct space upper */
4510 /* Offset 4 gives the best switch position. */
4511 switch (posixcc[4]) {
4513 if (memEQ(posixcc, "alph", 4)) {
4516 = complement ? ANYOF_NALPHA : ANYOF_ALPHA;
4520 if (memEQ(posixcc, "spac", 4)) {
4523 = complement ? ANYOF_NPSXSPC : ANYOF_PSXSPC;
4527 if (memEQ(posixcc, "grap", 4)) {
4530 = complement ? ANYOF_NGRAPH : ANYOF_GRAPH;
4534 if (memEQ(posixcc, "asci", 4)) {
4537 = complement ? ANYOF_NASCII : ANYOF_ASCII;
4541 if (memEQ(posixcc, "blan", 4)) {
4544 = complement ? ANYOF_NBLANK : ANYOF_BLANK;
4548 if (memEQ(posixcc, "cntr", 4)) {
4551 = complement ? ANYOF_NCNTRL : ANYOF_CNTRL;
4555 if (memEQ(posixcc, "alnu", 4)) {
4558 = complement ? ANYOF_NALNUMC : ANYOF_ALNUMC;
4562 if (memEQ(posixcc, "lowe", 4)) {
4565 = complement ? ANYOF_NLOWER : ANYOF_LOWER;
4567 if (memEQ(posixcc, "uppe", 4)) {
4570 = complement ? ANYOF_NUPPER : ANYOF_UPPER;
4574 if (memEQ(posixcc, "digi", 4)) {
4577 = complement ? ANYOF_NDIGIT : ANYOF_DIGIT;
4579 if (memEQ(posixcc, "prin", 4)) {
4582 = complement ? ANYOF_NPRINT : ANYOF_PRINT;
4584 if (memEQ(posixcc, "punc", 4)) {
4587 = complement ? ANYOF_NPUNCT : ANYOF_PUNCT;
4593 if (memEQ(posixcc, "xdigit", 6)) {
4595 = complement ? ANYOF_NXDIGIT : ANYOF_XDIGIT;
4600 if (namedclass == OOB_NAMEDCLASS)
4602 Simple_vFAIL3("POSIX class [:%.*s:] unknown",
4605 assert (posixcc[skip] == ':');
4606 assert (posixcc[skip+1] == ']');
4607 } else if (!SIZE_ONLY) {
4608 /* [[=foo=]] and [[.foo.]] are still future. */
4610 /* adjust RExC_parse so the warning shows after
4612 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse) != ']')
4614 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
4617 /* Maternal grandfather:
4618 * "[:" ending in ":" but not in ":]" */
4628 S_checkposixcc(pTHX_ RExC_state_t *pRExC_state)
4630 if (!SIZE_ONLY && POSIXCC(UCHARAT(RExC_parse))) {
4631 char *s = RExC_parse;
4634 while(*s && isALNUM(*s))
4636 if (*s && c == *s && s[1] == ']') {
4637 if (ckWARN(WARN_REGEXP))
4639 "POSIX syntax [%c %c] belongs inside character classes",
4642 /* [[=foo=]] and [[.foo.]] are still future. */
4643 if (POSIXCC_NOTYET(c)) {
4644 /* adjust RExC_parse so the error shows after
4646 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse++) != ']')
4648 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
4655 S_regclass(pTHX_ RExC_state_t *pRExC_state)
4658 register UV nextvalue;
4659 register IV prevvalue = OOB_UNICODE;
4660 register IV range = 0;
4661 register regnode *ret;
4664 char *rangebegin = 0;
4665 bool need_class = 0;
4666 SV *listsv = Nullsv;
4669 bool optimize_invert = TRUE;
4670 AV* unicode_alternate = 0;
4672 UV literal_endpoint = 0;
4675 ret = reganode(pRExC_state, ANYOF, 0);
4678 ANYOF_FLAGS(ret) = 0;
4680 if (UCHARAT(RExC_parse) == '^') { /* Complement of range. */
4684 ANYOF_FLAGS(ret) |= ANYOF_INVERT;
4688 RExC_size += ANYOF_SKIP;
4690 RExC_emit += ANYOF_SKIP;
4692 ANYOF_FLAGS(ret) |= ANYOF_FOLD;
4694 ANYOF_FLAGS(ret) |= ANYOF_LOCALE;
4695 ANYOF_BITMAP_ZERO(ret);
4696 listsv = newSVpvn("# comment\n", 10);
4699 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
4701 if (!SIZE_ONLY && POSIXCC(nextvalue))
4702 checkposixcc(pRExC_state);
4704 /* allow 1st char to be ] (allowing it to be - is dealt with later) */
4705 if (UCHARAT(RExC_parse) == ']')
4708 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != ']') {
4712 namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
4715 rangebegin = RExC_parse;
4717 value = utf8n_to_uvchr((U8*)RExC_parse,
4718 RExC_end - RExC_parse,
4720 RExC_parse += numlen;
4723 value = UCHARAT(RExC_parse++);
4724 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
4725 if (value == '[' && POSIXCC(nextvalue))
4726 namedclass = regpposixcc(pRExC_state, value);
4727 else if (value == '\\') {
4729 value = utf8n_to_uvchr((U8*)RExC_parse,
4730 RExC_end - RExC_parse,
4732 RExC_parse += numlen;
4735 value = UCHARAT(RExC_parse++);
4736 /* Some compilers cannot handle switching on 64-bit integer
4737 * values, therefore value cannot be an UV. Yes, this will
4738 * be a problem later if we want switch on Unicode.
4739 * A similar issue a little bit later when switching on
4740 * namedclass. --jhi */
4741 switch ((I32)value) {
4742 case 'w': namedclass = ANYOF_ALNUM; break;
4743 case 'W': namedclass = ANYOF_NALNUM; break;
4744 case 's': namedclass = ANYOF_SPACE; break;
4745 case 'S': namedclass = ANYOF_NSPACE; break;
4746 case 'd': namedclass = ANYOF_DIGIT; break;
4747 case 'D': namedclass = ANYOF_NDIGIT; break;
4750 if (RExC_parse >= RExC_end)
4751 vFAIL2("Empty \\%c{}", (U8)value);
4752 if (*RExC_parse == '{') {
4754 e = strchr(RExC_parse++, '}');
4756 vFAIL2("Missing right brace on \\%c{}", c);
4757 while (isSPACE(UCHARAT(RExC_parse)))
4759 if (e == RExC_parse)
4760 vFAIL2("Empty \\%c{}", c);
4762 while (isSPACE(UCHARAT(RExC_parse + n - 1)))
4770 if (UCHARAT(RExC_parse) == '^') {
4773 value = value == 'p' ? 'P' : 'p'; /* toggle */
4774 while (isSPACE(UCHARAT(RExC_parse))) {
4780 Perl_sv_catpvf(aTHX_ listsv,
4781 "+utf8::%.*s\n", (int)n, RExC_parse);
4783 Perl_sv_catpvf(aTHX_ listsv,
4784 "!utf8::%.*s\n", (int)n, RExC_parse);
4787 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
4788 namedclass = ANYOF_MAX; /* no official name, but it's named */
4790 case 'n': value = '\n'; break;
4791 case 'r': value = '\r'; break;
4792 case 't': value = '\t'; break;
4793 case 'f': value = '\f'; break;
4794 case 'b': value = '\b'; break;
4795 case 'e': value = ASCII_TO_NATIVE('\033');break;
4796 case 'a': value = ASCII_TO_NATIVE('\007');break;
4798 if (*RExC_parse == '{') {
4799 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
4800 | PERL_SCAN_DISALLOW_PREFIX;
4801 e = strchr(RExC_parse++, '}');
4803 vFAIL("Missing right brace on \\x{}");
4805 numlen = e - RExC_parse;
4806 value = grok_hex(RExC_parse, &numlen, &flags, NULL);
4810 I32 flags = PERL_SCAN_DISALLOW_PREFIX;
4812 value = grok_hex(RExC_parse, &numlen, &flags, NULL);
4813 RExC_parse += numlen;
4817 value = UCHARAT(RExC_parse++);
4818 value = toCTRL(value);
4820 case '0': case '1': case '2': case '3': case '4':
4821 case '5': case '6': case '7': case '8': case '9':
4825 value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
4826 RExC_parse += numlen;
4830 if (!SIZE_ONLY && ckWARN(WARN_REGEXP) && isALPHA(value))
4832 "Unrecognized escape \\%c in character class passed through",
4836 } /* end of \blah */
4842 if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
4844 if (!SIZE_ONLY && !need_class)
4845 ANYOF_CLASS_ZERO(ret);
4849 /* a bad range like a-\d, a-[:digit:] ? */
4852 if (ckWARN(WARN_REGEXP))
4854 "False [] range \"%*.*s\"",
4855 RExC_parse - rangebegin,
4856 RExC_parse - rangebegin,
4858 if (prevvalue < 256) {
4859 ANYOF_BITMAP_SET(ret, prevvalue);
4860 ANYOF_BITMAP_SET(ret, '-');
4863 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
4864 Perl_sv_catpvf(aTHX_ listsv,
4865 "%04"UVxf"\n%04"UVxf"\n", (UV)prevvalue, (UV) '-');
4869 range = 0; /* this was not a true range */
4873 const char *what = NULL;
4876 if (namedclass > OOB_NAMEDCLASS)
4877 optimize_invert = FALSE;
4878 /* Possible truncation here but in some 64-bit environments
4879 * the compiler gets heartburn about switch on 64-bit values.
4880 * A similar issue a little earlier when switching on value.
4882 switch ((I32)namedclass) {
4885 ANYOF_CLASS_SET(ret, ANYOF_ALNUM);
4887 for (value = 0; value < 256; value++)
4889 ANYOF_BITMAP_SET(ret, value);
4896 ANYOF_CLASS_SET(ret, ANYOF_NALNUM);
4898 for (value = 0; value < 256; value++)
4899 if (!isALNUM(value))
4900 ANYOF_BITMAP_SET(ret, value);
4907 ANYOF_CLASS_SET(ret, ANYOF_ALNUMC);
4909 for (value = 0; value < 256; value++)
4910 if (isALNUMC(value))
4911 ANYOF_BITMAP_SET(ret, value);
4918 ANYOF_CLASS_SET(ret, ANYOF_NALNUMC);
4920 for (value = 0; value < 256; value++)
4921 if (!isALNUMC(value))
4922 ANYOF_BITMAP_SET(ret, value);
4929 ANYOF_CLASS_SET(ret, ANYOF_ALPHA);
4931 for (value = 0; value < 256; value++)
4933 ANYOF_BITMAP_SET(ret, value);
4940 ANYOF_CLASS_SET(ret, ANYOF_NALPHA);
4942 for (value = 0; value < 256; value++)
4943 if (!isALPHA(value))
4944 ANYOF_BITMAP_SET(ret, value);
4951 ANYOF_CLASS_SET(ret, ANYOF_ASCII);
4954 for (value = 0; value < 128; value++)
4955 ANYOF_BITMAP_SET(ret, value);
4957 for (value = 0; value < 256; value++) {
4959 ANYOF_BITMAP_SET(ret, value);
4968 ANYOF_CLASS_SET(ret, ANYOF_NASCII);
4971 for (value = 128; value < 256; value++)
4972 ANYOF_BITMAP_SET(ret, value);
4974 for (value = 0; value < 256; value++) {
4975 if (!isASCII(value))
4976 ANYOF_BITMAP_SET(ret, value);
4985 ANYOF_CLASS_SET(ret, ANYOF_BLANK);
4987 for (value = 0; value < 256; value++)
4989 ANYOF_BITMAP_SET(ret, value);
4996 ANYOF_CLASS_SET(ret, ANYOF_NBLANK);
4998 for (value = 0; value < 256; value++)
4999 if (!isBLANK(value))
5000 ANYOF_BITMAP_SET(ret, value);
5007 ANYOF_CLASS_SET(ret, ANYOF_CNTRL);
5009 for (value = 0; value < 256; value++)
5011 ANYOF_BITMAP_SET(ret, value);
5018 ANYOF_CLASS_SET(ret, ANYOF_NCNTRL);
5020 for (value = 0; value < 256; value++)
5021 if (!isCNTRL(value))
5022 ANYOF_BITMAP_SET(ret, value);
5029 ANYOF_CLASS_SET(ret, ANYOF_DIGIT);
5031 /* consecutive digits assumed */
5032 for (value = '0'; value <= '9'; value++)
5033 ANYOF_BITMAP_SET(ret, value);
5040 ANYOF_CLASS_SET(ret, ANYOF_NDIGIT);
5042 /* consecutive digits assumed */
5043 for (value = 0; value < '0'; value++)
5044 ANYOF_BITMAP_SET(ret, value);
5045 for (value = '9' + 1; value < 256; value++)
5046 ANYOF_BITMAP_SET(ret, value);
5053 ANYOF_CLASS_SET(ret, ANYOF_GRAPH);
5055 for (value = 0; value < 256; value++)
5057 ANYOF_BITMAP_SET(ret, value);
5064 ANYOF_CLASS_SET(ret, ANYOF_NGRAPH);
5066 for (value = 0; value < 256; value++)
5067 if (!isGRAPH(value))
5068 ANYOF_BITMAP_SET(ret, value);
5075 ANYOF_CLASS_SET(ret, ANYOF_LOWER);
5077 for (value = 0; value < 256; value++)
5079 ANYOF_BITMAP_SET(ret, value);
5086 ANYOF_CLASS_SET(ret, ANYOF_NLOWER);
5088 for (value = 0; value < 256; value++)
5089 if (!isLOWER(value))
5090 ANYOF_BITMAP_SET(ret, value);
5097 ANYOF_CLASS_SET(ret, ANYOF_PRINT);
5099 for (value = 0; value < 256; value++)
5101 ANYOF_BITMAP_SET(ret, value);
5108 ANYOF_CLASS_SET(ret, ANYOF_NPRINT);
5110 for (value = 0; value < 256; value++)
5111 if (!isPRINT(value))
5112 ANYOF_BITMAP_SET(ret, value);
5119 ANYOF_CLASS_SET(ret, ANYOF_PSXSPC);
5121 for (value = 0; value < 256; value++)
5122 if (isPSXSPC(value))
5123 ANYOF_BITMAP_SET(ret, value);
5130 ANYOF_CLASS_SET(ret, ANYOF_NPSXSPC);
5132 for (value = 0; value < 256; value++)
5133 if (!isPSXSPC(value))
5134 ANYOF_BITMAP_SET(ret, value);
5141 ANYOF_CLASS_SET(ret, ANYOF_PUNCT);
5143 for (value = 0; value < 256; value++)
5145 ANYOF_BITMAP_SET(ret, value);
5152 ANYOF_CLASS_SET(ret, ANYOF_NPUNCT);
5154 for (value = 0; value < 256; value++)
5155 if (!isPUNCT(value))
5156 ANYOF_BITMAP_SET(ret, value);
5163 ANYOF_CLASS_SET(ret, ANYOF_SPACE);
5165 for (value = 0; value < 256; value++)
5167 ANYOF_BITMAP_SET(ret, value);
5174 ANYOF_CLASS_SET(ret, ANYOF_NSPACE);
5176 for (value = 0; value < 256; value++)
5177 if (!isSPACE(value))
5178 ANYOF_BITMAP_SET(ret, value);
5185 ANYOF_CLASS_SET(ret, ANYOF_UPPER);
5187 for (value = 0; value < 256; value++)
5189 ANYOF_BITMAP_SET(ret, value);
5196 ANYOF_CLASS_SET(ret, ANYOF_NUPPER);
5198 for (value = 0; value < 256; value++)
5199 if (!isUPPER(value))
5200 ANYOF_BITMAP_SET(ret, value);
5207 ANYOF_CLASS_SET(ret, ANYOF_XDIGIT);
5209 for (value = 0; value < 256; value++)
5210 if (isXDIGIT(value))
5211 ANYOF_BITMAP_SET(ret, value);
5218 ANYOF_CLASS_SET(ret, ANYOF_NXDIGIT);
5220 for (value = 0; value < 256; value++)
5221 if (!isXDIGIT(value))
5222 ANYOF_BITMAP_SET(ret, value);
5228 /* this is to handle \p and \P */
5231 vFAIL("Invalid [::] class");
5235 /* Strings such as "+utf8::isWord\n" */
5236 Perl_sv_catpvf(aTHX_ listsv, "%cutf8::Is%s\n", yesno, what);
5239 ANYOF_FLAGS(ret) |= ANYOF_CLASS;
5242 } /* end of namedclass \blah */
5245 if (prevvalue > (IV)value) /* b-a */ {
5246 Simple_vFAIL4("Invalid [] range \"%*.*s\"",
5247 RExC_parse - rangebegin,
5248 RExC_parse - rangebegin,
5250 range = 0; /* not a valid range */
5254 prevvalue = value; /* save the beginning of the range */
5255 if (*RExC_parse == '-' && RExC_parse+1 < RExC_end &&
5256 RExC_parse[1] != ']') {
5259 /* a bad range like \w-, [:word:]- ? */
5260 if (namedclass > OOB_NAMEDCLASS) {
5261 if (ckWARN(WARN_REGEXP))
5263 "False [] range \"%*.*s\"",
5264 RExC_parse - rangebegin,
5265 RExC_parse - rangebegin,
5268 ANYOF_BITMAP_SET(ret, '-');
5270 range = 1; /* yeah, it's a range! */
5271 continue; /* but do it the next time */
5275 /* now is the next time */
5279 if (prevvalue < 256) {
5280 IV ceilvalue = value < 256 ? value : 255;
5283 /* In EBCDIC [\x89-\x91] should include
5284 * the \x8e but [i-j] should not. */
5285 if (literal_endpoint == 2 &&
5286 ((isLOWER(prevvalue) && isLOWER(ceilvalue)) ||
5287 (isUPPER(prevvalue) && isUPPER(ceilvalue))))
5289 if (isLOWER(prevvalue)) {
5290 for (i = prevvalue; i <= ceilvalue; i++)
5292 ANYOF_BITMAP_SET(ret, i);
5294 for (i = prevvalue; i <= ceilvalue; i++)
5296 ANYOF_BITMAP_SET(ret, i);
5301 for (i = prevvalue; i <= ceilvalue; i++)
5302 ANYOF_BITMAP_SET(ret, i);
5304 if (value > 255 || UTF) {
5305 UV prevnatvalue = NATIVE_TO_UNI(prevvalue);
5306 UV natvalue = NATIVE_TO_UNI(value);
5308 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
5309 if (prevnatvalue < natvalue) { /* what about > ? */
5310 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\t%04"UVxf"\n",
5311 prevnatvalue, natvalue);
5313 else if (prevnatvalue == natvalue) {
5314 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n", natvalue);
5316 U8 foldbuf[UTF8_MAXBYTES_CASE+1];
5318 UV f = to_uni_fold(natvalue, foldbuf, &foldlen);
5320 /* If folding and foldable and a single
5321 * character, insert also the folded version
5322 * to the charclass. */
5324 if (foldlen == (STRLEN)UNISKIP(f))
5325 Perl_sv_catpvf(aTHX_ listsv,
5328 /* Any multicharacter foldings
5329 * require the following transform:
5330 * [ABCDEF] -> (?:[ABCabcDEFd]|pq|rst)
5331 * where E folds into "pq" and F folds
5332 * into "rst", all other characters
5333 * fold to single characters. We save
5334 * away these multicharacter foldings,
5335 * to be later saved as part of the
5336 * additional "s" data. */
5339 if (!unicode_alternate)
5340 unicode_alternate = newAV();
5341 sv = newSVpvn((char*)foldbuf, foldlen);
5343 av_push(unicode_alternate, sv);
5347 /* If folding and the value is one of the Greek
5348 * sigmas insert a few more sigmas to make the
5349 * folding rules of the sigmas to work right.
5350 * Note that not all the possible combinations
5351 * are handled here: some of them are handled
5352 * by the standard folding rules, and some of
5353 * them (literal or EXACTF cases) are handled
5354 * during runtime in regexec.c:S_find_byclass(). */
5355 if (value == UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA) {
5356 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
5357 (UV)UNICODE_GREEK_CAPITAL_LETTER_SIGMA);
5358 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
5359 (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
5361 else if (value == UNICODE_GREEK_CAPITAL_LETTER_SIGMA)
5362 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
5363 (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
5368 literal_endpoint = 0;
5372 range = 0; /* this range (if it was one) is done now */
5376 ANYOF_FLAGS(ret) |= ANYOF_LARGE;
5378 RExC_size += ANYOF_CLASS_ADD_SKIP;
5380 RExC_emit += ANYOF_CLASS_ADD_SKIP;
5383 /* optimize case-insensitive simple patterns (e.g. /[a-z]/i) */
5385 /* If the only flag is folding (plus possibly inversion). */
5386 ((ANYOF_FLAGS(ret) & (ANYOF_FLAGS_ALL ^ ANYOF_INVERT)) == ANYOF_FOLD)
5388 for (value = 0; value < 256; ++value) {
5389 if (ANYOF_BITMAP_TEST(ret, value)) {
5390 UV fold = PL_fold[value];
5393 ANYOF_BITMAP_SET(ret, fold);
5396 ANYOF_FLAGS(ret) &= ~ANYOF_FOLD;
5399 /* optimize inverted simple patterns (e.g. [^a-z]) */
5400 if (!SIZE_ONLY && optimize_invert &&
5401 /* If the only flag is inversion. */
5402 (ANYOF_FLAGS(ret) & ANYOF_FLAGS_ALL) == ANYOF_INVERT) {
5403 for (value = 0; value < ANYOF_BITMAP_SIZE; ++value)
5404 ANYOF_BITMAP(ret)[value] ^= ANYOF_FLAGS_ALL;
5405 ANYOF_FLAGS(ret) = ANYOF_UNICODE_ALL;
5412 /* The 0th element stores the character class description
5413 * in its textual form: used later (regexec.c:Perl_regclass_swash())
5414 * to initialize the appropriate swash (which gets stored in
5415 * the 1st element), and also useful for dumping the regnode.
5416 * The 2nd element stores the multicharacter foldings,
5417 * used later (regexec.c:S_reginclass()). */
5418 av_store(av, 0, listsv);
5419 av_store(av, 1, NULL);
5420 av_store(av, 2, (SV*)unicode_alternate);
5421 rv = newRV_noinc((SV*)av);
5422 n = add_data(pRExC_state, 1, "s");
5423 RExC_rx->data->data[n] = (void*)rv;
5431 S_nextchar(pTHX_ RExC_state_t *pRExC_state)
5433 char* retval = RExC_parse++;
5436 if (*RExC_parse == '(' && RExC_parse[1] == '?' &&
5437 RExC_parse[2] == '#') {
5438 while (*RExC_parse != ')') {
5439 if (RExC_parse == RExC_end)
5440 FAIL("Sequence (?#... not terminated");
5446 if (RExC_flags & PMf_EXTENDED) {
5447 if (isSPACE(*RExC_parse)) {
5451 else if (*RExC_parse == '#') {
5452 while (RExC_parse < RExC_end)
5453 if (*RExC_parse++ == '\n') break;
5462 - reg_node - emit a node
5464 STATIC regnode * /* Location. */
5465 S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
5467 register regnode *ret;
5468 register regnode *ptr;
5472 SIZE_ALIGN(RExC_size);
5477 NODE_ALIGN_FILL(ret);
5479 FILL_ADVANCE_NODE(ptr, op);
5480 if (RExC_offsets) { /* MJD */
5481 MJD_OFFSET_DEBUG(("%s:%u: (op %s) %s %u <- %u (len %u) (max %u).\n",
5482 "reg_node", __LINE__,
5484 RExC_emit - RExC_emit_start > RExC_offsets[0]
5485 ? "Overwriting end of array!\n" : "OK",
5486 RExC_emit - RExC_emit_start,
5487 RExC_parse - RExC_start,
5489 Set_Node_Offset(RExC_emit, RExC_parse + (op == END));
5498 - reganode - emit a node with an argument
5500 STATIC regnode * /* Location. */
5501 S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
5503 register regnode *ret;
5504 register regnode *ptr;
5508 SIZE_ALIGN(RExC_size);
5513 NODE_ALIGN_FILL(ret);
5515 FILL_ADVANCE_NODE_ARG(ptr, op, arg);
5516 if (RExC_offsets) { /* MJD */
5517 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %u <- %u (max %u).\n",
5521 RExC_emit - RExC_emit_start > RExC_offsets[0] ?
5522 "Overwriting end of array!\n" : "OK",
5523 RExC_emit - RExC_emit_start,
5524 RExC_parse - RExC_start,
5526 Set_Cur_Node_Offset;
5535 - reguni - emit (if appropriate) a Unicode character
5538 S_reguni(pTHX_ RExC_state_t *pRExC_state, UV uv, char* s, STRLEN* lenp)
5540 *lenp = SIZE_ONLY ? UNISKIP(uv) : (uvchr_to_utf8((U8*)s, uv) - (U8*)s);
5544 - reginsert - insert an operator in front of already-emitted operand
5546 * Means relocating the operand.
5549 S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd)
5551 register regnode *src;
5552 register regnode *dst;
5553 register regnode *place;
5554 register int offset = regarglen[(U8)op];
5556 /* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */
5559 RExC_size += NODE_STEP_REGNODE + offset;
5564 RExC_emit += NODE_STEP_REGNODE + offset;
5566 while (src > opnd) {
5567 StructCopy(--src, --dst, regnode);
5568 if (RExC_offsets) { /* MJD 20010112 */
5569 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s copy %u -> %u (max %u).\n",
5573 dst - RExC_emit_start > RExC_offsets[0]
5574 ? "Overwriting end of array!\n" : "OK",
5575 src - RExC_emit_start,
5576 dst - RExC_emit_start,
5578 Set_Node_Offset_To_R(dst-RExC_emit_start, Node_Offset(src));
5579 Set_Node_Length_To_R(dst-RExC_emit_start, Node_Length(src));
5584 place = opnd; /* Op node, where operand used to be. */
5585 if (RExC_offsets) { /* MJD */
5586 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %u <- %u (max %u).\n",
5590 place - RExC_emit_start > RExC_offsets[0]
5591 ? "Overwriting end of array!\n" : "OK",
5592 place - RExC_emit_start,
5593 RExC_parse - RExC_start,
5595 Set_Node_Offset(place, RExC_parse);
5596 Set_Node_Length(place, 1);
5598 src = NEXTOPER(place);
5599 FILL_ADVANCE_NODE(place, op);
5600 Zero(src, offset, regnode);
5604 - regtail - set the next-pointer at the end of a node chain of p to val.
5607 S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p, regnode *val)
5609 register regnode *scan;
5610 register regnode *temp;
5615 /* Find last node. */
5618 temp = regnext(scan);
5624 if (reg_off_by_arg[OP(scan)]) {
5625 ARG_SET(scan, val - scan);
5628 NEXT_OFF(scan) = val - scan;
5633 - regoptail - regtail on operand of first argument; nop if operandless
5636 S_regoptail(pTHX_ RExC_state_t *pRExC_state, regnode *p, regnode *val)
5638 /* "Operandless" and "op != BRANCH" are synonymous in practice. */
5639 if (p == NULL || SIZE_ONLY)
5641 if (PL_regkind[(U8)OP(p)] == BRANCH) {
5642 regtail(pRExC_state, NEXTOPER(p), val);
5644 else if ( PL_regkind[(U8)OP(p)] == BRANCHJ) {
5645 regtail(pRExC_state, NEXTOPER(NEXTOPER(p)), val);
5652 - regcurly - a little FSA that accepts {\d+,?\d*}
5655 S_regcurly(pTHX_ register char *s)
5676 S_dumpuntil(pTHX_ regnode *start, regnode *node, regnode *last, SV* sv, I32 l)
5678 register U8 op = EXACT; /* Arbitrary non-END op. */
5679 register regnode *next;
5681 while (op != END && (!last || node < last)) {
5682 /* While that wasn't END last time... */
5688 next = regnext(node);
5690 if (OP(node) == OPTIMIZED)
5693 PerlIO_printf(Perl_debug_log, "%4"IVdf":%*s%s", (IV)(node - start),
5694 (int)(2*l + 1), "", SvPVX(sv));
5695 if (next == NULL) /* Next ptr. */
5696 PerlIO_printf(Perl_debug_log, "(0)");
5698 PerlIO_printf(Perl_debug_log, "(%"IVdf")", (IV)(next - start));
5699 (void)PerlIO_putc(Perl_debug_log, '\n');
5701 if (PL_regkind[(U8)op] == BRANCHJ) {
5702 register regnode *nnode = (OP(next) == LONGJMP
5705 if (last && nnode > last)
5707 node = dumpuntil(start, NEXTOPER(NEXTOPER(node)), nnode, sv, l + 1);
5709 else if (PL_regkind[(U8)op] == BRANCH) {
5710 node = dumpuntil(start, NEXTOPER(node), next, sv, l + 1);
5712 else if ( PL_regkind[(U8)op] == TRIE ) {
5713 const I32 n = ARG(node);
5714 const reg_trie_data *trie = (reg_trie_data*)PL_regdata->data[n];
5715 const I32 arry_len = av_len(trie->words)+1;
5717 PerlIO_printf(Perl_debug_log,
5718 "%*s[Words:%d Chars Stored:%d Unique Chars:%d States:%d%s]\n",
5722 trie->uniquecharcount,
5724 node->flags ? " EVAL mode" : "");
5726 for (word_idx=0; word_idx < arry_len; word_idx++) {
5727 SV **elem_ptr=av_fetch(trie->words,word_idx,0);
5729 PerlIO_printf(Perl_debug_log, "%*s<%s%s%s>\n",
5732 SvPV_nolen(*elem_ptr),
5737 PerlIO_printf(Perl_debug_log, "(0)\n");
5739 PerlIO_printf(Perl_debug_log, "(%"IVdf")\n", (IV)(next - start));
5745 node = NEXTOPER(node);
5746 node += regarglen[(U8)op];
5749 else if ( op == CURLY) { /* `next' might be very big: optimizer */
5750 node = dumpuntil(start, NEXTOPER(node) + EXTRA_STEP_2ARGS,
5751 NEXTOPER(node) + EXTRA_STEP_2ARGS + 1, sv, l + 1);
5753 else if (PL_regkind[(U8)op] == CURLY && op != CURLYX) {
5754 node = dumpuntil(start, NEXTOPER(node) + EXTRA_STEP_2ARGS,
5757 else if ( op == PLUS || op == STAR) {
5758 node = dumpuntil(start, NEXTOPER(node), NEXTOPER(node) + 1, sv, l + 1);
5760 else if (op == ANYOF) {
5761 /* arglen 1 + class block */
5762 node += 1 + ((ANYOF_FLAGS(node) & ANYOF_LARGE)
5763 ? ANYOF_CLASS_SKIP : ANYOF_SKIP);
5764 node = NEXTOPER(node);
5766 else if (PL_regkind[(U8)op] == EXACT) {
5767 /* Literal string, where present. */
5768 node += NODE_SZ_STR(node) - 1;
5769 node = NEXTOPER(node);
5772 node = NEXTOPER(node);
5773 node += regarglen[(U8)op];
5775 if (op == CURLYX || op == OPEN)
5777 else if (op == WHILEM)
5783 #endif /* DEBUGGING */
5786 - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
5789 Perl_regdump(pTHX_ regexp *r)
5792 SV *sv = sv_newmortal();
5794 (void)dumpuntil(r->program, r->program + 1, NULL, sv, 0);
5796 /* Header fields of interest. */
5797 if (r->anchored_substr)
5798 PerlIO_printf(Perl_debug_log,
5799 "anchored `%s%.*s%s'%s at %"IVdf" ",
5801 (int)(SvCUR(r->anchored_substr) - (SvTAIL(r->anchored_substr)!=0)),
5802 SvPVX(r->anchored_substr),
5804 SvTAIL(r->anchored_substr) ? "$" : "",
5805 (IV)r->anchored_offset);
5806 else if (r->anchored_utf8)
5807 PerlIO_printf(Perl_debug_log,
5808 "anchored utf8 `%s%.*s%s'%s at %"IVdf" ",
5810 (int)(SvCUR(r->anchored_utf8) - (SvTAIL(r->anchored_utf8)!=0)),
5811 SvPVX(r->anchored_utf8),
5813 SvTAIL(r->anchored_utf8) ? "$" : "",
5814 (IV)r->anchored_offset);
5815 if (r->float_substr)
5816 PerlIO_printf(Perl_debug_log,
5817 "floating `%s%.*s%s'%s at %"IVdf"..%"UVuf" ",
5819 (int)(SvCUR(r->float_substr) - (SvTAIL(r->float_substr)!=0)),
5820 SvPVX(r->float_substr),
5822 SvTAIL(r->float_substr) ? "$" : "",
5823 (IV)r->float_min_offset, (UV)r->float_max_offset);
5824 else if (r->float_utf8)
5825 PerlIO_printf(Perl_debug_log,
5826 "floating utf8 `%s%.*s%s'%s at %"IVdf"..%"UVuf" ",
5828 (int)(SvCUR(r->float_utf8) - (SvTAIL(r->float_utf8)!=0)),
5829 SvPVX(r->float_utf8),
5831 SvTAIL(r->float_utf8) ? "$" : "",
5832 (IV)r->float_min_offset, (UV)r->float_max_offset);
5833 if (r->check_substr || r->check_utf8)
5834 PerlIO_printf(Perl_debug_log,
5835 r->check_substr == r->float_substr
5836 && r->check_utf8 == r->float_utf8
5837 ? "(checking floating" : "(checking anchored");
5838 if (r->reganch & ROPT_NOSCAN)
5839 PerlIO_printf(Perl_debug_log, " noscan");
5840 if (r->reganch & ROPT_CHECK_ALL)
5841 PerlIO_printf(Perl_debug_log, " isall");
5842 if (r->check_substr || r->check_utf8)
5843 PerlIO_printf(Perl_debug_log, ") ");
5845 if (r->regstclass) {
5846 regprop(sv, r->regstclass);
5847 PerlIO_printf(Perl_debug_log, "stclass `%s' ", SvPVX(sv));
5849 if (r->reganch & ROPT_ANCH) {
5850 PerlIO_printf(Perl_debug_log, "anchored");
5851 if (r->reganch & ROPT_ANCH_BOL)
5852 PerlIO_printf(Perl_debug_log, "(BOL)");
5853 if (r->reganch & ROPT_ANCH_MBOL)
5854 PerlIO_printf(Perl_debug_log, "(MBOL)");
5855 if (r->reganch & ROPT_ANCH_SBOL)
5856 PerlIO_printf(Perl_debug_log, "(SBOL)");
5857 if (r->reganch & ROPT_ANCH_GPOS)
5858 PerlIO_printf(Perl_debug_log, "(GPOS)");
5859 PerlIO_putc(Perl_debug_log, ' ');
5861 if (r->reganch & ROPT_GPOS_SEEN)
5862 PerlIO_printf(Perl_debug_log, "GPOS ");
5863 if (r->reganch & ROPT_SKIP)
5864 PerlIO_printf(Perl_debug_log, "plus ");
5865 if (r->reganch & ROPT_IMPLICIT)
5866 PerlIO_printf(Perl_debug_log, "implicit ");
5867 PerlIO_printf(Perl_debug_log, "minlen %ld ", (long) r->minlen);
5868 if (r->reganch & ROPT_EVAL_SEEN)
5869 PerlIO_printf(Perl_debug_log, "with eval ");
5870 PerlIO_printf(Perl_debug_log, "\n");
5873 const U32 len = r->offsets[0];
5874 GET_RE_DEBUG_FLAGS_DECL;
5876 PerlIO_printf(Perl_debug_log, "Offsets: [%"UVuf"]\n\t", (UV)r->offsets[0]);
5877 for (i = 1; i <= len; i++)
5878 PerlIO_printf(Perl_debug_log, "%"UVuf"[%"UVuf"] ",
5879 (UV)r->offsets[i*2-1],
5880 (UV)r->offsets[i*2]);
5881 PerlIO_printf(Perl_debug_log, "\n");
5884 #endif /* DEBUGGING */
5890 S_put_byte(pTHX_ SV *sv, int c)
5892 if (isCNTRL(c) || c == 255 || !isPRINT(c))
5893 Perl_sv_catpvf(aTHX_ sv, "\\%o", c);
5894 else if (c == '-' || c == ']' || c == '\\' || c == '^')
5895 Perl_sv_catpvf(aTHX_ sv, "\\%c", c);
5897 Perl_sv_catpvf(aTHX_ sv, "%c", c);
5900 #endif /* DEBUGGING */
5904 - regprop - printable representation of opcode
5907 Perl_regprop(pTHX_ SV *sv, regnode *o)
5912 sv_setpvn(sv, "", 0);
5913 if (OP(o) >= reg_num) /* regnode.type is unsigned */
5914 /* It would be nice to FAIL() here, but this may be called from
5915 regexec.c, and it would be hard to supply pRExC_state. */
5916 Perl_croak(aTHX_ "Corrupted regexp opcode");
5917 sv_catpv(sv, reg_name[OP(o)]); /* Take off const! */
5919 k = PL_regkind[(U8)OP(o)];
5922 SV *dsv = sv_2mortal(newSVpvn("", 0));
5923 /* Using is_utf8_string() is a crude hack but it may
5924 * be the best for now since we have no flag "this EXACTish
5925 * node was UTF-8" --jhi */
5926 bool do_utf8 = is_utf8_string((U8*)STRING(o), STR_LEN(o));
5928 pv_uni_display(dsv, (U8*)STRING(o), STR_LEN(o), 60,
5929 UNI_DISPLAY_REGEX) :
5931 const int len = do_utf8 ?
5934 Perl_sv_catpvf(aTHX_ sv, " <%s%.*s%s>",
5938 } else if (k == TRIE) {/*
5939 this isn't always safe, as Pl_regdata may not be for this regex yet
5940 (depending on where its called from) so its being moved to dumpuntil
5942 reg_trie_data *trie=(reg_trie_data*)PL_regdata->data[n];
5943 Perl_sv_catpvf(aTHX_ sv, " (W:%d L:%d C:%d S:%d)",
5946 trie->uniquecharcount,
5949 } else if (k == CURLY) {
5950 if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX)
5951 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */
5952 Perl_sv_catpvf(aTHX_ sv, " {%d,%d}", ARG1(o), ARG2(o));
5954 else if (k == WHILEM && o->flags) /* Ordinal/of */
5955 Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4);
5956 else if (k == REF || k == OPEN || k == CLOSE || k == GROUPP )
5957 Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o)); /* Parenth number */
5958 else if (k == LOGICAL)
5959 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* 2: embedded, otherwise 1 */
5960 else if (k == ANYOF) {
5961 int i, rangestart = -1;
5962 U8 flags = ANYOF_FLAGS(o);
5963 const char * const anyofs[] = { /* Should be synchronized with
5964 * ANYOF_ #xdefines in regcomp.h */
5997 if (flags & ANYOF_LOCALE)
5998 sv_catpv(sv, "{loc}");
5999 if (flags & ANYOF_FOLD)
6000 sv_catpv(sv, "{i}");
6001 Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
6002 if (flags & ANYOF_INVERT)
6004 for (i = 0; i <= 256; i++) {
6005 if (i < 256 && ANYOF_BITMAP_TEST(o,i)) {
6006 if (rangestart == -1)
6008 } else if (rangestart != -1) {
6009 if (i <= rangestart + 3)
6010 for (; rangestart < i; rangestart++)
6011 put_byte(sv, rangestart);
6013 put_byte(sv, rangestart);
6015 put_byte(sv, i - 1);
6021 if (o->flags & ANYOF_CLASS)
6022 for (i = 0; i < sizeof(anyofs)/sizeof(char*); i++)
6023 if (ANYOF_CLASS_TEST(o,i))
6024 sv_catpv(sv, anyofs[i]);
6026 if (flags & ANYOF_UNICODE)
6027 sv_catpv(sv, "{unicode}");
6028 else if (flags & ANYOF_UNICODE_ALL)
6029 sv_catpv(sv, "{unicode_all}");
6033 SV *sw = regclass_swash(o, FALSE, &lv, 0);
6037 U8 s[UTF8_MAXBYTES_CASE+1];
6039 for (i = 0; i <= 256; i++) { /* just the first 256 */
6040 U8 *e = uvchr_to_utf8(s, i);
6042 if (i < 256 && swash_fetch(sw, s, TRUE)) {
6043 if (rangestart == -1)
6045 } else if (rangestart != -1) {
6048 if (i <= rangestart + 3)
6049 for (; rangestart < i; rangestart++) {
6050 for(e = uvchr_to_utf8(s, rangestart), p = s; p < e; p++)
6054 for (e = uvchr_to_utf8(s, rangestart), p = s; p < e; p++)
6057 for (e = uvchr_to_utf8(s, i - 1), p = s; p < e; p++)
6064 sv_catpv(sv, "..."); /* et cetera */
6068 char *s = savesvpv(lv);
6071 while(*s && *s != '\n') s++;
6092 Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
6094 else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH))
6095 Perl_sv_catpvf(aTHX_ sv, "[-%d]", o->flags);
6096 #endif /* DEBUGGING */
6100 Perl_re_intuit_string(pTHX_ regexp *prog)
6101 { /* Assume that RE_INTUIT is set */
6102 GET_RE_DEBUG_FLAGS_DECL;
6105 char *s = SvPV(prog->check_substr
6106 ? prog->check_substr : prog->check_utf8, n_a);
6108 if (!PL_colorset) reginitcolors();
6109 PerlIO_printf(Perl_debug_log,
6110 "%sUsing REx %ssubstr:%s `%s%.60s%s%s'\n",
6112 prog->check_substr ? "" : "utf8 ",
6113 PL_colors[5],PL_colors[0],
6116 (strlen(s) > 60 ? "..." : ""));
6119 return prog->check_substr ? prog->check_substr : prog->check_utf8;
6123 Perl_pregfree(pTHX_ struct regexp *r)
6126 SV *dsv = PERL_DEBUG_PAD_ZERO(0);
6127 SV *re_debug_flags=get_sv(RE_DEBUG_FLAGS,0);
6131 if (!r || (--r->refcnt > 0))
6133 DEBUG_r(if (re_debug_flags && (SvIV(re_debug_flags) & RE_DEBUG_COMPILE)) {
6134 const char *s = (r->reganch & ROPT_UTF8)
6135 ? pv_uni_display(dsv, (U8*)r->precomp, r->prelen, 60, UNI_DISPLAY_REGEX)
6136 : pv_display(dsv, r->precomp, r->prelen, 0, 60);
6137 const int len = SvCUR(dsv);
6140 PerlIO_printf(Perl_debug_log,
6141 "%sFreeing REx:%s %s%*.*s%s%s\n",
6142 PL_colors[4],PL_colors[5],PL_colors[0],
6145 len > 60 ? "..." : "");
6149 Safefree(r->precomp);
6150 if (r->offsets) /* 20010421 MJD */
6151 Safefree(r->offsets);
6152 RX_MATCH_COPY_FREE(r);
6153 #ifdef PERL_COPY_ON_WRITE
6155 SvREFCNT_dec(r->saved_copy);
6158 if (r->anchored_substr)
6159 SvREFCNT_dec(r->anchored_substr);
6160 if (r->anchored_utf8)
6161 SvREFCNT_dec(r->anchored_utf8);
6162 if (r->float_substr)
6163 SvREFCNT_dec(r->float_substr);
6165 SvREFCNT_dec(r->float_utf8);
6166 Safefree(r->substrs);
6169 int n = r->data->count;
6170 PAD* new_comppad = NULL;
6175 /* If you add a ->what type here, update the comment in regcomp.h */
6176 switch (r->data->what[n]) {
6178 SvREFCNT_dec((SV*)r->data->data[n]);
6181 Safefree(r->data->data[n]);
6184 new_comppad = (AV*)r->data->data[n];
6187 if (new_comppad == NULL)
6188 Perl_croak(aTHX_ "panic: pregfree comppad");
6189 PAD_SAVE_LOCAL(old_comppad,
6190 /* Watch out for global destruction's random ordering. */
6191 (SvTYPE(new_comppad) == SVt_PVAV) ?
6192 new_comppad : Null(PAD *)
6195 refcnt = OpREFCNT_dec((OP_4tree*)r->data->data[n]);
6198 op_free((OP_4tree*)r->data->data[n]);
6200 PAD_RESTORE_LOCAL(old_comppad);
6201 SvREFCNT_dec((SV*)new_comppad);
6208 reg_trie_data *trie=(reg_trie_data*)r->data->data[n];
6211 refcount = trie->refcount--;
6215 Safefree(trie->charmap);
6216 if (trie->widecharmap)
6217 SvREFCNT_dec((SV*)trie->widecharmap);
6219 Safefree(trie->states);
6221 Safefree(trie->trans);
6224 SvREFCNT_dec((SV*)trie->words);
6225 if (trie->revcharmap)
6226 SvREFCNT_dec((SV*)trie->revcharmap);
6228 Safefree(r->data->data[n]); /* do this last!!!! */
6233 Perl_croak(aTHX_ "panic: regfree data code '%c'", r->data->what[n]);
6236 Safefree(r->data->what);
6239 Safefree(r->startp);
6245 - regnext - dig the "next" pointer out of a node
6248 Perl_regnext(pTHX_ register regnode *p)
6250 register I32 offset;
6252 if (p == &PL_regdummy)
6255 offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p));
6263 S_re_croak2(pTHX_ const char* pat1,const char* pat2,...)
6266 STRLEN l1 = strlen(pat1);
6267 STRLEN l2 = strlen(pat2);
6270 const char *message;
6276 Copy(pat1, buf, l1 , char);
6277 Copy(pat2, buf + l1, l2 , char);
6278 buf[l1 + l2] = '\n';
6279 buf[l1 + l2 + 1] = '\0';
6281 /* ANSI variant takes additional second argument */
6282 va_start(args, pat2);
6286 msv = vmess(buf, &args);
6288 message = SvPV(msv,l1);
6291 Copy(message, buf, l1 , char);
6292 buf[l1-1] = '\0'; /* Overwrite \n */
6293 Perl_croak(aTHX_ "%s", buf);
6296 /* XXX Here's a total kludge. But we need to re-enter for swash routines. */
6299 Perl_save_re_context(pTHX)
6301 SAVEI32(PL_reg_flags); /* from regexec.c */
6303 SAVEPPTR(PL_reginput); /* String-input pointer. */
6304 SAVEPPTR(PL_regbol); /* Beginning of input, for ^ check. */
6305 SAVEPPTR(PL_regeol); /* End of input, for $ check. */
6306 SAVEVPTR(PL_regstartp); /* Pointer to startp array. */
6307 SAVEVPTR(PL_regendp); /* Ditto for endp. */
6308 SAVEVPTR(PL_reglastparen); /* Similarly for lastparen. */
6309 SAVEVPTR(PL_reglastcloseparen); /* Similarly for lastcloseparen. */
6310 SAVEPPTR(PL_regtill); /* How far we are required to go. */
6311 SAVEGENERICPV(PL_reg_start_tmp); /* from regexec.c */
6312 PL_reg_start_tmp = 0;
6313 SAVEI32(PL_reg_start_tmpl); /* from regexec.c */
6314 PL_reg_start_tmpl = 0;
6315 SAVEVPTR(PL_regdata);
6316 SAVEI32(PL_reg_eval_set); /* from regexec.c */
6317 SAVEI32(PL_regnarrate); /* from regexec.c */
6318 SAVEVPTR(PL_regprogram); /* from regexec.c */
6319 SAVEINT(PL_regindent); /* from regexec.c */
6320 SAVEVPTR(PL_regcc); /* from regexec.c */
6321 SAVEVPTR(PL_curcop);
6322 SAVEVPTR(PL_reg_call_cc); /* from regexec.c */
6323 SAVEVPTR(PL_reg_re); /* from regexec.c */
6324 SAVEPPTR(PL_reg_ganch); /* from regexec.c */
6325 SAVESPTR(PL_reg_sv); /* from regexec.c */
6326 SAVEBOOL(PL_reg_match_utf8); /* from regexec.c */
6327 SAVEVPTR(PL_reg_magic); /* from regexec.c */
6328 SAVEI32(PL_reg_oldpos); /* from regexec.c */
6329 SAVEVPTR(PL_reg_oldcurpm); /* from regexec.c */
6330 SAVEVPTR(PL_reg_curpm); /* from regexec.c */
6331 SAVEPPTR(PL_reg_oldsaved); /* old saved substr during match */
6332 PL_reg_oldsaved = Nullch;
6333 SAVEI32(PL_reg_oldsavedlen); /* old length of saved substr during match */
6334 PL_reg_oldsavedlen = 0;
6335 #ifdef PERL_COPY_ON_WRITE
6339 SAVEI32(PL_reg_maxiter); /* max wait until caching pos */
6341 SAVEI32(PL_reg_leftiter); /* wait until caching pos */
6342 PL_reg_leftiter = 0;
6343 SAVEGENERICPV(PL_reg_poscache); /* cache of pos of WHILEM */
6344 PL_reg_poscache = Nullch;
6345 SAVEI32(PL_reg_poscache_size); /* size of pos cache of WHILEM */
6346 PL_reg_poscache_size = 0;
6347 SAVEPPTR(PL_regprecomp); /* uncompiled string. */
6348 SAVEI32(PL_regnpar); /* () count. */
6349 SAVEI32(PL_regsize); /* from regexec.c */
6352 /* Save $1..$n (#18107: UTF-8 s/(\w+)/uc($1)/e); AMS 20021106. */
6356 char digits[TYPE_CHARS(long)];
6358 if (PL_curpm && (rx = PM_GETRE(PL_curpm))) {
6359 for (i = 1; i <= rx->nparens; i++) {
6360 sprintf(digits, "%lu", (long)i);
6361 if ((mgv = gv_fetchpv(digits, FALSE, SVt_PV)))
6368 SAVEPPTR(PL_reg_starttry); /* from regexec.c */
6373 clear_re(pTHX_ void *r)
6375 ReREFCNT_dec((regexp *)r);
6380 * c-indentation-style: bsd
6382 * indent-tabs-mode: t
6385 * vim: shiftwidth=4: