5 * "A fair jaw-cracker dwarf-language must be." --Samwise Gamgee
8 /* This file contains functions for compiling a regular expression. See
9 * also regexec.c which funnily enough, contains functions for executing
10 * a regular expression.
12 * This file is also copied at build time to ext/re/re_comp.c, where
13 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
14 * This causes the main functions to be compiled under new names and with
15 * debugging support added, which makes "use re 'debug'" work.
18 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
19 * confused with the original package (see point 3 below). Thanks, Henry!
22 /* Additional note: this code is very heavily munged from Henry's version
23 * in places. In some spots I've traded clarity for efficiency, so don't
24 * blame Henry for some of the lack of readability.
27 /* The names of the functions have been changed from regcomp and
28 * regexec to pregcomp and pregexec in order to avoid conflicts
29 * with the POSIX routines of the same names.
32 #ifdef PERL_EXT_RE_BUILD
33 /* need to replace pregcomp et al, so enable that */
34 # ifndef PERL_IN_XSUB_RE
35 # define PERL_IN_XSUB_RE
37 /* need access to debugger hooks */
38 # if defined(PERL_EXT_RE_DEBUG) && !defined(DEBUGGING)
43 #ifdef PERL_IN_XSUB_RE
44 /* We *really* need to overwrite these symbols: */
45 # define Perl_pregcomp my_regcomp
46 # define Perl_regdump my_regdump
47 # define Perl_regprop my_regprop
48 # define Perl_pregfree my_regfree
49 # define Perl_re_intuit_string my_re_intuit_string
50 /* *These* symbols are masked to allow static link. */
51 # define Perl_regnext my_regnext
52 # define Perl_save_re_context my_save_re_context
53 # define Perl_reginitcolors my_reginitcolors
55 # define PERL_NO_GET_CONTEXT
60 * pregcomp and pregexec -- regsub and regerror are not used in perl
62 * Copyright (c) 1986 by University of Toronto.
63 * Written by Henry Spencer. Not derived from licensed software.
65 * Permission is granted to anyone to use this software for any
66 * purpose on any computer system, and to redistribute it freely,
67 * subject to the following restrictions:
69 * 1. The author is not responsible for the consequences of use of
70 * this software, no matter how awful, even if they arise
73 * 2. The origin of this software must not be misrepresented, either
74 * by explicit claim or by omission.
76 * 3. Altered versions must be plainly marked as such, and must not
77 * be misrepresented as being the original software.
80 **** Alterations to Henry's code are...
82 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
83 **** 2000, 2001, 2002, 2003, 2004, 2005, by Larry Wall and others
85 **** You may distribute under the terms of either the GNU General Public
86 **** License or the Artistic License, as specified in the README file.
89 * Beware that some of this code is subtly aware of the way operator
90 * precedence is structured in regular expressions. Serious changes in
91 * regular-expression syntax might require a total rethink.
94 #define PERL_IN_REGCOMP_C
97 #ifndef PERL_IN_XSUB_RE
109 # if defined(BUGGY_MSC6)
110 /* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */
111 # pragma optimize("a",off)
112 /* But MSC 6.00A is happy with 'w', for aliases only across function calls*/
113 # pragma optimize("w",on )
114 # endif /* BUGGY_MSC6 */
118 #define STATIC static
121 typedef struct RExC_state_t {
122 U32 flags; /* are we folding, multilining? */
123 char *precomp; /* uncompiled string. */
125 char *start; /* Start of input for compile */
126 char *end; /* End of input for compile */
127 char *parse; /* Input-scan pointer. */
128 I32 whilem_seen; /* number of WHILEM in this expr */
129 regnode *emit_start; /* Start of emitted-code area */
130 regnode *emit; /* Code-emit pointer; ®dummy = don't = compiling */
131 I32 naughty; /* How bad is this pattern? */
132 I32 sawback; /* Did we see \1, ...? */
134 I32 size; /* Code size. */
135 I32 npar; /* () count. */
141 char *starttry; /* -Dr: where regtry was called. */
142 #define RExC_starttry (pRExC_state->starttry)
146 #define RExC_flags (pRExC_state->flags)
147 #define RExC_precomp (pRExC_state->precomp)
148 #define RExC_rx (pRExC_state->rx)
149 #define RExC_start (pRExC_state->start)
150 #define RExC_end (pRExC_state->end)
151 #define RExC_parse (pRExC_state->parse)
152 #define RExC_whilem_seen (pRExC_state->whilem_seen)
153 #define RExC_offsets (pRExC_state->rx->offsets) /* I am not like the others */
154 #define RExC_emit (pRExC_state->emit)
155 #define RExC_emit_start (pRExC_state->emit_start)
156 #define RExC_naughty (pRExC_state->naughty)
157 #define RExC_sawback (pRExC_state->sawback)
158 #define RExC_seen (pRExC_state->seen)
159 #define RExC_size (pRExC_state->size)
160 #define RExC_npar (pRExC_state->npar)
161 #define RExC_extralen (pRExC_state->extralen)
162 #define RExC_seen_zerolen (pRExC_state->seen_zerolen)
163 #define RExC_seen_evals (pRExC_state->seen_evals)
164 #define RExC_utf8 (pRExC_state->utf8)
166 #define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?')
167 #define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
168 ((*s) == '{' && regcurly(s)))
171 #undef SPSTART /* dratted cpp namespace... */
174 * Flags to be passed up and down.
176 #define WORST 0 /* Worst case. */
177 #define HASWIDTH 0x1 /* Known to match non-null strings. */
178 #define SIMPLE 0x2 /* Simple enough to be STAR/PLUS operand. */
179 #define SPSTART 0x4 /* Starts with * or +. */
180 #define TRYAGAIN 0x8 /* Weeded out a declaration. */
182 /* Length of a variant. */
184 typedef struct scan_data_t {
190 I32 last_end; /* min value, <0 unless valid. */
193 SV **longest; /* Either &l_fixed, or &l_float. */
197 I32 offset_float_min;
198 I32 offset_float_max;
202 struct regnode_charclass_class *start_class;
206 * Forward declarations for pregcomp()'s friends.
209 static const scan_data_t zero_scan_data =
210 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
212 #define SF_BEFORE_EOL (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
213 #define SF_BEFORE_SEOL 0x1
214 #define SF_BEFORE_MEOL 0x2
215 #define SF_FIX_BEFORE_EOL (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
216 #define SF_FL_BEFORE_EOL (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
219 # define SF_FIX_SHIFT_EOL (0+2)
220 # define SF_FL_SHIFT_EOL (0+4)
222 # define SF_FIX_SHIFT_EOL (+2)
223 # define SF_FL_SHIFT_EOL (+4)
226 #define SF_FIX_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
227 #define SF_FIX_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
229 #define SF_FL_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
230 #define SF_FL_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
231 #define SF_IS_INF 0x40
232 #define SF_HAS_PAR 0x80
233 #define SF_IN_PAR 0x100
234 #define SF_HAS_EVAL 0x200
235 #define SCF_DO_SUBSTR 0x400
236 #define SCF_DO_STCLASS_AND 0x0800
237 #define SCF_DO_STCLASS_OR 0x1000
238 #define SCF_DO_STCLASS (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
239 #define SCF_WHILEM_VISITED_POS 0x2000
241 #define UTF (RExC_utf8 != 0)
242 #define LOC ((RExC_flags & PMf_LOCALE) != 0)
243 #define FOLD ((RExC_flags & PMf_FOLD) != 0)
245 #define OOB_UNICODE 12345678
246 #define OOB_NAMEDCLASS -1
248 #define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
249 #define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
252 /* length of regex to show in messages that don't mark a position within */
253 #define RegexLengthToShowInErrorMessages 127
256 * If MARKER[12] are adjusted, be sure to adjust the constants at the top
257 * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
258 * op/pragma/warn/regcomp.
260 #define MARKER1 "<-- HERE" /* marker as it appears in the description */
261 #define MARKER2 " <-- HERE " /* marker as it appears within the regex */
263 #define REPORT_LOCATION " in regex; marked by " MARKER1 " in m/%.*s" MARKER2 "%s/"
266 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
267 * arg. Show regex, up to a maximum length. If it's too long, chop and add
270 #define FAIL(msg) STMT_START { \
271 const char *ellipses = ""; \
272 IV len = RExC_end - RExC_precomp; \
275 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
276 if (len > RegexLengthToShowInErrorMessages) { \
277 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
278 len = RegexLengthToShowInErrorMessages - 10; \
281 Perl_croak(aTHX_ "%s in regex m/%.*s%s/", \
282 msg, (int)len, RExC_precomp, ellipses); \
286 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
287 * args. Show regex, up to a maximum length. If it's too long, chop and add
290 #define FAIL2(pat,msg) STMT_START { \
291 const char *ellipses = ""; \
292 IV len = RExC_end - RExC_precomp; \
295 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
296 if (len > RegexLengthToShowInErrorMessages) { \
297 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
298 len = RegexLengthToShowInErrorMessages - 10; \
301 S_re_croak2(aTHX_ pat, " in regex m/%.*s%s/", \
302 msg, (int)len, RExC_precomp, ellipses); \
307 * Simple_vFAIL -- like FAIL, but marks the current location in the scan
309 #define Simple_vFAIL(m) STMT_START { \
310 IV offset = RExC_parse - RExC_precomp; \
311 Perl_croak(aTHX_ "%s" REPORT_LOCATION, \
312 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
316 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
318 #define vFAIL(m) STMT_START { \
320 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
325 * Like Simple_vFAIL(), but accepts two arguments.
327 #define Simple_vFAIL2(m,a1) STMT_START { \
328 IV offset = RExC_parse - RExC_precomp; \
329 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, \
330 (int)offset, RExC_precomp, RExC_precomp + offset); \
334 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
336 #define vFAIL2(m,a1) STMT_START { \
338 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
339 Simple_vFAIL2(m, a1); \
344 * Like Simple_vFAIL(), but accepts three arguments.
346 #define Simple_vFAIL3(m, a1, a2) STMT_START { \
347 IV offset = RExC_parse - RExC_precomp; \
348 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, \
349 (int)offset, RExC_precomp, RExC_precomp + offset); \
353 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
355 #define vFAIL3(m,a1,a2) STMT_START { \
357 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
358 Simple_vFAIL3(m, a1, a2); \
362 * Like Simple_vFAIL(), but accepts four arguments.
364 #define Simple_vFAIL4(m, a1, a2, a3) STMT_START { \
365 IV offset = RExC_parse - RExC_precomp; \
366 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3, \
367 (int)offset, RExC_precomp, RExC_precomp + offset); \
371 * Like Simple_vFAIL(), but accepts five arguments.
373 #define Simple_vFAIL5(m, a1, a2, a3, a4) STMT_START { \
374 IV offset = RExC_parse - RExC_precomp; \
375 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3, a4, \
376 (int)offset, RExC_precomp, RExC_precomp + offset); \
380 #define vWARN(loc,m) STMT_START { \
381 IV offset = loc - RExC_precomp; \
382 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s" REPORT_LOCATION, \
383 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
386 #define vWARNdep(loc,m) STMT_START { \
387 IV offset = loc - RExC_precomp; \
388 Perl_warner(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), \
389 "%s" REPORT_LOCATION, \
390 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
394 #define vWARN2(loc, m, a1) STMT_START { \
395 IV offset = loc - RExC_precomp; \
396 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
397 a1, (int)offset, RExC_precomp, RExC_precomp + offset); \
400 #define vWARN3(loc, m, a1, a2) STMT_START { \
401 IV offset = loc - RExC_precomp; \
402 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
403 a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset); \
406 #define vWARN4(loc, m, a1, a2, a3) STMT_START { \
407 IV offset = loc - RExC_precomp; \
408 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
409 a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
412 #define vWARN5(loc, m, a1, a2, a3, a4) STMT_START { \
413 IV offset = loc - RExC_precomp; \
414 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
415 a1, a2, a3, a4, (int)offset, RExC_precomp, RExC_precomp + offset); \
419 /* Allow for side effects in s */
420 #define REGC(c,s) STMT_START { \
421 if (!SIZE_ONLY) *(s) = (c); else (void)(s); \
424 /* Macros for recording node offsets. 20001227 mjd@plover.com
425 * Nodes are numbered 1, 2, 3, 4. Node #n's position is recorded in
426 * element 2*n-1 of the array. Element #2n holds the byte length node #n.
427 * Element 0 holds the number n.
430 #define MJD_OFFSET_DEBUG(x)
431 /* #define MJD_OFFSET_DEBUG(x) DEBUG_r(Perl_warn_nocontext x) */
434 #define Set_Node_Offset_To_R(node,byte) STMT_START { \
436 MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n", \
437 __LINE__, (node), (byte))); \
439 Perl_croak(aTHX_ "value of node is %d in Offset macro", node); \
441 RExC_offsets[2*(node)-1] = (byte); \
446 #define Set_Node_Offset(node,byte) \
447 Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
448 #define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
450 #define Set_Node_Length_To_R(node,len) STMT_START { \
452 MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n", \
453 __LINE__, (node), (len))); \
455 Perl_croak(aTHX_ "value of node is %d in Length macro", node); \
457 RExC_offsets[2*(node)] = (len); \
462 #define Set_Node_Length(node,len) \
463 Set_Node_Length_To_R((node)-RExC_emit_start, len)
464 #define Set_Cur_Node_Length(len) Set_Node_Length(RExC_emit, len)
465 #define Set_Node_Cur_Length(node) \
466 Set_Node_Length(node, RExC_parse - parse_start)
468 /* Get offsets and lengths */
469 #define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
470 #define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
472 static void clear_re(pTHX_ void *r);
474 /* Mark that we cannot extend a found fixed substring at this point.
475 Updata the longest found anchored substring and the longest found
476 floating substrings if needed. */
479 S_scan_commit(pTHX_ RExC_state_t *pRExC_state, scan_data_t *data)
481 const STRLEN l = CHR_SVLEN(data->last_found);
482 const STRLEN old_l = CHR_SVLEN(*data->longest);
484 if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
485 SvSetMagicSV(*data->longest, data->last_found);
486 if (*data->longest == data->longest_fixed) {
487 data->offset_fixed = l ? data->last_start_min : data->pos_min;
488 if (data->flags & SF_BEFORE_EOL)
490 |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
492 data->flags &= ~SF_FIX_BEFORE_EOL;
495 data->offset_float_min = l ? data->last_start_min : data->pos_min;
496 data->offset_float_max = (l
497 ? data->last_start_max
498 : data->pos_min + data->pos_delta);
499 if ((U32)data->offset_float_max > (U32)I32_MAX)
500 data->offset_float_max = I32_MAX;
501 if (data->flags & SF_BEFORE_EOL)
503 |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
505 data->flags &= ~SF_FL_BEFORE_EOL;
508 SvCUR_set(data->last_found, 0);
510 SV * sv = data->last_found;
512 SvUTF8(sv) && SvMAGICAL(sv) ? mg_find(sv, PERL_MAGIC_utf8) : NULL;
513 if (mg && mg->mg_len > 0)
517 data->flags &= ~SF_BEFORE_EOL;
520 /* Can match anything (initialization) */
522 S_cl_anything(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
524 ANYOF_CLASS_ZERO(cl);
525 ANYOF_BITMAP_SETALL(cl);
526 cl->flags = ANYOF_EOS|ANYOF_UNICODE_ALL;
528 cl->flags |= ANYOF_LOCALE;
531 /* Can match anything (initialization) */
533 S_cl_is_anything(pTHX_ struct regnode_charclass_class *cl)
537 for (value = 0; value <= ANYOF_MAX; value += 2)
538 if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1))
540 if (!(cl->flags & ANYOF_UNICODE_ALL))
542 if (!ANYOF_BITMAP_TESTALLSET(cl))
547 /* Can match anything (initialization) */
549 S_cl_init(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
551 Zero(cl, 1, struct regnode_charclass_class);
553 cl_anything(pRExC_state, cl);
557 S_cl_init_zero(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
559 Zero(cl, 1, struct regnode_charclass_class);
561 cl_anything(pRExC_state, cl);
563 cl->flags |= ANYOF_LOCALE;
566 /* 'And' a given class with another one. Can create false positives */
567 /* We assume that cl is not inverted */
569 S_cl_and(pTHX_ struct regnode_charclass_class *cl,
570 struct regnode_charclass_class *and_with)
572 if (!(and_with->flags & ANYOF_CLASS)
573 && !(cl->flags & ANYOF_CLASS)
574 && (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
575 && !(and_with->flags & ANYOF_FOLD)
576 && !(cl->flags & ANYOF_FOLD)) {
579 if (and_with->flags & ANYOF_INVERT)
580 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
581 cl->bitmap[i] &= ~and_with->bitmap[i];
583 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
584 cl->bitmap[i] &= and_with->bitmap[i];
585 } /* XXXX: logic is complicated otherwise, leave it along for a moment. */
586 if (!(and_with->flags & ANYOF_EOS))
587 cl->flags &= ~ANYOF_EOS;
589 if (cl->flags & ANYOF_UNICODE_ALL && and_with->flags & ANYOF_UNICODE &&
590 !(and_with->flags & ANYOF_INVERT)) {
591 cl->flags &= ~ANYOF_UNICODE_ALL;
592 cl->flags |= ANYOF_UNICODE;
593 ARG_SET(cl, ARG(and_with));
595 if (!(and_with->flags & ANYOF_UNICODE_ALL) &&
596 !(and_with->flags & ANYOF_INVERT))
597 cl->flags &= ~ANYOF_UNICODE_ALL;
598 if (!(and_with->flags & (ANYOF_UNICODE|ANYOF_UNICODE_ALL)) &&
599 !(and_with->flags & ANYOF_INVERT))
600 cl->flags &= ~ANYOF_UNICODE;
603 /* 'OR' a given class with another one. Can create false positives */
604 /* We assume that cl is not inverted */
606 S_cl_or(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, struct regnode_charclass_class *or_with)
608 if (or_with->flags & ANYOF_INVERT) {
610 * (B1 | CL1) | (!B2 & !CL2) = (B1 | !B2 & !CL2) | (CL1 | (!B2 & !CL2))
611 * <= (B1 | !B2) | (CL1 | !CL2)
612 * which is wasteful if CL2 is small, but we ignore CL2:
613 * (B1 | CL1) | (!B2 & !CL2) <= (B1 | CL1) | !B2 = (B1 | !B2) | CL1
614 * XXXX Can we handle case-fold? Unclear:
615 * (OK1(i) | OK1(i')) | !(OK1(i) | OK1(i')) =
616 * (OK1(i) | OK1(i')) | (!OK1(i) & !OK1(i'))
618 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
619 && !(or_with->flags & ANYOF_FOLD)
620 && !(cl->flags & ANYOF_FOLD) ) {
623 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
624 cl->bitmap[i] |= ~or_with->bitmap[i];
625 } /* XXXX: logic is complicated otherwise */
627 cl_anything(pRExC_state, cl);
630 /* (B1 | CL1) | (B2 | CL2) = (B1 | B2) | (CL1 | CL2)) */
631 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
632 && (!(or_with->flags & ANYOF_FOLD)
633 || (cl->flags & ANYOF_FOLD)) ) {
636 /* OR char bitmap and class bitmap separately */
637 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
638 cl->bitmap[i] |= or_with->bitmap[i];
639 if (or_with->flags & ANYOF_CLASS) {
640 for (i = 0; i < ANYOF_CLASSBITMAP_SIZE; i++)
641 cl->classflags[i] |= or_with->classflags[i];
642 cl->flags |= ANYOF_CLASS;
645 else { /* XXXX: logic is complicated, leave it along for a moment. */
646 cl_anything(pRExC_state, cl);
649 if (or_with->flags & ANYOF_EOS)
650 cl->flags |= ANYOF_EOS;
652 if (cl->flags & ANYOF_UNICODE && or_with->flags & ANYOF_UNICODE &&
653 ARG(cl) != ARG(or_with)) {
654 cl->flags |= ANYOF_UNICODE_ALL;
655 cl->flags &= ~ANYOF_UNICODE;
657 if (or_with->flags & ANYOF_UNICODE_ALL) {
658 cl->flags |= ANYOF_UNICODE_ALL;
659 cl->flags &= ~ANYOF_UNICODE;
665 make_trie(startbranch,first,last,tail,flags)
666 startbranch: the first branch in the whole branch sequence
667 first : start branch of sequence of branch-exact nodes.
668 May be the same as startbranch
669 last : Thing following the last branch.
670 May be the same as tail.
671 tail : item following the branch sequence
672 flags : currently the OP() type we will be building one of /EXACT(|F|Fl)/
674 Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node.
676 A trie is an N'ary tree where the branches are determined by digital
677 decomposition of the key. IE, at the root node you look up the 1st character and
678 follow that branch repeat until you find the end of the branches. Nodes can be
679 marked as "accepting" meaning they represent a complete word. Eg:
683 would convert into the following structure. Numbers represent states, letters
684 following numbers represent valid transitions on the letter from that state, if
685 the number is in square brackets it represents an accepting state, otherwise it
686 will be in parenthesis.
688 +-h->+-e->[3]-+-r->(8)-+-s->[9]
692 (1) +-i->(6)-+-s->[7]
694 +-s->(3)-+-h->(4)-+-e->[5]
696 Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers)
698 This shows that when matching against the string 'hers' we will begin at state 1
699 read 'h' and move to state 2, read 'e' and move to state 3 which is accepting,
700 then read 'r' and go to state 8 followed by 's' which takes us to state 9 which
701 is also accepting. Thus we know that we can match both 'he' and 'hers' with a
702 single traverse. We store a mapping from accepting to state to which word was
703 matched, and then when we have multiple possibilities we try to complete the
704 rest of the regex in the order in which they occured in the alternation.
706 The only prior NFA like behaviour that would be changed by the TRIE support is
707 the silent ignoring of duplicate alternations which are of the form:
709 / (DUPE|DUPE) X? (?{ ... }) Y /x
711 Thus EVAL blocks follwing a trie may be called a different number of times with
712 and without the optimisation. With the optimisations dupes will be silently
713 ignored. This inconsistant behaviour of EVAL type nodes is well established as
714 the following demonstrates:
716 'words'=~/(word|word|word)(?{ print $1 })[xyz]/
718 which prints out 'word' three times, but
720 'words'=~/(word|word|word)(?{ print $1 })S/
722 which doesnt print it out at all. This is due to other optimisations kicking in.
724 Example of what happens on a structural level:
726 The regexp /(ac|ad|ab)+/ will produce the folowing debug output:
728 1: CURLYM[1] {1,32767}(18)
739 This would be optimizable with startbranch=5, first=5, last=16, tail=16
740 and should turn into:
742 1: CURLYM[1] {1,32767}(18)
744 [Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1]
752 Cases where tail != last would be like /(?foo|bar)baz/:
762 which would be optimizable with startbranch=1, first=1, last=7, tail=8
763 and would end up looking like:
766 [Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1]
775 #define TRIE_DEBUG_CHAR \
776 DEBUG_TRIE_COMPILE_r({ \
779 tmp = newSVpv( "", 0 ); \
780 pv_uni_display( tmp, uc, len, 60, UNI_DISPLAY_REGEX ); \
782 tmp = Perl_newSVpvf_nocontext( "%c", (int)uvc ); \
784 av_push( trie->revcharmap, tmp ); \
787 #define TRIE_READ_CHAR STMT_START { \
790 if ( foldlen > 0 ) { \
791 uvc = utf8n_to_uvuni( scan, UTF8_MAXLEN, &len, uniflags ); \
796 uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
797 uvc = to_uni_fold( uvc, foldbuf, &foldlen ); \
798 foldlen -= UNISKIP( uvc ); \
799 scan = foldbuf + UNISKIP( uvc ); \
802 uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
811 #define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ]
812 #define TRIE_LIST_CUR(state) ( TRIE_LIST_ITEM( state, 0 ).forid )
813 #define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate )
814 #define TRIE_LIST_USED(idx) ( trie->states[state].trans.list ? (TRIE_LIST_CUR( idx ) - 1) : 0 )
816 #define TRIE_LIST_PUSH(state,fid,ns) STMT_START { \
817 if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) { \
818 TRIE_LIST_LEN( state ) *= 2; \
819 Renew( trie->states[ state ].trans.list, \
820 TRIE_LIST_LEN( state ), reg_trie_trans_le ); \
822 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid; \
823 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns; \
824 TRIE_LIST_CUR( state )++; \
827 #define TRIE_LIST_NEW(state) STMT_START { \
828 Newz( 1023, trie->states[ state ].trans.list, \
829 4, reg_trie_trans_le ); \
830 TRIE_LIST_CUR( state ) = 1; \
831 TRIE_LIST_LEN( state ) = 4; \
835 S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *first, regnode *last, regnode *tail, U32 flags)
838 /* first pass, loop through and scan words */
841 const U32 uniflags = ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY;
846 /* we just use folder as a flag in utf8 */
847 const U8 * const folder = ( flags == EXACTF
855 const U32 data_slot = add_data( pRExC_state, 1, "t" );
858 GET_RE_DEBUG_FLAGS_DECL;
860 Newz( 848200, trie, 1, reg_trie_data );
862 RExC_rx->data->data[ data_slot ] = (void*)trie;
863 Newz( 848201, trie->charmap, 256, U16 );
865 trie->words = newAV();
866 trie->revcharmap = newAV();
870 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
871 if (!SvIOK(re_trie_maxbuff)) {
872 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
875 /* -- First loop and Setup --
877 We first traverse the branches and scan each word to determine if it
878 contains widechars, and how many unique chars there are, this is
879 important as we have to build a table with at least as many columns as we
882 We use an array of integers to represent the character codes 0..255
883 (trie->charmap) and we use a an HV* to store unicode characters. We use the
884 native representation of the character value as the key and IV's for the
887 *TODO* If we keep track of how many times each character is used we can
888 remap the columns so that the table compression later on is more
889 efficient in terms of memory by ensuring most common value is in the
890 middle and the least common are on the outside. IMO this would be better
891 than a most to least common mapping as theres a decent chance the most
892 common letter will share a node with the least common, meaning the node
893 will not be compressable. With a middle is most common approach the worst
894 case is when we have the least common nodes twice.
899 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
900 regnode *noper = NEXTOPER( cur );
901 const U8 *uc = (U8*)STRING( noper );
902 const U8 *e = uc + STR_LEN( noper );
904 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
905 const U8 *scan = (U8*)NULL;
907 for ( ; uc < e ; uc += len ) {
911 if ( !trie->charmap[ uvc ] ) {
912 trie->charmap[ uvc ]=( ++trie->uniquecharcount );
914 trie->charmap[ folder[ uvc ] ] = trie->charmap[ uvc ];
919 if ( !trie->widecharmap )
920 trie->widecharmap = newHV();
922 svpp = hv_fetch( trie->widecharmap, (char*)&uvc, sizeof( UV ), 1 );
925 Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%"UVXf, uvc );
927 if ( !SvTRUE( *svpp ) ) {
928 sv_setiv( *svpp, ++trie->uniquecharcount );
934 } /* end first pass */
935 DEBUG_TRIE_COMPILE_r(
936 PerlIO_printf( Perl_debug_log, "TRIE(%s): W:%d C:%d Uq:%d \n",
937 ( trie->widecharmap ? "UTF8" : "NATIVE" ), trie->wordcount,
938 (int)trie->charcount, trie->uniquecharcount )
943 We now know what we are dealing with in terms of unique chars and
944 string sizes so we can calculate how much memory a naive
945 representation using a flat table will take. If it's over a reasonable
946 limit (as specified by ${^RE_TRIE_MAXBUF}) we use a more memory
947 conservative but potentially much slower representation using an array
950 At the end we convert both representations into the same compressed
951 form that will be used in regexec.c for matching with. The latter
952 is a form that cannot be used to construct with but has memory
953 properties similar to the list form and access properties similar
954 to the table form making it both suitable for fast searches and
955 small enough that its feasable to store for the duration of a program.
957 See the comment in the code where the compressed table is produced
958 inplace from the flat tabe representation for an explanation of how
959 the compression works.
964 if ( (IV)( ( trie->charcount + 1 ) * trie->uniquecharcount + 1) > SvIV(re_trie_maxbuff) ) {
966 Second Pass -- Array Of Lists Representation
968 Each state will be represented by a list of charid:state records
969 (reg_trie_trans_le) the first such element holds the CUR and LEN
970 points of the allocated array. (See defines above).
972 We build the initial structure using the lists, and then convert
973 it into the compressed table form which allows faster lookups
974 (but cant be modified once converted).
980 STRLEN transcount = 1;
982 Newz( 848204, trie->states, trie->charcount + 2, reg_trie_state );
986 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
988 regnode *noper = NEXTOPER( cur );
989 U8 *uc = (U8*)STRING( noper );
990 U8 *e = uc + STR_LEN( noper );
991 U32 state = 1; /* required init */
992 U16 charid = 0; /* sanity init */
993 U8 *scan = (U8*)NULL; /* sanity init */
994 STRLEN foldlen = 0; /* required init */
995 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
998 for ( ; uc < e ; uc += len ) {
1003 charid = trie->charmap[ uvc ];
1005 SV** svpp=(SV**)NULL;
1006 svpp = hv_fetch( trie->widecharmap, (char*)&uvc, sizeof( UV ), 0);
1010 charid=(U16)SvIV( *svpp );
1019 if ( !trie->states[ state ].trans.list ) {
1020 TRIE_LIST_NEW( state );
1022 for ( check = 1; check <= TRIE_LIST_USED( state ); check++ ) {
1023 if ( TRIE_LIST_ITEM( state, check ).forid == charid ) {
1024 newstate = TRIE_LIST_ITEM( state, check ).newstate;
1029 newstate = next_alloc++;
1030 TRIE_LIST_PUSH( state, charid, newstate );
1036 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
1038 /* charid is now 0 if we dont know the char read, or nonzero if we do */
1041 if ( !trie->states[ state ].wordnum ) {
1042 /* we havent inserted this word into the structure yet. */
1043 trie->states[ state ].wordnum = ++curword;
1046 /* store the word for dumping */
1047 SV* tmp = newSVpvn( STRING( noper ), STR_LEN( noper ) );
1048 if ( UTF ) SvUTF8_on( tmp );
1049 av_push( trie->words, tmp );
1053 /* Its a dupe. So ignore it. */
1056 } /* end second pass */
1058 trie->laststate = next_alloc;
1059 Renew( trie->states, next_alloc, reg_trie_state );
1061 DEBUG_TRIE_COMPILE_MORE_r({
1066 print out the table precompression.
1069 PerlIO_printf( Perl_debug_log, "\nState :Word | Transition Data\n" );
1070 PerlIO_printf( Perl_debug_log, "------:-----+-----------------" );
1072 for( state=1 ; state < next_alloc ; state ++ ) {
1074 PerlIO_printf( Perl_debug_log, "\n %04"UVXf" :", (UV)state );
1075 if ( ! trie->states[ state ].wordnum ) {
1076 PerlIO_printf( Perl_debug_log, "%5s| ","");
1078 PerlIO_printf( Perl_debug_log, "W%04x| ",
1079 trie->states[ state ].wordnum
1082 for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) {
1083 SV **tmp = av_fetch( trie->revcharmap, TRIE_LIST_ITEM(state,charid).forid, 0);
1084 PerlIO_printf( Perl_debug_log, "%s:%3X=%04"UVXf" | ",
1086 TRIE_LIST_ITEM(state,charid).forid,
1087 (UV)TRIE_LIST_ITEM(state,charid).newstate
1092 PerlIO_printf( Perl_debug_log, "\n\n" );
1095 Newz( 848203, trie->trans, transcount ,reg_trie_trans );
1103 for( state=1 ; state < next_alloc ; state ++ ) {
1107 DEBUG_TRIE_COMPILE_MORE_r(
1108 PerlIO_printf( Perl_debug_log, "tp: %d zp: %d ",tp,zp)
1112 if (trie->states[state].trans.list) {
1113 U16 minid=TRIE_LIST_ITEM( state, 1).forid;
1117 for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
1118 if ( TRIE_LIST_ITEM( state, idx).forid < minid ) {
1119 minid=TRIE_LIST_ITEM( state, idx).forid;
1120 } else if ( TRIE_LIST_ITEM( state, idx).forid > maxid ) {
1121 maxid=TRIE_LIST_ITEM( state, idx).forid;
1124 if ( transcount < tp + maxid - minid + 1) {
1126 Renew( trie->trans, transcount, reg_trie_trans );
1127 Zero( trie->trans + (transcount / 2), transcount / 2 , reg_trie_trans );
1129 base = trie->uniquecharcount + tp - minid;
1130 if ( maxid == minid ) {
1132 for ( ; zp < tp ; zp++ ) {
1133 if ( ! trie->trans[ zp ].next ) {
1134 base = trie->uniquecharcount + zp - minid;
1135 trie->trans[ zp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1136 trie->trans[ zp ].check = state;
1142 trie->trans[ tp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1143 trie->trans[ tp ].check = state;
1148 for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
1149 U32 tid = base - trie->uniquecharcount + TRIE_LIST_ITEM( state, idx ).forid;
1150 trie->trans[ tid ].next = TRIE_LIST_ITEM( state, idx ).newstate;
1151 trie->trans[ tid ].check = state;
1153 tp += ( maxid - minid + 1 );
1155 Safefree(trie->states[ state ].trans.list);
1158 DEBUG_TRIE_COMPILE_MORE_r(
1159 PerlIO_printf( Perl_debug_log, " base: %d\n",base);
1162 trie->states[ state ].trans.base=base;
1164 trie->lasttrans = tp + 1;
1168 Second Pass -- Flat Table Representation.
1170 we dont use the 0 slot of either trans[] or states[] so we add 1 to each.
1171 We know that we will need Charcount+1 trans at most to store the data
1172 (one row per char at worst case) So we preallocate both structures
1173 assuming worst case.
1175 We then construct the trie using only the .next slots of the entry
1178 We use the .check field of the first entry of the node temporarily to
1179 make compression both faster and easier by keeping track of how many non
1180 zero fields are in the node.
1182 Since trans are numbered from 1 any 0 pointer in the table is a FAIL
1185 There are two terms at use here: state as a TRIE_NODEIDX() which is a
1186 number representing the first entry of the node, and state as a
1187 TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1) and
1188 TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3) if there
1189 are 2 entrys per node. eg:
1197 The table is internally in the right hand, idx form. However as we also
1198 have to deal with the states array which is indexed by nodenum we have to
1199 use TRIE_NODENUM() to convert.
1203 Newz( 848203, trie->trans, ( trie->charcount + 1 ) * trie->uniquecharcount + 1,
1205 Newz( 848204, trie->states, trie->charcount + 2, reg_trie_state );
1206 next_alloc = trie->uniquecharcount + 1;
1208 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1210 regnode *noper = NEXTOPER( cur );
1211 U8 *uc = (U8*)STRING( noper );
1212 U8 *e = uc + STR_LEN( noper );
1214 U32 state = 1; /* required init */
1216 U16 charid = 0; /* sanity init */
1217 U32 accept_state = 0; /* sanity init */
1218 U8 *scan = (U8*)NULL; /* sanity init */
1220 STRLEN foldlen = 0; /* required init */
1221 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1224 for ( ; uc < e ; uc += len ) {
1229 charid = trie->charmap[ uvc ];
1231 SV** svpp=(SV**)NULL;
1232 svpp = hv_fetch( trie->widecharmap, (char*)&uvc, sizeof( UV ), 0);
1236 charid=(U16)SvIV( *svpp );
1241 if ( !trie->trans[ state + charid ].next ) {
1242 trie->trans[ state + charid ].next = next_alloc;
1243 trie->trans[ state ].check++;
1244 next_alloc += trie->uniquecharcount;
1246 state = trie->trans[ state + charid ].next;
1248 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
1250 /* charid is now 0 if we dont know the char read, or nonzero if we do */
1253 accept_state = TRIE_NODENUM( state );
1254 if ( !trie->states[ accept_state ].wordnum ) {
1255 /* we havent inserted this word into the structure yet. */
1256 trie->states[ accept_state ].wordnum = ++curword;
1259 /* store the word for dumping */
1260 SV* tmp = newSVpvn( STRING( noper ), STR_LEN( noper ) );
1261 if ( UTF ) SvUTF8_on( tmp );
1262 av_push( trie->words, tmp );
1266 /* Its a dupe. So ignore it. */
1269 } /* end second pass */
1271 DEBUG_TRIE_COMPILE_MORE_r({
1273 print out the table precompression so that we can do a visual check
1274 that they are identical.
1278 PerlIO_printf( Perl_debug_log, "\nChar : " );
1280 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
1281 SV **tmp = av_fetch( trie->revcharmap, charid, 0);
1283 PerlIO_printf( Perl_debug_log, "%4.4s ", SvPV_nolen( *tmp ) );
1287 PerlIO_printf( Perl_debug_log, "\nState+-" );
1289 for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) {
1290 PerlIO_printf( Perl_debug_log, "%4s-", "----" );
1293 PerlIO_printf( Perl_debug_log, "\n" );
1295 for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) {
1297 PerlIO_printf( Perl_debug_log, "%04"UVXf" : ", (UV)TRIE_NODENUM( state ) );
1299 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
1300 PerlIO_printf( Perl_debug_log, "%04"UVXf" ",
1301 (UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next ) );
1303 if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) {
1304 PerlIO_printf( Perl_debug_log, " (%04"UVXf")\n", (UV)trie->trans[ state ].check );
1306 PerlIO_printf( Perl_debug_log, " (%04"UVXf") W%04X\n", (UV)trie->trans[ state ].check,
1307 trie->states[ TRIE_NODENUM( state ) ].wordnum );
1310 PerlIO_printf( Perl_debug_log, "\n\n" );
1314 * Inplace compress the table.*
1316 For sparse data sets the table constructed by the trie algorithm will
1317 be mostly 0/FAIL transitions or to put it another way mostly empty.
1318 (Note that leaf nodes will not contain any transitions.)
1320 This algorithm compresses the tables by eliminating most such
1321 transitions, at the cost of a modest bit of extra work during lookup:
1323 - Each states[] entry contains a .base field which indicates the
1324 index in the state[] array wheres its transition data is stored.
1326 - If .base is 0 there are no valid transitions from that node.
1328 - If .base is nonzero then charid is added to it to find an entry in
1331 -If trans[states[state].base+charid].check!=state then the
1332 transition is taken to be a 0/Fail transition. Thus if there are fail
1333 transitions at the front of the node then the .base offset will point
1334 somewhere inside the previous nodes data (or maybe even into a node
1335 even earlier), but the .check field determines if the transition is
1338 The following process inplace converts the table to the compressed
1339 table: We first do not compress the root node 1,and mark its all its
1340 .check pointers as 1 and set its .base pointer as 1 as well. This
1341 allows to do a DFA construction from the compressed table later, and
1342 ensures that any .base pointers we calculate later are greater than
1345 - We set 'pos' to indicate the first entry of the second node.
1347 - We then iterate over the columns of the node, finding the first and
1348 last used entry at l and m. We then copy l..m into pos..(pos+m-l),
1349 and set the .check pointers accordingly, and advance pos
1350 appropriately and repreat for the next node. Note that when we copy
1351 the next pointers we have to convert them from the original
1352 NODEIDX form to NODENUM form as the former is not valid post
1355 - If a node has no transitions used we mark its base as 0 and do not
1356 advance the pos pointer.
1358 - If a node only has one transition we use a second pointer into the
1359 structure to fill in allocated fail transitions from other states.
1360 This pointer is independent of the main pointer and scans forward
1361 looking for null transitions that are allocated to a state. When it
1362 finds one it writes the single transition into the "hole". If the
1363 pointer doesnt find one the single transition is appeneded as normal.
1365 - Once compressed we can Renew/realloc the structures to release the
1368 See "Table-Compression Methods" in sec 3.9 of the Red Dragon,
1369 specifically Fig 3.47 and the associated pseudocode.
1373 const U32 laststate = TRIE_NODENUM( next_alloc );
1374 U32 used , state, charid;
1376 trie->laststate = laststate;
1378 for ( state = 1 ; state < laststate ; state++ ) {
1380 U32 stateidx = TRIE_NODEIDX( state );
1381 U32 o_used=trie->trans[ stateidx ].check;
1382 used = trie->trans[ stateidx ].check;
1383 trie->trans[ stateidx ].check = 0;
1385 for ( charid = 0 ; used && charid < trie->uniquecharcount ; charid++ ) {
1386 if ( flag || trie->trans[ stateidx + charid ].next ) {
1387 if ( trie->trans[ stateidx + charid ].next ) {
1389 for ( ; zp < pos ; zp++ ) {
1390 if ( ! trie->trans[ zp ].next ) {
1394 trie->states[ state ].trans.base = zp + trie->uniquecharcount - charid ;
1395 trie->trans[ zp ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
1396 trie->trans[ zp ].check = state;
1397 if ( ++zp > pos ) pos = zp;
1404 trie->states[ state ].trans.base = pos + trie->uniquecharcount - charid ;
1406 trie->trans[ pos ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
1407 trie->trans[ pos ].check = state;
1412 trie->lasttrans = pos + 1;
1413 Renew( trie->states, laststate + 1, reg_trie_state);
1414 DEBUG_TRIE_COMPILE_MORE_r(
1415 PerlIO_printf( Perl_debug_log,
1416 " Alloc: %d Orig: %"IVdf" elements, Final:%"IVdf". Savings of %%%5.2f\n",
1417 (int)( ( trie->charcount + 1 ) * trie->uniquecharcount + 1 ),
1420 ( ( next_alloc - pos ) * 100 ) / (double)next_alloc );
1423 } /* end table compress */
1425 /* resize the trans array to remove unused space */
1426 Renew( trie->trans, trie->lasttrans, reg_trie_trans);
1428 DEBUG_TRIE_COMPILE_r({
1431 Now we print it out again, in a slightly different form as there is additional
1432 info we want to be able to see when its compressed. They are close enough for
1433 visual comparison though.
1435 PerlIO_printf( Perl_debug_log, "\nChar : %-6s%-6s%-4s ","Match","Base","Ofs" );
1437 for( state = 0 ; state < trie->uniquecharcount ; state++ ) {
1438 SV **tmp = av_fetch( trie->revcharmap, state, 0);
1440 PerlIO_printf( Perl_debug_log, "%4.4s ", SvPV_nolen( *tmp ) );
1443 PerlIO_printf( Perl_debug_log, "\n-----:-----------------------");
1445 for( state = 0 ; state < trie->uniquecharcount ; state++ )
1446 PerlIO_printf( Perl_debug_log, "-----");
1447 PerlIO_printf( Perl_debug_log, "\n");
1449 for( state = 1 ; state < trie->laststate ; state++ ) {
1450 U32 base = trie->states[ state ].trans.base;
1452 PerlIO_printf( Perl_debug_log, "#%04"UVXf" ", (UV)state);
1454 if ( trie->states[ state ].wordnum ) {
1455 PerlIO_printf( Perl_debug_log, " W%04X", trie->states[ state ].wordnum );
1457 PerlIO_printf( Perl_debug_log, "%6s", "" );
1460 PerlIO_printf( Perl_debug_log, " @%04"UVXf" ", (UV)base );
1465 while( ( base + ofs < trie->uniquecharcount ) ||
1466 ( base + ofs - trie->uniquecharcount < trie->lasttrans
1467 && trie->trans[ base + ofs - trie->uniquecharcount ].check != state))
1470 PerlIO_printf( Perl_debug_log, "+%02"UVXf"[ ", (UV)ofs);
1472 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
1473 if ( ( base + ofs >= trie->uniquecharcount ) &&
1474 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
1475 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
1477 PerlIO_printf( Perl_debug_log, "%04"UVXf" ",
1478 (UV)trie->trans[ base + ofs - trie->uniquecharcount ].next );
1480 PerlIO_printf( Perl_debug_log, "%4s "," 0" );
1484 PerlIO_printf( Perl_debug_log, "]");
1487 PerlIO_printf( Perl_debug_log, "\n" );
1492 /* now finally we "stitch in" the new TRIE node
1493 This means we convert either the first branch or the first Exact,
1494 depending on whether the thing following (in 'last') is a branch
1495 or not and whther first is the startbranch (ie is it a sub part of
1496 the alternation or is it the whole thing.)
1497 Assuming its a sub part we conver the EXACT otherwise we convert
1498 the whole branch sequence, including the first.
1505 if ( first == startbranch && OP( last ) != BRANCH ) {
1508 convert = NEXTOPER( first );
1509 NEXT_OFF( first ) = (U16)(last - first);
1512 OP( convert ) = TRIE + (U8)( flags - EXACT );
1513 NEXT_OFF( convert ) = (U16)(tail - convert);
1514 ARG_SET( convert, data_slot );
1516 /* tells us if we need to handle accept buffers specially */
1517 convert->flags = ( RExC_seen_evals ? 1 : 0 );
1520 /* needed for dumping*/
1522 regnode *optimize = convert + NODE_STEP_REGNODE + regarglen[ TRIE ];
1523 /* We now need to mark all of the space originally used by the
1524 branches as optimized away. This keeps the dumpuntil from
1525 throwing a wobbly as it doesnt use regnext() to traverse the
1528 while( optimize < last ) {
1529 OP( optimize ) = OPTIMIZED;
1533 } /* end node insert */
1540 * There are strange code-generation bugs caused on sparc64 by gcc-2.95.2.
1541 * These need to be revisited when a newer toolchain becomes available.
1543 #if defined(__sparc64__) && defined(__GNUC__)
1544 # if __GNUC__ < 2 || (__GNUC__ == 2 && __GNUC_MINOR__ < 96)
1545 # undef SPARC64_GCC_WORKAROUND
1546 # define SPARC64_GCC_WORKAROUND 1
1550 /* REx optimizer. Converts nodes into quickier variants "in place".
1551 Finds fixed substrings. */
1553 /* Stops at toplevel WHILEM as well as at `last'. At end *scanp is set
1554 to the position after last scanned or to NULL. */
1558 S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, I32 *deltap, regnode *last, scan_data_t *data, U32 flags, U32 depth)
1559 /* scanp: Start here (read-write). */
1560 /* deltap: Write maxlen-minlen here. */
1561 /* last: Stop before this one. */
1563 I32 min = 0, pars = 0, code;
1564 regnode *scan = *scanp, *next;
1566 int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
1567 int is_inf_internal = 0; /* The studied chunk is infinite */
1568 I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
1569 scan_data_t data_fake;
1570 struct regnode_charclass_class and_with; /* Valid if flags & SCF_DO_STCLASS_OR */
1571 SV *re_trie_maxbuff = NULL;
1573 GET_RE_DEBUG_FLAGS_DECL;
1575 while (scan && OP(scan) != END && scan < last) {
1576 /* Peephole optimizer: */
1578 SV *mysv=sv_newmortal();
1579 regprop( mysv, scan);
1580 PerlIO_printf(Perl_debug_log, "%*speep: %s (0x%08"UVXf")\n",
1581 (int)depth*2, "", SvPV_nolen(mysv), PTR2UV(scan));
1584 if (PL_regkind[(U8)OP(scan)] == EXACT) {
1585 /* Merge several consecutive EXACTish nodes into one. */
1586 regnode *n = regnext(scan);
1589 regnode *stop = scan;
1592 next = scan + NODE_SZ_STR(scan);
1593 /* Skip NOTHING, merge EXACT*. */
1595 ( PL_regkind[(U8)OP(n)] == NOTHING ||
1596 (stringok && (OP(n) == OP(scan))))
1598 && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX) {
1599 if (OP(n) == TAIL || n > next)
1601 if (PL_regkind[(U8)OP(n)] == NOTHING) {
1602 NEXT_OFF(scan) += NEXT_OFF(n);
1603 next = n + NODE_STEP_REGNODE;
1610 else if (stringok) {
1611 const int oldl = STR_LEN(scan);
1612 regnode *nnext = regnext(n);
1614 if (oldl + STR_LEN(n) > U8_MAX)
1616 NEXT_OFF(scan) += NEXT_OFF(n);
1617 STR_LEN(scan) += STR_LEN(n);
1618 next = n + NODE_SZ_STR(n);
1619 /* Now we can overwrite *n : */
1620 Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
1628 if (UTF && ( OP(scan) == EXACTF ) && ( STR_LEN(scan) >= 6 ) ) {
1630 Two problematic code points in Unicode casefolding of EXACT nodes:
1632 U+0390 - GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
1633 U+03B0 - GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
1639 U+03B9 U+0308 U+0301 0xCE 0xB9 0xCC 0x88 0xCC 0x81
1640 U+03C5 U+0308 U+0301 0xCF 0x85 0xCC 0x88 0xCC 0x81
1642 This means that in case-insensitive matching (or "loose matching",
1643 as Unicode calls it), an EXACTF of length six (the UTF-8 encoded byte
1644 length of the above casefolded versions) can match a target string
1645 of length two (the byte length of UTF-8 encoded U+0390 or U+03B0).
1646 This would rather mess up the minimum length computation.
1648 What we'll do is to look for the tail four bytes, and then peek
1649 at the preceding two bytes to see whether we need to decrease
1650 the minimum length by four (six minus two).
1652 Thanks to the design of UTF-8, there cannot be false matches:
1653 A sequence of valid UTF-8 bytes cannot be a subsequence of
1654 another valid sequence of UTF-8 bytes.
1657 char *s0 = STRING(scan), *s, *t;
1658 char *s1 = s0 + STR_LEN(scan) - 1, *s2 = s1 - 4;
1659 const char *t0 = "\xcc\x88\xcc\x81";
1660 const char *t1 = t0 + 3;
1663 s < s2 && (t = ninstr(s, s1, t0, t1));
1665 if (((U8)t[-1] == 0xB9 && (U8)t[-2] == 0xCE) ||
1666 ((U8)t[-1] == 0x85 && (U8)t[-2] == 0xCF))
1673 n = scan + NODE_SZ_STR(scan);
1675 if (PL_regkind[(U8)OP(n)] != NOTHING || OP(n) == NOTHING) {
1686 /* Follow the next-chain of the current node and optimize
1687 away all the NOTHINGs from it. */
1688 if (OP(scan) != CURLYX) {
1689 const int max = (reg_off_by_arg[OP(scan)]
1691 /* I32 may be smaller than U16 on CRAYs! */
1692 : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
1693 int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
1697 /* Skip NOTHING and LONGJMP. */
1698 while ((n = regnext(n))
1699 && ((PL_regkind[(U8)OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
1700 || ((OP(n) == LONGJMP) && (noff = ARG(n))))
1701 && off + noff < max)
1703 if (reg_off_by_arg[OP(scan)])
1706 NEXT_OFF(scan) = off;
1709 /* The principal pseudo-switch. Cannot be a switch, since we
1710 look into several different things. */
1711 if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
1712 || OP(scan) == IFTHEN || OP(scan) == SUSPEND) {
1713 next = regnext(scan);
1715 /* demq: the op(next)==code check is to see if we have "branch-branch" AFAICT */
1717 if (OP(next) == code || code == IFTHEN || code == SUSPEND) {
1718 I32 max1 = 0, min1 = I32_MAX, num = 0;
1719 struct regnode_charclass_class accum;
1720 regnode *startbranch=scan;
1722 if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
1723 scan_commit(pRExC_state, data); /* Cannot merge strings after this. */
1724 if (flags & SCF_DO_STCLASS)
1725 cl_init_zero(pRExC_state, &accum);
1727 while (OP(scan) == code) {
1728 I32 deltanext, minnext, f = 0, fake;
1729 struct regnode_charclass_class this_class;
1732 data_fake.flags = 0;
1734 data_fake.whilem_c = data->whilem_c;
1735 data_fake.last_closep = data->last_closep;
1738 data_fake.last_closep = &fake;
1739 next = regnext(scan);
1740 scan = NEXTOPER(scan);
1742 scan = NEXTOPER(scan);
1743 if (flags & SCF_DO_STCLASS) {
1744 cl_init(pRExC_state, &this_class);
1745 data_fake.start_class = &this_class;
1746 f = SCF_DO_STCLASS_AND;
1748 if (flags & SCF_WHILEM_VISITED_POS)
1749 f |= SCF_WHILEM_VISITED_POS;
1751 /* we suppose the run is continuous, last=next...*/
1752 minnext = study_chunk(pRExC_state, &scan, &deltanext,
1753 next, &data_fake, f,depth+1);
1756 if (max1 < minnext + deltanext)
1757 max1 = minnext + deltanext;
1758 if (deltanext == I32_MAX)
1759 is_inf = is_inf_internal = 1;
1761 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
1763 if (data && (data_fake.flags & SF_HAS_EVAL))
1764 data->flags |= SF_HAS_EVAL;
1766 data->whilem_c = data_fake.whilem_c;
1767 if (flags & SCF_DO_STCLASS)
1768 cl_or(pRExC_state, &accum, &this_class);
1769 if (code == SUSPEND)
1772 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
1774 if (flags & SCF_DO_SUBSTR) {
1775 data->pos_min += min1;
1776 data->pos_delta += max1 - min1;
1777 if (max1 != min1 || is_inf)
1778 data->longest = &(data->longest_float);
1781 delta += max1 - min1;
1782 if (flags & SCF_DO_STCLASS_OR) {
1783 cl_or(pRExC_state, data->start_class, &accum);
1785 cl_and(data->start_class, &and_with);
1786 flags &= ~SCF_DO_STCLASS;
1789 else if (flags & SCF_DO_STCLASS_AND) {
1791 cl_and(data->start_class, &accum);
1792 flags &= ~SCF_DO_STCLASS;
1795 /* Switch to OR mode: cache the old value of
1796 * data->start_class */
1797 StructCopy(data->start_class, &and_with,
1798 struct regnode_charclass_class);
1799 flags &= ~SCF_DO_STCLASS_AND;
1800 StructCopy(&accum, data->start_class,
1801 struct regnode_charclass_class);
1802 flags |= SCF_DO_STCLASS_OR;
1803 data->start_class->flags |= ANYOF_EOS;
1809 Assuming this was/is a branch we are dealing with: 'scan' now
1810 points at the item that follows the branch sequence, whatever
1811 it is. We now start at the beginning of the sequence and look
1817 which would be constructed from a pattern like /A|LIST|OF|WORDS/
1819 If we can find such a subseqence we need to turn the first
1820 element into a trie and then add the subsequent branch exact
1821 strings to the trie.
1825 1. patterns where the whole set of branch can be converted to a trie,
1827 2. patterns where only a subset of the alternations can be
1828 converted to a trie.
1830 In case 1 we can replace the whole set with a single regop
1831 for the trie. In case 2 we need to keep the start and end
1834 'BRANCH EXACT; BRANCH EXACT; BRANCH X'
1835 becomes BRANCH TRIE; BRANCH X;
1837 Hypthetically when we know the regex isnt anchored we can
1838 turn a case 1 into a DFA and let it rip... Every time it finds a match
1839 it would just call its tail, no WHILEM/CURLY needed.
1843 if (!re_trie_maxbuff) {
1844 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
1845 if (!SvIOK(re_trie_maxbuff))
1846 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
1848 if ( SvIV(re_trie_maxbuff)>=0 && OP( startbranch )==BRANCH ) {
1850 regnode *first = (regnode *)NULL;
1851 regnode *last = (regnode *)NULL;
1852 regnode *tail = scan;
1857 SV *mysv = sv_newmortal(); /* for dumping */
1859 /* var tail is used because there may be a TAIL
1860 regop in the way. Ie, the exacts will point to the
1861 thing following the TAIL, but the last branch will
1862 point at the TAIL. So we advance tail. If we
1863 have nested (?:) we may have to move through several
1867 while ( OP( tail ) == TAIL ) {
1868 /* this is the TAIL generated by (?:) */
1869 tail = regnext( tail );
1873 regprop( mysv, tail );
1874 PerlIO_printf( Perl_debug_log, "%*s%s%s%s\n",
1875 (int)depth * 2 + 2, "", "Tail node is:", SvPV_nolen( mysv ),
1876 (RExC_seen_evals) ? "[EVAL]" : ""
1881 step through the branches, cur represents each
1882 branch, noper is the first thing to be matched
1883 as part of that branch and noper_next is the
1884 regnext() of that node. if noper is an EXACT
1885 and noper_next is the same as scan (our current
1886 position in the regex) then the EXACT branch is
1887 a possible optimization target. Once we have
1888 two or more consequetive such branches we can
1889 create a trie of the EXACT's contents and stich
1890 it in place. If the sequence represents all of
1891 the branches we eliminate the whole thing and
1892 replace it with a single TRIE. If it is a
1893 subsequence then we need to stitch it in. This
1894 means the first branch has to remain, and needs
1895 to be repointed at the item on the branch chain
1896 following the last branch optimized. This could
1897 be either a BRANCH, in which case the
1898 subsequence is internal, or it could be the
1899 item following the branch sequence in which
1900 case the subsequence is at the end.
1904 /* dont use tail as the end marker for this traverse */
1905 for ( cur = startbranch ; cur != scan ; cur = regnext( cur ) ) {
1906 regnode *noper = NEXTOPER( cur );
1907 regnode *noper_next = regnext( noper );
1910 regprop( mysv, cur);
1911 PerlIO_printf( Perl_debug_log, "%*s%s",
1912 (int)depth * 2 + 2," ", SvPV_nolen( mysv ) );
1914 regprop( mysv, noper);
1915 PerlIO_printf( Perl_debug_log, " -> %s",
1919 regprop( mysv, noper_next );
1920 PerlIO_printf( Perl_debug_log,"\t=> %s\t",
1923 PerlIO_printf( Perl_debug_log, "0x%p,0x%p,0x%p)\n",
1926 if ( ( first ? OP( noper ) == optype
1927 : PL_regkind[ (U8)OP( noper ) ] == EXACT )
1928 && noper_next == tail && count<U16_MAX)
1933 optype = OP( noper );
1937 regprop( mysv, first);
1938 PerlIO_printf( Perl_debug_log, "%*s%s",
1939 (int)depth * 2 + 2, "F:", SvPV_nolen( mysv ) );
1940 regprop( mysv, NEXTOPER(first) );
1941 PerlIO_printf( Perl_debug_log, " -> %s\n",
1942 SvPV_nolen( mysv ) );
1947 regprop( mysv, cur);
1948 PerlIO_printf( Perl_debug_log, "%*s%s",
1949 (int)depth * 2 + 2, "N:", SvPV_nolen( mysv ) );
1950 regprop( mysv, noper );
1951 PerlIO_printf( Perl_debug_log, " -> %s\n",
1952 SvPV_nolen( mysv ) );
1958 PerlIO_printf( Perl_debug_log, "%*s%s\n",
1959 (int)depth * 2 + 2, "E:", "**END**" );
1961 make_trie( pRExC_state, startbranch, first, cur, tail, optype );
1963 if ( PL_regkind[ (U8)OP( noper ) ] == EXACT
1964 && noper_next == tail )
1968 optype = OP( noper );
1978 regprop( mysv, cur);
1979 PerlIO_printf( Perl_debug_log,
1980 "%*s%s\t(0x%p,0x%p,0x%p)\n", (int)depth * 2 + 2,
1981 " ", SvPV_nolen( mysv ), first, last, cur);
1986 PerlIO_printf( Perl_debug_log, "%*s%s\n",
1987 (int)depth * 2 + 2, "E:", "==END==" );
1989 make_trie( pRExC_state, startbranch, first, scan, tail, optype );
1994 else if ( code == BRANCHJ ) { /* single branch is optimized. */
1995 scan = NEXTOPER(NEXTOPER(scan));
1996 } else /* single branch is optimized. */
1997 scan = NEXTOPER(scan);
2000 else if (OP(scan) == EXACT) {
2001 I32 l = STR_LEN(scan);
2002 UV uc = *((U8*)STRING(scan));
2004 const U8 * const s = (U8*)STRING(scan);
2005 l = utf8_length(s, s + l);
2006 uc = utf8_to_uvchr(s, NULL);
2009 if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
2010 /* The code below prefers earlier match for fixed
2011 offset, later match for variable offset. */
2012 if (data->last_end == -1) { /* Update the start info. */
2013 data->last_start_min = data->pos_min;
2014 data->last_start_max = is_inf
2015 ? I32_MAX : data->pos_min + data->pos_delta;
2017 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
2019 SV * sv = data->last_found;
2020 MAGIC *mg = SvUTF8(sv) && SvMAGICAL(sv) ?
2021 mg_find(sv, PERL_MAGIC_utf8) : NULL;
2022 if (mg && mg->mg_len >= 0)
2023 mg->mg_len += utf8_length((U8*)STRING(scan),
2024 (U8*)STRING(scan)+STR_LEN(scan));
2027 SvUTF8_on(data->last_found);
2028 data->last_end = data->pos_min + l;
2029 data->pos_min += l; /* As in the first entry. */
2030 data->flags &= ~SF_BEFORE_EOL;
2032 if (flags & SCF_DO_STCLASS_AND) {
2033 /* Check whether it is compatible with what we know already! */
2037 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
2038 && !ANYOF_BITMAP_TEST(data->start_class, uc)
2039 && (!(data->start_class->flags & ANYOF_FOLD)
2040 || !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
2043 ANYOF_CLASS_ZERO(data->start_class);
2044 ANYOF_BITMAP_ZERO(data->start_class);
2046 ANYOF_BITMAP_SET(data->start_class, uc);
2047 data->start_class->flags &= ~ANYOF_EOS;
2049 data->start_class->flags &= ~ANYOF_UNICODE_ALL;
2051 else if (flags & SCF_DO_STCLASS_OR) {
2052 /* false positive possible if the class is case-folded */
2054 ANYOF_BITMAP_SET(data->start_class, uc);
2056 data->start_class->flags |= ANYOF_UNICODE_ALL;
2057 data->start_class->flags &= ~ANYOF_EOS;
2058 cl_and(data->start_class, &and_with);
2060 flags &= ~SCF_DO_STCLASS;
2062 else if (PL_regkind[(U8)OP(scan)] == EXACT) { /* But OP != EXACT! */
2063 I32 l = STR_LEN(scan);
2064 UV uc = *((U8*)STRING(scan));
2066 /* Search for fixed substrings supports EXACT only. */
2067 if (flags & SCF_DO_SUBSTR)
2068 scan_commit(pRExC_state, data);
2070 U8 *s = (U8 *)STRING(scan);
2071 l = utf8_length(s, s + l);
2072 uc = utf8_to_uvchr(s, NULL);
2075 if (data && (flags & SCF_DO_SUBSTR))
2077 if (flags & SCF_DO_STCLASS_AND) {
2078 /* Check whether it is compatible with what we know already! */
2082 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
2083 && !ANYOF_BITMAP_TEST(data->start_class, uc)
2084 && !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
2086 ANYOF_CLASS_ZERO(data->start_class);
2087 ANYOF_BITMAP_ZERO(data->start_class);
2089 ANYOF_BITMAP_SET(data->start_class, uc);
2090 data->start_class->flags &= ~ANYOF_EOS;
2091 data->start_class->flags |= ANYOF_FOLD;
2092 if (OP(scan) == EXACTFL)
2093 data->start_class->flags |= ANYOF_LOCALE;
2096 else if (flags & SCF_DO_STCLASS_OR) {
2097 if (data->start_class->flags & ANYOF_FOLD) {
2098 /* false positive possible if the class is case-folded.
2099 Assume that the locale settings are the same... */
2101 ANYOF_BITMAP_SET(data->start_class, uc);
2102 data->start_class->flags &= ~ANYOF_EOS;
2104 cl_and(data->start_class, &and_with);
2106 flags &= ~SCF_DO_STCLASS;
2108 else if (strchr((const char*)PL_varies,OP(scan))) {
2109 I32 mincount, maxcount, minnext, deltanext, fl = 0;
2110 I32 f = flags, pos_before = 0;
2111 regnode *oscan = scan;
2112 struct regnode_charclass_class this_class;
2113 struct regnode_charclass_class *oclass = NULL;
2114 I32 next_is_eval = 0;
2116 switch (PL_regkind[(U8)OP(scan)]) {
2117 case WHILEM: /* End of (?:...)* . */
2118 scan = NEXTOPER(scan);
2121 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
2122 next = NEXTOPER(scan);
2123 if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) {
2125 maxcount = REG_INFTY;
2126 next = regnext(scan);
2127 scan = NEXTOPER(scan);
2131 if (flags & SCF_DO_SUBSTR)
2136 if (flags & SCF_DO_STCLASS) {
2138 maxcount = REG_INFTY;
2139 next = regnext(scan);
2140 scan = NEXTOPER(scan);
2143 is_inf = is_inf_internal = 1;
2144 scan = regnext(scan);
2145 if (flags & SCF_DO_SUBSTR) {
2146 scan_commit(pRExC_state, data); /* Cannot extend fixed substrings */
2147 data->longest = &(data->longest_float);
2149 goto optimize_curly_tail;
2151 mincount = ARG1(scan);
2152 maxcount = ARG2(scan);
2153 next = regnext(scan);
2154 if (OP(scan) == CURLYX) {
2155 I32 lp = (data ? *(data->last_closep) : 0);
2156 scan->flags = ((lp <= U8_MAX) ? (U8)lp : U8_MAX);
2158 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
2159 next_is_eval = (OP(scan) == EVAL);
2161 if (flags & SCF_DO_SUBSTR) {
2162 if (mincount == 0) scan_commit(pRExC_state,data); /* Cannot extend fixed substrings */
2163 pos_before = data->pos_min;
2167 data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
2169 data->flags |= SF_IS_INF;
2171 if (flags & SCF_DO_STCLASS) {
2172 cl_init(pRExC_state, &this_class);
2173 oclass = data->start_class;
2174 data->start_class = &this_class;
2175 f |= SCF_DO_STCLASS_AND;
2176 f &= ~SCF_DO_STCLASS_OR;
2178 /* These are the cases when once a subexpression
2179 fails at a particular position, it cannot succeed
2180 even after backtracking at the enclosing scope.
2182 XXXX what if minimal match and we are at the
2183 initial run of {n,m}? */
2184 if ((mincount != maxcount - 1) && (maxcount != REG_INFTY))
2185 f &= ~SCF_WHILEM_VISITED_POS;
2187 /* This will finish on WHILEM, setting scan, or on NULL: */
2188 minnext = study_chunk(pRExC_state, &scan, &deltanext, last, data,
2190 ? (f & ~SCF_DO_SUBSTR) : f),depth+1);
2192 if (flags & SCF_DO_STCLASS)
2193 data->start_class = oclass;
2194 if (mincount == 0 || minnext == 0) {
2195 if (flags & SCF_DO_STCLASS_OR) {
2196 cl_or(pRExC_state, data->start_class, &this_class);
2198 else if (flags & SCF_DO_STCLASS_AND) {
2199 /* Switch to OR mode: cache the old value of
2200 * data->start_class */
2201 StructCopy(data->start_class, &and_with,
2202 struct regnode_charclass_class);
2203 flags &= ~SCF_DO_STCLASS_AND;
2204 StructCopy(&this_class, data->start_class,
2205 struct regnode_charclass_class);
2206 flags |= SCF_DO_STCLASS_OR;
2207 data->start_class->flags |= ANYOF_EOS;
2209 } else { /* Non-zero len */
2210 if (flags & SCF_DO_STCLASS_OR) {
2211 cl_or(pRExC_state, data->start_class, &this_class);
2212 cl_and(data->start_class, &and_with);
2214 else if (flags & SCF_DO_STCLASS_AND)
2215 cl_and(data->start_class, &this_class);
2216 flags &= ~SCF_DO_STCLASS;
2218 if (!scan) /* It was not CURLYX, but CURLY. */
2220 if (ckWARN(WARN_REGEXP)
2221 /* ? quantifier ok, except for (?{ ... }) */
2222 && (next_is_eval || !(mincount == 0 && maxcount == 1))
2223 && (minnext == 0) && (deltanext == 0)
2224 && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
2225 && maxcount <= REG_INFTY/3) /* Complement check for big count */
2228 "Quantifier unexpected on zero-length expression");
2231 min += minnext * mincount;
2232 is_inf_internal |= ((maxcount == REG_INFTY
2233 && (minnext + deltanext) > 0)
2234 || deltanext == I32_MAX);
2235 is_inf |= is_inf_internal;
2236 delta += (minnext + deltanext) * maxcount - minnext * mincount;
2238 /* Try powerful optimization CURLYX => CURLYN. */
2239 if ( OP(oscan) == CURLYX && data
2240 && data->flags & SF_IN_PAR
2241 && !(data->flags & SF_HAS_EVAL)
2242 && !deltanext && minnext == 1 ) {
2243 /* Try to optimize to CURLYN. */
2244 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
2245 regnode *nxt1 = nxt;
2252 if (!strchr((const char*)PL_simple,OP(nxt))
2253 && !(PL_regkind[(U8)OP(nxt)] == EXACT
2254 && STR_LEN(nxt) == 1))
2260 if (OP(nxt) != CLOSE)
2262 /* Now we know that nxt2 is the only contents: */
2263 oscan->flags = (U8)ARG(nxt);
2265 OP(nxt1) = NOTHING; /* was OPEN. */
2267 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
2268 NEXT_OFF(nxt1+ 1) = 0; /* just for consistancy. */
2269 NEXT_OFF(nxt2) = 0; /* just for consistancy with CURLY. */
2270 OP(nxt) = OPTIMIZED; /* was CLOSE. */
2271 OP(nxt + 1) = OPTIMIZED; /* was count. */
2272 NEXT_OFF(nxt+ 1) = 0; /* just for consistancy. */
2277 /* Try optimization CURLYX => CURLYM. */
2278 if ( OP(oscan) == CURLYX && data
2279 && !(data->flags & SF_HAS_PAR)
2280 && !(data->flags & SF_HAS_EVAL)
2281 && !deltanext /* atom is fixed width */
2282 && minnext != 0 /* CURLYM can't handle zero width */
2284 /* XXXX How to optimize if data == 0? */
2285 /* Optimize to a simpler form. */
2286 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
2290 while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
2291 && (OP(nxt2) != WHILEM))
2293 OP(nxt2) = SUCCEED; /* Whas WHILEM */
2294 /* Need to optimize away parenths. */
2295 if (data->flags & SF_IN_PAR) {
2296 /* Set the parenth number. */
2297 regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
2299 if (OP(nxt) != CLOSE)
2300 FAIL("Panic opt close");
2301 oscan->flags = (U8)ARG(nxt);
2302 OP(nxt1) = OPTIMIZED; /* was OPEN. */
2303 OP(nxt) = OPTIMIZED; /* was CLOSE. */
2305 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
2306 OP(nxt + 1) = OPTIMIZED; /* was count. */
2307 NEXT_OFF(nxt1 + 1) = 0; /* just for consistancy. */
2308 NEXT_OFF(nxt + 1) = 0; /* just for consistancy. */
2311 while ( nxt1 && (OP(nxt1) != WHILEM)) {
2312 regnode *nnxt = regnext(nxt1);
2315 if (reg_off_by_arg[OP(nxt1)])
2316 ARG_SET(nxt1, nxt2 - nxt1);
2317 else if (nxt2 - nxt1 < U16_MAX)
2318 NEXT_OFF(nxt1) = nxt2 - nxt1;
2320 OP(nxt) = NOTHING; /* Cannot beautify */
2325 /* Optimize again: */
2326 study_chunk(pRExC_state, &nxt1, &deltanext, nxt,
2332 else if ((OP(oscan) == CURLYX)
2333 && (flags & SCF_WHILEM_VISITED_POS)
2334 /* See the comment on a similar expression above.
2335 However, this time it not a subexpression
2336 we care about, but the expression itself. */
2337 && (maxcount == REG_INFTY)
2338 && data && ++data->whilem_c < 16) {
2339 /* This stays as CURLYX, we can put the count/of pair. */
2340 /* Find WHILEM (as in regexec.c) */
2341 regnode *nxt = oscan + NEXT_OFF(oscan);
2343 if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
2345 PREVOPER(nxt)->flags = (U8)(data->whilem_c
2346 | (RExC_whilem_seen << 4)); /* On WHILEM */
2348 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
2350 if (flags & SCF_DO_SUBSTR) {
2351 SV *last_str = Nullsv;
2352 int counted = mincount != 0;
2354 if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */
2355 #if defined(SPARC64_GCC_WORKAROUND)
2361 if (pos_before >= data->last_start_min)
2364 b = data->last_start_min;
2367 s = SvPV(data->last_found, l);
2368 old = b - data->last_start_min;
2371 I32 b = pos_before >= data->last_start_min
2372 ? pos_before : data->last_start_min;
2374 char *s = SvPV(data->last_found, l);
2375 I32 old = b - data->last_start_min;
2379 old = utf8_hop((U8*)s, old) - (U8*)s;
2382 /* Get the added string: */
2383 last_str = newSVpvn(s + old, l);
2385 SvUTF8_on(last_str);
2386 if (deltanext == 0 && pos_before == b) {
2387 /* What was added is a constant string */
2389 SvGROW(last_str, (mincount * l) + 1);
2390 repeatcpy(SvPVX(last_str) + l,
2391 SvPVX(last_str), l, mincount - 1);
2392 SvCUR_set(last_str, SvCUR(last_str) * mincount);
2393 /* Add additional parts. */
2394 SvCUR_set(data->last_found,
2395 SvCUR(data->last_found) - l);
2396 sv_catsv(data->last_found, last_str);
2398 SV * sv = data->last_found;
2400 SvUTF8(sv) && SvMAGICAL(sv) ?
2401 mg_find(sv, PERL_MAGIC_utf8) : NULL;
2402 if (mg && mg->mg_len >= 0)
2403 mg->mg_len += CHR_SVLEN(last_str);
2405 data->last_end += l * (mincount - 1);
2408 /* start offset must point into the last copy */
2409 data->last_start_min += minnext * (mincount - 1);
2410 data->last_start_max += is_inf ? I32_MAX
2411 : (maxcount - 1) * (minnext + data->pos_delta);
2414 /* It is counted once already... */
2415 data->pos_min += minnext * (mincount - counted);
2416 data->pos_delta += - counted * deltanext +
2417 (minnext + deltanext) * maxcount - minnext * mincount;
2418 if (mincount != maxcount) {
2419 /* Cannot extend fixed substrings found inside
2421 scan_commit(pRExC_state,data);
2422 if (mincount && last_str) {
2423 sv_setsv(data->last_found, last_str);
2424 data->last_end = data->pos_min;
2425 data->last_start_min =
2426 data->pos_min - CHR_SVLEN(last_str);
2427 data->last_start_max = is_inf
2429 : data->pos_min + data->pos_delta
2430 - CHR_SVLEN(last_str);
2432 data->longest = &(data->longest_float);
2434 SvREFCNT_dec(last_str);
2436 if (data && (fl & SF_HAS_EVAL))
2437 data->flags |= SF_HAS_EVAL;
2438 optimize_curly_tail:
2439 if (OP(oscan) != CURLYX) {
2440 while (PL_regkind[(U8)OP(next = regnext(oscan))] == NOTHING
2442 NEXT_OFF(oscan) += NEXT_OFF(next);
2445 default: /* REF and CLUMP only? */
2446 if (flags & SCF_DO_SUBSTR) {
2447 scan_commit(pRExC_state,data); /* Cannot expect anything... */
2448 data->longest = &(data->longest_float);
2450 is_inf = is_inf_internal = 1;
2451 if (flags & SCF_DO_STCLASS_OR)
2452 cl_anything(pRExC_state, data->start_class);
2453 flags &= ~SCF_DO_STCLASS;
2457 else if (strchr((const char*)PL_simple,OP(scan))) {
2460 if (flags & SCF_DO_SUBSTR) {
2461 scan_commit(pRExC_state,data);
2465 if (flags & SCF_DO_STCLASS) {
2466 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
2468 /* Some of the logic below assumes that switching
2469 locale on will only add false positives. */
2470 switch (PL_regkind[(U8)OP(scan)]) {
2474 /* Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan)); */
2475 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
2476 cl_anything(pRExC_state, data->start_class);
2479 if (OP(scan) == SANY)
2481 if (flags & SCF_DO_STCLASS_OR) { /* Everything but \n */
2482 value = (ANYOF_BITMAP_TEST(data->start_class,'\n')
2483 || (data->start_class->flags & ANYOF_CLASS));
2484 cl_anything(pRExC_state, data->start_class);
2486 if (flags & SCF_DO_STCLASS_AND || !value)
2487 ANYOF_BITMAP_CLEAR(data->start_class,'\n');
2490 if (flags & SCF_DO_STCLASS_AND)
2491 cl_and(data->start_class,
2492 (struct regnode_charclass_class*)scan);
2494 cl_or(pRExC_state, data->start_class,
2495 (struct regnode_charclass_class*)scan);
2498 if (flags & SCF_DO_STCLASS_AND) {
2499 if (!(data->start_class->flags & ANYOF_LOCALE)) {
2500 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
2501 for (value = 0; value < 256; value++)
2502 if (!isALNUM(value))
2503 ANYOF_BITMAP_CLEAR(data->start_class, value);
2507 if (data->start_class->flags & ANYOF_LOCALE)
2508 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
2510 for (value = 0; value < 256; value++)
2512 ANYOF_BITMAP_SET(data->start_class, value);
2517 if (flags & SCF_DO_STCLASS_AND) {
2518 if (data->start_class->flags & ANYOF_LOCALE)
2519 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
2522 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
2523 data->start_class->flags |= ANYOF_LOCALE;
2527 if (flags & SCF_DO_STCLASS_AND) {
2528 if (!(data->start_class->flags & ANYOF_LOCALE)) {
2529 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
2530 for (value = 0; value < 256; value++)
2532 ANYOF_BITMAP_CLEAR(data->start_class, value);
2536 if (data->start_class->flags & ANYOF_LOCALE)
2537 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
2539 for (value = 0; value < 256; value++)
2540 if (!isALNUM(value))
2541 ANYOF_BITMAP_SET(data->start_class, value);
2546 if (flags & SCF_DO_STCLASS_AND) {
2547 if (data->start_class->flags & ANYOF_LOCALE)
2548 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
2551 data->start_class->flags |= ANYOF_LOCALE;
2552 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
2556 if (flags & SCF_DO_STCLASS_AND) {
2557 if (!(data->start_class->flags & ANYOF_LOCALE)) {
2558 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
2559 for (value = 0; value < 256; value++)
2560 if (!isSPACE(value))
2561 ANYOF_BITMAP_CLEAR(data->start_class, value);
2565 if (data->start_class->flags & ANYOF_LOCALE)
2566 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
2568 for (value = 0; value < 256; value++)
2570 ANYOF_BITMAP_SET(data->start_class, value);
2575 if (flags & SCF_DO_STCLASS_AND) {
2576 if (data->start_class->flags & ANYOF_LOCALE)
2577 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
2580 data->start_class->flags |= ANYOF_LOCALE;
2581 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
2585 if (flags & SCF_DO_STCLASS_AND) {
2586 if (!(data->start_class->flags & ANYOF_LOCALE)) {
2587 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
2588 for (value = 0; value < 256; value++)
2590 ANYOF_BITMAP_CLEAR(data->start_class, value);
2594 if (data->start_class->flags & ANYOF_LOCALE)
2595 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
2597 for (value = 0; value < 256; value++)
2598 if (!isSPACE(value))
2599 ANYOF_BITMAP_SET(data->start_class, value);
2604 if (flags & SCF_DO_STCLASS_AND) {
2605 if (data->start_class->flags & ANYOF_LOCALE) {
2606 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
2607 for (value = 0; value < 256; value++)
2608 if (!isSPACE(value))
2609 ANYOF_BITMAP_CLEAR(data->start_class, value);
2613 data->start_class->flags |= ANYOF_LOCALE;
2614 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
2618 if (flags & SCF_DO_STCLASS_AND) {
2619 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NDIGIT);
2620 for (value = 0; value < 256; value++)
2621 if (!isDIGIT(value))
2622 ANYOF_BITMAP_CLEAR(data->start_class, value);
2625 if (data->start_class->flags & ANYOF_LOCALE)
2626 ANYOF_CLASS_SET(data->start_class,ANYOF_DIGIT);
2628 for (value = 0; value < 256; value++)
2630 ANYOF_BITMAP_SET(data->start_class, value);
2635 if (flags & SCF_DO_STCLASS_AND) {
2636 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_DIGIT);
2637 for (value = 0; value < 256; value++)
2639 ANYOF_BITMAP_CLEAR(data->start_class, value);
2642 if (data->start_class->flags & ANYOF_LOCALE)
2643 ANYOF_CLASS_SET(data->start_class,ANYOF_NDIGIT);
2645 for (value = 0; value < 256; value++)
2646 if (!isDIGIT(value))
2647 ANYOF_BITMAP_SET(data->start_class, value);
2652 if (flags & SCF_DO_STCLASS_OR)
2653 cl_and(data->start_class, &and_with);
2654 flags &= ~SCF_DO_STCLASS;
2657 else if (PL_regkind[(U8)OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
2658 data->flags |= (OP(scan) == MEOL
2662 else if ( PL_regkind[(U8)OP(scan)] == BRANCHJ
2663 /* Lookbehind, or need to calculate parens/evals/stclass: */
2664 && (scan->flags || data || (flags & SCF_DO_STCLASS))
2665 && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) {
2666 /* Lookahead/lookbehind */
2667 I32 deltanext, minnext, fake = 0;
2669 struct regnode_charclass_class intrnl;
2672 data_fake.flags = 0;
2674 data_fake.whilem_c = data->whilem_c;
2675 data_fake.last_closep = data->last_closep;
2678 data_fake.last_closep = &fake;
2679 if ( flags & SCF_DO_STCLASS && !scan->flags
2680 && OP(scan) == IFMATCH ) { /* Lookahead */
2681 cl_init(pRExC_state, &intrnl);
2682 data_fake.start_class = &intrnl;
2683 f |= SCF_DO_STCLASS_AND;
2685 if (flags & SCF_WHILEM_VISITED_POS)
2686 f |= SCF_WHILEM_VISITED_POS;
2687 next = regnext(scan);
2688 nscan = NEXTOPER(NEXTOPER(scan));
2689 minnext = study_chunk(pRExC_state, &nscan, &deltanext, last, &data_fake, f,depth+1);
2692 vFAIL("Variable length lookbehind not implemented");
2694 else if (minnext > U8_MAX) {
2695 vFAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
2697 scan->flags = (U8)minnext;
2699 if (data && data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
2701 if (data && (data_fake.flags & SF_HAS_EVAL))
2702 data->flags |= SF_HAS_EVAL;
2704 data->whilem_c = data_fake.whilem_c;
2705 if (f & SCF_DO_STCLASS_AND) {
2706 int was = (data->start_class->flags & ANYOF_EOS);
2708 cl_and(data->start_class, &intrnl);
2710 data->start_class->flags |= ANYOF_EOS;
2713 else if (OP(scan) == OPEN) {
2716 else if (OP(scan) == CLOSE) {
2717 if ((I32)ARG(scan) == is_par) {
2718 next = regnext(scan);
2720 if ( next && (OP(next) != WHILEM) && next < last)
2721 is_par = 0; /* Disable optimization */
2724 *(data->last_closep) = ARG(scan);
2726 else if (OP(scan) == EVAL) {
2728 data->flags |= SF_HAS_EVAL;
2730 else if (OP(scan) == LOGICAL && scan->flags == 2) { /* Embedded follows */
2731 if (flags & SCF_DO_SUBSTR) {
2732 scan_commit(pRExC_state,data);
2733 data->longest = &(data->longest_float);
2735 is_inf = is_inf_internal = 1;
2736 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
2737 cl_anything(pRExC_state, data->start_class);
2738 flags &= ~SCF_DO_STCLASS;
2740 /* Else: zero-length, ignore. */
2741 scan = regnext(scan);
2746 *deltap = is_inf_internal ? I32_MAX : delta;
2747 if (flags & SCF_DO_SUBSTR && is_inf)
2748 data->pos_delta = I32_MAX - data->pos_min;
2749 if (is_par > U8_MAX)
2751 if (is_par && pars==1 && data) {
2752 data->flags |= SF_IN_PAR;
2753 data->flags &= ~SF_HAS_PAR;
2755 else if (pars && data) {
2756 data->flags |= SF_HAS_PAR;
2757 data->flags &= ~SF_IN_PAR;
2759 if (flags & SCF_DO_STCLASS_OR)
2760 cl_and(data->start_class, &and_with);
2765 S_add_data(pTHX_ RExC_state_t *pRExC_state, I32 n, const char *s)
2767 if (RExC_rx->data) {
2768 Renewc(RExC_rx->data,
2769 sizeof(*RExC_rx->data) + sizeof(void*) * (RExC_rx->data->count + n - 1),
2770 char, struct reg_data);
2771 Renew(RExC_rx->data->what, RExC_rx->data->count + n, U8);
2772 RExC_rx->data->count += n;
2775 Newc(1207, RExC_rx->data, sizeof(*RExC_rx->data) + sizeof(void*) * (n - 1),
2776 char, struct reg_data);
2777 New(1208, RExC_rx->data->what, n, U8);
2778 RExC_rx->data->count = n;
2780 Copy(s, RExC_rx->data->what + RExC_rx->data->count - n, n, U8);
2781 return RExC_rx->data->count - n;
2785 Perl_reginitcolors(pTHX)
2788 char *s = PerlEnv_getenv("PERL_RE_COLORS");
2791 PL_colors[0] = s = savepv(s);
2793 s = strchr(s, '\t');
2799 PL_colors[i] = s = (char *)"";
2803 PL_colors[i++] = (char *)"";
2810 - pregcomp - compile a regular expression into internal code
2812 * We can't allocate space until we know how big the compiled form will be,
2813 * but we can't compile it (and thus know how big it is) until we've got a
2814 * place to put the code. So we cheat: we compile it twice, once with code
2815 * generation turned off and size counting turned on, and once "for real".
2816 * This also means that we don't allocate space until we are sure that the
2817 * thing really will compile successfully, and we never have to move the
2818 * code and thus invalidate pointers into it. (Note that it has to be in
2819 * one piece because free() must be able to free it all.) [NB: not true in perl]
2821 * Beware that the optimization-preparation code in here knows about some
2822 * of the structure of the compiled regexp. [I'll say.]
2825 Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm)
2835 RExC_state_t RExC_state;
2836 RExC_state_t *pRExC_state = &RExC_state;
2838 GET_RE_DEBUG_FLAGS_DECL;
2841 FAIL("NULL regexp argument");
2843 RExC_utf8 = pm->op_pmdynflags & PMdf_CMP_UTF8;
2846 DEBUG_r(if (!PL_colorset) reginitcolors());
2848 PerlIO_printf(Perl_debug_log, "%sCompiling REx%s \"%s%*s%s\"\n",
2849 PL_colors[4],PL_colors[5],PL_colors[0],
2850 (int)(xend - exp), RExC_precomp, PL_colors[1]);
2852 RExC_flags = pm->op_pmflags;
2856 RExC_seen_zerolen = *exp == '^' ? -1 : 0;
2857 RExC_seen_evals = 0;
2860 /* First pass: determine size, legality. */
2867 RExC_emit = &PL_regdummy;
2868 RExC_whilem_seen = 0;
2869 #if 0 /* REGC() is (currently) a NOP at the first pass.
2870 * Clever compilers notice this and complain. --jhi */
2871 REGC((U8)REG_MAGIC, (char*)RExC_emit);
2873 if (reg(pRExC_state, 0, &flags) == NULL) {
2874 RExC_precomp = Nullch;
2877 DEBUG_COMPILE_r(PerlIO_printf(Perl_debug_log, "size %"IVdf" ", (IV)RExC_size));
2879 /* Small enough for pointer-storage convention?
2880 If extralen==0, this means that we will not need long jumps. */
2881 if (RExC_size >= 0x10000L && RExC_extralen)
2882 RExC_size += RExC_extralen;
2885 if (RExC_whilem_seen > 15)
2886 RExC_whilem_seen = 15;
2888 /* Allocate space and initialize. */
2889 Newc(1001, r, sizeof(regexp) + (unsigned)RExC_size * sizeof(regnode),
2892 FAIL("Regexp out of space");
2895 /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */
2896 Zero(r, sizeof(regexp) + (unsigned)RExC_size * sizeof(regnode), char);
2899 r->prelen = xend - exp;
2900 r->precomp = savepvn(RExC_precomp, r->prelen);
2902 #ifdef PERL_COPY_ON_WRITE
2903 r->saved_copy = Nullsv;
2905 r->reganch = pm->op_pmflags & PMf_COMPILETIME;
2906 r->nparens = RExC_npar - 1; /* set early to validate backrefs */
2908 r->substrs = 0; /* Useful during FAIL. */
2909 r->startp = 0; /* Useful during FAIL. */
2910 r->endp = 0; /* Useful during FAIL. */
2912 Newz(1304, r->offsets, 2*RExC_size+1, U32); /* MJD 20001228 */
2914 r->offsets[0] = RExC_size;
2916 DEBUG_OFFSETS_r(PerlIO_printf(Perl_debug_log,
2917 "%s %"UVuf" bytes for offset annotations.\n",
2918 r->offsets ? "Got" : "Couldn't get",
2919 (UV)((2*RExC_size+1) * sizeof(U32))));
2923 /* Second pass: emit code. */
2924 RExC_flags = pm->op_pmflags; /* don't let top level (?i) bleed */
2929 RExC_emit_start = r->program;
2930 RExC_emit = r->program;
2931 /* Store the count of eval-groups for security checks: */
2932 RExC_emit->next_off = (U16)((RExC_seen_evals > U16_MAX) ? U16_MAX : RExC_seen_evals);
2933 REGC((U8)REG_MAGIC, (char*) RExC_emit++);
2935 if (reg(pRExC_state, 0, &flags) == NULL)
2939 /* Dig out information for optimizations. */
2940 r->reganch = pm->op_pmflags & PMf_COMPILETIME; /* Again? */
2941 pm->op_pmflags = RExC_flags;
2943 r->reganch |= ROPT_UTF8; /* Unicode in it? */
2944 r->regstclass = NULL;
2945 if (RExC_naughty >= 10) /* Probably an expensive pattern. */
2946 r->reganch |= ROPT_NAUGHTY;
2947 scan = r->program + 1; /* First BRANCH. */
2949 /* XXXX To minimize changes to RE engine we always allocate
2950 3-units-long substrs field. */
2951 Newz(1004, r->substrs, 1, struct reg_substr_data);
2953 StructCopy(&zero_scan_data, &data, scan_data_t);
2954 /* XXXX Should not we check for something else? Usually it is OPEN1... */
2955 if (OP(scan) != BRANCH) { /* Only one top-level choice. */
2957 STRLEN longest_float_length, longest_fixed_length;
2958 struct regnode_charclass_class ch_class;
2963 /* Skip introductions and multiplicators >= 1. */
2964 while ((OP(first) == OPEN && (sawopen = 1)) ||
2965 /* An OR of *one* alternative - should not happen now. */
2966 (OP(first) == BRANCH && OP(regnext(first)) != BRANCH) ||
2967 (OP(first) == PLUS) ||
2968 (OP(first) == MINMOD) ||
2969 /* An {n,m} with n>0 */
2970 (PL_regkind[(U8)OP(first)] == CURLY && ARG1(first) > 0) ) {
2971 if (OP(first) == PLUS)
2974 first += regarglen[(U8)OP(first)];
2975 first = NEXTOPER(first);
2978 /* Starting-point info. */
2980 if (PL_regkind[(U8)OP(first)] == EXACT) {
2981 if (OP(first) == EXACT)
2982 ; /* Empty, get anchored substr later. */
2983 else if ((OP(first) == EXACTF || OP(first) == EXACTFL))
2984 r->regstclass = first;
2986 else if (strchr((const char*)PL_simple,OP(first)))
2987 r->regstclass = first;
2988 else if (PL_regkind[(U8)OP(first)] == BOUND ||
2989 PL_regkind[(U8)OP(first)] == NBOUND)
2990 r->regstclass = first;
2991 else if (PL_regkind[(U8)OP(first)] == BOL) {
2992 r->reganch |= (OP(first) == MBOL
2994 : (OP(first) == SBOL
2997 first = NEXTOPER(first);
3000 else if (OP(first) == GPOS) {
3001 r->reganch |= ROPT_ANCH_GPOS;
3002 first = NEXTOPER(first);
3005 else if (!sawopen && (OP(first) == STAR &&
3006 PL_regkind[(U8)OP(NEXTOPER(first))] == REG_ANY) &&
3007 !(r->reganch & ROPT_ANCH) )
3009 /* turn .* into ^.* with an implied $*=1 */
3010 int type = OP(NEXTOPER(first));
3012 if (type == REG_ANY)
3013 type = ROPT_ANCH_MBOL;
3015 type = ROPT_ANCH_SBOL;
3017 r->reganch |= type | ROPT_IMPLICIT;
3018 first = NEXTOPER(first);
3021 if (sawplus && (!sawopen || !RExC_sawback)
3022 && !(RExC_seen & REG_SEEN_EVAL)) /* May examine pos and $& */
3023 /* x+ must match at the 1st pos of run of x's */
3024 r->reganch |= ROPT_SKIP;
3026 /* Scan is after the zeroth branch, first is atomic matcher. */
3027 DEBUG_COMPILE_r(PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
3028 (IV)(first - scan + 1)));
3030 * If there's something expensive in the r.e., find the
3031 * longest literal string that must appear and make it the
3032 * regmust. Resolve ties in favor of later strings, since
3033 * the regstart check works with the beginning of the r.e.
3034 * and avoiding duplication strengthens checking. Not a
3035 * strong reason, but sufficient in the absence of others.
3036 * [Now we resolve ties in favor of the earlier string if
3037 * it happens that c_offset_min has been invalidated, since the
3038 * earlier string may buy us something the later one won't.]
3042 data.longest_fixed = newSVpvn("",0);
3043 data.longest_float = newSVpvn("",0);
3044 data.last_found = newSVpvn("",0);
3045 data.longest = &(data.longest_fixed);
3047 if (!r->regstclass) {
3048 cl_init(pRExC_state, &ch_class);
3049 data.start_class = &ch_class;
3050 stclass_flag = SCF_DO_STCLASS_AND;
3051 } else /* XXXX Check for BOUND? */
3053 data.last_closep = &last_close;
3055 minlen = study_chunk(pRExC_state, &first, &fake, scan + RExC_size, /* Up to end */
3056 &data, SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag,0);
3057 if ( RExC_npar == 1 && data.longest == &(data.longest_fixed)
3058 && data.last_start_min == 0 && data.last_end > 0
3059 && !RExC_seen_zerolen
3060 && (!(RExC_seen & REG_SEEN_GPOS) || (r->reganch & ROPT_ANCH_GPOS)))
3061 r->reganch |= ROPT_CHECK_ALL;
3062 scan_commit(pRExC_state, &data);
3063 SvREFCNT_dec(data.last_found);
3065 longest_float_length = CHR_SVLEN(data.longest_float);
3066 if (longest_float_length
3067 || (data.flags & SF_FL_BEFORE_EOL
3068 && (!(data.flags & SF_FL_BEFORE_MEOL)
3069 || (RExC_flags & PMf_MULTILINE)))) {
3072 if (SvCUR(data.longest_fixed) /* ok to leave SvCUR */
3073 && data.offset_fixed == data.offset_float_min
3074 && SvCUR(data.longest_fixed) == SvCUR(data.longest_float))
3075 goto remove_float; /* As in (a)+. */
3077 if (SvUTF8(data.longest_float)) {
3078 r->float_utf8 = data.longest_float;
3079 r->float_substr = Nullsv;
3081 r->float_substr = data.longest_float;
3082 r->float_utf8 = Nullsv;
3084 r->float_min_offset = data.offset_float_min;
3085 r->float_max_offset = data.offset_float_max;
3086 t = (data.flags & SF_FL_BEFORE_EOL /* Can't have SEOL and MULTI */
3087 && (!(data.flags & SF_FL_BEFORE_MEOL)
3088 || (RExC_flags & PMf_MULTILINE)));
3089 fbm_compile(data.longest_float, t ? FBMcf_TAIL : 0);
3093 r->float_substr = r->float_utf8 = Nullsv;
3094 SvREFCNT_dec(data.longest_float);
3095 longest_float_length = 0;
3098 longest_fixed_length = CHR_SVLEN(data.longest_fixed);
3099 if (longest_fixed_length
3100 || (data.flags & SF_FIX_BEFORE_EOL /* Cannot have SEOL and MULTI */
3101 && (!(data.flags & SF_FIX_BEFORE_MEOL)
3102 || (RExC_flags & PMf_MULTILINE)))) {
3105 if (SvUTF8(data.longest_fixed)) {
3106 r->anchored_utf8 = data.longest_fixed;
3107 r->anchored_substr = Nullsv;
3109 r->anchored_substr = data.longest_fixed;
3110 r->anchored_utf8 = Nullsv;
3112 r->anchored_offset = data.offset_fixed;
3113 t = (data.flags & SF_FIX_BEFORE_EOL /* Can't have SEOL and MULTI */
3114 && (!(data.flags & SF_FIX_BEFORE_MEOL)
3115 || (RExC_flags & PMf_MULTILINE)));
3116 fbm_compile(data.longest_fixed, t ? FBMcf_TAIL : 0);
3119 r->anchored_substr = r->anchored_utf8 = Nullsv;
3120 SvREFCNT_dec(data.longest_fixed);
3121 longest_fixed_length = 0;
3124 && (OP(r->regstclass) == REG_ANY || OP(r->regstclass) == SANY))
3125 r->regstclass = NULL;
3126 if ((!(r->anchored_substr || r->anchored_utf8) || r->anchored_offset)
3128 && !(data.start_class->flags & ANYOF_EOS)
3129 && !cl_is_anything(data.start_class))
3131 I32 n = add_data(pRExC_state, 1, "f");
3133 New(1006, RExC_rx->data->data[n], 1,
3134 struct regnode_charclass_class);
3135 StructCopy(data.start_class,
3136 (struct regnode_charclass_class*)RExC_rx->data->data[n],
3137 struct regnode_charclass_class);
3138 r->regstclass = (regnode*)RExC_rx->data->data[n];
3139 r->reganch &= ~ROPT_SKIP; /* Used in find_byclass(). */
3140 PL_regdata = r->data; /* for regprop() */
3141 DEBUG_COMPILE_r({ SV *sv = sv_newmortal();
3142 regprop(sv, (regnode*)data.start_class);
3143 PerlIO_printf(Perl_debug_log,
3144 "synthetic stclass `%s'.\n",
3148 /* A temporary algorithm prefers floated substr to fixed one to dig more info. */
3149 if (longest_fixed_length > longest_float_length) {
3150 r->check_substr = r->anchored_substr;
3151 r->check_utf8 = r->anchored_utf8;
3152 r->check_offset_min = r->check_offset_max = r->anchored_offset;
3153 if (r->reganch & ROPT_ANCH_SINGLE)
3154 r->reganch |= ROPT_NOSCAN;
3157 r->check_substr = r->float_substr;
3158 r->check_utf8 = r->float_utf8;
3159 r->check_offset_min = data.offset_float_min;
3160 r->check_offset_max = data.offset_float_max;
3162 /* XXXX Currently intuiting is not compatible with ANCH_GPOS.
3163 This should be changed ASAP! */
3164 if ((r->check_substr || r->check_utf8) && !(r->reganch & ROPT_ANCH_GPOS)) {
3165 r->reganch |= RE_USE_INTUIT;
3166 if (SvTAIL(r->check_substr ? r->check_substr : r->check_utf8))
3167 r->reganch |= RE_INTUIT_TAIL;
3171 /* Several toplevels. Best we can is to set minlen. */
3173 struct regnode_charclass_class ch_class;
3176 DEBUG_COMPILE_r(PerlIO_printf(Perl_debug_log, "\n"));
3177 scan = r->program + 1;
3178 cl_init(pRExC_state, &ch_class);
3179 data.start_class = &ch_class;
3180 data.last_closep = &last_close;
3181 minlen = study_chunk(pRExC_state, &scan, &fake, scan + RExC_size, &data, SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS,0);
3182 r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8
3183 = r->float_substr = r->float_utf8 = Nullsv;
3184 if (!(data.start_class->flags & ANYOF_EOS)
3185 && !cl_is_anything(data.start_class))
3187 I32 n = add_data(pRExC_state, 1, "f");
3189 New(1006, RExC_rx->data->data[n], 1,
3190 struct regnode_charclass_class);
3191 StructCopy(data.start_class,
3192 (struct regnode_charclass_class*)RExC_rx->data->data[n],
3193 struct regnode_charclass_class);
3194 r->regstclass = (regnode*)RExC_rx->data->data[n];
3195 r->reganch &= ~ROPT_SKIP; /* Used in find_byclass(). */
3196 DEBUG_COMPILE_r({ SV* sv = sv_newmortal();
3197 regprop(sv, (regnode*)data.start_class);
3198 PerlIO_printf(Perl_debug_log,
3199 "synthetic stclass `%s'.\n",
3205 if (RExC_seen & REG_SEEN_GPOS)
3206 r->reganch |= ROPT_GPOS_SEEN;
3207 if (RExC_seen & REG_SEEN_LOOKBEHIND)
3208 r->reganch |= ROPT_LOOKBEHIND_SEEN;
3209 if (RExC_seen & REG_SEEN_EVAL)
3210 r->reganch |= ROPT_EVAL_SEEN;
3211 if (RExC_seen & REG_SEEN_CANY)
3212 r->reganch |= ROPT_CANY_SEEN;
3213 Newz(1002, r->startp, RExC_npar, I32);
3214 Newz(1002, r->endp, RExC_npar, I32);
3215 PL_regdata = r->data; /* for regprop() */
3216 DEBUG_COMPILE_r(regdump(r));
3221 - reg - regular expression, i.e. main body or parenthesized thing
3223 * Caller must absorb opening parenthesis.
3225 * Combining parenthesis handling with the base level of regular expression
3226 * is a trifle forced, but the need to tie the tails of the branches to what
3227 * follows makes it hard to avoid.
3230 S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp)
3231 /* paren: Parenthesized? 0=top, 1=(, inside: changed to letter. */
3234 register regnode *ret; /* Will be the head of the group. */
3235 register regnode *br;
3236 register regnode *lastbr;
3237 register regnode *ender = 0;
3238 register I32 parno = 0;
3239 I32 flags, oregflags = RExC_flags, have_branch = 0, open = 0;
3241 /* for (?g), (?gc), and (?o) warnings; warning
3242 about (?c) will warn about (?g) -- japhy */
3244 I32 wastedflags = 0x00,
3247 wasted_gc = 0x02 | 0x04,
3250 char * parse_start = RExC_parse; /* MJD */
3251 char *oregcomp_parse = RExC_parse;
3254 *flagp = 0; /* Tentatively. */
3257 /* Make an OPEN node, if parenthesized. */
3259 if (*RExC_parse == '?') { /* (?...) */
3260 U32 posflags = 0, negflags = 0;
3261 U32 *flagsp = &posflags;
3263 char *seqstart = RExC_parse;
3266 paren = *RExC_parse++;
3267 ret = NULL; /* For look-ahead/behind. */
3269 case '<': /* (?<...) */
3270 RExC_seen |= REG_SEEN_LOOKBEHIND;
3271 if (*RExC_parse == '!')
3273 if (*RExC_parse != '=' && *RExC_parse != '!')
3276 case '=': /* (?=...) */
3277 case '!': /* (?!...) */
3278 RExC_seen_zerolen++;
3279 case ':': /* (?:...) */
3280 case '>': /* (?>...) */
3282 case '$': /* (?$...) */
3283 case '@': /* (?@...) */
3284 vFAIL2("Sequence (?%c...) not implemented", (int)paren);
3286 case '#': /* (?#...) */
3287 while (*RExC_parse && *RExC_parse != ')')
3289 if (*RExC_parse != ')')
3290 FAIL("Sequence (?#... not terminated");
3291 nextchar(pRExC_state);
3294 case 'p': /* (?p...) */
3295 if (SIZE_ONLY && ckWARN2(WARN_DEPRECATED, WARN_REGEXP))
3296 vWARNdep(RExC_parse, "(?p{}) is deprecated - use (??{})");
3298 case '?': /* (??...) */
3300 if (*RExC_parse != '{')
3302 paren = *RExC_parse++;
3304 case '{': /* (?{...}) */
3306 I32 count = 1, n = 0;
3308 char *s = RExC_parse;
3310 OP_4tree *sop, *rop;
3312 RExC_seen_zerolen++;
3313 RExC_seen |= REG_SEEN_EVAL;
3314 while (count && (c = *RExC_parse)) {
3315 if (c == '\\' && RExC_parse[1])
3323 if (*RExC_parse != ')')
3326 vFAIL("Sequence (?{...}) not terminated or not {}-balanced");
3331 if (RExC_parse - 1 - s)
3332 sv = newSVpvn(s, RExC_parse - 1 - s);
3334 sv = newSVpvn("", 0);
3337 Perl_save_re_context(aTHX);
3338 rop = sv_compile_2op(sv, &sop, "re", &pad);
3339 sop->op_private |= OPpREFCOUNTED;
3340 /* re_dup will OpREFCNT_inc */
3341 OpREFCNT_set(sop, 1);
3344 n = add_data(pRExC_state, 3, "nop");
3345 RExC_rx->data->data[n] = (void*)rop;
3346 RExC_rx->data->data[n+1] = (void*)sop;
3347 RExC_rx->data->data[n+2] = (void*)pad;
3350 else { /* First pass */
3351 if (PL_reginterp_cnt < ++RExC_seen_evals
3353 /* No compiled RE interpolated, has runtime
3354 components ===> unsafe. */
3355 FAIL("Eval-group not allowed at runtime, use re 'eval'");
3356 if (PL_tainting && PL_tainted)
3357 FAIL("Eval-group in insecure regular expression");
3358 if (IN_PERL_COMPILETIME)
3362 nextchar(pRExC_state);
3364 ret = reg_node(pRExC_state, LOGICAL);
3367 regtail(pRExC_state, ret, reganode(pRExC_state, EVAL, n));
3368 /* deal with the length of this later - MJD */
3371 ret = reganode(pRExC_state, EVAL, n);
3372 Set_Node_Length(ret, RExC_parse - parse_start + 1);
3373 Set_Node_Offset(ret, parse_start);
3376 case '(': /* (?(?{...})...) and (?(?=...)...) */
3378 if (RExC_parse[0] == '?') { /* (?(?...)) */
3379 if (RExC_parse[1] == '=' || RExC_parse[1] == '!'
3380 || RExC_parse[1] == '<'
3381 || RExC_parse[1] == '{') { /* Lookahead or eval. */
3384 ret = reg_node(pRExC_state, LOGICAL);
3387 regtail(pRExC_state, ret, reg(pRExC_state, 1, &flag));
3391 else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
3393 parno = atoi(RExC_parse++);
3395 while (isDIGIT(*RExC_parse))
3397 ret = reganode(pRExC_state, GROUPP, parno);
3399 if ((c = *nextchar(pRExC_state)) != ')')
3400 vFAIL("Switch condition not recognized");
3402 regtail(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
3403 br = regbranch(pRExC_state, &flags, 1);
3405 br = reganode(pRExC_state, LONGJMP, 0);
3407 regtail(pRExC_state, br, reganode(pRExC_state, LONGJMP, 0));
3408 c = *nextchar(pRExC_state);
3412 lastbr = reganode(pRExC_state, IFTHEN, 0); /* Fake one for optimizer. */
3413 regbranch(pRExC_state, &flags, 1);
3414 regtail(pRExC_state, ret, lastbr);
3417 c = *nextchar(pRExC_state);
3422 vFAIL("Switch (?(condition)... contains too many branches");
3423 ender = reg_node(pRExC_state, TAIL);
3424 regtail(pRExC_state, br, ender);
3426 regtail(pRExC_state, lastbr, ender);
3427 regtail(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
3430 regtail(pRExC_state, ret, ender);
3434 vFAIL2("Unknown switch condition (?(%.2s", RExC_parse);
3438 RExC_parse--; /* for vFAIL to print correctly */
3439 vFAIL("Sequence (? incomplete");
3443 parse_flags: /* (?i) */
3444 while (*RExC_parse && strchr("iogcmsx", *RExC_parse)) {
3445 /* (?g), (?gc) and (?o) are useless here
3446 and must be globally applied -- japhy */
3448 if (*RExC_parse == 'o' || *RExC_parse == 'g') {
3449 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
3450 I32 wflagbit = *RExC_parse == 'o' ? wasted_o : wasted_g;
3451 if (! (wastedflags & wflagbit) ) {
3452 wastedflags |= wflagbit;
3455 "Useless (%s%c) - %suse /%c modifier",
3456 flagsp == &negflags ? "?-" : "?",
3458 flagsp == &negflags ? "don't " : "",
3464 else if (*RExC_parse == 'c') {
3465 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
3466 if (! (wastedflags & wasted_c) ) {
3467 wastedflags |= wasted_gc;
3470 "Useless (%sc) - %suse /gc modifier",
3471 flagsp == &negflags ? "?-" : "?",
3472 flagsp == &negflags ? "don't " : ""
3477 else { pmflag(flagsp, *RExC_parse); }
3481 if (*RExC_parse == '-') {
3483 wastedflags = 0; /* reset so (?g-c) warns twice */
3487 RExC_flags |= posflags;
3488 RExC_flags &= ~negflags;
3489 if (*RExC_parse == ':') {
3495 if (*RExC_parse != ')') {
3497 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
3499 nextchar(pRExC_state);
3507 ret = reganode(pRExC_state, OPEN, parno);
3508 Set_Node_Length(ret, 1); /* MJD */
3509 Set_Node_Offset(ret, RExC_parse); /* MJD */
3516 /* Pick up the branches, linking them together. */
3517 parse_start = RExC_parse; /* MJD */
3518 br = regbranch(pRExC_state, &flags, 1);
3519 /* branch_len = (paren != 0); */
3523 if (*RExC_parse == '|') {
3524 if (!SIZE_ONLY && RExC_extralen) {
3525 reginsert(pRExC_state, BRANCHJ, br);
3528 reginsert(pRExC_state, BRANCH, br);
3529 Set_Node_Length(br, paren != 0);
3530 Set_Node_Offset_To_R(br-RExC_emit_start, parse_start-RExC_start);
3534 RExC_extralen += 1; /* For BRANCHJ-BRANCH. */
3536 else if (paren == ':') {
3537 *flagp |= flags&SIMPLE;
3539 if (open) { /* Starts with OPEN. */
3540 regtail(pRExC_state, ret, br); /* OPEN -> first. */
3542 else if (paren != '?') /* Not Conditional */
3544 *flagp |= flags & (SPSTART | HASWIDTH);
3546 while (*RExC_parse == '|') {
3547 if (!SIZE_ONLY && RExC_extralen) {
3548 ender = reganode(pRExC_state, LONGJMP,0);
3549 regtail(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender); /* Append to the previous. */
3552 RExC_extralen += 2; /* Account for LONGJMP. */
3553 nextchar(pRExC_state);
3554 br = regbranch(pRExC_state, &flags, 0);
3558 regtail(pRExC_state, lastbr, br); /* BRANCH -> BRANCH. */
3562 *flagp |= flags&SPSTART;
3565 if (have_branch || paren != ':') {
3566 /* Make a closing node, and hook it on the end. */
3569 ender = reg_node(pRExC_state, TAIL);
3572 ender = reganode(pRExC_state, CLOSE, parno);
3573 Set_Node_Offset(ender,RExC_parse+1); /* MJD */
3574 Set_Node_Length(ender,1); /* MJD */
3580 *flagp &= ~HASWIDTH;
3583 ender = reg_node(pRExC_state, SUCCEED);
3586 ender = reg_node(pRExC_state, END);
3589 regtail(pRExC_state, lastbr, ender);
3592 /* Hook the tails of the branches to the closing node. */
3593 for (br = ret; br != NULL; br = regnext(br)) {
3594 regoptail(pRExC_state, br, ender);
3601 static const char parens[] = "=!<,>";
3603 if (paren && (p = strchr(parens, paren))) {
3604 U8 node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
3605 int flag = (p - parens) > 1;
3608 node = SUSPEND, flag = 0;
3609 reginsert(pRExC_state, node,ret);
3610 Set_Node_Cur_Length(ret);
3611 Set_Node_Offset(ret, parse_start + 1);
3613 regtail(pRExC_state, ret, reg_node(pRExC_state, TAIL));
3617 /* Check for proper termination. */
3619 RExC_flags = oregflags;
3620 if (RExC_parse >= RExC_end || *nextchar(pRExC_state) != ')') {
3621 RExC_parse = oregcomp_parse;
3622 vFAIL("Unmatched (");
3625 else if (!paren && RExC_parse < RExC_end) {
3626 if (*RExC_parse == ')') {
3628 vFAIL("Unmatched )");
3631 FAIL("Junk on end of regexp"); /* "Can't happen". */
3639 - regbranch - one alternative of an | operator
3641 * Implements the concatenation operator.
3644 S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first)
3646 register regnode *ret;
3647 register regnode *chain = NULL;
3648 register regnode *latest;
3649 I32 flags = 0, c = 0;
3654 if (!SIZE_ONLY && RExC_extralen)
3655 ret = reganode(pRExC_state, BRANCHJ,0);
3657 ret = reg_node(pRExC_state, BRANCH);
3658 Set_Node_Length(ret, 1);
3662 if (!first && SIZE_ONLY)
3663 RExC_extralen += 1; /* BRANCHJ */
3665 *flagp = WORST; /* Tentatively. */
3668 nextchar(pRExC_state);
3669 while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
3671 latest = regpiece(pRExC_state, &flags);
3672 if (latest == NULL) {
3673 if (flags & TRYAGAIN)
3677 else if (ret == NULL)
3679 *flagp |= flags&HASWIDTH;
3680 if (chain == NULL) /* First piece. */
3681 *flagp |= flags&SPSTART;
3684 regtail(pRExC_state, chain, latest);
3689 if (chain == NULL) { /* Loop ran zero times. */
3690 chain = reg_node(pRExC_state, NOTHING);
3695 *flagp |= flags&SIMPLE;
3702 - regpiece - something followed by possible [*+?]
3704 * Note that the branching code sequences used for ? and the general cases
3705 * of * and + are somewhat optimized: they use the same NOTHING node as
3706 * both the endmarker for their branch list and the body of the last branch.
3707 * It might seem that this node could be dispensed with entirely, but the
3708 * endmarker role is not redundant.
3711 S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp)
3713 register regnode *ret;
3715 register char *next;
3717 char *origparse = RExC_parse;
3720 I32 max = REG_INFTY;
3723 ret = regatom(pRExC_state, &flags);
3725 if (flags & TRYAGAIN)
3732 if (op == '{' && regcurly(RExC_parse)) {
3733 parse_start = RExC_parse; /* MJD */
3734 next = RExC_parse + 1;
3736 while (isDIGIT(*next) || *next == ',') {
3745 if (*next == '}') { /* got one */
3749 min = atoi(RExC_parse);
3753 maxpos = RExC_parse;
3755 if (!max && *maxpos != '0')
3756 max = REG_INFTY; /* meaning "infinity" */
3757 else if (max >= REG_INFTY)
3758 vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
3760 nextchar(pRExC_state);
3763 if ((flags&SIMPLE)) {
3764 RExC_naughty += 2 + RExC_naughty / 2;
3765 reginsert(pRExC_state, CURLY, ret);
3766 Set_Node_Offset(ret, parse_start+1); /* MJD */
3767 Set_Node_Cur_Length(ret);
3770 regnode *w = reg_node(pRExC_state, WHILEM);
3773 regtail(pRExC_state, ret, w);
3774 if (!SIZE_ONLY && RExC_extralen) {
3775 reginsert(pRExC_state, LONGJMP,ret);
3776 reginsert(pRExC_state, NOTHING,ret);
3777 NEXT_OFF(ret) = 3; /* Go over LONGJMP. */
3779 reginsert(pRExC_state, CURLYX,ret);
3781 Set_Node_Offset(ret, parse_start+1);
3782 Set_Node_Length(ret,
3783 op == '{' ? (RExC_parse - parse_start) : 1);
3785 if (!SIZE_ONLY && RExC_extralen)
3786 NEXT_OFF(ret) = 3; /* Go over NOTHING to LONGJMP. */
3787 regtail(pRExC_state, ret, reg_node(pRExC_state, NOTHING));
3789 RExC_whilem_seen++, RExC_extralen += 3;
3790 RExC_naughty += 4 + RExC_naughty; /* compound interest */
3798 if (max && max < min)
3799 vFAIL("Can't do {n,m} with n > m");
3801 ARG1_SET(ret, (U16)min);
3802 ARG2_SET(ret, (U16)max);
3814 #if 0 /* Now runtime fix should be reliable. */
3816 /* if this is reinstated, don't forget to put this back into perldiag:
3818 =item Regexp *+ operand could be empty at {#} in regex m/%s/
3820 (F) The part of the regexp subject to either the * or + quantifier
3821 could match an empty string. The {#} shows in the regular
3822 expression about where the problem was discovered.
3826 if (!(flags&HASWIDTH) && op != '?')
3827 vFAIL("Regexp *+ operand could be empty");
3830 parse_start = RExC_parse;
3831 nextchar(pRExC_state);
3833 *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH);
3835 if (op == '*' && (flags&SIMPLE)) {
3836 reginsert(pRExC_state, STAR, ret);
3840 else if (op == '*') {
3844 else if (op == '+' && (flags&SIMPLE)) {
3845 reginsert(pRExC_state, PLUS, ret);
3849 else if (op == '+') {
3853 else if (op == '?') {
3858 if (ckWARN(WARN_REGEXP) && !SIZE_ONLY && !(flags&HASWIDTH) && max > REG_INFTY/3) {
3860 "%.*s matches null string many times",
3861 RExC_parse - origparse,
3865 if (*RExC_parse == '?') {
3866 nextchar(pRExC_state);
3867 reginsert(pRExC_state, MINMOD, ret);
3868 regtail(pRExC_state, ret, ret + NODE_STEP_REGNODE);
3870 if (ISMULT2(RExC_parse)) {
3872 vFAIL("Nested quantifiers");
3879 - regatom - the lowest level
3881 * Optimization: gobbles an entire sequence of ordinary characters so that
3882 * it can turn them into a single node, which is smaller to store and
3883 * faster to run. Backslashed characters are exceptions, each becoming a
3884 * separate node; the code is simpler that way and it's not worth fixing.
3886 * [Yes, it is worth fixing, some scripts can run twice the speed.] */
3888 S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp)
3890 register regnode *ret = 0;
3892 char *parse_start = RExC_parse;
3894 *flagp = WORST; /* Tentatively. */
3897 switch (*RExC_parse) {
3899 RExC_seen_zerolen++;
3900 nextchar(pRExC_state);
3901 if (RExC_flags & PMf_MULTILINE)
3902 ret = reg_node(pRExC_state, MBOL);
3903 else if (RExC_flags & PMf_SINGLELINE)
3904 ret = reg_node(pRExC_state, SBOL);
3906 ret = reg_node(pRExC_state, BOL);
3907 Set_Node_Length(ret, 1); /* MJD */
3910 nextchar(pRExC_state);
3912 RExC_seen_zerolen++;
3913 if (RExC_flags & PMf_MULTILINE)
3914 ret = reg_node(pRExC_state, MEOL);
3915 else if (RExC_flags & PMf_SINGLELINE)
3916 ret = reg_node(pRExC_state, SEOL);
3918 ret = reg_node(pRExC_state, EOL);
3919 Set_Node_Length(ret, 1); /* MJD */
3922 nextchar(pRExC_state);
3923 if (RExC_flags & PMf_SINGLELINE)
3924 ret = reg_node(pRExC_state, SANY);
3926 ret = reg_node(pRExC_state, REG_ANY);
3927 *flagp |= HASWIDTH|SIMPLE;
3929 Set_Node_Length(ret, 1); /* MJD */
3933 char *oregcomp_parse = ++RExC_parse;
3934 ret = regclass(pRExC_state);
3935 if (*RExC_parse != ']') {
3936 RExC_parse = oregcomp_parse;
3937 vFAIL("Unmatched [");
3939 nextchar(pRExC_state);
3940 *flagp |= HASWIDTH|SIMPLE;
3941 Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */
3945 nextchar(pRExC_state);
3946 ret = reg(pRExC_state, 1, &flags);
3948 if (flags & TRYAGAIN) {
3949 if (RExC_parse == RExC_end) {
3950 /* Make parent create an empty node if needed. */
3958 *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE);
3962 if (flags & TRYAGAIN) {
3966 vFAIL("Internal urp");
3967 /* Supposed to be caught earlier. */
3970 if (!regcurly(RExC_parse)) {
3979 vFAIL("Quantifier follows nothing");
3982 switch (*++RExC_parse) {
3984 RExC_seen_zerolen++;
3985 ret = reg_node(pRExC_state, SBOL);
3987 nextchar(pRExC_state);
3988 Set_Node_Length(ret, 2); /* MJD */
3991 ret = reg_node(pRExC_state, GPOS);
3992 RExC_seen |= REG_SEEN_GPOS;
3994 nextchar(pRExC_state);
3995 Set_Node_Length(ret, 2); /* MJD */
3998 ret = reg_node(pRExC_state, SEOL);
4000 RExC_seen_zerolen++; /* Do not optimize RE away */
4001 nextchar(pRExC_state);
4004 ret = reg_node(pRExC_state, EOS);
4006 RExC_seen_zerolen++; /* Do not optimize RE away */
4007 nextchar(pRExC_state);
4008 Set_Node_Length(ret, 2); /* MJD */
4011 ret = reg_node(pRExC_state, CANY);
4012 RExC_seen |= REG_SEEN_CANY;
4013 *flagp |= HASWIDTH|SIMPLE;
4014 nextchar(pRExC_state);
4015 Set_Node_Length(ret, 2); /* MJD */
4018 ret = reg_node(pRExC_state, CLUMP);
4020 nextchar(pRExC_state);
4021 Set_Node_Length(ret, 2); /* MJD */
4024 ret = reg_node(pRExC_state, (U8)(LOC ? ALNUML : ALNUM));
4025 *flagp |= HASWIDTH|SIMPLE;
4026 nextchar(pRExC_state);
4027 Set_Node_Length(ret, 2); /* MJD */
4030 ret = reg_node(pRExC_state, (U8)(LOC ? NALNUML : NALNUM));
4031 *flagp |= HASWIDTH|SIMPLE;
4032 nextchar(pRExC_state);
4033 Set_Node_Length(ret, 2); /* MJD */
4036 RExC_seen_zerolen++;
4037 RExC_seen |= REG_SEEN_LOOKBEHIND;
4038 ret = reg_node(pRExC_state, (U8)(LOC ? BOUNDL : BOUND));
4040 nextchar(pRExC_state);
4041 Set_Node_Length(ret, 2); /* MJD */
4044 RExC_seen_zerolen++;
4045 RExC_seen |= REG_SEEN_LOOKBEHIND;
4046 ret = reg_node(pRExC_state, (U8)(LOC ? NBOUNDL : NBOUND));
4048 nextchar(pRExC_state);
4049 Set_Node_Length(ret, 2); /* MJD */
4052 ret = reg_node(pRExC_state, (U8)(LOC ? SPACEL : SPACE));
4053 *flagp |= HASWIDTH|SIMPLE;
4054 nextchar(pRExC_state);
4055 Set_Node_Length(ret, 2); /* MJD */
4058 ret = reg_node(pRExC_state, (U8)(LOC ? NSPACEL : NSPACE));
4059 *flagp |= HASWIDTH|SIMPLE;
4060 nextchar(pRExC_state);
4061 Set_Node_Length(ret, 2); /* MJD */
4064 ret = reg_node(pRExC_state, DIGIT);
4065 *flagp |= HASWIDTH|SIMPLE;
4066 nextchar(pRExC_state);
4067 Set_Node_Length(ret, 2); /* MJD */
4070 ret = reg_node(pRExC_state, NDIGIT);
4071 *flagp |= HASWIDTH|SIMPLE;
4072 nextchar(pRExC_state);
4073 Set_Node_Length(ret, 2); /* MJD */
4078 char* oldregxend = RExC_end;
4079 char* parse_start = RExC_parse - 2;
4081 if (RExC_parse[1] == '{') {
4082 /* a lovely hack--pretend we saw [\pX] instead */
4083 RExC_end = strchr(RExC_parse, '}');
4085 U8 c = (U8)*RExC_parse;
4087 RExC_end = oldregxend;
4088 vFAIL2("Missing right brace on \\%c{}", c);
4093 RExC_end = RExC_parse + 2;
4094 if (RExC_end > oldregxend)
4095 RExC_end = oldregxend;
4099 ret = regclass(pRExC_state);
4101 RExC_end = oldregxend;
4104 Set_Node_Offset(ret, parse_start + 2);
4105 Set_Node_Cur_Length(ret);
4106 nextchar(pRExC_state);
4107 *flagp |= HASWIDTH|SIMPLE;
4120 case '1': case '2': case '3': case '4':
4121 case '5': case '6': case '7': case '8': case '9':
4123 I32 num = atoi(RExC_parse);
4125 if (num > 9 && num >= RExC_npar)
4128 char * parse_start = RExC_parse - 1; /* MJD */
4129 while (isDIGIT(*RExC_parse))
4132 if (!SIZE_ONLY && num > (I32)RExC_rx->nparens)
4133 vFAIL("Reference to nonexistent group");
4135 ret = reganode(pRExC_state,
4136 (U8)(FOLD ? (LOC ? REFFL : REFF) : REF),
4140 /* override incorrect value set in reganode MJD */
4141 Set_Node_Offset(ret, parse_start+1);
4142 Set_Node_Cur_Length(ret); /* MJD */
4144 nextchar(pRExC_state);
4149 if (RExC_parse >= RExC_end)
4150 FAIL("Trailing \\");
4153 /* Do not generate `unrecognized' warnings here, we fall
4154 back into the quick-grab loop below */
4161 if (RExC_flags & PMf_EXTENDED) {
4162 while (RExC_parse < RExC_end && *RExC_parse != '\n') RExC_parse++;
4163 if (RExC_parse < RExC_end)
4169 register STRLEN len;
4175 U8 tmpbuf[UTF8_MAXBYTES_CASE+1], *foldbuf;
4177 parse_start = RExC_parse - 1;
4183 ret = reg_node(pRExC_state,
4184 (U8)(FOLD ? (LOC ? EXACTFL : EXACTF) : EXACT));
4186 for (len = 0, p = RExC_parse - 1;
4187 len < 127 && p < RExC_end;
4192 if (RExC_flags & PMf_EXTENDED)
4193 p = regwhite(p, RExC_end);
4240 ender = ASCII_TO_NATIVE('\033');
4244 ender = ASCII_TO_NATIVE('\007');
4249 char* e = strchr(p, '}');
4253 vFAIL("Missing right brace on \\x{}");
4256 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
4257 | PERL_SCAN_DISALLOW_PREFIX;
4259 ender = grok_hex(p + 1, &numlen, &flags, NULL);
4266 I32 flags = PERL_SCAN_DISALLOW_PREFIX;
4268 ender = grok_hex(p, &numlen, &flags, NULL);
4274 ender = UCHARAT(p++);
4275 ender = toCTRL(ender);
4277 case '0': case '1': case '2': case '3':case '4':
4278 case '5': case '6': case '7': case '8':case '9':
4280 (isDIGIT(p[1]) && atoi(p) >= RExC_npar) ) {
4283 ender = grok_oct(p, &numlen, &flags, NULL);
4293 FAIL("Trailing \\");
4296 if (!SIZE_ONLY && ckWARN(WARN_REGEXP) && isALPHA(*p))
4297 vWARN2(p + 1, "Unrecognized escape \\%c passed through", UCHARAT(p));
4298 goto normal_default;
4303 if (UTF8_IS_START(*p) && UTF) {
4304 ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
4312 if (RExC_flags & PMf_EXTENDED)
4313 p = regwhite(p, RExC_end);
4315 /* Prime the casefolded buffer. */
4316 ender = toFOLD_uni(ender, tmpbuf, &foldlen);
4318 if (ISMULT2(p)) { /* Back off on ?+*. */
4325 /* Emit all the Unicode characters. */
4326 for (foldbuf = tmpbuf;
4328 foldlen -= numlen) {
4329 ender = utf8_to_uvchr(foldbuf, &numlen);
4331 reguni(pRExC_state, ender, s, &unilen);
4334 /* In EBCDIC the numlen
4335 * and unilen can differ. */
4337 if (numlen >= foldlen)
4341 break; /* "Can't happen." */
4345 reguni(pRExC_state, ender, s, &unilen);
4354 REGC((char)ender, s++);
4362 /* Emit all the Unicode characters. */
4363 for (foldbuf = tmpbuf;
4365 foldlen -= numlen) {
4366 ender = utf8_to_uvchr(foldbuf, &numlen);
4368 reguni(pRExC_state, ender, s, &unilen);
4371 /* In EBCDIC the numlen
4372 * and unilen can differ. */
4374 if (numlen >= foldlen)
4382 reguni(pRExC_state, ender, s, &unilen);
4391 REGC((char)ender, s++);
4395 Set_Node_Cur_Length(ret); /* MJD */
4396 nextchar(pRExC_state);
4398 /* len is STRLEN which is unsigned, need to copy to signed */
4401 vFAIL("Internal disaster");
4405 if (len == 1 && UNI_IS_INVARIANT(ender))
4410 RExC_size += STR_SZ(len);
4412 RExC_emit += STR_SZ(len);
4417 /* If the encoding pragma is in effect recode the text of
4418 * any EXACT-kind nodes. */
4419 if (PL_encoding && PL_regkind[(U8)OP(ret)] == EXACT) {
4420 STRLEN oldlen = STR_LEN(ret);
4421 SV *sv = sv_2mortal(newSVpvn(STRING(ret), oldlen));
4425 if (sv_utf8_downgrade(sv, TRUE)) {
4426 char *s = sv_recode_to_utf8(sv, PL_encoding);
4427 STRLEN newlen = SvCUR(sv);
4432 GET_RE_DEBUG_FLAGS_DECL;
4433 DEBUG_COMPILE_r(PerlIO_printf(Perl_debug_log, "recode %*s to %*s\n",
4434 (int)oldlen, STRING(ret),
4436 Copy(s, STRING(ret), newlen, char);
4437 STR_LEN(ret) += newlen - oldlen;
4438 RExC_emit += STR_SZ(newlen) - STR_SZ(oldlen);
4440 RExC_size += STR_SZ(newlen) - STR_SZ(oldlen);
4448 S_regwhite(pTHX_ char *p, const char *e)
4453 else if (*p == '#') {
4456 } while (p < e && *p != '\n');
4464 /* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
4465 Character classes ([:foo:]) can also be negated ([:^foo:]).
4466 Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
4467 Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
4468 but trigger failures because they are currently unimplemented. */
4470 #define POSIXCC_DONE(c) ((c) == ':')
4471 #define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.')
4472 #define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c))
4475 S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value)
4478 I32 namedclass = OOB_NAMEDCLASS;
4480 if (value == '[' && RExC_parse + 1 < RExC_end &&
4481 /* I smell either [: or [= or [. -- POSIX has been here, right? */
4482 POSIXCC(UCHARAT(RExC_parse))) {
4483 char c = UCHARAT(RExC_parse);
4484 char* s = RExC_parse++;
4486 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != c)
4488 if (RExC_parse == RExC_end)
4489 /* Grandfather lone [:, [=, [. */
4492 char* t = RExC_parse++; /* skip over the c */
4496 if (UCHARAT(RExC_parse) == ']') {
4497 RExC_parse++; /* skip over the ending ] */
4500 I32 complement = *posixcc == '^' ? *posixcc++ : 0;
4501 I32 skip = t - posixcc;
4503 /* Initially switch on the length of the name. */
4506 if (memEQ(posixcc, "word", 4)) {
4507 /* this is not POSIX, this is the Perl \w */;
4509 = complement ? ANYOF_NALNUM : ANYOF_ALNUM;
4513 /* Names all of length 5. */
4514 /* alnum alpha ascii blank cntrl digit graph lower
4515 print punct space upper */
4516 /* Offset 4 gives the best switch position. */
4517 switch (posixcc[4]) {
4519 if (memEQ(posixcc, "alph", 4)) {
4522 = complement ? ANYOF_NALPHA : ANYOF_ALPHA;
4526 if (memEQ(posixcc, "spac", 4)) {
4529 = complement ? ANYOF_NPSXSPC : ANYOF_PSXSPC;
4533 if (memEQ(posixcc, "grap", 4)) {
4536 = complement ? ANYOF_NGRAPH : ANYOF_GRAPH;
4540 if (memEQ(posixcc, "asci", 4)) {
4543 = complement ? ANYOF_NASCII : ANYOF_ASCII;
4547 if (memEQ(posixcc, "blan", 4)) {
4550 = complement ? ANYOF_NBLANK : ANYOF_BLANK;
4554 if (memEQ(posixcc, "cntr", 4)) {
4557 = complement ? ANYOF_NCNTRL : ANYOF_CNTRL;
4561 if (memEQ(posixcc, "alnu", 4)) {
4564 = complement ? ANYOF_NALNUMC : ANYOF_ALNUMC;
4568 if (memEQ(posixcc, "lowe", 4)) {
4571 = complement ? ANYOF_NLOWER : ANYOF_LOWER;
4573 if (memEQ(posixcc, "uppe", 4)) {
4576 = complement ? ANYOF_NUPPER : ANYOF_UPPER;
4580 if (memEQ(posixcc, "digi", 4)) {
4583 = complement ? ANYOF_NDIGIT : ANYOF_DIGIT;
4585 if (memEQ(posixcc, "prin", 4)) {
4588 = complement ? ANYOF_NPRINT : ANYOF_PRINT;
4590 if (memEQ(posixcc, "punc", 4)) {
4593 = complement ? ANYOF_NPUNCT : ANYOF_PUNCT;
4599 if (memEQ(posixcc, "xdigit", 6)) {
4601 = complement ? ANYOF_NXDIGIT : ANYOF_XDIGIT;
4606 if (namedclass == OOB_NAMEDCLASS)
4608 Simple_vFAIL3("POSIX class [:%.*s:] unknown",
4611 assert (posixcc[skip] == ':');
4612 assert (posixcc[skip+1] == ']');
4613 } else if (!SIZE_ONLY) {
4614 /* [[=foo=]] and [[.foo.]] are still future. */
4616 /* adjust RExC_parse so the warning shows after
4618 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse) != ']')
4620 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
4623 /* Maternal grandfather:
4624 * "[:" ending in ":" but not in ":]" */
4634 S_checkposixcc(pTHX_ RExC_state_t *pRExC_state)
4636 if (!SIZE_ONLY && POSIXCC(UCHARAT(RExC_parse))) {
4637 char *s = RExC_parse;
4640 while(*s && isALNUM(*s))
4642 if (*s && c == *s && s[1] == ']') {
4643 if (ckWARN(WARN_REGEXP))
4645 "POSIX syntax [%c %c] belongs inside character classes",
4648 /* [[=foo=]] and [[.foo.]] are still future. */
4649 if (POSIXCC_NOTYET(c)) {
4650 /* adjust RExC_parse so the error shows after
4652 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse++) != ']')
4654 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
4661 S_regclass(pTHX_ RExC_state_t *pRExC_state)
4664 register UV nextvalue;
4665 register IV prevvalue = OOB_UNICODE;
4666 register IV range = 0;
4667 register regnode *ret;
4670 char *rangebegin = 0;
4671 bool need_class = 0;
4672 SV *listsv = Nullsv;
4675 bool optimize_invert = TRUE;
4676 AV* unicode_alternate = 0;
4678 UV literal_endpoint = 0;
4681 ret = reganode(pRExC_state, ANYOF, 0);
4684 ANYOF_FLAGS(ret) = 0;
4686 if (UCHARAT(RExC_parse) == '^') { /* Complement of range. */
4690 ANYOF_FLAGS(ret) |= ANYOF_INVERT;
4694 RExC_size += ANYOF_SKIP;
4696 RExC_emit += ANYOF_SKIP;
4698 ANYOF_FLAGS(ret) |= ANYOF_FOLD;
4700 ANYOF_FLAGS(ret) |= ANYOF_LOCALE;
4701 ANYOF_BITMAP_ZERO(ret);
4702 listsv = newSVpvn("# comment\n", 10);
4705 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
4707 if (!SIZE_ONLY && POSIXCC(nextvalue))
4708 checkposixcc(pRExC_state);
4710 /* allow 1st char to be ] (allowing it to be - is dealt with later) */
4711 if (UCHARAT(RExC_parse) == ']')
4714 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != ']') {
4718 namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
4721 rangebegin = RExC_parse;
4723 value = utf8n_to_uvchr((U8*)RExC_parse,
4724 RExC_end - RExC_parse,
4726 RExC_parse += numlen;
4729 value = UCHARAT(RExC_parse++);
4730 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
4731 if (value == '[' && POSIXCC(nextvalue))
4732 namedclass = regpposixcc(pRExC_state, value);
4733 else if (value == '\\') {
4735 value = utf8n_to_uvchr((U8*)RExC_parse,
4736 RExC_end - RExC_parse,
4738 RExC_parse += numlen;
4741 value = UCHARAT(RExC_parse++);
4742 /* Some compilers cannot handle switching on 64-bit integer
4743 * values, therefore value cannot be an UV. Yes, this will
4744 * be a problem later if we want switch on Unicode.
4745 * A similar issue a little bit later when switching on
4746 * namedclass. --jhi */
4747 switch ((I32)value) {
4748 case 'w': namedclass = ANYOF_ALNUM; break;
4749 case 'W': namedclass = ANYOF_NALNUM; break;
4750 case 's': namedclass = ANYOF_SPACE; break;
4751 case 'S': namedclass = ANYOF_NSPACE; break;
4752 case 'd': namedclass = ANYOF_DIGIT; break;
4753 case 'D': namedclass = ANYOF_NDIGIT; break;
4756 if (RExC_parse >= RExC_end)
4757 vFAIL2("Empty \\%c{}", (U8)value);
4758 if (*RExC_parse == '{') {
4760 e = strchr(RExC_parse++, '}');
4762 vFAIL2("Missing right brace on \\%c{}", c);
4763 while (isSPACE(UCHARAT(RExC_parse)))
4765 if (e == RExC_parse)
4766 vFAIL2("Empty \\%c{}", c);
4768 while (isSPACE(UCHARAT(RExC_parse + n - 1)))
4776 if (UCHARAT(RExC_parse) == '^') {
4779 value = value == 'p' ? 'P' : 'p'; /* toggle */
4780 while (isSPACE(UCHARAT(RExC_parse))) {
4786 Perl_sv_catpvf(aTHX_ listsv,
4787 "+utf8::%.*s\n", (int)n, RExC_parse);
4789 Perl_sv_catpvf(aTHX_ listsv,
4790 "!utf8::%.*s\n", (int)n, RExC_parse);
4793 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
4794 namedclass = ANYOF_MAX; /* no official name, but it's named */
4796 case 'n': value = '\n'; break;
4797 case 'r': value = '\r'; break;
4798 case 't': value = '\t'; break;
4799 case 'f': value = '\f'; break;
4800 case 'b': value = '\b'; break;
4801 case 'e': value = ASCII_TO_NATIVE('\033');break;
4802 case 'a': value = ASCII_TO_NATIVE('\007');break;
4804 if (*RExC_parse == '{') {
4805 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
4806 | PERL_SCAN_DISALLOW_PREFIX;
4807 e = strchr(RExC_parse++, '}');
4809 vFAIL("Missing right brace on \\x{}");
4811 numlen = e - RExC_parse;
4812 value = grok_hex(RExC_parse, &numlen, &flags, NULL);
4816 I32 flags = PERL_SCAN_DISALLOW_PREFIX;
4818 value = grok_hex(RExC_parse, &numlen, &flags, NULL);
4819 RExC_parse += numlen;
4823 value = UCHARAT(RExC_parse++);
4824 value = toCTRL(value);
4826 case '0': case '1': case '2': case '3': case '4':
4827 case '5': case '6': case '7': case '8': case '9':
4831 value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
4832 RExC_parse += numlen;
4836 if (!SIZE_ONLY && ckWARN(WARN_REGEXP) && isALPHA(value))
4838 "Unrecognized escape \\%c in character class passed through",
4842 } /* end of \blah */
4848 if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
4850 if (!SIZE_ONLY && !need_class)
4851 ANYOF_CLASS_ZERO(ret);
4855 /* a bad range like a-\d, a-[:digit:] ? */
4858 if (ckWARN(WARN_REGEXP))
4860 "False [] range \"%*.*s\"",
4861 RExC_parse - rangebegin,
4862 RExC_parse - rangebegin,
4864 if (prevvalue < 256) {
4865 ANYOF_BITMAP_SET(ret, prevvalue);
4866 ANYOF_BITMAP_SET(ret, '-');
4869 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
4870 Perl_sv_catpvf(aTHX_ listsv,
4871 "%04"UVxf"\n%04"UVxf"\n", (UV)prevvalue, (UV) '-');
4875 range = 0; /* this was not a true range */
4879 const char *what = NULL;
4882 if (namedclass > OOB_NAMEDCLASS)
4883 optimize_invert = FALSE;
4884 /* Possible truncation here but in some 64-bit environments
4885 * the compiler gets heartburn about switch on 64-bit values.
4886 * A similar issue a little earlier when switching on value.
4888 switch ((I32)namedclass) {
4891 ANYOF_CLASS_SET(ret, ANYOF_ALNUM);
4893 for (value = 0; value < 256; value++)
4895 ANYOF_BITMAP_SET(ret, value);
4902 ANYOF_CLASS_SET(ret, ANYOF_NALNUM);
4904 for (value = 0; value < 256; value++)
4905 if (!isALNUM(value))
4906 ANYOF_BITMAP_SET(ret, value);
4913 ANYOF_CLASS_SET(ret, ANYOF_ALNUMC);
4915 for (value = 0; value < 256; value++)
4916 if (isALNUMC(value))
4917 ANYOF_BITMAP_SET(ret, value);
4924 ANYOF_CLASS_SET(ret, ANYOF_NALNUMC);
4926 for (value = 0; value < 256; value++)
4927 if (!isALNUMC(value))
4928 ANYOF_BITMAP_SET(ret, value);
4935 ANYOF_CLASS_SET(ret, ANYOF_ALPHA);
4937 for (value = 0; value < 256; value++)
4939 ANYOF_BITMAP_SET(ret, value);
4946 ANYOF_CLASS_SET(ret, ANYOF_NALPHA);
4948 for (value = 0; value < 256; value++)
4949 if (!isALPHA(value))
4950 ANYOF_BITMAP_SET(ret, value);
4957 ANYOF_CLASS_SET(ret, ANYOF_ASCII);
4960 for (value = 0; value < 128; value++)
4961 ANYOF_BITMAP_SET(ret, value);
4963 for (value = 0; value < 256; value++) {
4965 ANYOF_BITMAP_SET(ret, value);
4974 ANYOF_CLASS_SET(ret, ANYOF_NASCII);
4977 for (value = 128; value < 256; value++)
4978 ANYOF_BITMAP_SET(ret, value);
4980 for (value = 0; value < 256; value++) {
4981 if (!isASCII(value))
4982 ANYOF_BITMAP_SET(ret, value);
4991 ANYOF_CLASS_SET(ret, ANYOF_BLANK);
4993 for (value = 0; value < 256; value++)
4995 ANYOF_BITMAP_SET(ret, value);
5002 ANYOF_CLASS_SET(ret, ANYOF_NBLANK);
5004 for (value = 0; value < 256; value++)
5005 if (!isBLANK(value))
5006 ANYOF_BITMAP_SET(ret, value);
5013 ANYOF_CLASS_SET(ret, ANYOF_CNTRL);
5015 for (value = 0; value < 256; value++)
5017 ANYOF_BITMAP_SET(ret, value);
5024 ANYOF_CLASS_SET(ret, ANYOF_NCNTRL);
5026 for (value = 0; value < 256; value++)
5027 if (!isCNTRL(value))
5028 ANYOF_BITMAP_SET(ret, value);
5035 ANYOF_CLASS_SET(ret, ANYOF_DIGIT);
5037 /* consecutive digits assumed */
5038 for (value = '0'; value <= '9'; value++)
5039 ANYOF_BITMAP_SET(ret, value);
5046 ANYOF_CLASS_SET(ret, ANYOF_NDIGIT);
5048 /* consecutive digits assumed */
5049 for (value = 0; value < '0'; value++)
5050 ANYOF_BITMAP_SET(ret, value);
5051 for (value = '9' + 1; value < 256; value++)
5052 ANYOF_BITMAP_SET(ret, value);
5059 ANYOF_CLASS_SET(ret, ANYOF_GRAPH);
5061 for (value = 0; value < 256; value++)
5063 ANYOF_BITMAP_SET(ret, value);
5070 ANYOF_CLASS_SET(ret, ANYOF_NGRAPH);
5072 for (value = 0; value < 256; value++)
5073 if (!isGRAPH(value))
5074 ANYOF_BITMAP_SET(ret, value);
5081 ANYOF_CLASS_SET(ret, ANYOF_LOWER);
5083 for (value = 0; value < 256; value++)
5085 ANYOF_BITMAP_SET(ret, value);
5092 ANYOF_CLASS_SET(ret, ANYOF_NLOWER);
5094 for (value = 0; value < 256; value++)
5095 if (!isLOWER(value))
5096 ANYOF_BITMAP_SET(ret, value);
5103 ANYOF_CLASS_SET(ret, ANYOF_PRINT);
5105 for (value = 0; value < 256; value++)
5107 ANYOF_BITMAP_SET(ret, value);
5114 ANYOF_CLASS_SET(ret, ANYOF_NPRINT);
5116 for (value = 0; value < 256; value++)
5117 if (!isPRINT(value))
5118 ANYOF_BITMAP_SET(ret, value);
5125 ANYOF_CLASS_SET(ret, ANYOF_PSXSPC);
5127 for (value = 0; value < 256; value++)
5128 if (isPSXSPC(value))
5129 ANYOF_BITMAP_SET(ret, value);
5136 ANYOF_CLASS_SET(ret, ANYOF_NPSXSPC);
5138 for (value = 0; value < 256; value++)
5139 if (!isPSXSPC(value))
5140 ANYOF_BITMAP_SET(ret, value);
5147 ANYOF_CLASS_SET(ret, ANYOF_PUNCT);
5149 for (value = 0; value < 256; value++)
5151 ANYOF_BITMAP_SET(ret, value);
5158 ANYOF_CLASS_SET(ret, ANYOF_NPUNCT);
5160 for (value = 0; value < 256; value++)
5161 if (!isPUNCT(value))
5162 ANYOF_BITMAP_SET(ret, value);
5169 ANYOF_CLASS_SET(ret, ANYOF_SPACE);
5171 for (value = 0; value < 256; value++)
5173 ANYOF_BITMAP_SET(ret, value);
5180 ANYOF_CLASS_SET(ret, ANYOF_NSPACE);
5182 for (value = 0; value < 256; value++)
5183 if (!isSPACE(value))
5184 ANYOF_BITMAP_SET(ret, value);
5191 ANYOF_CLASS_SET(ret, ANYOF_UPPER);
5193 for (value = 0; value < 256; value++)
5195 ANYOF_BITMAP_SET(ret, value);
5202 ANYOF_CLASS_SET(ret, ANYOF_NUPPER);
5204 for (value = 0; value < 256; value++)
5205 if (!isUPPER(value))
5206 ANYOF_BITMAP_SET(ret, value);
5213 ANYOF_CLASS_SET(ret, ANYOF_XDIGIT);
5215 for (value = 0; value < 256; value++)
5216 if (isXDIGIT(value))
5217 ANYOF_BITMAP_SET(ret, value);
5224 ANYOF_CLASS_SET(ret, ANYOF_NXDIGIT);
5226 for (value = 0; value < 256; value++)
5227 if (!isXDIGIT(value))
5228 ANYOF_BITMAP_SET(ret, value);
5234 /* this is to handle \p and \P */
5237 vFAIL("Invalid [::] class");
5241 /* Strings such as "+utf8::isWord\n" */
5242 Perl_sv_catpvf(aTHX_ listsv, "%cutf8::Is%s\n", yesno, what);
5245 ANYOF_FLAGS(ret) |= ANYOF_CLASS;
5248 } /* end of namedclass \blah */
5251 if (prevvalue > (IV)value) /* b-a */ {
5252 Simple_vFAIL4("Invalid [] range \"%*.*s\"",
5253 RExC_parse - rangebegin,
5254 RExC_parse - rangebegin,
5256 range = 0; /* not a valid range */
5260 prevvalue = value; /* save the beginning of the range */
5261 if (*RExC_parse == '-' && RExC_parse+1 < RExC_end &&
5262 RExC_parse[1] != ']') {
5265 /* a bad range like \w-, [:word:]- ? */
5266 if (namedclass > OOB_NAMEDCLASS) {
5267 if (ckWARN(WARN_REGEXP))
5269 "False [] range \"%*.*s\"",
5270 RExC_parse - rangebegin,
5271 RExC_parse - rangebegin,
5274 ANYOF_BITMAP_SET(ret, '-');
5276 range = 1; /* yeah, it's a range! */
5277 continue; /* but do it the next time */
5281 /* now is the next time */
5285 if (prevvalue < 256) {
5286 IV ceilvalue = value < 256 ? value : 255;
5289 /* In EBCDIC [\x89-\x91] should include
5290 * the \x8e but [i-j] should not. */
5291 if (literal_endpoint == 2 &&
5292 ((isLOWER(prevvalue) && isLOWER(ceilvalue)) ||
5293 (isUPPER(prevvalue) && isUPPER(ceilvalue))))
5295 if (isLOWER(prevvalue)) {
5296 for (i = prevvalue; i <= ceilvalue; i++)
5298 ANYOF_BITMAP_SET(ret, i);
5300 for (i = prevvalue; i <= ceilvalue; i++)
5302 ANYOF_BITMAP_SET(ret, i);
5307 for (i = prevvalue; i <= ceilvalue; i++)
5308 ANYOF_BITMAP_SET(ret, i);
5310 if (value > 255 || UTF) {
5311 UV prevnatvalue = NATIVE_TO_UNI(prevvalue);
5312 UV natvalue = NATIVE_TO_UNI(value);
5314 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
5315 if (prevnatvalue < natvalue) { /* what about > ? */
5316 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\t%04"UVxf"\n",
5317 prevnatvalue, natvalue);
5319 else if (prevnatvalue == natvalue) {
5320 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n", natvalue);
5322 U8 foldbuf[UTF8_MAXBYTES_CASE+1];
5324 UV f = to_uni_fold(natvalue, foldbuf, &foldlen);
5326 /* If folding and foldable and a single
5327 * character, insert also the folded version
5328 * to the charclass. */
5330 if (foldlen == (STRLEN)UNISKIP(f))
5331 Perl_sv_catpvf(aTHX_ listsv,
5334 /* Any multicharacter foldings
5335 * require the following transform:
5336 * [ABCDEF] -> (?:[ABCabcDEFd]|pq|rst)
5337 * where E folds into "pq" and F folds
5338 * into "rst", all other characters
5339 * fold to single characters. We save
5340 * away these multicharacter foldings,
5341 * to be later saved as part of the
5342 * additional "s" data. */
5345 if (!unicode_alternate)
5346 unicode_alternate = newAV();
5347 sv = newSVpvn((char*)foldbuf, foldlen);
5349 av_push(unicode_alternate, sv);
5353 /* If folding and the value is one of the Greek
5354 * sigmas insert a few more sigmas to make the
5355 * folding rules of the sigmas to work right.
5356 * Note that not all the possible combinations
5357 * are handled here: some of them are handled
5358 * by the standard folding rules, and some of
5359 * them (literal or EXACTF cases) are handled
5360 * during runtime in regexec.c:S_find_byclass(). */
5361 if (value == UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA) {
5362 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
5363 (UV)UNICODE_GREEK_CAPITAL_LETTER_SIGMA);
5364 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
5365 (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
5367 else if (value == UNICODE_GREEK_CAPITAL_LETTER_SIGMA)
5368 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
5369 (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
5374 literal_endpoint = 0;
5378 range = 0; /* this range (if it was one) is done now */
5382 ANYOF_FLAGS(ret) |= ANYOF_LARGE;
5384 RExC_size += ANYOF_CLASS_ADD_SKIP;
5386 RExC_emit += ANYOF_CLASS_ADD_SKIP;
5389 /* optimize case-insensitive simple patterns (e.g. /[a-z]/i) */
5391 /* If the only flag is folding (plus possibly inversion). */
5392 ((ANYOF_FLAGS(ret) & (ANYOF_FLAGS_ALL ^ ANYOF_INVERT)) == ANYOF_FOLD)
5394 for (value = 0; value < 256; ++value) {
5395 if (ANYOF_BITMAP_TEST(ret, value)) {
5396 UV fold = PL_fold[value];
5399 ANYOF_BITMAP_SET(ret, fold);
5402 ANYOF_FLAGS(ret) &= ~ANYOF_FOLD;
5405 /* optimize inverted simple patterns (e.g. [^a-z]) */
5406 if (!SIZE_ONLY && optimize_invert &&
5407 /* If the only flag is inversion. */
5408 (ANYOF_FLAGS(ret) & ANYOF_FLAGS_ALL) == ANYOF_INVERT) {
5409 for (value = 0; value < ANYOF_BITMAP_SIZE; ++value)
5410 ANYOF_BITMAP(ret)[value] ^= ANYOF_FLAGS_ALL;
5411 ANYOF_FLAGS(ret) = ANYOF_UNICODE_ALL;
5418 /* The 0th element stores the character class description
5419 * in its textual form: used later (regexec.c:Perl_regclass_swash())
5420 * to initialize the appropriate swash (which gets stored in
5421 * the 1st element), and also useful for dumping the regnode.
5422 * The 2nd element stores the multicharacter foldings,
5423 * used later (regexec.c:S_reginclass()). */
5424 av_store(av, 0, listsv);
5425 av_store(av, 1, NULL);
5426 av_store(av, 2, (SV*)unicode_alternate);
5427 rv = newRV_noinc((SV*)av);
5428 n = add_data(pRExC_state, 1, "s");
5429 RExC_rx->data->data[n] = (void*)rv;
5437 S_nextchar(pTHX_ RExC_state_t *pRExC_state)
5439 char* retval = RExC_parse++;
5442 if (*RExC_parse == '(' && RExC_parse[1] == '?' &&
5443 RExC_parse[2] == '#') {
5444 while (*RExC_parse != ')') {
5445 if (RExC_parse == RExC_end)
5446 FAIL("Sequence (?#... not terminated");
5452 if (RExC_flags & PMf_EXTENDED) {
5453 if (isSPACE(*RExC_parse)) {
5457 else if (*RExC_parse == '#') {
5458 while (RExC_parse < RExC_end)
5459 if (*RExC_parse++ == '\n') break;
5468 - reg_node - emit a node
5470 STATIC regnode * /* Location. */
5471 S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
5473 register regnode *ptr;
5474 regnode * const ret = RExC_emit;
5477 SIZE_ALIGN(RExC_size);
5482 NODE_ALIGN_FILL(ret);
5484 FILL_ADVANCE_NODE(ptr, op);
5485 if (RExC_offsets) { /* MJD */
5486 MJD_OFFSET_DEBUG(("%s:%u: (op %s) %s %u <- %u (len %u) (max %u).\n",
5487 "reg_node", __LINE__,
5489 RExC_emit - RExC_emit_start > RExC_offsets[0]
5490 ? "Overwriting end of array!\n" : "OK",
5491 RExC_emit - RExC_emit_start,
5492 RExC_parse - RExC_start,
5494 Set_Node_Offset(RExC_emit, RExC_parse + (op == END));
5503 - reganode - emit a node with an argument
5505 STATIC regnode * /* Location. */
5506 S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
5508 register regnode *ptr;
5509 regnode * const ret = RExC_emit;
5512 SIZE_ALIGN(RExC_size);
5517 NODE_ALIGN_FILL(ret);
5519 FILL_ADVANCE_NODE_ARG(ptr, op, arg);
5520 if (RExC_offsets) { /* MJD */
5521 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %u <- %u (max %u).\n",
5525 RExC_emit - RExC_emit_start > RExC_offsets[0] ?
5526 "Overwriting end of array!\n" : "OK",
5527 RExC_emit - RExC_emit_start,
5528 RExC_parse - RExC_start,
5530 Set_Cur_Node_Offset;
5539 - reguni - emit (if appropriate) a Unicode character
5542 S_reguni(pTHX_ RExC_state_t *pRExC_state, UV uv, char* s, STRLEN* lenp)
5544 *lenp = SIZE_ONLY ? UNISKIP(uv) : (uvchr_to_utf8((U8*)s, uv) - (U8*)s);
5548 - reginsert - insert an operator in front of already-emitted operand
5550 * Means relocating the operand.
5553 S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd)
5555 register regnode *src;
5556 register regnode *dst;
5557 register regnode *place;
5558 const int offset = regarglen[(U8)op];
5560 /* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */
5563 RExC_size += NODE_STEP_REGNODE + offset;
5568 RExC_emit += NODE_STEP_REGNODE + offset;
5570 while (src > opnd) {
5571 StructCopy(--src, --dst, regnode);
5572 if (RExC_offsets) { /* MJD 20010112 */
5573 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s copy %u -> %u (max %u).\n",
5577 dst - RExC_emit_start > RExC_offsets[0]
5578 ? "Overwriting end of array!\n" : "OK",
5579 src - RExC_emit_start,
5580 dst - RExC_emit_start,
5582 Set_Node_Offset_To_R(dst-RExC_emit_start, Node_Offset(src));
5583 Set_Node_Length_To_R(dst-RExC_emit_start, Node_Length(src));
5588 place = opnd; /* Op node, where operand used to be. */
5589 if (RExC_offsets) { /* MJD */
5590 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %u <- %u (max %u).\n",
5594 place - RExC_emit_start > RExC_offsets[0]
5595 ? "Overwriting end of array!\n" : "OK",
5596 place - RExC_emit_start,
5597 RExC_parse - RExC_start,
5599 Set_Node_Offset(place, RExC_parse);
5600 Set_Node_Length(place, 1);
5602 src = NEXTOPER(place);
5603 FILL_ADVANCE_NODE(place, op);
5604 Zero(src, offset, regnode);
5608 - regtail - set the next-pointer at the end of a node chain of p to val.
5611 S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p, regnode *val)
5613 register regnode *scan;
5618 /* Find last node. */
5621 regnode * const temp = regnext(scan);
5627 if (reg_off_by_arg[OP(scan)]) {
5628 ARG_SET(scan, val - scan);
5631 NEXT_OFF(scan) = val - scan;
5636 - regoptail - regtail on operand of first argument; nop if operandless
5639 S_regoptail(pTHX_ RExC_state_t *pRExC_state, regnode *p, regnode *val)
5641 /* "Operandless" and "op != BRANCH" are synonymous in practice. */
5642 if (p == NULL || SIZE_ONLY)
5644 if (PL_regkind[(U8)OP(p)] == BRANCH) {
5645 regtail(pRExC_state, NEXTOPER(p), val);
5647 else if ( PL_regkind[(U8)OP(p)] == BRANCHJ) {
5648 regtail(pRExC_state, NEXTOPER(NEXTOPER(p)), val);
5655 - regcurly - a little FSA that accepts {\d+,?\d*}
5658 S_regcurly(pTHX_ register const char *s)
5679 S_dumpuntil(pTHX_ regnode *start, regnode *node, regnode *last, SV* sv, I32 l)
5681 register U8 op = EXACT; /* Arbitrary non-END op. */
5682 register regnode *next;
5684 while (op != END && (!last || node < last)) {
5685 /* While that wasn't END last time... */
5691 next = regnext(node);
5693 if (OP(node) == OPTIMIZED)
5696 PerlIO_printf(Perl_debug_log, "%4"IVdf":%*s%s", (IV)(node - start),
5697 (int)(2*l + 1), "", SvPVX(sv));
5698 if (next == NULL) /* Next ptr. */
5699 PerlIO_printf(Perl_debug_log, "(0)");
5701 PerlIO_printf(Perl_debug_log, "(%"IVdf")", (IV)(next - start));
5702 (void)PerlIO_putc(Perl_debug_log, '\n');
5704 if (PL_regkind[(U8)op] == BRANCHJ) {
5705 register regnode *nnode = (OP(next) == LONGJMP
5708 if (last && nnode > last)
5710 node = dumpuntil(start, NEXTOPER(NEXTOPER(node)), nnode, sv, l + 1);
5712 else if (PL_regkind[(U8)op] == BRANCH) {
5713 node = dumpuntil(start, NEXTOPER(node), next, sv, l + 1);
5715 else if ( PL_regkind[(U8)op] == TRIE ) {
5716 const I32 n = ARG(node);
5717 const reg_trie_data *trie = (reg_trie_data*)PL_regdata->data[n];
5718 const I32 arry_len = av_len(trie->words)+1;
5720 PerlIO_printf(Perl_debug_log,
5721 "%*s[Words:%d Chars Stored:%d Unique Chars:%d States:%"IVdf"%s]\n",
5725 (int)trie->charcount,
5726 trie->uniquecharcount,
5727 (IV)trie->laststate-1,
5728 node->flags ? " EVAL mode" : "");
5730 for (word_idx=0; word_idx < arry_len; word_idx++) {
5731 SV **elem_ptr=av_fetch(trie->words,word_idx,0);
5733 PerlIO_printf(Perl_debug_log, "%*s<%s%s%s>\n",
5736 SvPV_nolen(*elem_ptr),
5741 PerlIO_printf(Perl_debug_log, "(0)\n");
5743 PerlIO_printf(Perl_debug_log, "(%"IVdf")\n", (IV)(next - start));
5749 node = NEXTOPER(node);
5750 node += regarglen[(U8)op];
5753 else if ( op == CURLY) { /* `next' might be very big: optimizer */
5754 node = dumpuntil(start, NEXTOPER(node) + EXTRA_STEP_2ARGS,
5755 NEXTOPER(node) + EXTRA_STEP_2ARGS + 1, sv, l + 1);
5757 else if (PL_regkind[(U8)op] == CURLY && op != CURLYX) {
5758 node = dumpuntil(start, NEXTOPER(node) + EXTRA_STEP_2ARGS,
5761 else if ( op == PLUS || op == STAR) {
5762 node = dumpuntil(start, NEXTOPER(node), NEXTOPER(node) + 1, sv, l + 1);
5764 else if (op == ANYOF) {
5765 /* arglen 1 + class block */
5766 node += 1 + ((ANYOF_FLAGS(node) & ANYOF_LARGE)
5767 ? ANYOF_CLASS_SKIP : ANYOF_SKIP);
5768 node = NEXTOPER(node);
5770 else if (PL_regkind[(U8)op] == EXACT) {
5771 /* Literal string, where present. */
5772 node += NODE_SZ_STR(node) - 1;
5773 node = NEXTOPER(node);
5776 node = NEXTOPER(node);
5777 node += regarglen[(U8)op];
5779 if (op == CURLYX || op == OPEN)
5781 else if (op == WHILEM)
5787 #endif /* DEBUGGING */
5790 - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
5793 Perl_regdump(pTHX_ regexp *r)
5796 SV *sv = sv_newmortal();
5798 (void)dumpuntil(r->program, r->program + 1, NULL, sv, 0);
5800 /* Header fields of interest. */
5801 if (r->anchored_substr)
5802 PerlIO_printf(Perl_debug_log,
5803 "anchored `%s%.*s%s'%s at %"IVdf" ",
5805 (int)(SvCUR(r->anchored_substr) - (SvTAIL(r->anchored_substr)!=0)),
5806 SvPVX(r->anchored_substr),
5808 SvTAIL(r->anchored_substr) ? "$" : "",
5809 (IV)r->anchored_offset);
5810 else if (r->anchored_utf8)
5811 PerlIO_printf(Perl_debug_log,
5812 "anchored utf8 `%s%.*s%s'%s at %"IVdf" ",
5814 (int)(SvCUR(r->anchored_utf8) - (SvTAIL(r->anchored_utf8)!=0)),
5815 SvPVX(r->anchored_utf8),
5817 SvTAIL(r->anchored_utf8) ? "$" : "",
5818 (IV)r->anchored_offset);
5819 if (r->float_substr)
5820 PerlIO_printf(Perl_debug_log,
5821 "floating `%s%.*s%s'%s at %"IVdf"..%"UVuf" ",
5823 (int)(SvCUR(r->float_substr) - (SvTAIL(r->float_substr)!=0)),
5824 SvPVX(r->float_substr),
5826 SvTAIL(r->float_substr) ? "$" : "",
5827 (IV)r->float_min_offset, (UV)r->float_max_offset);
5828 else if (r->float_utf8)
5829 PerlIO_printf(Perl_debug_log,
5830 "floating utf8 `%s%.*s%s'%s at %"IVdf"..%"UVuf" ",
5832 (int)(SvCUR(r->float_utf8) - (SvTAIL(r->float_utf8)!=0)),
5833 SvPVX(r->float_utf8),
5835 SvTAIL(r->float_utf8) ? "$" : "",
5836 (IV)r->float_min_offset, (UV)r->float_max_offset);
5837 if (r->check_substr || r->check_utf8)
5838 PerlIO_printf(Perl_debug_log,
5839 r->check_substr == r->float_substr
5840 && r->check_utf8 == r->float_utf8
5841 ? "(checking floating" : "(checking anchored");
5842 if (r->reganch & ROPT_NOSCAN)
5843 PerlIO_printf(Perl_debug_log, " noscan");
5844 if (r->reganch & ROPT_CHECK_ALL)
5845 PerlIO_printf(Perl_debug_log, " isall");
5846 if (r->check_substr || r->check_utf8)
5847 PerlIO_printf(Perl_debug_log, ") ");
5849 if (r->regstclass) {
5850 regprop(sv, r->regstclass);
5851 PerlIO_printf(Perl_debug_log, "stclass `%s' ", SvPVX(sv));
5853 if (r->reganch & ROPT_ANCH) {
5854 PerlIO_printf(Perl_debug_log, "anchored");
5855 if (r->reganch & ROPT_ANCH_BOL)
5856 PerlIO_printf(Perl_debug_log, "(BOL)");
5857 if (r->reganch & ROPT_ANCH_MBOL)
5858 PerlIO_printf(Perl_debug_log, "(MBOL)");
5859 if (r->reganch & ROPT_ANCH_SBOL)
5860 PerlIO_printf(Perl_debug_log, "(SBOL)");
5861 if (r->reganch & ROPT_ANCH_GPOS)
5862 PerlIO_printf(Perl_debug_log, "(GPOS)");
5863 PerlIO_putc(Perl_debug_log, ' ');
5865 if (r->reganch & ROPT_GPOS_SEEN)
5866 PerlIO_printf(Perl_debug_log, "GPOS ");
5867 if (r->reganch & ROPT_SKIP)
5868 PerlIO_printf(Perl_debug_log, "plus ");
5869 if (r->reganch & ROPT_IMPLICIT)
5870 PerlIO_printf(Perl_debug_log, "implicit ");
5871 PerlIO_printf(Perl_debug_log, "minlen %ld ", (long) r->minlen);
5872 if (r->reganch & ROPT_EVAL_SEEN)
5873 PerlIO_printf(Perl_debug_log, "with eval ");
5874 PerlIO_printf(Perl_debug_log, "\n");
5877 const U32 len = r->offsets[0];
5878 GET_RE_DEBUG_FLAGS_DECL;
5880 PerlIO_printf(Perl_debug_log, "Offsets: [%"UVuf"]\n\t", (UV)r->offsets[0]);
5881 for (i = 1; i <= len; i++)
5882 PerlIO_printf(Perl_debug_log, "%"UVuf"[%"UVuf"] ",
5883 (UV)r->offsets[i*2-1], (UV)r->offsets[i*2]);
5884 PerlIO_printf(Perl_debug_log, "\n");
5887 #endif /* DEBUGGING */
5893 S_put_byte(pTHX_ SV *sv, int c)
5895 if (isCNTRL(c) || c == 255 || !isPRINT(c))
5896 Perl_sv_catpvf(aTHX_ sv, "\\%o", c);
5897 else if (c == '-' || c == ']' || c == '\\' || c == '^')
5898 Perl_sv_catpvf(aTHX_ sv, "\\%c", c);
5900 Perl_sv_catpvf(aTHX_ sv, "%c", c);
5903 #endif /* DEBUGGING */
5907 - regprop - printable representation of opcode
5910 Perl_regprop(pTHX_ SV *sv, const regnode *o)
5915 sv_setpvn(sv, "", 0);
5916 if (OP(o) >= reg_num) /* regnode.type is unsigned */
5917 /* It would be nice to FAIL() here, but this may be called from
5918 regexec.c, and it would be hard to supply pRExC_state. */
5919 Perl_croak(aTHX_ "Corrupted regexp opcode");
5920 sv_catpv(sv, reg_name[OP(o)]); /* Take off const! */
5922 k = PL_regkind[(U8)OP(o)];
5925 SV *dsv = sv_2mortal(newSVpvn("", 0));
5926 /* Using is_utf8_string() is a crude hack but it may
5927 * be the best for now since we have no flag "this EXACTish
5928 * node was UTF-8" --jhi */
5929 bool do_utf8 = is_utf8_string((U8*)STRING(o), STR_LEN(o));
5931 pv_uni_display(dsv, (U8*)STRING(o), STR_LEN(o), 60,
5932 UNI_DISPLAY_REGEX) :
5934 const int len = do_utf8 ?
5937 Perl_sv_catpvf(aTHX_ sv, " <%s%.*s%s>",
5941 } else if (k == TRIE) {/*
5942 this isn't always safe, as Pl_regdata may not be for this regex yet
5943 (depending on where its called from) so its being moved to dumpuntil
5945 reg_trie_data *trie=(reg_trie_data*)PL_regdata->data[n];
5946 Perl_sv_catpvf(aTHX_ sv, " (W:%d L:%d C:%d S:%d)",
5949 trie->uniquecharcount,
5952 } else if (k == CURLY) {
5953 if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX)
5954 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */
5955 Perl_sv_catpvf(aTHX_ sv, " {%d,%d}", ARG1(o), ARG2(o));
5957 else if (k == WHILEM && o->flags) /* Ordinal/of */
5958 Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4);
5959 else if (k == REF || k == OPEN || k == CLOSE || k == GROUPP )
5960 Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o)); /* Parenth number */
5961 else if (k == LOGICAL)
5962 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* 2: embedded, otherwise 1 */
5963 else if (k == ANYOF) {
5964 int i, rangestart = -1;
5965 U8 flags = ANYOF_FLAGS(o);
5966 const char * const anyofs[] = { /* Should be synchronized with
5967 * ANYOF_ #xdefines in regcomp.h */
6000 if (flags & ANYOF_LOCALE)
6001 sv_catpv(sv, "{loc}");
6002 if (flags & ANYOF_FOLD)
6003 sv_catpv(sv, "{i}");
6004 Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
6005 if (flags & ANYOF_INVERT)
6007 for (i = 0; i <= 256; i++) {
6008 if (i < 256 && ANYOF_BITMAP_TEST(o,i)) {
6009 if (rangestart == -1)
6011 } else if (rangestart != -1) {
6012 if (i <= rangestart + 3)
6013 for (; rangestart < i; rangestart++)
6014 put_byte(sv, rangestart);
6016 put_byte(sv, rangestart);
6018 put_byte(sv, i - 1);
6024 if (o->flags & ANYOF_CLASS)
6025 for (i = 0; i < sizeof(anyofs)/sizeof(char*); i++)
6026 if (ANYOF_CLASS_TEST(o,i))
6027 sv_catpv(sv, anyofs[i]);
6029 if (flags & ANYOF_UNICODE)
6030 sv_catpv(sv, "{unicode}");
6031 else if (flags & ANYOF_UNICODE_ALL)
6032 sv_catpv(sv, "{unicode_all}");
6036 SV *sw = regclass_swash(o, FALSE, &lv, 0);
6040 U8 s[UTF8_MAXBYTES_CASE+1];
6042 for (i = 0; i <= 256; i++) { /* just the first 256 */
6043 U8 *e = uvchr_to_utf8(s, i);
6045 if (i < 256 && swash_fetch(sw, s, TRUE)) {
6046 if (rangestart == -1)
6048 } else if (rangestart != -1) {
6051 if (i <= rangestart + 3)
6052 for (; rangestart < i; rangestart++) {
6053 for(e = uvchr_to_utf8(s, rangestart), p = s; p < e; p++)
6057 for (e = uvchr_to_utf8(s, rangestart), p = s; p < e; p++)
6060 for (e = uvchr_to_utf8(s, i - 1), p = s; p < e; p++)
6067 sv_catpv(sv, "..."); /* et cetera */
6071 char *s = savesvpv(lv);
6074 while(*s && *s != '\n') s++;
6095 Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
6097 else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH))
6098 Perl_sv_catpvf(aTHX_ sv, "[-%d]", o->flags);
6099 #endif /* DEBUGGING */
6103 Perl_re_intuit_string(pTHX_ regexp *prog)
6104 { /* Assume that RE_INTUIT is set */
6105 GET_RE_DEBUG_FLAGS_DECL;
6108 const char *s = SvPV(prog->check_substr
6109 ? prog->check_substr : prog->check_utf8, n_a);
6111 if (!PL_colorset) reginitcolors();
6112 PerlIO_printf(Perl_debug_log,
6113 "%sUsing REx %ssubstr:%s `%s%.60s%s%s'\n",
6115 prog->check_substr ? "" : "utf8 ",
6116 PL_colors[5],PL_colors[0],
6119 (strlen(s) > 60 ? "..." : ""));
6122 return prog->check_substr ? prog->check_substr : prog->check_utf8;
6126 Perl_pregfree(pTHX_ struct regexp *r)
6130 SV *dsv = PERL_DEBUG_PAD_ZERO(0);
6131 SV *re_debug_flags=get_sv(RE_DEBUG_FLAGS,0);
6135 if (!r || (--r->refcnt > 0))
6137 DEBUG_r(if (re_debug_flags && (SvIV(re_debug_flags) & RE_DEBUG_COMPILE)) {
6138 const char *s = (r->reganch & ROPT_UTF8)
6139 ? pv_uni_display(dsv, (U8*)r->precomp, r->prelen, 60, UNI_DISPLAY_REGEX)
6140 : pv_display(dsv, r->precomp, r->prelen, 0, 60);
6141 const int len = SvCUR(dsv);
6144 PerlIO_printf(Perl_debug_log,
6145 "%sFreeing REx:%s %s%*.*s%s%s\n",
6146 PL_colors[4],PL_colors[5],PL_colors[0],
6149 len > 60 ? "..." : "");
6153 Safefree(r->precomp);
6154 if (r->offsets) /* 20010421 MJD */
6155 Safefree(r->offsets);
6156 RX_MATCH_COPY_FREE(r);
6157 #ifdef PERL_COPY_ON_WRITE
6159 SvREFCNT_dec(r->saved_copy);
6162 if (r->anchored_substr)
6163 SvREFCNT_dec(r->anchored_substr);
6164 if (r->anchored_utf8)
6165 SvREFCNT_dec(r->anchored_utf8);
6166 if (r->float_substr)
6167 SvREFCNT_dec(r->float_substr);
6169 SvREFCNT_dec(r->float_utf8);
6170 Safefree(r->substrs);
6173 int n = r->data->count;
6174 PAD* new_comppad = NULL;
6179 /* If you add a ->what type here, update the comment in regcomp.h */
6180 switch (r->data->what[n]) {
6182 SvREFCNT_dec((SV*)r->data->data[n]);
6185 Safefree(r->data->data[n]);
6188 new_comppad = (AV*)r->data->data[n];
6191 if (new_comppad == NULL)
6192 Perl_croak(aTHX_ "panic: pregfree comppad");
6193 PAD_SAVE_LOCAL(old_comppad,
6194 /* Watch out for global destruction's random ordering. */
6195 (SvTYPE(new_comppad) == SVt_PVAV) ?
6196 new_comppad : Null(PAD *)
6199 refcnt = OpREFCNT_dec((OP_4tree*)r->data->data[n]);
6202 op_free((OP_4tree*)r->data->data[n]);
6204 PAD_RESTORE_LOCAL(old_comppad);
6205 SvREFCNT_dec((SV*)new_comppad);
6212 reg_trie_data *trie=(reg_trie_data*)r->data->data[n];
6215 refcount = trie->refcount--;
6219 Safefree(trie->charmap);
6220 if (trie->widecharmap)
6221 SvREFCNT_dec((SV*)trie->widecharmap);
6223 Safefree(trie->states);
6225 Safefree(trie->trans);
6228 SvREFCNT_dec((SV*)trie->words);
6229 if (trie->revcharmap)
6230 SvREFCNT_dec((SV*)trie->revcharmap);
6232 Safefree(r->data->data[n]); /* do this last!!!! */
6237 Perl_croak(aTHX_ "panic: regfree data code '%c'", r->data->what[n]);
6240 Safefree(r->data->what);
6243 Safefree(r->startp);
6249 - regnext - dig the "next" pointer out of a node
6252 Perl_regnext(pTHX_ register regnode *p)
6254 register I32 offset;
6256 if (p == &PL_regdummy)
6259 offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p));
6267 S_re_croak2(pTHX_ const char* pat1,const char* pat2,...)
6270 STRLEN l1 = strlen(pat1);
6271 STRLEN l2 = strlen(pat2);
6274 const char *message;
6280 Copy(pat1, buf, l1 , char);
6281 Copy(pat2, buf + l1, l2 , char);
6282 buf[l1 + l2] = '\n';
6283 buf[l1 + l2 + 1] = '\0';
6285 /* ANSI variant takes additional second argument */
6286 va_start(args, pat2);
6290 msv = vmess(buf, &args);
6292 message = SvPV(msv,l1);
6295 Copy(message, buf, l1 , char);
6296 buf[l1-1] = '\0'; /* Overwrite \n */
6297 Perl_croak(aTHX_ "%s", buf);
6300 /* XXX Here's a total kludge. But we need to re-enter for swash routines. */
6303 Perl_save_re_context(pTHX)
6305 SAVEI32(PL_reg_flags); /* from regexec.c */
6307 SAVEPPTR(PL_reginput); /* String-input pointer. */
6308 SAVEPPTR(PL_regbol); /* Beginning of input, for ^ check. */
6309 SAVEPPTR(PL_regeol); /* End of input, for $ check. */
6310 SAVEVPTR(PL_regstartp); /* Pointer to startp array. */
6311 SAVEVPTR(PL_regendp); /* Ditto for endp. */
6312 SAVEVPTR(PL_reglastparen); /* Similarly for lastparen. */
6313 SAVEVPTR(PL_reglastcloseparen); /* Similarly for lastcloseparen. */
6314 SAVEPPTR(PL_regtill); /* How far we are required to go. */
6315 SAVEGENERICPV(PL_reg_start_tmp); /* from regexec.c */
6316 PL_reg_start_tmp = 0;
6317 SAVEI32(PL_reg_start_tmpl); /* from regexec.c */
6318 PL_reg_start_tmpl = 0;
6319 SAVEVPTR(PL_regdata);
6320 SAVEI32(PL_reg_eval_set); /* from regexec.c */
6321 SAVEI32(PL_regnarrate); /* from regexec.c */
6322 SAVEVPTR(PL_regprogram); /* from regexec.c */
6323 SAVEINT(PL_regindent); /* from regexec.c */
6324 SAVEVPTR(PL_regcc); /* from regexec.c */
6325 SAVEVPTR(PL_curcop);
6326 SAVEVPTR(PL_reg_call_cc); /* from regexec.c */
6327 SAVEVPTR(PL_reg_re); /* from regexec.c */
6328 SAVEPPTR(PL_reg_ganch); /* from regexec.c */
6329 SAVESPTR(PL_reg_sv); /* from regexec.c */
6330 SAVEBOOL(PL_reg_match_utf8); /* from regexec.c */
6331 SAVEVPTR(PL_reg_magic); /* from regexec.c */
6332 SAVEI32(PL_reg_oldpos); /* from regexec.c */
6333 SAVEVPTR(PL_reg_oldcurpm); /* from regexec.c */
6334 SAVEVPTR(PL_reg_curpm); /* from regexec.c */
6335 SAVEPPTR(PL_reg_oldsaved); /* old saved substr during match */
6336 PL_reg_oldsaved = Nullch;
6337 SAVEI32(PL_reg_oldsavedlen); /* old length of saved substr during match */
6338 PL_reg_oldsavedlen = 0;
6339 #ifdef PERL_COPY_ON_WRITE
6343 SAVEI32(PL_reg_maxiter); /* max wait until caching pos */
6345 SAVEI32(PL_reg_leftiter); /* wait until caching pos */
6346 PL_reg_leftiter = 0;
6347 SAVEGENERICPV(PL_reg_poscache); /* cache of pos of WHILEM */
6348 PL_reg_poscache = Nullch;
6349 SAVEI32(PL_reg_poscache_size); /* size of pos cache of WHILEM */
6350 PL_reg_poscache_size = 0;
6351 SAVEPPTR(PL_regprecomp); /* uncompiled string. */
6352 SAVEI32(PL_regnpar); /* () count. */
6353 SAVEI32(PL_regsize); /* from regexec.c */
6356 /* Save $1..$n (#18107: UTF-8 s/(\w+)/uc($1)/e); AMS 20021106. */
6360 char digits[TYPE_CHARS(long)];
6362 if (PL_curpm && (rx = PM_GETRE(PL_curpm))) {
6363 for (i = 1; i <= rx->nparens; i++) {
6364 sprintf(digits, "%lu", (long)i);
6365 if ((mgv = gv_fetchpv(digits, FALSE, SVt_PV)))
6372 SAVEPPTR(PL_reg_starttry); /* from regexec.c */
6377 clear_re(pTHX_ void *r)
6379 ReREFCNT_dec((regexp *)r);
6384 * c-indentation-style: bsd
6386 * indent-tabs-mode: t
6389 * ex: set ts=8 sts=4 sw=4 noet: