5 * "A fair jaw-cracker dwarf-language must be." --Samwise Gamgee
8 /* This file contains functions for compiling a regular expression. See
9 * also regexec.c which funnily enough, contains functions for executing
10 * a regular expression.
12 * This file is also copied at build time to ext/re/re_comp.c, where
13 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
14 * This causes the main functions to be compiled under new names and with
15 * debugging support added, which makes "use re 'debug'" work.
18 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
19 * confused with the original package (see point 3 below). Thanks, Henry!
22 /* Additional note: this code is very heavily munged from Henry's version
23 * in places. In some spots I've traded clarity for efficiency, so don't
24 * blame Henry for some of the lack of readability.
27 /* The names of the functions have been changed from regcomp and
28 * regexec to pregcomp and pregexec in order to avoid conflicts
29 * with the POSIX routines of the same names.
32 #ifdef PERL_EXT_RE_BUILD
33 /* need to replace pregcomp et al, so enable that */
34 # ifndef PERL_IN_XSUB_RE
35 # define PERL_IN_XSUB_RE
37 /* need access to debugger hooks */
38 # if defined(PERL_EXT_RE_DEBUG) && !defined(DEBUGGING)
43 #ifdef PERL_IN_XSUB_RE
44 /* We *really* need to overwrite these symbols: */
45 # define Perl_pregcomp my_regcomp
46 # define Perl_regdump my_regdump
47 # define Perl_regprop my_regprop
48 # define Perl_pregfree my_regfree
49 # define Perl_re_intuit_string my_re_intuit_string
50 /* *These* symbols are masked to allow static link. */
51 # define Perl_regnext my_regnext
52 # define Perl_save_re_context my_save_re_context
53 # define Perl_reginitcolors my_reginitcolors
55 # define PERL_NO_GET_CONTEXT
60 * pregcomp and pregexec -- regsub and regerror are not used in perl
62 * Copyright (c) 1986 by University of Toronto.
63 * Written by Henry Spencer. Not derived from licensed software.
65 * Permission is granted to anyone to use this software for any
66 * purpose on any computer system, and to redistribute it freely,
67 * subject to the following restrictions:
69 * 1. The author is not responsible for the consequences of use of
70 * this software, no matter how awful, even if they arise
73 * 2. The origin of this software must not be misrepresented, either
74 * by explicit claim or by omission.
76 * 3. Altered versions must be plainly marked as such, and must not
77 * be misrepresented as being the original software.
80 **** Alterations to Henry's code are...
82 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
83 **** 2000, 2001, 2002, 2003, 2004, 2005, by Larry Wall and others
85 **** You may distribute under the terms of either the GNU General Public
86 **** License or the Artistic License, as specified in the README file.
89 * Beware that some of this code is subtly aware of the way operator
90 * precedence is structured in regular expressions. Serious changes in
91 * regular-expression syntax might require a total rethink.
94 #define PERL_IN_REGCOMP_C
97 #ifndef PERL_IN_XSUB_RE
109 # if defined(BUGGY_MSC6)
110 /* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */
111 # pragma optimize("a",off)
112 /* But MSC 6.00A is happy with 'w', for aliases only across function calls*/
113 # pragma optimize("w",on )
114 # endif /* BUGGY_MSC6 */
118 #define STATIC static
121 typedef struct RExC_state_t {
122 U32 flags; /* are we folding, multilining? */
123 char *precomp; /* uncompiled string. */
125 char *start; /* Start of input for compile */
126 char *end; /* End of input for compile */
127 char *parse; /* Input-scan pointer. */
128 I32 whilem_seen; /* number of WHILEM in this expr */
129 regnode *emit_start; /* Start of emitted-code area */
130 regnode *emit; /* Code-emit pointer; ®dummy = don't = compiling */
131 I32 naughty; /* How bad is this pattern? */
132 I32 sawback; /* Did we see \1, ...? */
134 I32 size; /* Code size. */
135 I32 npar; /* () count. */
141 char *starttry; /* -Dr: where regtry was called. */
142 #define RExC_starttry (pRExC_state->starttry)
146 #define RExC_flags (pRExC_state->flags)
147 #define RExC_precomp (pRExC_state->precomp)
148 #define RExC_rx (pRExC_state->rx)
149 #define RExC_start (pRExC_state->start)
150 #define RExC_end (pRExC_state->end)
151 #define RExC_parse (pRExC_state->parse)
152 #define RExC_whilem_seen (pRExC_state->whilem_seen)
153 #define RExC_offsets (pRExC_state->rx->offsets) /* I am not like the others */
154 #define RExC_emit (pRExC_state->emit)
155 #define RExC_emit_start (pRExC_state->emit_start)
156 #define RExC_naughty (pRExC_state->naughty)
157 #define RExC_sawback (pRExC_state->sawback)
158 #define RExC_seen (pRExC_state->seen)
159 #define RExC_size (pRExC_state->size)
160 #define RExC_npar (pRExC_state->npar)
161 #define RExC_extralen (pRExC_state->extralen)
162 #define RExC_seen_zerolen (pRExC_state->seen_zerolen)
163 #define RExC_seen_evals (pRExC_state->seen_evals)
164 #define RExC_utf8 (pRExC_state->utf8)
166 #define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?')
167 #define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
168 ((*s) == '{' && regcurly(s)))
171 #undef SPSTART /* dratted cpp namespace... */
174 * Flags to be passed up and down.
176 #define WORST 0 /* Worst case. */
177 #define HASWIDTH 0x1 /* Known to match non-null strings. */
178 #define SIMPLE 0x2 /* Simple enough to be STAR/PLUS operand. */
179 #define SPSTART 0x4 /* Starts with * or +. */
180 #define TRYAGAIN 0x8 /* Weeded out a declaration. */
182 /* Length of a variant. */
184 typedef struct scan_data_t {
190 I32 last_end; /* min value, <0 unless valid. */
193 SV **longest; /* Either &l_fixed, or &l_float. */
197 I32 offset_float_min;
198 I32 offset_float_max;
202 struct regnode_charclass_class *start_class;
206 * Forward declarations for pregcomp()'s friends.
209 static const scan_data_t zero_scan_data =
210 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
212 #define SF_BEFORE_EOL (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
213 #define SF_BEFORE_SEOL 0x1
214 #define SF_BEFORE_MEOL 0x2
215 #define SF_FIX_BEFORE_EOL (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
216 #define SF_FL_BEFORE_EOL (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
219 # define SF_FIX_SHIFT_EOL (0+2)
220 # define SF_FL_SHIFT_EOL (0+4)
222 # define SF_FIX_SHIFT_EOL (+2)
223 # define SF_FL_SHIFT_EOL (+4)
226 #define SF_FIX_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
227 #define SF_FIX_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
229 #define SF_FL_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
230 #define SF_FL_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
231 #define SF_IS_INF 0x40
232 #define SF_HAS_PAR 0x80
233 #define SF_IN_PAR 0x100
234 #define SF_HAS_EVAL 0x200
235 #define SCF_DO_SUBSTR 0x400
236 #define SCF_DO_STCLASS_AND 0x0800
237 #define SCF_DO_STCLASS_OR 0x1000
238 #define SCF_DO_STCLASS (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
239 #define SCF_WHILEM_VISITED_POS 0x2000
241 #define UTF (RExC_utf8 != 0)
242 #define LOC ((RExC_flags & PMf_LOCALE) != 0)
243 #define FOLD ((RExC_flags & PMf_FOLD) != 0)
245 #define OOB_UNICODE 12345678
246 #define OOB_NAMEDCLASS -1
248 #define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
249 #define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
252 /* length of regex to show in messages that don't mark a position within */
253 #define RegexLengthToShowInErrorMessages 127
256 * If MARKER[12] are adjusted, be sure to adjust the constants at the top
257 * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
258 * op/pragma/warn/regcomp.
260 #define MARKER1 "<-- HERE" /* marker as it appears in the description */
261 #define MARKER2 " <-- HERE " /* marker as it appears within the regex */
263 #define REPORT_LOCATION " in regex; marked by " MARKER1 " in m/%.*s" MARKER2 "%s/"
266 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
267 * arg. Show regex, up to a maximum length. If it's too long, chop and add
270 #define FAIL(msg) STMT_START { \
271 const char *ellipses = ""; \
272 IV len = RExC_end - RExC_precomp; \
275 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
276 if (len > RegexLengthToShowInErrorMessages) { \
277 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
278 len = RegexLengthToShowInErrorMessages - 10; \
281 Perl_croak(aTHX_ "%s in regex m/%.*s%s/", \
282 msg, (int)len, RExC_precomp, ellipses); \
286 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
287 * args. Show regex, up to a maximum length. If it's too long, chop and add
290 #define FAIL2(pat,msg) STMT_START { \
291 const char *ellipses = ""; \
292 IV len = RExC_end - RExC_precomp; \
295 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
296 if (len > RegexLengthToShowInErrorMessages) { \
297 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
298 len = RegexLengthToShowInErrorMessages - 10; \
301 S_re_croak2(aTHX_ pat, " in regex m/%.*s%s/", \
302 msg, (int)len, RExC_precomp, ellipses); \
307 * Simple_vFAIL -- like FAIL, but marks the current location in the scan
309 #define Simple_vFAIL(m) STMT_START { \
310 IV offset = RExC_parse - RExC_precomp; \
311 Perl_croak(aTHX_ "%s" REPORT_LOCATION, \
312 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
316 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
318 #define vFAIL(m) STMT_START { \
320 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
325 * Like Simple_vFAIL(), but accepts two arguments.
327 #define Simple_vFAIL2(m,a1) STMT_START { \
328 IV offset = RExC_parse - RExC_precomp; \
329 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, \
330 (int)offset, RExC_precomp, RExC_precomp + offset); \
334 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
336 #define vFAIL2(m,a1) STMT_START { \
338 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
339 Simple_vFAIL2(m, a1); \
344 * Like Simple_vFAIL(), but accepts three arguments.
346 #define Simple_vFAIL3(m, a1, a2) STMT_START { \
347 IV offset = RExC_parse - RExC_precomp; \
348 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, \
349 (int)offset, RExC_precomp, RExC_precomp + offset); \
353 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
355 #define vFAIL3(m,a1,a2) STMT_START { \
357 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
358 Simple_vFAIL3(m, a1, a2); \
362 * Like Simple_vFAIL(), but accepts four arguments.
364 #define Simple_vFAIL4(m, a1, a2, a3) STMT_START { \
365 IV offset = RExC_parse - RExC_precomp; \
366 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3, \
367 (int)offset, RExC_precomp, RExC_precomp + offset); \
371 * Like Simple_vFAIL(), but accepts five arguments.
373 #define Simple_vFAIL5(m, a1, a2, a3, a4) STMT_START { \
374 IV offset = RExC_parse - RExC_precomp; \
375 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3, a4, \
376 (int)offset, RExC_precomp, RExC_precomp + offset); \
380 #define vWARN(loc,m) STMT_START { \
381 IV offset = loc - RExC_precomp; \
382 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s" REPORT_LOCATION, \
383 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
386 #define vWARNdep(loc,m) STMT_START { \
387 IV offset = loc - RExC_precomp; \
388 Perl_warner(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), \
389 "%s" REPORT_LOCATION, \
390 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
394 #define vWARN2(loc, m, a1) STMT_START { \
395 IV offset = loc - RExC_precomp; \
396 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
397 a1, (int)offset, RExC_precomp, RExC_precomp + offset); \
400 #define vWARN3(loc, m, a1, a2) STMT_START { \
401 IV offset = loc - RExC_precomp; \
402 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
403 a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset); \
406 #define vWARN4(loc, m, a1, a2, a3) STMT_START { \
407 IV offset = loc - RExC_precomp; \
408 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
409 a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
412 #define vWARN5(loc, m, a1, a2, a3, a4) STMT_START { \
413 IV offset = loc - RExC_precomp; \
414 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
415 a1, a2, a3, a4, (int)offset, RExC_precomp, RExC_precomp + offset); \
419 /* Allow for side effects in s */
420 #define REGC(c,s) STMT_START { \
421 if (!SIZE_ONLY) *(s) = (c); else (void)(s); \
424 /* Macros for recording node offsets. 20001227 mjd@plover.com
425 * Nodes are numbered 1, 2, 3, 4. Node #n's position is recorded in
426 * element 2*n-1 of the array. Element #2n holds the byte length node #n.
427 * Element 0 holds the number n.
430 #define MJD_OFFSET_DEBUG(x)
431 /* #define MJD_OFFSET_DEBUG(x) DEBUG_r(Perl_warn_nocontext x) */
434 #define Set_Node_Offset_To_R(node,byte) STMT_START { \
436 MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n", \
437 __LINE__, (node), (byte))); \
439 Perl_croak(aTHX_ "value of node is %d in Offset macro", node); \
441 RExC_offsets[2*(node)-1] = (byte); \
446 #define Set_Node_Offset(node,byte) \
447 Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
448 #define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
450 #define Set_Node_Length_To_R(node,len) STMT_START { \
452 MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n", \
453 __LINE__, (node), (len))); \
455 Perl_croak(aTHX_ "value of node is %d in Length macro", node); \
457 RExC_offsets[2*(node)] = (len); \
462 #define Set_Node_Length(node,len) \
463 Set_Node_Length_To_R((node)-RExC_emit_start, len)
464 #define Set_Cur_Node_Length(len) Set_Node_Length(RExC_emit, len)
465 #define Set_Node_Cur_Length(node) \
466 Set_Node_Length(node, RExC_parse - parse_start)
468 /* Get offsets and lengths */
469 #define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
470 #define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
472 static void clear_re(pTHX_ void *r);
474 /* Mark that we cannot extend a found fixed substring at this point.
475 Updata the longest found anchored substring and the longest found
476 floating substrings if needed. */
479 S_scan_commit(pTHX_ RExC_state_t *pRExC_state, scan_data_t *data)
481 const STRLEN l = CHR_SVLEN(data->last_found);
482 const STRLEN old_l = CHR_SVLEN(*data->longest);
484 if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
485 SvSetMagicSV(*data->longest, data->last_found);
486 if (*data->longest == data->longest_fixed) {
487 data->offset_fixed = l ? data->last_start_min : data->pos_min;
488 if (data->flags & SF_BEFORE_EOL)
490 |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
492 data->flags &= ~SF_FIX_BEFORE_EOL;
495 data->offset_float_min = l ? data->last_start_min : data->pos_min;
496 data->offset_float_max = (l
497 ? data->last_start_max
498 : data->pos_min + data->pos_delta);
499 if ((U32)data->offset_float_max > (U32)I32_MAX)
500 data->offset_float_max = I32_MAX;
501 if (data->flags & SF_BEFORE_EOL)
503 |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
505 data->flags &= ~SF_FL_BEFORE_EOL;
508 SvCUR_set(data->last_found, 0);
510 SV * sv = data->last_found;
512 SvUTF8(sv) && SvMAGICAL(sv) ? mg_find(sv, PERL_MAGIC_utf8) : NULL;
513 if (mg && mg->mg_len > 0)
517 data->flags &= ~SF_BEFORE_EOL;
520 /* Can match anything (initialization) */
522 S_cl_anything(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
524 ANYOF_CLASS_ZERO(cl);
525 ANYOF_BITMAP_SETALL(cl);
526 cl->flags = ANYOF_EOS|ANYOF_UNICODE_ALL;
528 cl->flags |= ANYOF_LOCALE;
531 /* Can match anything (initialization) */
533 S_cl_is_anything(pTHX_ struct regnode_charclass_class *cl)
537 for (value = 0; value <= ANYOF_MAX; value += 2)
538 if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1))
540 if (!(cl->flags & ANYOF_UNICODE_ALL))
542 if (!ANYOF_BITMAP_TESTALLSET(cl))
547 /* Can match anything (initialization) */
549 S_cl_init(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
551 Zero(cl, 1, struct regnode_charclass_class);
553 cl_anything(pRExC_state, cl);
557 S_cl_init_zero(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
559 Zero(cl, 1, struct regnode_charclass_class);
561 cl_anything(pRExC_state, cl);
563 cl->flags |= ANYOF_LOCALE;
566 /* 'And' a given class with another one. Can create false positives */
567 /* We assume that cl is not inverted */
569 S_cl_and(pTHX_ struct regnode_charclass_class *cl,
570 struct regnode_charclass_class *and_with)
572 if (!(and_with->flags & ANYOF_CLASS)
573 && !(cl->flags & ANYOF_CLASS)
574 && (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
575 && !(and_with->flags & ANYOF_FOLD)
576 && !(cl->flags & ANYOF_FOLD)) {
579 if (and_with->flags & ANYOF_INVERT)
580 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
581 cl->bitmap[i] &= ~and_with->bitmap[i];
583 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
584 cl->bitmap[i] &= and_with->bitmap[i];
585 } /* XXXX: logic is complicated otherwise, leave it along for a moment. */
586 if (!(and_with->flags & ANYOF_EOS))
587 cl->flags &= ~ANYOF_EOS;
589 if (cl->flags & ANYOF_UNICODE_ALL && and_with->flags & ANYOF_UNICODE &&
590 !(and_with->flags & ANYOF_INVERT)) {
591 cl->flags &= ~ANYOF_UNICODE_ALL;
592 cl->flags |= ANYOF_UNICODE;
593 ARG_SET(cl, ARG(and_with));
595 if (!(and_with->flags & ANYOF_UNICODE_ALL) &&
596 !(and_with->flags & ANYOF_INVERT))
597 cl->flags &= ~ANYOF_UNICODE_ALL;
598 if (!(and_with->flags & (ANYOF_UNICODE|ANYOF_UNICODE_ALL)) &&
599 !(and_with->flags & ANYOF_INVERT))
600 cl->flags &= ~ANYOF_UNICODE;
603 /* 'OR' a given class with another one. Can create false positives */
604 /* We assume that cl is not inverted */
606 S_cl_or(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, struct regnode_charclass_class *or_with)
608 if (or_with->flags & ANYOF_INVERT) {
610 * (B1 | CL1) | (!B2 & !CL2) = (B1 | !B2 & !CL2) | (CL1 | (!B2 & !CL2))
611 * <= (B1 | !B2) | (CL1 | !CL2)
612 * which is wasteful if CL2 is small, but we ignore CL2:
613 * (B1 | CL1) | (!B2 & !CL2) <= (B1 | CL1) | !B2 = (B1 | !B2) | CL1
614 * XXXX Can we handle case-fold? Unclear:
615 * (OK1(i) | OK1(i')) | !(OK1(i) | OK1(i')) =
616 * (OK1(i) | OK1(i')) | (!OK1(i) & !OK1(i'))
618 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
619 && !(or_with->flags & ANYOF_FOLD)
620 && !(cl->flags & ANYOF_FOLD) ) {
623 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
624 cl->bitmap[i] |= ~or_with->bitmap[i];
625 } /* XXXX: logic is complicated otherwise */
627 cl_anything(pRExC_state, cl);
630 /* (B1 | CL1) | (B2 | CL2) = (B1 | B2) | (CL1 | CL2)) */
631 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
632 && (!(or_with->flags & ANYOF_FOLD)
633 || (cl->flags & ANYOF_FOLD)) ) {
636 /* OR char bitmap and class bitmap separately */
637 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
638 cl->bitmap[i] |= or_with->bitmap[i];
639 if (or_with->flags & ANYOF_CLASS) {
640 for (i = 0; i < ANYOF_CLASSBITMAP_SIZE; i++)
641 cl->classflags[i] |= or_with->classflags[i];
642 cl->flags |= ANYOF_CLASS;
645 else { /* XXXX: logic is complicated, leave it along for a moment. */
646 cl_anything(pRExC_state, cl);
649 if (or_with->flags & ANYOF_EOS)
650 cl->flags |= ANYOF_EOS;
652 if (cl->flags & ANYOF_UNICODE && or_with->flags & ANYOF_UNICODE &&
653 ARG(cl) != ARG(or_with)) {
654 cl->flags |= ANYOF_UNICODE_ALL;
655 cl->flags &= ~ANYOF_UNICODE;
657 if (or_with->flags & ANYOF_UNICODE_ALL) {
658 cl->flags |= ANYOF_UNICODE_ALL;
659 cl->flags &= ~ANYOF_UNICODE;
665 make_trie(startbranch,first,last,tail,flags)
666 startbranch: the first branch in the whole branch sequence
667 first : start branch of sequence of branch-exact nodes.
668 May be the same as startbranch
669 last : Thing following the last branch.
670 May be the same as tail.
671 tail : item following the branch sequence
672 flags : currently the OP() type we will be building one of /EXACT(|F|Fl)/
674 Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node.
676 A trie is an N'ary tree where the branches are determined by digital
677 decomposition of the key. IE, at the root node you look up the 1st character and
678 follow that branch repeat until you find the end of the branches. Nodes can be
679 marked as "accepting" meaning they represent a complete word. Eg:
683 would convert into the following structure. Numbers represent states, letters
684 following numbers represent valid transitions on the letter from that state, if
685 the number is in square brackets it represents an accepting state, otherwise it
686 will be in parenthesis.
688 +-h->+-e->[3]-+-r->(8)-+-s->[9]
692 (1) +-i->(6)-+-s->[7]
694 +-s->(3)-+-h->(4)-+-e->[5]
696 Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers)
698 This shows that when matching against the string 'hers' we will begin at state 1
699 read 'h' and move to state 2, read 'e' and move to state 3 which is accepting,
700 then read 'r' and go to state 8 followed by 's' which takes us to state 9 which
701 is also accepting. Thus we know that we can match both 'he' and 'hers' with a
702 single traverse. We store a mapping from accepting to state to which word was
703 matched, and then when we have multiple possibilities we try to complete the
704 rest of the regex in the order in which they occured in the alternation.
706 The only prior NFA like behaviour that would be changed by the TRIE support is
707 the silent ignoring of duplicate alternations which are of the form:
709 / (DUPE|DUPE) X? (?{ ... }) Y /x
711 Thus EVAL blocks follwing a trie may be called a different number of times with
712 and without the optimisation. With the optimisations dupes will be silently
713 ignored. This inconsistant behaviour of EVAL type nodes is well established as
714 the following demonstrates:
716 'words'=~/(word|word|word)(?{ print $1 })[xyz]/
718 which prints out 'word' three times, but
720 'words'=~/(word|word|word)(?{ print $1 })S/
722 which doesnt print it out at all. This is due to other optimisations kicking in.
724 Example of what happens on a structural level:
726 The regexp /(ac|ad|ab)+/ will produce the folowing debug output:
728 1: CURLYM[1] {1,32767}(18)
739 This would be optimizable with startbranch=5, first=5, last=16, tail=16
740 and should turn into:
742 1: CURLYM[1] {1,32767}(18)
744 [Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1]
752 Cases where tail != last would be like /(?foo|bar)baz/:
762 which would be optimizable with startbranch=1, first=1, last=7, tail=8
763 and would end up looking like:
766 [Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1]
775 #define TRIE_DEBUG_CHAR \
776 DEBUG_TRIE_COMPILE_r({ \
779 tmp = newSVpv( "", 0 ); \
780 pv_uni_display( tmp, uc, len, 60, UNI_DISPLAY_REGEX ); \
782 tmp = Perl_newSVpvf_nocontext( "%c", (int)uvc ); \
784 av_push( trie->revcharmap, tmp ); \
787 #define TRIE_READ_CHAR STMT_START { \
790 if ( foldlen > 0 ) { \
791 uvc = utf8n_to_uvuni( scan, UTF8_MAXLEN, &len, uniflags ); \
796 uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
797 uvc = to_uni_fold( uvc, foldbuf, &foldlen ); \
798 foldlen -= UNISKIP( uvc ); \
799 scan = foldbuf + UNISKIP( uvc ); \
802 uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
811 #define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ]
812 #define TRIE_LIST_CUR(state) ( TRIE_LIST_ITEM( state, 0 ).forid )
813 #define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate )
814 #define TRIE_LIST_USED(idx) ( trie->states[state].trans.list ? (TRIE_LIST_CUR( idx ) - 1) : 0 )
816 #define TRIE_LIST_PUSH(state,fid,ns) STMT_START { \
817 if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) { \
818 TRIE_LIST_LEN( state ) *= 2; \
819 Renew( trie->states[ state ].trans.list, \
820 TRIE_LIST_LEN( state ), reg_trie_trans_le ); \
822 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid; \
823 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns; \
824 TRIE_LIST_CUR( state )++; \
827 #define TRIE_LIST_NEW(state) STMT_START { \
828 Newz( 1023, trie->states[ state ].trans.list, \
829 4, reg_trie_trans_le ); \
830 TRIE_LIST_CUR( state ) = 1; \
831 TRIE_LIST_LEN( state ) = 4; \
835 S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *first, regnode *last, regnode *tail, U32 flags)
838 /* first pass, loop through and scan words */
841 const U32 uniflags = ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY;
846 /* we just use folder as a flag in utf8 */
847 const U8 * const folder = ( flags == EXACTF
855 const U32 data_slot = add_data( pRExC_state, 1, "t" );
858 GET_RE_DEBUG_FLAGS_DECL;
860 Newz( 848200, trie, 1, reg_trie_data );
862 RExC_rx->data->data[ data_slot ] = (void*)trie;
863 Newz( 848201, trie->charmap, 256, U16 );
865 trie->words = newAV();
866 trie->revcharmap = newAV();
870 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
871 if (!SvIOK(re_trie_maxbuff)) {
872 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
875 /* -- First loop and Setup --
877 We first traverse the branches and scan each word to determine if it
878 contains widechars, and how many unique chars there are, this is
879 important as we have to build a table with at least as many columns as we
882 We use an array of integers to represent the character codes 0..255
883 (trie->charmap) and we use a an HV* to store unicode characters. We use the
884 native representation of the character value as the key and IV's for the
887 *TODO* If we keep track of how many times each character is used we can
888 remap the columns so that the table compression later on is more
889 efficient in terms of memory by ensuring most common value is in the
890 middle and the least common are on the outside. IMO this would be better
891 than a most to least common mapping as theres a decent chance the most
892 common letter will share a node with the least common, meaning the node
893 will not be compressable. With a middle is most common approach the worst
894 case is when we have the least common nodes twice.
899 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
900 regnode *noper = NEXTOPER( cur );
901 const U8 *uc = (U8*)STRING( noper );
902 const U8 *e = uc + STR_LEN( noper );
904 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
905 const U8 *scan = (U8*)NULL;
907 for ( ; uc < e ; uc += len ) {
911 if ( !trie->charmap[ uvc ] ) {
912 trie->charmap[ uvc ]=( ++trie->uniquecharcount );
914 trie->charmap[ folder[ uvc ] ] = trie->charmap[ uvc ];
919 if ( !trie->widecharmap )
920 trie->widecharmap = newHV();
922 svpp = hv_fetch( trie->widecharmap, (char*)&uvc, sizeof( UV ), 1 );
925 Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%"UVXf, uvc );
927 if ( !SvTRUE( *svpp ) ) {
928 sv_setiv( *svpp, ++trie->uniquecharcount );
934 } /* end first pass */
935 DEBUG_TRIE_COMPILE_r(
936 PerlIO_printf( Perl_debug_log, "TRIE(%s): W:%d C:%d Uq:%d \n",
937 ( trie->widecharmap ? "UTF8" : "NATIVE" ), trie->wordcount,
938 trie->charcount, trie->uniquecharcount )
943 We now know what we are dealing with in terms of unique chars and
944 string sizes so we can calculate how much memory a naive
945 representation using a flat table will take. If it's over a reasonable
946 limit (as specified by ${^RE_TRIE_MAXBUF}) we use a more memory
947 conservative but potentially much slower representation using an array
950 At the end we convert both representations into the same compressed
951 form that will be used in regexec.c for matching with. The latter
952 is a form that cannot be used to construct with but has memory
953 properties similar to the list form and access properties similar
954 to the table form making it both suitable for fast searches and
955 small enough that its feasable to store for the duration of a program.
957 See the comment in the code where the compressed table is produced
958 inplace from the flat tabe representation for an explanation of how
959 the compression works.
964 if ( (IV)( ( trie->charcount + 1 ) * trie->uniquecharcount + 1) > SvIV(re_trie_maxbuff) ) {
966 Second Pass -- Array Of Lists Representation
968 Each state will be represented by a list of charid:state records
969 (reg_trie_trans_le) the first such element holds the CUR and LEN
970 points of the allocated array. (See defines above).
972 We build the initial structure using the lists, and then convert
973 it into the compressed table form which allows faster lookups
974 (but cant be modified once converted).
980 STRLEN transcount = 1;
982 Newz( 848204, trie->states, trie->charcount + 2, reg_trie_state );
986 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
988 regnode *noper = NEXTOPER( cur );
989 U8 *uc = (U8*)STRING( noper );
990 U8 *e = uc + STR_LEN( noper );
991 U32 state = 1; /* required init */
992 U16 charid = 0; /* sanity init */
993 U8 *scan = (U8*)NULL; /* sanity init */
994 STRLEN foldlen = 0; /* required init */
995 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
998 for ( ; uc < e ; uc += len ) {
1003 charid = trie->charmap[ uvc ];
1005 SV** svpp=(SV**)NULL;
1006 svpp = hv_fetch( trie->widecharmap, (char*)&uvc, sizeof( UV ), 0);
1010 charid=(U16)SvIV( *svpp );
1019 if ( !trie->states[ state ].trans.list ) {
1020 TRIE_LIST_NEW( state );
1022 for ( check = 1; check <= TRIE_LIST_USED( state ); check++ ) {
1023 if ( TRIE_LIST_ITEM( state, check ).forid == charid ) {
1024 newstate = TRIE_LIST_ITEM( state, check ).newstate;
1029 newstate = next_alloc++;
1030 TRIE_LIST_PUSH( state, charid, newstate );
1036 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
1038 /* charid is now 0 if we dont know the char read, or nonzero if we do */
1041 if ( !trie->states[ state ].wordnum ) {
1042 /* we havent inserted this word into the structure yet. */
1043 trie->states[ state ].wordnum = ++curword;
1046 /* store the word for dumping */
1047 SV* tmp = newSVpvn( STRING( noper ), STR_LEN( noper ) );
1048 if ( UTF ) SvUTF8_on( tmp );
1049 av_push( trie->words, tmp );
1053 /* Its a dupe. So ignore it. */
1056 } /* end second pass */
1058 trie->laststate = next_alloc;
1059 Renew( trie->states, next_alloc, reg_trie_state );
1061 DEBUG_TRIE_COMPILE_MORE_r({
1066 print out the table precompression.
1069 PerlIO_printf( Perl_debug_log, "\nState :Word | Transition Data\n" );
1070 PerlIO_printf( Perl_debug_log, "------:-----+-----------------" );
1072 for( state=1 ; state < next_alloc ; state ++ ) {
1074 PerlIO_printf( Perl_debug_log, "\n %04"UVXf" :", (UV)state );
1075 if ( ! trie->states[ state ].wordnum ) {
1076 PerlIO_printf( Perl_debug_log, "%5s| ","");
1078 PerlIO_printf( Perl_debug_log, "W%04x| ",
1079 trie->states[ state ].wordnum
1082 for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) {
1083 SV **tmp = av_fetch( trie->revcharmap, TRIE_LIST_ITEM(state,charid).forid, 0);
1084 PerlIO_printf( Perl_debug_log, "%s:%3X=%04"UVXf" | ",
1086 TRIE_LIST_ITEM(state,charid).forid,
1087 (UV)TRIE_LIST_ITEM(state,charid).newstate
1092 PerlIO_printf( Perl_debug_log, "\n\n" );
1095 Newz( 848203, trie->trans, transcount ,reg_trie_trans );
1103 for( state=1 ; state < next_alloc ; state ++ ) {
1107 DEBUG_TRIE_COMPILE_MORE_r(
1108 PerlIO_printf( Perl_debug_log, "tp: %d zp: %d ",tp,zp)
1112 if (trie->states[state].trans.list) {
1113 U16 minid=TRIE_LIST_ITEM( state, 1).forid;
1117 for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
1118 if ( TRIE_LIST_ITEM( state, idx).forid < minid ) {
1119 minid=TRIE_LIST_ITEM( state, idx).forid;
1120 } else if ( TRIE_LIST_ITEM( state, idx).forid > maxid ) {
1121 maxid=TRIE_LIST_ITEM( state, idx).forid;
1124 if ( transcount < tp + maxid - minid + 1) {
1126 Renew( trie->trans, transcount, reg_trie_trans );
1127 Zero( trie->trans + (transcount / 2), transcount / 2 , reg_trie_trans );
1129 base = trie->uniquecharcount + tp - minid;
1130 if ( maxid == minid ) {
1132 for ( ; zp < tp ; zp++ ) {
1133 if ( ! trie->trans[ zp ].next ) {
1134 base = trie->uniquecharcount + zp - minid;
1135 trie->trans[ zp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1136 trie->trans[ zp ].check = state;
1142 trie->trans[ tp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1143 trie->trans[ tp ].check = state;
1148 for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
1149 U32 tid = base - trie->uniquecharcount + TRIE_LIST_ITEM( state, idx ).forid;
1150 trie->trans[ tid ].next = TRIE_LIST_ITEM( state, idx ).newstate;
1151 trie->trans[ tid ].check = state;
1153 tp += ( maxid - minid + 1 );
1155 Safefree(trie->states[ state ].trans.list);
1158 DEBUG_TRIE_COMPILE_MORE_r(
1159 PerlIO_printf( Perl_debug_log, " base: %d\n",base);
1162 trie->states[ state ].trans.base=base;
1164 trie->lasttrans = tp + 1;
1168 Second Pass -- Flat Table Representation.
1170 we dont use the 0 slot of either trans[] or states[] so we add 1 to each.
1171 We know that we will need Charcount+1 trans at most to store the data
1172 (one row per char at worst case) So we preallocate both structures
1173 assuming worst case.
1175 We then construct the trie using only the .next slots of the entry
1178 We use the .check field of the first entry of the node temporarily to
1179 make compression both faster and easier by keeping track of how many non
1180 zero fields are in the node.
1182 Since trans are numbered from 1 any 0 pointer in the table is a FAIL
1185 There are two terms at use here: state as a TRIE_NODEIDX() which is a
1186 number representing the first entry of the node, and state as a
1187 TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1) and
1188 TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3) if there
1189 are 2 entrys per node. eg:
1197 The table is internally in the right hand, idx form. However as we also
1198 have to deal with the states array which is indexed by nodenum we have to
1199 use TRIE_NODENUM() to convert.
1203 Newz( 848203, trie->trans, ( trie->charcount + 1 ) * trie->uniquecharcount + 1,
1205 Newz( 848204, trie->states, trie->charcount + 2, reg_trie_state );
1206 next_alloc = trie->uniquecharcount + 1;
1208 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1210 regnode *noper = NEXTOPER( cur );
1211 U8 *uc = (U8*)STRING( noper );
1212 U8 *e = uc + STR_LEN( noper );
1214 U32 state = 1; /* required init */
1216 U16 charid = 0; /* sanity init */
1217 U32 accept_state = 0; /* sanity init */
1218 U8 *scan = (U8*)NULL; /* sanity init */
1220 STRLEN foldlen = 0; /* required init */
1221 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1224 for ( ; uc < e ; uc += len ) {
1229 charid = trie->charmap[ uvc ];
1231 SV** svpp=(SV**)NULL;
1232 svpp = hv_fetch( trie->widecharmap, (char*)&uvc, sizeof( UV ), 0);
1236 charid=(U16)SvIV( *svpp );
1241 if ( !trie->trans[ state + charid ].next ) {
1242 trie->trans[ state + charid ].next = next_alloc;
1243 trie->trans[ state ].check++;
1244 next_alloc += trie->uniquecharcount;
1246 state = trie->trans[ state + charid ].next;
1248 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
1250 /* charid is now 0 if we dont know the char read, or nonzero if we do */
1253 accept_state = TRIE_NODENUM( state );
1254 if ( !trie->states[ accept_state ].wordnum ) {
1255 /* we havent inserted this word into the structure yet. */
1256 trie->states[ accept_state ].wordnum = ++curword;
1259 /* store the word for dumping */
1260 SV* tmp = newSVpvn( STRING( noper ), STR_LEN( noper ) );
1261 if ( UTF ) SvUTF8_on( tmp );
1262 av_push( trie->words, tmp );
1266 /* Its a dupe. So ignore it. */
1269 } /* end second pass */
1271 DEBUG_TRIE_COMPILE_MORE_r({
1273 print out the table precompression so that we can do a visual check
1274 that they are identical.
1278 PerlIO_printf( Perl_debug_log, "\nChar : " );
1280 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
1281 SV **tmp = av_fetch( trie->revcharmap, charid, 0);
1283 PerlIO_printf( Perl_debug_log, "%4.4s ", SvPV_nolen( *tmp ) );
1287 PerlIO_printf( Perl_debug_log, "\nState+-" );
1289 for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) {
1290 PerlIO_printf( Perl_debug_log, "%4s-", "----" );
1293 PerlIO_printf( Perl_debug_log, "\n" );
1295 for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) {
1297 PerlIO_printf( Perl_debug_log, "%04"UVXf" : ", (UV)TRIE_NODENUM( state ) );
1299 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
1300 PerlIO_printf( Perl_debug_log, "%04"UVXf" ",
1301 (UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next ) );
1303 if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) {
1304 PerlIO_printf( Perl_debug_log, " (%04"UVXf")\n", (UV)trie->trans[ state ].check );
1306 PerlIO_printf( Perl_debug_log, " (%04"UVXf") W%04X\n", (UV)trie->trans[ state ].check,
1307 trie->states[ TRIE_NODENUM( state ) ].wordnum );
1310 PerlIO_printf( Perl_debug_log, "\n\n" );
1314 * Inplace compress the table.*
1316 For sparse data sets the table constructed by the trie algorithm will
1317 be mostly 0/FAIL transitions or to put it another way mostly empty.
1318 (Note that leaf nodes will not contain any transitions.)
1320 This algorithm compresses the tables by eliminating most such
1321 transitions, at the cost of a modest bit of extra work during lookup:
1323 - Each states[] entry contains a .base field which indicates the
1324 index in the state[] array wheres its transition data is stored.
1326 - If .base is 0 there are no valid transitions from that node.
1328 - If .base is nonzero then charid is added to it to find an entry in
1331 -If trans[states[state].base+charid].check!=state then the
1332 transition is taken to be a 0/Fail transition. Thus if there are fail
1333 transitions at the front of the node then the .base offset will point
1334 somewhere inside the previous nodes data (or maybe even into a node
1335 even earlier), but the .check field determines if the transition is
1338 The following process inplace converts the table to the compressed
1339 table: We first do not compress the root node 1,and mark its all its
1340 .check pointers as 1 and set its .base pointer as 1 as well. This
1341 allows to do a DFA construction from the compressed table later, and
1342 ensures that any .base pointers we calculate later are greater than
1345 - We set 'pos' to indicate the first entry of the second node.
1347 - We then iterate over the columns of the node, finding the first and
1348 last used entry at l and m. We then copy l..m into pos..(pos+m-l),
1349 and set the .check pointers accordingly, and advance pos
1350 appropriately and repreat for the next node. Note that when we copy
1351 the next pointers we have to convert them from the original
1352 NODEIDX form to NODENUM form as the former is not valid post
1355 - If a node has no transitions used we mark its base as 0 and do not
1356 advance the pos pointer.
1358 - If a node only has one transition we use a second pointer into the
1359 structure to fill in allocated fail transitions from other states.
1360 This pointer is independent of the main pointer and scans forward
1361 looking for null transitions that are allocated to a state. When it
1362 finds one it writes the single transition into the "hole". If the
1363 pointer doesnt find one the single transition is appeneded as normal.
1365 - Once compressed we can Renew/realloc the structures to release the
1368 See "Table-Compression Methods" in sec 3.9 of the Red Dragon,
1369 specifically Fig 3.47 and the associated pseudocode.
1373 const U32 laststate = TRIE_NODENUM( next_alloc );
1374 U32 used , state, charid;
1376 trie->laststate = laststate;
1378 for ( state = 1 ; state < laststate ; state++ ) {
1380 U32 stateidx = TRIE_NODEIDX( state );
1381 U32 o_used=trie->trans[ stateidx ].check;
1382 used = trie->trans[ stateidx ].check;
1383 trie->trans[ stateidx ].check = 0;
1385 for ( charid = 0 ; used && charid < trie->uniquecharcount ; charid++ ) {
1386 if ( flag || trie->trans[ stateidx + charid ].next ) {
1387 if ( trie->trans[ stateidx + charid ].next ) {
1389 for ( ; zp < pos ; zp++ ) {
1390 if ( ! trie->trans[ zp ].next ) {
1394 trie->states[ state ].trans.base = zp + trie->uniquecharcount - charid ;
1395 trie->trans[ zp ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
1396 trie->trans[ zp ].check = state;
1397 if ( ++zp > pos ) pos = zp;
1404 trie->states[ state ].trans.base = pos + trie->uniquecharcount - charid ;
1406 trie->trans[ pos ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
1407 trie->trans[ pos ].check = state;
1412 trie->lasttrans = pos + 1;
1413 Renew( trie->states, laststate + 1, reg_trie_state);
1414 DEBUG_TRIE_COMPILE_MORE_r(
1415 PerlIO_printf( Perl_debug_log,
1416 " Alloc: %d Orig: %"IVdf" elements, Final:%"IVdf". Savings of %%%5.2f\n",
1417 ( ( trie->charcount + 1 ) * trie->uniquecharcount + 1 ), (IV)next_alloc, (IV)pos,
1418 ( ( next_alloc - pos ) * 100 ) / (double)next_alloc );
1421 } /* end table compress */
1423 /* resize the trans array to remove unused space */
1424 Renew( trie->trans, trie->lasttrans, reg_trie_trans);
1426 DEBUG_TRIE_COMPILE_r({
1429 Now we print it out again, in a slightly different form as there is additional
1430 info we want to be able to see when its compressed. They are close enough for
1431 visual comparison though.
1433 PerlIO_printf( Perl_debug_log, "\nChar : %-6s%-6s%-4s ","Match","Base","Ofs" );
1435 for( state = 0 ; state < trie->uniquecharcount ; state++ ) {
1436 SV **tmp = av_fetch( trie->revcharmap, state, 0);
1438 PerlIO_printf( Perl_debug_log, "%4.4s ", SvPV_nolen( *tmp ) );
1441 PerlIO_printf( Perl_debug_log, "\n-----:-----------------------");
1443 for( state = 0 ; state < trie->uniquecharcount ; state++ )
1444 PerlIO_printf( Perl_debug_log, "-----");
1445 PerlIO_printf( Perl_debug_log, "\n");
1447 for( state = 1 ; state < trie->laststate ; state++ ) {
1448 U32 base = trie->states[ state ].trans.base;
1450 PerlIO_printf( Perl_debug_log, "#%04"UVXf" ", (UV)state);
1452 if ( trie->states[ state ].wordnum ) {
1453 PerlIO_printf( Perl_debug_log, " W%04X", trie->states[ state ].wordnum );
1455 PerlIO_printf( Perl_debug_log, "%6s", "" );
1458 PerlIO_printf( Perl_debug_log, " @%04"UVXf" ", (UV)base );
1463 while( ( base + ofs < trie->uniquecharcount ) ||
1464 ( base + ofs - trie->uniquecharcount < trie->lasttrans
1465 && trie->trans[ base + ofs - trie->uniquecharcount ].check != state))
1468 PerlIO_printf( Perl_debug_log, "+%02"UVXf"[ ", (UV)ofs);
1470 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
1471 if ( ( base + ofs >= trie->uniquecharcount ) &&
1472 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
1473 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
1475 PerlIO_printf( Perl_debug_log, "%04"UVXf" ",
1476 (UV)trie->trans[ base + ofs - trie->uniquecharcount ].next );
1478 PerlIO_printf( Perl_debug_log, "%4s "," 0" );
1482 PerlIO_printf( Perl_debug_log, "]");
1485 PerlIO_printf( Perl_debug_log, "\n" );
1490 /* now finally we "stitch in" the new TRIE node
1491 This means we convert either the first branch or the first Exact,
1492 depending on whether the thing following (in 'last') is a branch
1493 or not and whther first is the startbranch (ie is it a sub part of
1494 the alternation or is it the whole thing.)
1495 Assuming its a sub part we conver the EXACT otherwise we convert
1496 the whole branch sequence, including the first.
1503 if ( first == startbranch && OP( last ) != BRANCH ) {
1506 convert = NEXTOPER( first );
1507 NEXT_OFF( first ) = (U16)(last - first);
1510 OP( convert ) = TRIE + (U8)( flags - EXACT );
1511 NEXT_OFF( convert ) = (U16)(tail - convert);
1512 ARG_SET( convert, data_slot );
1514 /* tells us if we need to handle accept buffers specially */
1515 convert->flags = ( RExC_seen_evals ? 1 : 0 );
1518 /* needed for dumping*/
1520 regnode *optimize = convert + NODE_STEP_REGNODE + regarglen[ TRIE ];
1521 /* We now need to mark all of the space originally used by the
1522 branches as optimized away. This keeps the dumpuntil from
1523 throwing a wobbly as it doesnt use regnext() to traverse the
1526 while( optimize < last ) {
1527 OP( optimize ) = OPTIMIZED;
1531 } /* end node insert */
1538 * There are strange code-generation bugs caused on sparc64 by gcc-2.95.2.
1539 * These need to be revisited when a newer toolchain becomes available.
1541 #if defined(__sparc64__) && defined(__GNUC__)
1542 # if __GNUC__ < 2 || (__GNUC__ == 2 && __GNUC_MINOR__ < 96)
1543 # undef SPARC64_GCC_WORKAROUND
1544 # define SPARC64_GCC_WORKAROUND 1
1548 /* REx optimizer. Converts nodes into quickier variants "in place".
1549 Finds fixed substrings. */
1551 /* Stops at toplevel WHILEM as well as at `last'. At end *scanp is set
1552 to the position after last scanned or to NULL. */
1556 S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, I32 *deltap, regnode *last, scan_data_t *data, U32 flags, U32 depth)
1557 /* scanp: Start here (read-write). */
1558 /* deltap: Write maxlen-minlen here. */
1559 /* last: Stop before this one. */
1561 I32 min = 0, pars = 0, code;
1562 regnode *scan = *scanp, *next;
1564 int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
1565 int is_inf_internal = 0; /* The studied chunk is infinite */
1566 I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
1567 scan_data_t data_fake;
1568 struct regnode_charclass_class and_with; /* Valid if flags & SCF_DO_STCLASS_OR */
1569 SV *re_trie_maxbuff = NULL;
1571 GET_RE_DEBUG_FLAGS_DECL;
1573 while (scan && OP(scan) != END && scan < last) {
1574 /* Peephole optimizer: */
1576 SV *mysv=sv_newmortal();
1577 regprop( mysv, scan);
1578 PerlIO_printf(Perl_debug_log, "%*speep: %s (0x%08"UVXf")\n",
1579 (int)depth*2, "", SvPV_nolen(mysv), PTR2UV(scan));
1582 if (PL_regkind[(U8)OP(scan)] == EXACT) {
1583 /* Merge several consecutive EXACTish nodes into one. */
1584 regnode *n = regnext(scan);
1587 regnode *stop = scan;
1590 next = scan + NODE_SZ_STR(scan);
1591 /* Skip NOTHING, merge EXACT*. */
1593 ( PL_regkind[(U8)OP(n)] == NOTHING ||
1594 (stringok && (OP(n) == OP(scan))))
1596 && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX) {
1597 if (OP(n) == TAIL || n > next)
1599 if (PL_regkind[(U8)OP(n)] == NOTHING) {
1600 NEXT_OFF(scan) += NEXT_OFF(n);
1601 next = n + NODE_STEP_REGNODE;
1608 else if (stringok) {
1609 const int oldl = STR_LEN(scan);
1610 regnode *nnext = regnext(n);
1612 if (oldl + STR_LEN(n) > U8_MAX)
1614 NEXT_OFF(scan) += NEXT_OFF(n);
1615 STR_LEN(scan) += STR_LEN(n);
1616 next = n + NODE_SZ_STR(n);
1617 /* Now we can overwrite *n : */
1618 Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
1626 if (UTF && ( OP(scan) == EXACTF ) && ( STR_LEN(scan) >= 6 ) ) {
1628 Two problematic code points in Unicode casefolding of EXACT nodes:
1630 U+0390 - GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
1631 U+03B0 - GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
1637 U+03B9 U+0308 U+0301 0xCE 0xB9 0xCC 0x88 0xCC 0x81
1638 U+03C5 U+0308 U+0301 0xCF 0x85 0xCC 0x88 0xCC 0x81
1640 This means that in case-insensitive matching (or "loose matching",
1641 as Unicode calls it), an EXACTF of length six (the UTF-8 encoded byte
1642 length of the above casefolded versions) can match a target string
1643 of length two (the byte length of UTF-8 encoded U+0390 or U+03B0).
1644 This would rather mess up the minimum length computation.
1646 What we'll do is to look for the tail four bytes, and then peek
1647 at the preceding two bytes to see whether we need to decrease
1648 the minimum length by four (six minus two).
1650 Thanks to the design of UTF-8, there cannot be false matches:
1651 A sequence of valid UTF-8 bytes cannot be a subsequence of
1652 another valid sequence of UTF-8 bytes.
1655 char *s0 = STRING(scan), *s, *t;
1656 char *s1 = s0 + STR_LEN(scan) - 1, *s2 = s1 - 4;
1657 const char *t0 = "\xcc\x88\xcc\x81";
1658 const char *t1 = t0 + 3;
1661 s < s2 && (t = ninstr(s, s1, t0, t1));
1663 if (((U8)t[-1] == 0xB9 && (U8)t[-2] == 0xCE) ||
1664 ((U8)t[-1] == 0x85 && (U8)t[-2] == 0xCF))
1671 n = scan + NODE_SZ_STR(scan);
1673 if (PL_regkind[(U8)OP(n)] != NOTHING || OP(n) == NOTHING) {
1684 /* Follow the next-chain of the current node and optimize
1685 away all the NOTHINGs from it. */
1686 if (OP(scan) != CURLYX) {
1687 const int max = (reg_off_by_arg[OP(scan)]
1689 /* I32 may be smaller than U16 on CRAYs! */
1690 : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
1691 int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
1695 /* Skip NOTHING and LONGJMP. */
1696 while ((n = regnext(n))
1697 && ((PL_regkind[(U8)OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
1698 || ((OP(n) == LONGJMP) && (noff = ARG(n))))
1699 && off + noff < max)
1701 if (reg_off_by_arg[OP(scan)])
1704 NEXT_OFF(scan) = off;
1707 /* The principal pseudo-switch. Cannot be a switch, since we
1708 look into several different things. */
1709 if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
1710 || OP(scan) == IFTHEN || OP(scan) == SUSPEND) {
1711 next = regnext(scan);
1713 /* demq: the op(next)==code check is to see if we have "branch-branch" AFAICT */
1715 if (OP(next) == code || code == IFTHEN || code == SUSPEND) {
1716 I32 max1 = 0, min1 = I32_MAX, num = 0;
1717 struct regnode_charclass_class accum;
1718 regnode *startbranch=scan;
1720 if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
1721 scan_commit(pRExC_state, data); /* Cannot merge strings after this. */
1722 if (flags & SCF_DO_STCLASS)
1723 cl_init_zero(pRExC_state, &accum);
1725 while (OP(scan) == code) {
1726 I32 deltanext, minnext, f = 0, fake;
1727 struct regnode_charclass_class this_class;
1730 data_fake.flags = 0;
1732 data_fake.whilem_c = data->whilem_c;
1733 data_fake.last_closep = data->last_closep;
1736 data_fake.last_closep = &fake;
1737 next = regnext(scan);
1738 scan = NEXTOPER(scan);
1740 scan = NEXTOPER(scan);
1741 if (flags & SCF_DO_STCLASS) {
1742 cl_init(pRExC_state, &this_class);
1743 data_fake.start_class = &this_class;
1744 f = SCF_DO_STCLASS_AND;
1746 if (flags & SCF_WHILEM_VISITED_POS)
1747 f |= SCF_WHILEM_VISITED_POS;
1749 /* we suppose the run is continuous, last=next...*/
1750 minnext = study_chunk(pRExC_state, &scan, &deltanext,
1751 next, &data_fake, f,depth+1);
1754 if (max1 < minnext + deltanext)
1755 max1 = minnext + deltanext;
1756 if (deltanext == I32_MAX)
1757 is_inf = is_inf_internal = 1;
1759 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
1761 if (data && (data_fake.flags & SF_HAS_EVAL))
1762 data->flags |= SF_HAS_EVAL;
1764 data->whilem_c = data_fake.whilem_c;
1765 if (flags & SCF_DO_STCLASS)
1766 cl_or(pRExC_state, &accum, &this_class);
1767 if (code == SUSPEND)
1770 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
1772 if (flags & SCF_DO_SUBSTR) {
1773 data->pos_min += min1;
1774 data->pos_delta += max1 - min1;
1775 if (max1 != min1 || is_inf)
1776 data->longest = &(data->longest_float);
1779 delta += max1 - min1;
1780 if (flags & SCF_DO_STCLASS_OR) {
1781 cl_or(pRExC_state, data->start_class, &accum);
1783 cl_and(data->start_class, &and_with);
1784 flags &= ~SCF_DO_STCLASS;
1787 else if (flags & SCF_DO_STCLASS_AND) {
1789 cl_and(data->start_class, &accum);
1790 flags &= ~SCF_DO_STCLASS;
1793 /* Switch to OR mode: cache the old value of
1794 * data->start_class */
1795 StructCopy(data->start_class, &and_with,
1796 struct regnode_charclass_class);
1797 flags &= ~SCF_DO_STCLASS_AND;
1798 StructCopy(&accum, data->start_class,
1799 struct regnode_charclass_class);
1800 flags |= SCF_DO_STCLASS_OR;
1801 data->start_class->flags |= ANYOF_EOS;
1807 Assuming this was/is a branch we are dealing with: 'scan' now
1808 points at the item that follows the branch sequence, whatever
1809 it is. We now start at the beginning of the sequence and look
1815 which would be constructed from a pattern like /A|LIST|OF|WORDS/
1817 If we can find such a subseqence we need to turn the first
1818 element into a trie and then add the subsequent branch exact
1819 strings to the trie.
1823 1. patterns where the whole set of branch can be converted to a trie,
1825 2. patterns where only a subset of the alternations can be
1826 converted to a trie.
1828 In case 1 we can replace the whole set with a single regop
1829 for the trie. In case 2 we need to keep the start and end
1832 'BRANCH EXACT; BRANCH EXACT; BRANCH X'
1833 becomes BRANCH TRIE; BRANCH X;
1835 Hypthetically when we know the regex isnt anchored we can
1836 turn a case 1 into a DFA and let it rip... Every time it finds a match
1837 it would just call its tail, no WHILEM/CURLY needed.
1841 if (!re_trie_maxbuff) {
1842 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
1843 if (!SvIOK(re_trie_maxbuff))
1844 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
1846 if ( SvIV(re_trie_maxbuff)>=0 && OP( startbranch )==BRANCH ) {
1848 regnode *first = (regnode *)NULL;
1849 regnode *last = (regnode *)NULL;
1850 regnode *tail = scan;
1855 SV *mysv = sv_newmortal(); /* for dumping */
1857 /* var tail is used because there may be a TAIL
1858 regop in the way. Ie, the exacts will point to the
1859 thing following the TAIL, but the last branch will
1860 point at the TAIL. So we advance tail. If we
1861 have nested (?:) we may have to move through several
1865 while ( OP( tail ) == TAIL ) {
1866 /* this is the TAIL generated by (?:) */
1867 tail = regnext( tail );
1871 regprop( mysv, tail );
1872 PerlIO_printf( Perl_debug_log, "%*s%s%s%s\n",
1873 (int)depth * 2 + 2, "", "Tail node is:", SvPV_nolen( mysv ),
1874 (RExC_seen_evals) ? "[EVAL]" : ""
1879 step through the branches, cur represents each
1880 branch, noper is the first thing to be matched
1881 as part of that branch and noper_next is the
1882 regnext() of that node. if noper is an EXACT
1883 and noper_next is the same as scan (our current
1884 position in the regex) then the EXACT branch is
1885 a possible optimization target. Once we have
1886 two or more consequetive such branches we can
1887 create a trie of the EXACT's contents and stich
1888 it in place. If the sequence represents all of
1889 the branches we eliminate the whole thing and
1890 replace it with a single TRIE. If it is a
1891 subsequence then we need to stitch it in. This
1892 means the first branch has to remain, and needs
1893 to be repointed at the item on the branch chain
1894 following the last branch optimized. This could
1895 be either a BRANCH, in which case the
1896 subsequence is internal, or it could be the
1897 item following the branch sequence in which
1898 case the subsequence is at the end.
1902 /* dont use tail as the end marker for this traverse */
1903 for ( cur = startbranch ; cur != scan ; cur = regnext( cur ) ) {
1904 regnode *noper = NEXTOPER( cur );
1905 regnode *noper_next = regnext( noper );
1908 regprop( mysv, cur);
1909 PerlIO_printf( Perl_debug_log, "%*s%s",
1910 (int)depth * 2 + 2," ", SvPV_nolen( mysv ) );
1912 regprop( mysv, noper);
1913 PerlIO_printf( Perl_debug_log, " -> %s",
1917 regprop( mysv, noper_next );
1918 PerlIO_printf( Perl_debug_log,"\t=> %s\t",
1921 PerlIO_printf( Perl_debug_log, "0x%p,0x%p,0x%p)\n",
1924 if ( ( first ? OP( noper ) == optype
1925 : PL_regkind[ (U8)OP( noper ) ] == EXACT )
1926 && noper_next == tail && count<U16_MAX)
1931 optype = OP( noper );
1935 regprop( mysv, first);
1936 PerlIO_printf( Perl_debug_log, "%*s%s",
1937 (int)depth * 2 + 2, "F:", SvPV_nolen( mysv ) );
1938 regprop( mysv, NEXTOPER(first) );
1939 PerlIO_printf( Perl_debug_log, " -> %s\n",
1940 SvPV_nolen( mysv ) );
1945 regprop( mysv, cur);
1946 PerlIO_printf( Perl_debug_log, "%*s%s",
1947 (int)depth * 2 + 2, "N:", SvPV_nolen( mysv ) );
1948 regprop( mysv, noper );
1949 PerlIO_printf( Perl_debug_log, " -> %s\n",
1950 SvPV_nolen( mysv ) );
1956 PerlIO_printf( Perl_debug_log, "%*s%s\n",
1957 (int)depth * 2 + 2, "E:", "**END**" );
1959 make_trie( pRExC_state, startbranch, first, cur, tail, optype );
1961 if ( PL_regkind[ (U8)OP( noper ) ] == EXACT
1962 && noper_next == tail )
1966 optype = OP( noper );
1976 regprop( mysv, cur);
1977 PerlIO_printf( Perl_debug_log,
1978 "%*s%s\t(0x%p,0x%p,0x%p)\n", (int)depth * 2 + 2,
1979 " ", SvPV_nolen( mysv ), first, last, cur);
1984 PerlIO_printf( Perl_debug_log, "%*s%s\n",
1985 (int)depth * 2 + 2, "E:", "==END==" );
1987 make_trie( pRExC_state, startbranch, first, scan, tail, optype );
1992 else if ( code == BRANCHJ ) { /* single branch is optimized. */
1993 scan = NEXTOPER(NEXTOPER(scan));
1994 } else /* single branch is optimized. */
1995 scan = NEXTOPER(scan);
1998 else if (OP(scan) == EXACT) {
1999 I32 l = STR_LEN(scan);
2000 UV uc = *((U8*)STRING(scan));
2002 const U8 * const s = (U8*)STRING(scan);
2003 l = utf8_length(s, s + l);
2004 uc = utf8_to_uvchr(s, NULL);
2007 if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
2008 /* The code below prefers earlier match for fixed
2009 offset, later match for variable offset. */
2010 if (data->last_end == -1) { /* Update the start info. */
2011 data->last_start_min = data->pos_min;
2012 data->last_start_max = is_inf
2013 ? I32_MAX : data->pos_min + data->pos_delta;
2015 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
2017 SV * sv = data->last_found;
2018 MAGIC *mg = SvUTF8(sv) && SvMAGICAL(sv) ?
2019 mg_find(sv, PERL_MAGIC_utf8) : NULL;
2020 if (mg && mg->mg_len >= 0)
2021 mg->mg_len += utf8_length((U8*)STRING(scan),
2022 (U8*)STRING(scan)+STR_LEN(scan));
2025 SvUTF8_on(data->last_found);
2026 data->last_end = data->pos_min + l;
2027 data->pos_min += l; /* As in the first entry. */
2028 data->flags &= ~SF_BEFORE_EOL;
2030 if (flags & SCF_DO_STCLASS_AND) {
2031 /* Check whether it is compatible with what we know already! */
2035 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
2036 && !ANYOF_BITMAP_TEST(data->start_class, uc)
2037 && (!(data->start_class->flags & ANYOF_FOLD)
2038 || !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
2041 ANYOF_CLASS_ZERO(data->start_class);
2042 ANYOF_BITMAP_ZERO(data->start_class);
2044 ANYOF_BITMAP_SET(data->start_class, uc);
2045 data->start_class->flags &= ~ANYOF_EOS;
2047 data->start_class->flags &= ~ANYOF_UNICODE_ALL;
2049 else if (flags & SCF_DO_STCLASS_OR) {
2050 /* false positive possible if the class is case-folded */
2052 ANYOF_BITMAP_SET(data->start_class, uc);
2054 data->start_class->flags |= ANYOF_UNICODE_ALL;
2055 data->start_class->flags &= ~ANYOF_EOS;
2056 cl_and(data->start_class, &and_with);
2058 flags &= ~SCF_DO_STCLASS;
2060 else if (PL_regkind[(U8)OP(scan)] == EXACT) { /* But OP != EXACT! */
2061 I32 l = STR_LEN(scan);
2062 UV uc = *((U8*)STRING(scan));
2064 /* Search for fixed substrings supports EXACT only. */
2065 if (flags & SCF_DO_SUBSTR)
2066 scan_commit(pRExC_state, data);
2068 U8 *s = (U8 *)STRING(scan);
2069 l = utf8_length(s, s + l);
2070 uc = utf8_to_uvchr(s, NULL);
2073 if (data && (flags & SCF_DO_SUBSTR))
2075 if (flags & SCF_DO_STCLASS_AND) {
2076 /* Check whether it is compatible with what we know already! */
2080 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
2081 && !ANYOF_BITMAP_TEST(data->start_class, uc)
2082 && !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
2084 ANYOF_CLASS_ZERO(data->start_class);
2085 ANYOF_BITMAP_ZERO(data->start_class);
2087 ANYOF_BITMAP_SET(data->start_class, uc);
2088 data->start_class->flags &= ~ANYOF_EOS;
2089 data->start_class->flags |= ANYOF_FOLD;
2090 if (OP(scan) == EXACTFL)
2091 data->start_class->flags |= ANYOF_LOCALE;
2094 else if (flags & SCF_DO_STCLASS_OR) {
2095 if (data->start_class->flags & ANYOF_FOLD) {
2096 /* false positive possible if the class is case-folded.
2097 Assume that the locale settings are the same... */
2099 ANYOF_BITMAP_SET(data->start_class, uc);
2100 data->start_class->flags &= ~ANYOF_EOS;
2102 cl_and(data->start_class, &and_with);
2104 flags &= ~SCF_DO_STCLASS;
2106 else if (strchr((const char*)PL_varies,OP(scan))) {
2107 I32 mincount, maxcount, minnext, deltanext, fl = 0;
2108 I32 f = flags, pos_before = 0;
2109 regnode *oscan = scan;
2110 struct regnode_charclass_class this_class;
2111 struct regnode_charclass_class *oclass = NULL;
2112 I32 next_is_eval = 0;
2114 switch (PL_regkind[(U8)OP(scan)]) {
2115 case WHILEM: /* End of (?:...)* . */
2116 scan = NEXTOPER(scan);
2119 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
2120 next = NEXTOPER(scan);
2121 if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) {
2123 maxcount = REG_INFTY;
2124 next = regnext(scan);
2125 scan = NEXTOPER(scan);
2129 if (flags & SCF_DO_SUBSTR)
2134 if (flags & SCF_DO_STCLASS) {
2136 maxcount = REG_INFTY;
2137 next = regnext(scan);
2138 scan = NEXTOPER(scan);
2141 is_inf = is_inf_internal = 1;
2142 scan = regnext(scan);
2143 if (flags & SCF_DO_SUBSTR) {
2144 scan_commit(pRExC_state, data); /* Cannot extend fixed substrings */
2145 data->longest = &(data->longest_float);
2147 goto optimize_curly_tail;
2149 mincount = ARG1(scan);
2150 maxcount = ARG2(scan);
2151 next = regnext(scan);
2152 if (OP(scan) == CURLYX) {
2153 I32 lp = (data ? *(data->last_closep) : 0);
2154 scan->flags = ((lp <= U8_MAX) ? (U8)lp : U8_MAX);
2156 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
2157 next_is_eval = (OP(scan) == EVAL);
2159 if (flags & SCF_DO_SUBSTR) {
2160 if (mincount == 0) scan_commit(pRExC_state,data); /* Cannot extend fixed substrings */
2161 pos_before = data->pos_min;
2165 data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
2167 data->flags |= SF_IS_INF;
2169 if (flags & SCF_DO_STCLASS) {
2170 cl_init(pRExC_state, &this_class);
2171 oclass = data->start_class;
2172 data->start_class = &this_class;
2173 f |= SCF_DO_STCLASS_AND;
2174 f &= ~SCF_DO_STCLASS_OR;
2176 /* These are the cases when once a subexpression
2177 fails at a particular position, it cannot succeed
2178 even after backtracking at the enclosing scope.
2180 XXXX what if minimal match and we are at the
2181 initial run of {n,m}? */
2182 if ((mincount != maxcount - 1) && (maxcount != REG_INFTY))
2183 f &= ~SCF_WHILEM_VISITED_POS;
2185 /* This will finish on WHILEM, setting scan, or on NULL: */
2186 minnext = study_chunk(pRExC_state, &scan, &deltanext, last, data,
2188 ? (f & ~SCF_DO_SUBSTR) : f),depth+1);
2190 if (flags & SCF_DO_STCLASS)
2191 data->start_class = oclass;
2192 if (mincount == 0 || minnext == 0) {
2193 if (flags & SCF_DO_STCLASS_OR) {
2194 cl_or(pRExC_state, data->start_class, &this_class);
2196 else if (flags & SCF_DO_STCLASS_AND) {
2197 /* Switch to OR mode: cache the old value of
2198 * data->start_class */
2199 StructCopy(data->start_class, &and_with,
2200 struct regnode_charclass_class);
2201 flags &= ~SCF_DO_STCLASS_AND;
2202 StructCopy(&this_class, data->start_class,
2203 struct regnode_charclass_class);
2204 flags |= SCF_DO_STCLASS_OR;
2205 data->start_class->flags |= ANYOF_EOS;
2207 } else { /* Non-zero len */
2208 if (flags & SCF_DO_STCLASS_OR) {
2209 cl_or(pRExC_state, data->start_class, &this_class);
2210 cl_and(data->start_class, &and_with);
2212 else if (flags & SCF_DO_STCLASS_AND)
2213 cl_and(data->start_class, &this_class);
2214 flags &= ~SCF_DO_STCLASS;
2216 if (!scan) /* It was not CURLYX, but CURLY. */
2218 if (ckWARN(WARN_REGEXP)
2219 /* ? quantifier ok, except for (?{ ... }) */
2220 && (next_is_eval || !(mincount == 0 && maxcount == 1))
2221 && (minnext == 0) && (deltanext == 0)
2222 && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
2223 && maxcount <= REG_INFTY/3) /* Complement check for big count */
2226 "Quantifier unexpected on zero-length expression");
2229 min += minnext * mincount;
2230 is_inf_internal |= ((maxcount == REG_INFTY
2231 && (minnext + deltanext) > 0)
2232 || deltanext == I32_MAX);
2233 is_inf |= is_inf_internal;
2234 delta += (minnext + deltanext) * maxcount - minnext * mincount;
2236 /* Try powerful optimization CURLYX => CURLYN. */
2237 if ( OP(oscan) == CURLYX && data
2238 && data->flags & SF_IN_PAR
2239 && !(data->flags & SF_HAS_EVAL)
2240 && !deltanext && minnext == 1 ) {
2241 /* Try to optimize to CURLYN. */
2242 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
2243 regnode *nxt1 = nxt;
2250 if (!strchr((const char*)PL_simple,OP(nxt))
2251 && !(PL_regkind[(U8)OP(nxt)] == EXACT
2252 && STR_LEN(nxt) == 1))
2258 if (OP(nxt) != CLOSE)
2260 /* Now we know that nxt2 is the only contents: */
2261 oscan->flags = (U8)ARG(nxt);
2263 OP(nxt1) = NOTHING; /* was OPEN. */
2265 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
2266 NEXT_OFF(nxt1+ 1) = 0; /* just for consistancy. */
2267 NEXT_OFF(nxt2) = 0; /* just for consistancy with CURLY. */
2268 OP(nxt) = OPTIMIZED; /* was CLOSE. */
2269 OP(nxt + 1) = OPTIMIZED; /* was count. */
2270 NEXT_OFF(nxt+ 1) = 0; /* just for consistancy. */
2275 /* Try optimization CURLYX => CURLYM. */
2276 if ( OP(oscan) == CURLYX && data
2277 && !(data->flags & SF_HAS_PAR)
2278 && !(data->flags & SF_HAS_EVAL)
2279 && !deltanext /* atom is fixed width */
2280 && minnext != 0 /* CURLYM can't handle zero width */
2282 /* XXXX How to optimize if data == 0? */
2283 /* Optimize to a simpler form. */
2284 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
2288 while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
2289 && (OP(nxt2) != WHILEM))
2291 OP(nxt2) = SUCCEED; /* Whas WHILEM */
2292 /* Need to optimize away parenths. */
2293 if (data->flags & SF_IN_PAR) {
2294 /* Set the parenth number. */
2295 regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
2297 if (OP(nxt) != CLOSE)
2298 FAIL("Panic opt close");
2299 oscan->flags = (U8)ARG(nxt);
2300 OP(nxt1) = OPTIMIZED; /* was OPEN. */
2301 OP(nxt) = OPTIMIZED; /* was CLOSE. */
2303 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
2304 OP(nxt + 1) = OPTIMIZED; /* was count. */
2305 NEXT_OFF(nxt1 + 1) = 0; /* just for consistancy. */
2306 NEXT_OFF(nxt + 1) = 0; /* just for consistancy. */
2309 while ( nxt1 && (OP(nxt1) != WHILEM)) {
2310 regnode *nnxt = regnext(nxt1);
2313 if (reg_off_by_arg[OP(nxt1)])
2314 ARG_SET(nxt1, nxt2 - nxt1);
2315 else if (nxt2 - nxt1 < U16_MAX)
2316 NEXT_OFF(nxt1) = nxt2 - nxt1;
2318 OP(nxt) = NOTHING; /* Cannot beautify */
2323 /* Optimize again: */
2324 study_chunk(pRExC_state, &nxt1, &deltanext, nxt,
2330 else if ((OP(oscan) == CURLYX)
2331 && (flags & SCF_WHILEM_VISITED_POS)
2332 /* See the comment on a similar expression above.
2333 However, this time it not a subexpression
2334 we care about, but the expression itself. */
2335 && (maxcount == REG_INFTY)
2336 && data && ++data->whilem_c < 16) {
2337 /* This stays as CURLYX, we can put the count/of pair. */
2338 /* Find WHILEM (as in regexec.c) */
2339 regnode *nxt = oscan + NEXT_OFF(oscan);
2341 if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
2343 PREVOPER(nxt)->flags = (U8)(data->whilem_c
2344 | (RExC_whilem_seen << 4)); /* On WHILEM */
2346 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
2348 if (flags & SCF_DO_SUBSTR) {
2349 SV *last_str = Nullsv;
2350 int counted = mincount != 0;
2352 if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */
2353 #if defined(SPARC64_GCC_WORKAROUND)
2359 if (pos_before >= data->last_start_min)
2362 b = data->last_start_min;
2365 s = SvPV(data->last_found, l);
2366 old = b - data->last_start_min;
2369 I32 b = pos_before >= data->last_start_min
2370 ? pos_before : data->last_start_min;
2372 char *s = SvPV(data->last_found, l);
2373 I32 old = b - data->last_start_min;
2377 old = utf8_hop((U8*)s, old) - (U8*)s;
2380 /* Get the added string: */
2381 last_str = newSVpvn(s + old, l);
2383 SvUTF8_on(last_str);
2384 if (deltanext == 0 && pos_before == b) {
2385 /* What was added is a constant string */
2387 SvGROW(last_str, (mincount * l) + 1);
2388 repeatcpy(SvPVX(last_str) + l,
2389 SvPVX(last_str), l, mincount - 1);
2390 SvCUR_set(last_str, SvCUR(last_str) * mincount);
2391 /* Add additional parts. */
2392 SvCUR_set(data->last_found,
2393 SvCUR(data->last_found) - l);
2394 sv_catsv(data->last_found, last_str);
2396 SV * sv = data->last_found;
2398 SvUTF8(sv) && SvMAGICAL(sv) ?
2399 mg_find(sv, PERL_MAGIC_utf8) : NULL;
2400 if (mg && mg->mg_len >= 0)
2401 mg->mg_len += CHR_SVLEN(last_str);
2403 data->last_end += l * (mincount - 1);
2406 /* start offset must point into the last copy */
2407 data->last_start_min += minnext * (mincount - 1);
2408 data->last_start_max += is_inf ? I32_MAX
2409 : (maxcount - 1) * (minnext + data->pos_delta);
2412 /* It is counted once already... */
2413 data->pos_min += minnext * (mincount - counted);
2414 data->pos_delta += - counted * deltanext +
2415 (minnext + deltanext) * maxcount - minnext * mincount;
2416 if (mincount != maxcount) {
2417 /* Cannot extend fixed substrings found inside
2419 scan_commit(pRExC_state,data);
2420 if (mincount && last_str) {
2421 sv_setsv(data->last_found, last_str);
2422 data->last_end = data->pos_min;
2423 data->last_start_min =
2424 data->pos_min - CHR_SVLEN(last_str);
2425 data->last_start_max = is_inf
2427 : data->pos_min + data->pos_delta
2428 - CHR_SVLEN(last_str);
2430 data->longest = &(data->longest_float);
2432 SvREFCNT_dec(last_str);
2434 if (data && (fl & SF_HAS_EVAL))
2435 data->flags |= SF_HAS_EVAL;
2436 optimize_curly_tail:
2437 if (OP(oscan) != CURLYX) {
2438 while (PL_regkind[(U8)OP(next = regnext(oscan))] == NOTHING
2440 NEXT_OFF(oscan) += NEXT_OFF(next);
2443 default: /* REF and CLUMP only? */
2444 if (flags & SCF_DO_SUBSTR) {
2445 scan_commit(pRExC_state,data); /* Cannot expect anything... */
2446 data->longest = &(data->longest_float);
2448 is_inf = is_inf_internal = 1;
2449 if (flags & SCF_DO_STCLASS_OR)
2450 cl_anything(pRExC_state, data->start_class);
2451 flags &= ~SCF_DO_STCLASS;
2455 else if (strchr((const char*)PL_simple,OP(scan))) {
2458 if (flags & SCF_DO_SUBSTR) {
2459 scan_commit(pRExC_state,data);
2463 if (flags & SCF_DO_STCLASS) {
2464 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
2466 /* Some of the logic below assumes that switching
2467 locale on will only add false positives. */
2468 switch (PL_regkind[(U8)OP(scan)]) {
2472 /* Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan)); */
2473 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
2474 cl_anything(pRExC_state, data->start_class);
2477 if (OP(scan) == SANY)
2479 if (flags & SCF_DO_STCLASS_OR) { /* Everything but \n */
2480 value = (ANYOF_BITMAP_TEST(data->start_class,'\n')
2481 || (data->start_class->flags & ANYOF_CLASS));
2482 cl_anything(pRExC_state, data->start_class);
2484 if (flags & SCF_DO_STCLASS_AND || !value)
2485 ANYOF_BITMAP_CLEAR(data->start_class,'\n');
2488 if (flags & SCF_DO_STCLASS_AND)
2489 cl_and(data->start_class,
2490 (struct regnode_charclass_class*)scan);
2492 cl_or(pRExC_state, data->start_class,
2493 (struct regnode_charclass_class*)scan);
2496 if (flags & SCF_DO_STCLASS_AND) {
2497 if (!(data->start_class->flags & ANYOF_LOCALE)) {
2498 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
2499 for (value = 0; value < 256; value++)
2500 if (!isALNUM(value))
2501 ANYOF_BITMAP_CLEAR(data->start_class, value);
2505 if (data->start_class->flags & ANYOF_LOCALE)
2506 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
2508 for (value = 0; value < 256; value++)
2510 ANYOF_BITMAP_SET(data->start_class, value);
2515 if (flags & SCF_DO_STCLASS_AND) {
2516 if (data->start_class->flags & ANYOF_LOCALE)
2517 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
2520 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
2521 data->start_class->flags |= ANYOF_LOCALE;
2525 if (flags & SCF_DO_STCLASS_AND) {
2526 if (!(data->start_class->flags & ANYOF_LOCALE)) {
2527 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
2528 for (value = 0; value < 256; value++)
2530 ANYOF_BITMAP_CLEAR(data->start_class, value);
2534 if (data->start_class->flags & ANYOF_LOCALE)
2535 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
2537 for (value = 0; value < 256; value++)
2538 if (!isALNUM(value))
2539 ANYOF_BITMAP_SET(data->start_class, value);
2544 if (flags & SCF_DO_STCLASS_AND) {
2545 if (data->start_class->flags & ANYOF_LOCALE)
2546 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
2549 data->start_class->flags |= ANYOF_LOCALE;
2550 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
2554 if (flags & SCF_DO_STCLASS_AND) {
2555 if (!(data->start_class->flags & ANYOF_LOCALE)) {
2556 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
2557 for (value = 0; value < 256; value++)
2558 if (!isSPACE(value))
2559 ANYOF_BITMAP_CLEAR(data->start_class, value);
2563 if (data->start_class->flags & ANYOF_LOCALE)
2564 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
2566 for (value = 0; value < 256; value++)
2568 ANYOF_BITMAP_SET(data->start_class, value);
2573 if (flags & SCF_DO_STCLASS_AND) {
2574 if (data->start_class->flags & ANYOF_LOCALE)
2575 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
2578 data->start_class->flags |= ANYOF_LOCALE;
2579 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
2583 if (flags & SCF_DO_STCLASS_AND) {
2584 if (!(data->start_class->flags & ANYOF_LOCALE)) {
2585 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
2586 for (value = 0; value < 256; value++)
2588 ANYOF_BITMAP_CLEAR(data->start_class, value);
2592 if (data->start_class->flags & ANYOF_LOCALE)
2593 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
2595 for (value = 0; value < 256; value++)
2596 if (!isSPACE(value))
2597 ANYOF_BITMAP_SET(data->start_class, value);
2602 if (flags & SCF_DO_STCLASS_AND) {
2603 if (data->start_class->flags & ANYOF_LOCALE) {
2604 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
2605 for (value = 0; value < 256; value++)
2606 if (!isSPACE(value))
2607 ANYOF_BITMAP_CLEAR(data->start_class, value);
2611 data->start_class->flags |= ANYOF_LOCALE;
2612 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
2616 if (flags & SCF_DO_STCLASS_AND) {
2617 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NDIGIT);
2618 for (value = 0; value < 256; value++)
2619 if (!isDIGIT(value))
2620 ANYOF_BITMAP_CLEAR(data->start_class, value);
2623 if (data->start_class->flags & ANYOF_LOCALE)
2624 ANYOF_CLASS_SET(data->start_class,ANYOF_DIGIT);
2626 for (value = 0; value < 256; value++)
2628 ANYOF_BITMAP_SET(data->start_class, value);
2633 if (flags & SCF_DO_STCLASS_AND) {
2634 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_DIGIT);
2635 for (value = 0; value < 256; value++)
2637 ANYOF_BITMAP_CLEAR(data->start_class, value);
2640 if (data->start_class->flags & ANYOF_LOCALE)
2641 ANYOF_CLASS_SET(data->start_class,ANYOF_NDIGIT);
2643 for (value = 0; value < 256; value++)
2644 if (!isDIGIT(value))
2645 ANYOF_BITMAP_SET(data->start_class, value);
2650 if (flags & SCF_DO_STCLASS_OR)
2651 cl_and(data->start_class, &and_with);
2652 flags &= ~SCF_DO_STCLASS;
2655 else if (PL_regkind[(U8)OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
2656 data->flags |= (OP(scan) == MEOL
2660 else if ( PL_regkind[(U8)OP(scan)] == BRANCHJ
2661 /* Lookbehind, or need to calculate parens/evals/stclass: */
2662 && (scan->flags || data || (flags & SCF_DO_STCLASS))
2663 && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) {
2664 /* Lookahead/lookbehind */
2665 I32 deltanext, minnext, fake = 0;
2667 struct regnode_charclass_class intrnl;
2670 data_fake.flags = 0;
2672 data_fake.whilem_c = data->whilem_c;
2673 data_fake.last_closep = data->last_closep;
2676 data_fake.last_closep = &fake;
2677 if ( flags & SCF_DO_STCLASS && !scan->flags
2678 && OP(scan) == IFMATCH ) { /* Lookahead */
2679 cl_init(pRExC_state, &intrnl);
2680 data_fake.start_class = &intrnl;
2681 f |= SCF_DO_STCLASS_AND;
2683 if (flags & SCF_WHILEM_VISITED_POS)
2684 f |= SCF_WHILEM_VISITED_POS;
2685 next = regnext(scan);
2686 nscan = NEXTOPER(NEXTOPER(scan));
2687 minnext = study_chunk(pRExC_state, &nscan, &deltanext, last, &data_fake, f,depth+1);
2690 vFAIL("Variable length lookbehind not implemented");
2692 else if (minnext > U8_MAX) {
2693 vFAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
2695 scan->flags = (U8)minnext;
2697 if (data && data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
2699 if (data && (data_fake.flags & SF_HAS_EVAL))
2700 data->flags |= SF_HAS_EVAL;
2702 data->whilem_c = data_fake.whilem_c;
2703 if (f & SCF_DO_STCLASS_AND) {
2704 int was = (data->start_class->flags & ANYOF_EOS);
2706 cl_and(data->start_class, &intrnl);
2708 data->start_class->flags |= ANYOF_EOS;
2711 else if (OP(scan) == OPEN) {
2714 else if (OP(scan) == CLOSE) {
2715 if ((I32)ARG(scan) == is_par) {
2716 next = regnext(scan);
2718 if ( next && (OP(next) != WHILEM) && next < last)
2719 is_par = 0; /* Disable optimization */
2722 *(data->last_closep) = ARG(scan);
2724 else if (OP(scan) == EVAL) {
2726 data->flags |= SF_HAS_EVAL;
2728 else if (OP(scan) == LOGICAL && scan->flags == 2) { /* Embedded follows */
2729 if (flags & SCF_DO_SUBSTR) {
2730 scan_commit(pRExC_state,data);
2731 data->longest = &(data->longest_float);
2733 is_inf = is_inf_internal = 1;
2734 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
2735 cl_anything(pRExC_state, data->start_class);
2736 flags &= ~SCF_DO_STCLASS;
2738 /* Else: zero-length, ignore. */
2739 scan = regnext(scan);
2744 *deltap = is_inf_internal ? I32_MAX : delta;
2745 if (flags & SCF_DO_SUBSTR && is_inf)
2746 data->pos_delta = I32_MAX - data->pos_min;
2747 if (is_par > U8_MAX)
2749 if (is_par && pars==1 && data) {
2750 data->flags |= SF_IN_PAR;
2751 data->flags &= ~SF_HAS_PAR;
2753 else if (pars && data) {
2754 data->flags |= SF_HAS_PAR;
2755 data->flags &= ~SF_IN_PAR;
2757 if (flags & SCF_DO_STCLASS_OR)
2758 cl_and(data->start_class, &and_with);
2763 S_add_data(pTHX_ RExC_state_t *pRExC_state, I32 n, const char *s)
2765 if (RExC_rx->data) {
2766 Renewc(RExC_rx->data,
2767 sizeof(*RExC_rx->data) + sizeof(void*) * (RExC_rx->data->count + n - 1),
2768 char, struct reg_data);
2769 Renew(RExC_rx->data->what, RExC_rx->data->count + n, U8);
2770 RExC_rx->data->count += n;
2773 Newc(1207, RExC_rx->data, sizeof(*RExC_rx->data) + sizeof(void*) * (n - 1),
2774 char, struct reg_data);
2775 New(1208, RExC_rx->data->what, n, U8);
2776 RExC_rx->data->count = n;
2778 Copy(s, RExC_rx->data->what + RExC_rx->data->count - n, n, U8);
2779 return RExC_rx->data->count - n;
2783 Perl_reginitcolors(pTHX)
2786 char *s = PerlEnv_getenv("PERL_RE_COLORS");
2789 PL_colors[0] = s = savepv(s);
2791 s = strchr(s, '\t');
2797 PL_colors[i] = s = (char *)"";
2801 PL_colors[i++] = (char *)"";
2808 - pregcomp - compile a regular expression into internal code
2810 * We can't allocate space until we know how big the compiled form will be,
2811 * but we can't compile it (and thus know how big it is) until we've got a
2812 * place to put the code. So we cheat: we compile it twice, once with code
2813 * generation turned off and size counting turned on, and once "for real".
2814 * This also means that we don't allocate space until we are sure that the
2815 * thing really will compile successfully, and we never have to move the
2816 * code and thus invalidate pointers into it. (Note that it has to be in
2817 * one piece because free() must be able to free it all.) [NB: not true in perl]
2819 * Beware that the optimization-preparation code in here knows about some
2820 * of the structure of the compiled regexp. [I'll say.]
2823 Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm)
2833 RExC_state_t RExC_state;
2834 RExC_state_t *pRExC_state = &RExC_state;
2836 GET_RE_DEBUG_FLAGS_DECL;
2839 FAIL("NULL regexp argument");
2841 RExC_utf8 = pm->op_pmdynflags & PMdf_CMP_UTF8;
2844 DEBUG_r(if (!PL_colorset) reginitcolors());
2846 PerlIO_printf(Perl_debug_log, "%sCompiling REx%s \"%s%*s%s\"\n",
2847 PL_colors[4],PL_colors[5],PL_colors[0],
2848 (int)(xend - exp), RExC_precomp, PL_colors[1]);
2850 RExC_flags = pm->op_pmflags;
2854 RExC_seen_zerolen = *exp == '^' ? -1 : 0;
2855 RExC_seen_evals = 0;
2858 /* First pass: determine size, legality. */
2865 RExC_emit = &PL_regdummy;
2866 RExC_whilem_seen = 0;
2867 #if 0 /* REGC() is (currently) a NOP at the first pass.
2868 * Clever compilers notice this and complain. --jhi */
2869 REGC((U8)REG_MAGIC, (char*)RExC_emit);
2871 if (reg(pRExC_state, 0, &flags) == NULL) {
2872 RExC_precomp = Nullch;
2875 DEBUG_COMPILE_r(PerlIO_printf(Perl_debug_log, "size %"IVdf" ", (IV)RExC_size));
2877 /* Small enough for pointer-storage convention?
2878 If extralen==0, this means that we will not need long jumps. */
2879 if (RExC_size >= 0x10000L && RExC_extralen)
2880 RExC_size += RExC_extralen;
2883 if (RExC_whilem_seen > 15)
2884 RExC_whilem_seen = 15;
2886 /* Allocate space and initialize. */
2887 Newc(1001, r, sizeof(regexp) + (unsigned)RExC_size * sizeof(regnode),
2890 FAIL("Regexp out of space");
2893 /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */
2894 Zero(r, sizeof(regexp) + (unsigned)RExC_size * sizeof(regnode), char);
2897 r->prelen = xend - exp;
2898 r->precomp = savepvn(RExC_precomp, r->prelen);
2900 #ifdef PERL_COPY_ON_WRITE
2901 r->saved_copy = Nullsv;
2903 r->reganch = pm->op_pmflags & PMf_COMPILETIME;
2904 r->nparens = RExC_npar - 1; /* set early to validate backrefs */
2906 r->substrs = 0; /* Useful during FAIL. */
2907 r->startp = 0; /* Useful during FAIL. */
2908 r->endp = 0; /* Useful during FAIL. */
2910 Newz(1304, r->offsets, 2*RExC_size+1, U32); /* MJD 20001228 */
2912 r->offsets[0] = RExC_size;
2914 DEBUG_OFFSETS_r(PerlIO_printf(Perl_debug_log,
2915 "%s %"UVuf" bytes for offset annotations.\n",
2916 r->offsets ? "Got" : "Couldn't get",
2917 (UV)((2*RExC_size+1) * sizeof(U32))));
2921 /* Second pass: emit code. */
2922 RExC_flags = pm->op_pmflags; /* don't let top level (?i) bleed */
2927 RExC_emit_start = r->program;
2928 RExC_emit = r->program;
2929 /* Store the count of eval-groups for security checks: */
2930 RExC_emit->next_off = (U16)((RExC_seen_evals > U16_MAX) ? U16_MAX : RExC_seen_evals);
2931 REGC((U8)REG_MAGIC, (char*) RExC_emit++);
2933 if (reg(pRExC_state, 0, &flags) == NULL)
2937 /* Dig out information for optimizations. */
2938 r->reganch = pm->op_pmflags & PMf_COMPILETIME; /* Again? */
2939 pm->op_pmflags = RExC_flags;
2941 r->reganch |= ROPT_UTF8; /* Unicode in it? */
2942 r->regstclass = NULL;
2943 if (RExC_naughty >= 10) /* Probably an expensive pattern. */
2944 r->reganch |= ROPT_NAUGHTY;
2945 scan = r->program + 1; /* First BRANCH. */
2947 /* XXXX To minimize changes to RE engine we always allocate
2948 3-units-long substrs field. */
2949 Newz(1004, r->substrs, 1, struct reg_substr_data);
2951 StructCopy(&zero_scan_data, &data, scan_data_t);
2952 /* XXXX Should not we check for something else? Usually it is OPEN1... */
2953 if (OP(scan) != BRANCH) { /* Only one top-level choice. */
2955 STRLEN longest_float_length, longest_fixed_length;
2956 struct regnode_charclass_class ch_class;
2961 /* Skip introductions and multiplicators >= 1. */
2962 while ((OP(first) == OPEN && (sawopen = 1)) ||
2963 /* An OR of *one* alternative - should not happen now. */
2964 (OP(first) == BRANCH && OP(regnext(first)) != BRANCH) ||
2965 (OP(first) == PLUS) ||
2966 (OP(first) == MINMOD) ||
2967 /* An {n,m} with n>0 */
2968 (PL_regkind[(U8)OP(first)] == CURLY && ARG1(first) > 0) ) {
2969 if (OP(first) == PLUS)
2972 first += regarglen[(U8)OP(first)];
2973 first = NEXTOPER(first);
2976 /* Starting-point info. */
2978 if (PL_regkind[(U8)OP(first)] == EXACT) {
2979 if (OP(first) == EXACT)
2980 ; /* Empty, get anchored substr later. */
2981 else if ((OP(first) == EXACTF || OP(first) == EXACTFL))
2982 r->regstclass = first;
2984 else if (strchr((const char*)PL_simple,OP(first)))
2985 r->regstclass = first;
2986 else if (PL_regkind[(U8)OP(first)] == BOUND ||
2987 PL_regkind[(U8)OP(first)] == NBOUND)
2988 r->regstclass = first;
2989 else if (PL_regkind[(U8)OP(first)] == BOL) {
2990 r->reganch |= (OP(first) == MBOL
2992 : (OP(first) == SBOL
2995 first = NEXTOPER(first);
2998 else if (OP(first) == GPOS) {
2999 r->reganch |= ROPT_ANCH_GPOS;
3000 first = NEXTOPER(first);
3003 else if (!sawopen && (OP(first) == STAR &&
3004 PL_regkind[(U8)OP(NEXTOPER(first))] == REG_ANY) &&
3005 !(r->reganch & ROPT_ANCH) )
3007 /* turn .* into ^.* with an implied $*=1 */
3008 int type = OP(NEXTOPER(first));
3010 if (type == REG_ANY)
3011 type = ROPT_ANCH_MBOL;
3013 type = ROPT_ANCH_SBOL;
3015 r->reganch |= type | ROPT_IMPLICIT;
3016 first = NEXTOPER(first);
3019 if (sawplus && (!sawopen || !RExC_sawback)
3020 && !(RExC_seen & REG_SEEN_EVAL)) /* May examine pos and $& */
3021 /* x+ must match at the 1st pos of run of x's */
3022 r->reganch |= ROPT_SKIP;
3024 /* Scan is after the zeroth branch, first is atomic matcher. */
3025 DEBUG_COMPILE_r(PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
3026 (IV)(first - scan + 1)));
3028 * If there's something expensive in the r.e., find the
3029 * longest literal string that must appear and make it the
3030 * regmust. Resolve ties in favor of later strings, since
3031 * the regstart check works with the beginning of the r.e.
3032 * and avoiding duplication strengthens checking. Not a
3033 * strong reason, but sufficient in the absence of others.
3034 * [Now we resolve ties in favor of the earlier string if
3035 * it happens that c_offset_min has been invalidated, since the
3036 * earlier string may buy us something the later one won't.]
3040 data.longest_fixed = newSVpvn("",0);
3041 data.longest_float = newSVpvn("",0);
3042 data.last_found = newSVpvn("",0);
3043 data.longest = &(data.longest_fixed);
3045 if (!r->regstclass) {
3046 cl_init(pRExC_state, &ch_class);
3047 data.start_class = &ch_class;
3048 stclass_flag = SCF_DO_STCLASS_AND;
3049 } else /* XXXX Check for BOUND? */
3051 data.last_closep = &last_close;
3053 minlen = study_chunk(pRExC_state, &first, &fake, scan + RExC_size, /* Up to end */
3054 &data, SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag,0);
3055 if ( RExC_npar == 1 && data.longest == &(data.longest_fixed)
3056 && data.last_start_min == 0 && data.last_end > 0
3057 && !RExC_seen_zerolen
3058 && (!(RExC_seen & REG_SEEN_GPOS) || (r->reganch & ROPT_ANCH_GPOS)))
3059 r->reganch |= ROPT_CHECK_ALL;
3060 scan_commit(pRExC_state, &data);
3061 SvREFCNT_dec(data.last_found);
3063 longest_float_length = CHR_SVLEN(data.longest_float);
3064 if (longest_float_length
3065 || (data.flags & SF_FL_BEFORE_EOL
3066 && (!(data.flags & SF_FL_BEFORE_MEOL)
3067 || (RExC_flags & PMf_MULTILINE)))) {
3070 if (SvCUR(data.longest_fixed) /* ok to leave SvCUR */
3071 && data.offset_fixed == data.offset_float_min
3072 && SvCUR(data.longest_fixed) == SvCUR(data.longest_float))
3073 goto remove_float; /* As in (a)+. */
3075 if (SvUTF8(data.longest_float)) {
3076 r->float_utf8 = data.longest_float;
3077 r->float_substr = Nullsv;
3079 r->float_substr = data.longest_float;
3080 r->float_utf8 = Nullsv;
3082 r->float_min_offset = data.offset_float_min;
3083 r->float_max_offset = data.offset_float_max;
3084 t = (data.flags & SF_FL_BEFORE_EOL /* Can't have SEOL and MULTI */
3085 && (!(data.flags & SF_FL_BEFORE_MEOL)
3086 || (RExC_flags & PMf_MULTILINE)));
3087 fbm_compile(data.longest_float, t ? FBMcf_TAIL : 0);
3091 r->float_substr = r->float_utf8 = Nullsv;
3092 SvREFCNT_dec(data.longest_float);
3093 longest_float_length = 0;
3096 longest_fixed_length = CHR_SVLEN(data.longest_fixed);
3097 if (longest_fixed_length
3098 || (data.flags & SF_FIX_BEFORE_EOL /* Cannot have SEOL and MULTI */
3099 && (!(data.flags & SF_FIX_BEFORE_MEOL)
3100 || (RExC_flags & PMf_MULTILINE)))) {
3103 if (SvUTF8(data.longest_fixed)) {
3104 r->anchored_utf8 = data.longest_fixed;
3105 r->anchored_substr = Nullsv;
3107 r->anchored_substr = data.longest_fixed;
3108 r->anchored_utf8 = Nullsv;
3110 r->anchored_offset = data.offset_fixed;
3111 t = (data.flags & SF_FIX_BEFORE_EOL /* Can't have SEOL and MULTI */
3112 && (!(data.flags & SF_FIX_BEFORE_MEOL)
3113 || (RExC_flags & PMf_MULTILINE)));
3114 fbm_compile(data.longest_fixed, t ? FBMcf_TAIL : 0);
3117 r->anchored_substr = r->anchored_utf8 = Nullsv;
3118 SvREFCNT_dec(data.longest_fixed);
3119 longest_fixed_length = 0;
3122 && (OP(r->regstclass) == REG_ANY || OP(r->regstclass) == SANY))
3123 r->regstclass = NULL;
3124 if ((!(r->anchored_substr || r->anchored_utf8) || r->anchored_offset)
3126 && !(data.start_class->flags & ANYOF_EOS)
3127 && !cl_is_anything(data.start_class))
3129 I32 n = add_data(pRExC_state, 1, "f");
3131 New(1006, RExC_rx->data->data[n], 1,
3132 struct regnode_charclass_class);
3133 StructCopy(data.start_class,
3134 (struct regnode_charclass_class*)RExC_rx->data->data[n],
3135 struct regnode_charclass_class);
3136 r->regstclass = (regnode*)RExC_rx->data->data[n];
3137 r->reganch &= ~ROPT_SKIP; /* Used in find_byclass(). */
3138 PL_regdata = r->data; /* for regprop() */
3139 DEBUG_COMPILE_r({ SV *sv = sv_newmortal();
3140 regprop(sv, (regnode*)data.start_class);
3141 PerlIO_printf(Perl_debug_log,
3142 "synthetic stclass `%s'.\n",
3146 /* A temporary algorithm prefers floated substr to fixed one to dig more info. */
3147 if (longest_fixed_length > longest_float_length) {
3148 r->check_substr = r->anchored_substr;
3149 r->check_utf8 = r->anchored_utf8;
3150 r->check_offset_min = r->check_offset_max = r->anchored_offset;
3151 if (r->reganch & ROPT_ANCH_SINGLE)
3152 r->reganch |= ROPT_NOSCAN;
3155 r->check_substr = r->float_substr;
3156 r->check_utf8 = r->float_utf8;
3157 r->check_offset_min = data.offset_float_min;
3158 r->check_offset_max = data.offset_float_max;
3160 /* XXXX Currently intuiting is not compatible with ANCH_GPOS.
3161 This should be changed ASAP! */
3162 if ((r->check_substr || r->check_utf8) && !(r->reganch & ROPT_ANCH_GPOS)) {
3163 r->reganch |= RE_USE_INTUIT;
3164 if (SvTAIL(r->check_substr ? r->check_substr : r->check_utf8))
3165 r->reganch |= RE_INTUIT_TAIL;
3169 /* Several toplevels. Best we can is to set minlen. */
3171 struct regnode_charclass_class ch_class;
3174 DEBUG_COMPILE_r(PerlIO_printf(Perl_debug_log, "\n"));
3175 scan = r->program + 1;
3176 cl_init(pRExC_state, &ch_class);
3177 data.start_class = &ch_class;
3178 data.last_closep = &last_close;
3179 minlen = study_chunk(pRExC_state, &scan, &fake, scan + RExC_size, &data, SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS,0);
3180 r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8
3181 = r->float_substr = r->float_utf8 = Nullsv;
3182 if (!(data.start_class->flags & ANYOF_EOS)
3183 && !cl_is_anything(data.start_class))
3185 I32 n = add_data(pRExC_state, 1, "f");
3187 New(1006, RExC_rx->data->data[n], 1,
3188 struct regnode_charclass_class);
3189 StructCopy(data.start_class,
3190 (struct regnode_charclass_class*)RExC_rx->data->data[n],
3191 struct regnode_charclass_class);
3192 r->regstclass = (regnode*)RExC_rx->data->data[n];
3193 r->reganch &= ~ROPT_SKIP; /* Used in find_byclass(). */
3194 DEBUG_COMPILE_r({ SV* sv = sv_newmortal();
3195 regprop(sv, (regnode*)data.start_class);
3196 PerlIO_printf(Perl_debug_log,
3197 "synthetic stclass `%s'.\n",
3203 if (RExC_seen & REG_SEEN_GPOS)
3204 r->reganch |= ROPT_GPOS_SEEN;
3205 if (RExC_seen & REG_SEEN_LOOKBEHIND)
3206 r->reganch |= ROPT_LOOKBEHIND_SEEN;
3207 if (RExC_seen & REG_SEEN_EVAL)
3208 r->reganch |= ROPT_EVAL_SEEN;
3209 if (RExC_seen & REG_SEEN_CANY)
3210 r->reganch |= ROPT_CANY_SEEN;
3211 Newz(1002, r->startp, RExC_npar, I32);
3212 Newz(1002, r->endp, RExC_npar, I32);
3213 PL_regdata = r->data; /* for regprop() */
3214 DEBUG_COMPILE_r(regdump(r));
3219 - reg - regular expression, i.e. main body or parenthesized thing
3221 * Caller must absorb opening parenthesis.
3223 * Combining parenthesis handling with the base level of regular expression
3224 * is a trifle forced, but the need to tie the tails of the branches to what
3225 * follows makes it hard to avoid.
3228 S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp)
3229 /* paren: Parenthesized? 0=top, 1=(, inside: changed to letter. */
3232 register regnode *ret; /* Will be the head of the group. */
3233 register regnode *br;
3234 register regnode *lastbr;
3235 register regnode *ender = 0;
3236 register I32 parno = 0;
3237 I32 flags, oregflags = RExC_flags, have_branch = 0, open = 0;
3239 /* for (?g), (?gc), and (?o) warnings; warning
3240 about (?c) will warn about (?g) -- japhy */
3242 I32 wastedflags = 0x00,
3245 wasted_gc = 0x02 | 0x04,
3248 char * parse_start = RExC_parse; /* MJD */
3249 char *oregcomp_parse = RExC_parse;
3252 *flagp = 0; /* Tentatively. */
3255 /* Make an OPEN node, if parenthesized. */
3257 if (*RExC_parse == '?') { /* (?...) */
3258 U32 posflags = 0, negflags = 0;
3259 U32 *flagsp = &posflags;
3261 char *seqstart = RExC_parse;
3264 paren = *RExC_parse++;
3265 ret = NULL; /* For look-ahead/behind. */
3267 case '<': /* (?<...) */
3268 RExC_seen |= REG_SEEN_LOOKBEHIND;
3269 if (*RExC_parse == '!')
3271 if (*RExC_parse != '=' && *RExC_parse != '!')
3274 case '=': /* (?=...) */
3275 case '!': /* (?!...) */
3276 RExC_seen_zerolen++;
3277 case ':': /* (?:...) */
3278 case '>': /* (?>...) */
3280 case '$': /* (?$...) */
3281 case '@': /* (?@...) */
3282 vFAIL2("Sequence (?%c...) not implemented", (int)paren);
3284 case '#': /* (?#...) */
3285 while (*RExC_parse && *RExC_parse != ')')
3287 if (*RExC_parse != ')')
3288 FAIL("Sequence (?#... not terminated");
3289 nextchar(pRExC_state);
3292 case 'p': /* (?p...) */
3293 if (SIZE_ONLY && ckWARN2(WARN_DEPRECATED, WARN_REGEXP))
3294 vWARNdep(RExC_parse, "(?p{}) is deprecated - use (??{})");
3296 case '?': /* (??...) */
3298 if (*RExC_parse != '{')
3300 paren = *RExC_parse++;
3302 case '{': /* (?{...}) */
3304 I32 count = 1, n = 0;
3306 char *s = RExC_parse;
3308 OP_4tree *sop, *rop;
3310 RExC_seen_zerolen++;
3311 RExC_seen |= REG_SEEN_EVAL;
3312 while (count && (c = *RExC_parse)) {
3313 if (c == '\\' && RExC_parse[1])
3321 if (*RExC_parse != ')')
3324 vFAIL("Sequence (?{...}) not terminated or not {}-balanced");
3329 if (RExC_parse - 1 - s)
3330 sv = newSVpvn(s, RExC_parse - 1 - s);
3332 sv = newSVpvn("", 0);
3335 Perl_save_re_context(aTHX);
3336 rop = sv_compile_2op(sv, &sop, "re", &pad);
3337 sop->op_private |= OPpREFCOUNTED;
3338 /* re_dup will OpREFCNT_inc */
3339 OpREFCNT_set(sop, 1);
3342 n = add_data(pRExC_state, 3, "nop");
3343 RExC_rx->data->data[n] = (void*)rop;
3344 RExC_rx->data->data[n+1] = (void*)sop;
3345 RExC_rx->data->data[n+2] = (void*)pad;
3348 else { /* First pass */
3349 if (PL_reginterp_cnt < ++RExC_seen_evals
3351 /* No compiled RE interpolated, has runtime
3352 components ===> unsafe. */
3353 FAIL("Eval-group not allowed at runtime, use re 'eval'");
3354 if (PL_tainting && PL_tainted)
3355 FAIL("Eval-group in insecure regular expression");
3356 if (IN_PERL_COMPILETIME)
3360 nextchar(pRExC_state);
3362 ret = reg_node(pRExC_state, LOGICAL);
3365 regtail(pRExC_state, ret, reganode(pRExC_state, EVAL, n));
3366 /* deal with the length of this later - MJD */
3369 ret = reganode(pRExC_state, EVAL, n);
3370 Set_Node_Length(ret, RExC_parse - parse_start + 1);
3371 Set_Node_Offset(ret, parse_start);
3374 case '(': /* (?(?{...})...) and (?(?=...)...) */
3376 if (RExC_parse[0] == '?') { /* (?(?...)) */
3377 if (RExC_parse[1] == '=' || RExC_parse[1] == '!'
3378 || RExC_parse[1] == '<'
3379 || RExC_parse[1] == '{') { /* Lookahead or eval. */
3382 ret = reg_node(pRExC_state, LOGICAL);
3385 regtail(pRExC_state, ret, reg(pRExC_state, 1, &flag));
3389 else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
3391 parno = atoi(RExC_parse++);
3393 while (isDIGIT(*RExC_parse))
3395 ret = reganode(pRExC_state, GROUPP, parno);
3397 if ((c = *nextchar(pRExC_state)) != ')')
3398 vFAIL("Switch condition not recognized");
3400 regtail(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
3401 br = regbranch(pRExC_state, &flags, 1);
3403 br = reganode(pRExC_state, LONGJMP, 0);
3405 regtail(pRExC_state, br, reganode(pRExC_state, LONGJMP, 0));
3406 c = *nextchar(pRExC_state);
3410 lastbr = reganode(pRExC_state, IFTHEN, 0); /* Fake one for optimizer. */
3411 regbranch(pRExC_state, &flags, 1);
3412 regtail(pRExC_state, ret, lastbr);
3415 c = *nextchar(pRExC_state);
3420 vFAIL("Switch (?(condition)... contains too many branches");
3421 ender = reg_node(pRExC_state, TAIL);
3422 regtail(pRExC_state, br, ender);
3424 regtail(pRExC_state, lastbr, ender);
3425 regtail(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
3428 regtail(pRExC_state, ret, ender);
3432 vFAIL2("Unknown switch condition (?(%.2s", RExC_parse);
3436 RExC_parse--; /* for vFAIL to print correctly */
3437 vFAIL("Sequence (? incomplete");
3441 parse_flags: /* (?i) */
3442 while (*RExC_parse && strchr("iogcmsx", *RExC_parse)) {
3443 /* (?g), (?gc) and (?o) are useless here
3444 and must be globally applied -- japhy */
3446 if (*RExC_parse == 'o' || *RExC_parse == 'g') {
3447 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
3448 I32 wflagbit = *RExC_parse == 'o' ? wasted_o : wasted_g;
3449 if (! (wastedflags & wflagbit) ) {
3450 wastedflags |= wflagbit;
3453 "Useless (%s%c) - %suse /%c modifier",
3454 flagsp == &negflags ? "?-" : "?",
3456 flagsp == &negflags ? "don't " : "",
3462 else if (*RExC_parse == 'c') {
3463 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
3464 if (! (wastedflags & wasted_c) ) {
3465 wastedflags |= wasted_gc;
3468 "Useless (%sc) - %suse /gc modifier",
3469 flagsp == &negflags ? "?-" : "?",
3470 flagsp == &negflags ? "don't " : ""
3475 else { pmflag(flagsp, *RExC_parse); }
3479 if (*RExC_parse == '-') {
3481 wastedflags = 0; /* reset so (?g-c) warns twice */
3485 RExC_flags |= posflags;
3486 RExC_flags &= ~negflags;
3487 if (*RExC_parse == ':') {
3493 if (*RExC_parse != ')') {
3495 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
3497 nextchar(pRExC_state);
3505 ret = reganode(pRExC_state, OPEN, parno);
3506 Set_Node_Length(ret, 1); /* MJD */
3507 Set_Node_Offset(ret, RExC_parse); /* MJD */
3514 /* Pick up the branches, linking them together. */
3515 parse_start = RExC_parse; /* MJD */
3516 br = regbranch(pRExC_state, &flags, 1);
3517 /* branch_len = (paren != 0); */
3521 if (*RExC_parse == '|') {
3522 if (!SIZE_ONLY && RExC_extralen) {
3523 reginsert(pRExC_state, BRANCHJ, br);
3526 reginsert(pRExC_state, BRANCH, br);
3527 Set_Node_Length(br, paren != 0);
3528 Set_Node_Offset_To_R(br-RExC_emit_start, parse_start-RExC_start);
3532 RExC_extralen += 1; /* For BRANCHJ-BRANCH. */
3534 else if (paren == ':') {
3535 *flagp |= flags&SIMPLE;
3537 if (open) { /* Starts with OPEN. */
3538 regtail(pRExC_state, ret, br); /* OPEN -> first. */
3540 else if (paren != '?') /* Not Conditional */
3542 *flagp |= flags & (SPSTART | HASWIDTH);
3544 while (*RExC_parse == '|') {
3545 if (!SIZE_ONLY && RExC_extralen) {
3546 ender = reganode(pRExC_state, LONGJMP,0);
3547 regtail(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender); /* Append to the previous. */
3550 RExC_extralen += 2; /* Account for LONGJMP. */
3551 nextchar(pRExC_state);
3552 br = regbranch(pRExC_state, &flags, 0);
3556 regtail(pRExC_state, lastbr, br); /* BRANCH -> BRANCH. */
3560 *flagp |= flags&SPSTART;
3563 if (have_branch || paren != ':') {
3564 /* Make a closing node, and hook it on the end. */
3567 ender = reg_node(pRExC_state, TAIL);
3570 ender = reganode(pRExC_state, CLOSE, parno);
3571 Set_Node_Offset(ender,RExC_parse+1); /* MJD */
3572 Set_Node_Length(ender,1); /* MJD */
3578 *flagp &= ~HASWIDTH;
3581 ender = reg_node(pRExC_state, SUCCEED);
3584 ender = reg_node(pRExC_state, END);
3587 regtail(pRExC_state, lastbr, ender);
3590 /* Hook the tails of the branches to the closing node. */
3591 for (br = ret; br != NULL; br = regnext(br)) {
3592 regoptail(pRExC_state, br, ender);
3599 static const char parens[] = "=!<,>";
3601 if (paren && (p = strchr(parens, paren))) {
3602 U8 node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
3603 int flag = (p - parens) > 1;
3606 node = SUSPEND, flag = 0;
3607 reginsert(pRExC_state, node,ret);
3608 Set_Node_Cur_Length(ret);
3609 Set_Node_Offset(ret, parse_start + 1);
3611 regtail(pRExC_state, ret, reg_node(pRExC_state, TAIL));
3615 /* Check for proper termination. */
3617 RExC_flags = oregflags;
3618 if (RExC_parse >= RExC_end || *nextchar(pRExC_state) != ')') {
3619 RExC_parse = oregcomp_parse;
3620 vFAIL("Unmatched (");
3623 else if (!paren && RExC_parse < RExC_end) {
3624 if (*RExC_parse == ')') {
3626 vFAIL("Unmatched )");
3629 FAIL("Junk on end of regexp"); /* "Can't happen". */
3637 - regbranch - one alternative of an | operator
3639 * Implements the concatenation operator.
3642 S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first)
3644 register regnode *ret;
3645 register regnode *chain = NULL;
3646 register regnode *latest;
3647 I32 flags = 0, c = 0;
3652 if (!SIZE_ONLY && RExC_extralen)
3653 ret = reganode(pRExC_state, BRANCHJ,0);
3655 ret = reg_node(pRExC_state, BRANCH);
3656 Set_Node_Length(ret, 1);
3660 if (!first && SIZE_ONLY)
3661 RExC_extralen += 1; /* BRANCHJ */
3663 *flagp = WORST; /* Tentatively. */
3666 nextchar(pRExC_state);
3667 while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
3669 latest = regpiece(pRExC_state, &flags);
3670 if (latest == NULL) {
3671 if (flags & TRYAGAIN)
3675 else if (ret == NULL)
3677 *flagp |= flags&HASWIDTH;
3678 if (chain == NULL) /* First piece. */
3679 *flagp |= flags&SPSTART;
3682 regtail(pRExC_state, chain, latest);
3687 if (chain == NULL) { /* Loop ran zero times. */
3688 chain = reg_node(pRExC_state, NOTHING);
3693 *flagp |= flags&SIMPLE;
3700 - regpiece - something followed by possible [*+?]
3702 * Note that the branching code sequences used for ? and the general cases
3703 * of * and + are somewhat optimized: they use the same NOTHING node as
3704 * both the endmarker for their branch list and the body of the last branch.
3705 * It might seem that this node could be dispensed with entirely, but the
3706 * endmarker role is not redundant.
3709 S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp)
3711 register regnode *ret;
3713 register char *next;
3715 char *origparse = RExC_parse;
3718 I32 max = REG_INFTY;
3721 ret = regatom(pRExC_state, &flags);
3723 if (flags & TRYAGAIN)
3730 if (op == '{' && regcurly(RExC_parse)) {
3731 parse_start = RExC_parse; /* MJD */
3732 next = RExC_parse + 1;
3734 while (isDIGIT(*next) || *next == ',') {
3743 if (*next == '}') { /* got one */
3747 min = atoi(RExC_parse);
3751 maxpos = RExC_parse;
3753 if (!max && *maxpos != '0')
3754 max = REG_INFTY; /* meaning "infinity" */
3755 else if (max >= REG_INFTY)
3756 vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
3758 nextchar(pRExC_state);
3761 if ((flags&SIMPLE)) {
3762 RExC_naughty += 2 + RExC_naughty / 2;
3763 reginsert(pRExC_state, CURLY, ret);
3764 Set_Node_Offset(ret, parse_start+1); /* MJD */
3765 Set_Node_Cur_Length(ret);
3768 regnode *w = reg_node(pRExC_state, WHILEM);
3771 regtail(pRExC_state, ret, w);
3772 if (!SIZE_ONLY && RExC_extralen) {
3773 reginsert(pRExC_state, LONGJMP,ret);
3774 reginsert(pRExC_state, NOTHING,ret);
3775 NEXT_OFF(ret) = 3; /* Go over LONGJMP. */
3777 reginsert(pRExC_state, CURLYX,ret);
3779 Set_Node_Offset(ret, parse_start+1);
3780 Set_Node_Length(ret,
3781 op == '{' ? (RExC_parse - parse_start) : 1);
3783 if (!SIZE_ONLY && RExC_extralen)
3784 NEXT_OFF(ret) = 3; /* Go over NOTHING to LONGJMP. */
3785 regtail(pRExC_state, ret, reg_node(pRExC_state, NOTHING));
3787 RExC_whilem_seen++, RExC_extralen += 3;
3788 RExC_naughty += 4 + RExC_naughty; /* compound interest */
3796 if (max && max < min)
3797 vFAIL("Can't do {n,m} with n > m");
3799 ARG1_SET(ret, (U16)min);
3800 ARG2_SET(ret, (U16)max);
3812 #if 0 /* Now runtime fix should be reliable. */
3814 /* if this is reinstated, don't forget to put this back into perldiag:
3816 =item Regexp *+ operand could be empty at {#} in regex m/%s/
3818 (F) The part of the regexp subject to either the * or + quantifier
3819 could match an empty string. The {#} shows in the regular
3820 expression about where the problem was discovered.
3824 if (!(flags&HASWIDTH) && op != '?')
3825 vFAIL("Regexp *+ operand could be empty");
3828 parse_start = RExC_parse;
3829 nextchar(pRExC_state);
3831 *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH);
3833 if (op == '*' && (flags&SIMPLE)) {
3834 reginsert(pRExC_state, STAR, ret);
3838 else if (op == '*') {
3842 else if (op == '+' && (flags&SIMPLE)) {
3843 reginsert(pRExC_state, PLUS, ret);
3847 else if (op == '+') {
3851 else if (op == '?') {
3856 if (ckWARN(WARN_REGEXP) && !SIZE_ONLY && !(flags&HASWIDTH) && max > REG_INFTY/3) {
3858 "%.*s matches null string many times",
3859 RExC_parse - origparse,
3863 if (*RExC_parse == '?') {
3864 nextchar(pRExC_state);
3865 reginsert(pRExC_state, MINMOD, ret);
3866 regtail(pRExC_state, ret, ret + NODE_STEP_REGNODE);
3868 if (ISMULT2(RExC_parse)) {
3870 vFAIL("Nested quantifiers");
3877 - regatom - the lowest level
3879 * Optimization: gobbles an entire sequence of ordinary characters so that
3880 * it can turn them into a single node, which is smaller to store and
3881 * faster to run. Backslashed characters are exceptions, each becoming a
3882 * separate node; the code is simpler that way and it's not worth fixing.
3884 * [Yes, it is worth fixing, some scripts can run twice the speed.] */
3886 S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp)
3888 register regnode *ret = 0;
3890 char *parse_start = RExC_parse;
3892 *flagp = WORST; /* Tentatively. */
3895 switch (*RExC_parse) {
3897 RExC_seen_zerolen++;
3898 nextchar(pRExC_state);
3899 if (RExC_flags & PMf_MULTILINE)
3900 ret = reg_node(pRExC_state, MBOL);
3901 else if (RExC_flags & PMf_SINGLELINE)
3902 ret = reg_node(pRExC_state, SBOL);
3904 ret = reg_node(pRExC_state, BOL);
3905 Set_Node_Length(ret, 1); /* MJD */
3908 nextchar(pRExC_state);
3910 RExC_seen_zerolen++;
3911 if (RExC_flags & PMf_MULTILINE)
3912 ret = reg_node(pRExC_state, MEOL);
3913 else if (RExC_flags & PMf_SINGLELINE)
3914 ret = reg_node(pRExC_state, SEOL);
3916 ret = reg_node(pRExC_state, EOL);
3917 Set_Node_Length(ret, 1); /* MJD */
3920 nextchar(pRExC_state);
3921 if (RExC_flags & PMf_SINGLELINE)
3922 ret = reg_node(pRExC_state, SANY);
3924 ret = reg_node(pRExC_state, REG_ANY);
3925 *flagp |= HASWIDTH|SIMPLE;
3927 Set_Node_Length(ret, 1); /* MJD */
3931 char *oregcomp_parse = ++RExC_parse;
3932 ret = regclass(pRExC_state);
3933 if (*RExC_parse != ']') {
3934 RExC_parse = oregcomp_parse;
3935 vFAIL("Unmatched [");
3937 nextchar(pRExC_state);
3938 *flagp |= HASWIDTH|SIMPLE;
3939 Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */
3943 nextchar(pRExC_state);
3944 ret = reg(pRExC_state, 1, &flags);
3946 if (flags & TRYAGAIN) {
3947 if (RExC_parse == RExC_end) {
3948 /* Make parent create an empty node if needed. */
3956 *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE);
3960 if (flags & TRYAGAIN) {
3964 vFAIL("Internal urp");
3965 /* Supposed to be caught earlier. */
3968 if (!regcurly(RExC_parse)) {
3977 vFAIL("Quantifier follows nothing");
3980 switch (*++RExC_parse) {
3982 RExC_seen_zerolen++;
3983 ret = reg_node(pRExC_state, SBOL);
3985 nextchar(pRExC_state);
3986 Set_Node_Length(ret, 2); /* MJD */
3989 ret = reg_node(pRExC_state, GPOS);
3990 RExC_seen |= REG_SEEN_GPOS;
3992 nextchar(pRExC_state);
3993 Set_Node_Length(ret, 2); /* MJD */
3996 ret = reg_node(pRExC_state, SEOL);
3998 RExC_seen_zerolen++; /* Do not optimize RE away */
3999 nextchar(pRExC_state);
4002 ret = reg_node(pRExC_state, EOS);
4004 RExC_seen_zerolen++; /* Do not optimize RE away */
4005 nextchar(pRExC_state);
4006 Set_Node_Length(ret, 2); /* MJD */
4009 ret = reg_node(pRExC_state, CANY);
4010 RExC_seen |= REG_SEEN_CANY;
4011 *flagp |= HASWIDTH|SIMPLE;
4012 nextchar(pRExC_state);
4013 Set_Node_Length(ret, 2); /* MJD */
4016 ret = reg_node(pRExC_state, CLUMP);
4018 nextchar(pRExC_state);
4019 Set_Node_Length(ret, 2); /* MJD */
4022 ret = reg_node(pRExC_state, (U8)(LOC ? ALNUML : ALNUM));
4023 *flagp |= HASWIDTH|SIMPLE;
4024 nextchar(pRExC_state);
4025 Set_Node_Length(ret, 2); /* MJD */
4028 ret = reg_node(pRExC_state, (U8)(LOC ? NALNUML : NALNUM));
4029 *flagp |= HASWIDTH|SIMPLE;
4030 nextchar(pRExC_state);
4031 Set_Node_Length(ret, 2); /* MJD */
4034 RExC_seen_zerolen++;
4035 RExC_seen |= REG_SEEN_LOOKBEHIND;
4036 ret = reg_node(pRExC_state, (U8)(LOC ? BOUNDL : BOUND));
4038 nextchar(pRExC_state);
4039 Set_Node_Length(ret, 2); /* MJD */
4042 RExC_seen_zerolen++;
4043 RExC_seen |= REG_SEEN_LOOKBEHIND;
4044 ret = reg_node(pRExC_state, (U8)(LOC ? NBOUNDL : NBOUND));
4046 nextchar(pRExC_state);
4047 Set_Node_Length(ret, 2); /* MJD */
4050 ret = reg_node(pRExC_state, (U8)(LOC ? SPACEL : SPACE));
4051 *flagp |= HASWIDTH|SIMPLE;
4052 nextchar(pRExC_state);
4053 Set_Node_Length(ret, 2); /* MJD */
4056 ret = reg_node(pRExC_state, (U8)(LOC ? NSPACEL : NSPACE));
4057 *flagp |= HASWIDTH|SIMPLE;
4058 nextchar(pRExC_state);
4059 Set_Node_Length(ret, 2); /* MJD */
4062 ret = reg_node(pRExC_state, DIGIT);
4063 *flagp |= HASWIDTH|SIMPLE;
4064 nextchar(pRExC_state);
4065 Set_Node_Length(ret, 2); /* MJD */
4068 ret = reg_node(pRExC_state, NDIGIT);
4069 *flagp |= HASWIDTH|SIMPLE;
4070 nextchar(pRExC_state);
4071 Set_Node_Length(ret, 2); /* MJD */
4076 char* oldregxend = RExC_end;
4077 char* parse_start = RExC_parse - 2;
4079 if (RExC_parse[1] == '{') {
4080 /* a lovely hack--pretend we saw [\pX] instead */
4081 RExC_end = strchr(RExC_parse, '}');
4083 U8 c = (U8)*RExC_parse;
4085 RExC_end = oldregxend;
4086 vFAIL2("Missing right brace on \\%c{}", c);
4091 RExC_end = RExC_parse + 2;
4092 if (RExC_end > oldregxend)
4093 RExC_end = oldregxend;
4097 ret = regclass(pRExC_state);
4099 RExC_end = oldregxend;
4102 Set_Node_Offset(ret, parse_start + 2);
4103 Set_Node_Cur_Length(ret);
4104 nextchar(pRExC_state);
4105 *flagp |= HASWIDTH|SIMPLE;
4118 case '1': case '2': case '3': case '4':
4119 case '5': case '6': case '7': case '8': case '9':
4121 I32 num = atoi(RExC_parse);
4123 if (num > 9 && num >= RExC_npar)
4126 char * parse_start = RExC_parse - 1; /* MJD */
4127 while (isDIGIT(*RExC_parse))
4130 if (!SIZE_ONLY && num > (I32)RExC_rx->nparens)
4131 vFAIL("Reference to nonexistent group");
4133 ret = reganode(pRExC_state,
4134 (U8)(FOLD ? (LOC ? REFFL : REFF) : REF),
4138 /* override incorrect value set in reganode MJD */
4139 Set_Node_Offset(ret, parse_start+1);
4140 Set_Node_Cur_Length(ret); /* MJD */
4142 nextchar(pRExC_state);
4147 if (RExC_parse >= RExC_end)
4148 FAIL("Trailing \\");
4151 /* Do not generate `unrecognized' warnings here, we fall
4152 back into the quick-grab loop below */
4159 if (RExC_flags & PMf_EXTENDED) {
4160 while (RExC_parse < RExC_end && *RExC_parse != '\n') RExC_parse++;
4161 if (RExC_parse < RExC_end)
4167 register STRLEN len;
4173 U8 tmpbuf[UTF8_MAXBYTES_CASE+1], *foldbuf;
4175 parse_start = RExC_parse - 1;
4181 ret = reg_node(pRExC_state,
4182 (U8)(FOLD ? (LOC ? EXACTFL : EXACTF) : EXACT));
4184 for (len = 0, p = RExC_parse - 1;
4185 len < 127 && p < RExC_end;
4190 if (RExC_flags & PMf_EXTENDED)
4191 p = regwhite(p, RExC_end);
4238 ender = ASCII_TO_NATIVE('\033');
4242 ender = ASCII_TO_NATIVE('\007');
4247 char* e = strchr(p, '}');
4251 vFAIL("Missing right brace on \\x{}");
4254 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
4255 | PERL_SCAN_DISALLOW_PREFIX;
4257 ender = grok_hex(p + 1, &numlen, &flags, NULL);
4264 I32 flags = PERL_SCAN_DISALLOW_PREFIX;
4266 ender = grok_hex(p, &numlen, &flags, NULL);
4272 ender = UCHARAT(p++);
4273 ender = toCTRL(ender);
4275 case '0': case '1': case '2': case '3':case '4':
4276 case '5': case '6': case '7': case '8':case '9':
4278 (isDIGIT(p[1]) && atoi(p) >= RExC_npar) ) {
4281 ender = grok_oct(p, &numlen, &flags, NULL);
4291 FAIL("Trailing \\");
4294 if (!SIZE_ONLY && ckWARN(WARN_REGEXP) && isALPHA(*p))
4295 vWARN2(p + 1, "Unrecognized escape \\%c passed through", UCHARAT(p));
4296 goto normal_default;
4301 if (UTF8_IS_START(*p) && UTF) {
4302 ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
4310 if (RExC_flags & PMf_EXTENDED)
4311 p = regwhite(p, RExC_end);
4313 /* Prime the casefolded buffer. */
4314 ender = toFOLD_uni(ender, tmpbuf, &foldlen);
4316 if (ISMULT2(p)) { /* Back off on ?+*. */
4323 /* Emit all the Unicode characters. */
4324 for (foldbuf = tmpbuf;
4326 foldlen -= numlen) {
4327 ender = utf8_to_uvchr(foldbuf, &numlen);
4329 reguni(pRExC_state, ender, s, &unilen);
4332 /* In EBCDIC the numlen
4333 * and unilen can differ. */
4335 if (numlen >= foldlen)
4339 break; /* "Can't happen." */
4343 reguni(pRExC_state, ender, s, &unilen);
4352 REGC((char)ender, s++);
4360 /* Emit all the Unicode characters. */
4361 for (foldbuf = tmpbuf;
4363 foldlen -= numlen) {
4364 ender = utf8_to_uvchr(foldbuf, &numlen);
4366 reguni(pRExC_state, ender, s, &unilen);
4369 /* In EBCDIC the numlen
4370 * and unilen can differ. */
4372 if (numlen >= foldlen)
4380 reguni(pRExC_state, ender, s, &unilen);
4389 REGC((char)ender, s++);
4393 Set_Node_Cur_Length(ret); /* MJD */
4394 nextchar(pRExC_state);
4396 /* len is STRLEN which is unsigned, need to copy to signed */
4399 vFAIL("Internal disaster");
4403 if (len == 1 && UNI_IS_INVARIANT(ender))
4408 RExC_size += STR_SZ(len);
4410 RExC_emit += STR_SZ(len);
4415 /* If the encoding pragma is in effect recode the text of
4416 * any EXACT-kind nodes. */
4417 if (PL_encoding && PL_regkind[(U8)OP(ret)] == EXACT) {
4418 STRLEN oldlen = STR_LEN(ret);
4419 SV *sv = sv_2mortal(newSVpvn(STRING(ret), oldlen));
4423 if (sv_utf8_downgrade(sv, TRUE)) {
4424 char *s = sv_recode_to_utf8(sv, PL_encoding);
4425 STRLEN newlen = SvCUR(sv);
4430 GET_RE_DEBUG_FLAGS_DECL;
4431 DEBUG_COMPILE_r(PerlIO_printf(Perl_debug_log, "recode %*s to %*s\n",
4432 (int)oldlen, STRING(ret),
4434 Copy(s, STRING(ret), newlen, char);
4435 STR_LEN(ret) += newlen - oldlen;
4436 RExC_emit += STR_SZ(newlen) - STR_SZ(oldlen);
4438 RExC_size += STR_SZ(newlen) - STR_SZ(oldlen);
4446 S_regwhite(pTHX_ char *p, char *e)
4451 else if (*p == '#') {
4454 } while (p < e && *p != '\n');
4462 /* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
4463 Character classes ([:foo:]) can also be negated ([:^foo:]).
4464 Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
4465 Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
4466 but trigger failures because they are currently unimplemented. */
4468 #define POSIXCC_DONE(c) ((c) == ':')
4469 #define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.')
4470 #define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c))
4473 S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value)
4476 I32 namedclass = OOB_NAMEDCLASS;
4478 if (value == '[' && RExC_parse + 1 < RExC_end &&
4479 /* I smell either [: or [= or [. -- POSIX has been here, right? */
4480 POSIXCC(UCHARAT(RExC_parse))) {
4481 char c = UCHARAT(RExC_parse);
4482 char* s = RExC_parse++;
4484 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != c)
4486 if (RExC_parse == RExC_end)
4487 /* Grandfather lone [:, [=, [. */
4490 char* t = RExC_parse++; /* skip over the c */
4494 if (UCHARAT(RExC_parse) == ']') {
4495 RExC_parse++; /* skip over the ending ] */
4498 I32 complement = *posixcc == '^' ? *posixcc++ : 0;
4499 I32 skip = t - posixcc;
4501 /* Initially switch on the length of the name. */
4504 if (memEQ(posixcc, "word", 4)) {
4505 /* this is not POSIX, this is the Perl \w */;
4507 = complement ? ANYOF_NALNUM : ANYOF_ALNUM;
4511 /* Names all of length 5. */
4512 /* alnum alpha ascii blank cntrl digit graph lower
4513 print punct space upper */
4514 /* Offset 4 gives the best switch position. */
4515 switch (posixcc[4]) {
4517 if (memEQ(posixcc, "alph", 4)) {
4520 = complement ? ANYOF_NALPHA : ANYOF_ALPHA;
4524 if (memEQ(posixcc, "spac", 4)) {
4527 = complement ? ANYOF_NPSXSPC : ANYOF_PSXSPC;
4531 if (memEQ(posixcc, "grap", 4)) {
4534 = complement ? ANYOF_NGRAPH : ANYOF_GRAPH;
4538 if (memEQ(posixcc, "asci", 4)) {
4541 = complement ? ANYOF_NASCII : ANYOF_ASCII;
4545 if (memEQ(posixcc, "blan", 4)) {
4548 = complement ? ANYOF_NBLANK : ANYOF_BLANK;
4552 if (memEQ(posixcc, "cntr", 4)) {
4555 = complement ? ANYOF_NCNTRL : ANYOF_CNTRL;
4559 if (memEQ(posixcc, "alnu", 4)) {
4562 = complement ? ANYOF_NALNUMC : ANYOF_ALNUMC;
4566 if (memEQ(posixcc, "lowe", 4)) {
4569 = complement ? ANYOF_NLOWER : ANYOF_LOWER;
4571 if (memEQ(posixcc, "uppe", 4)) {
4574 = complement ? ANYOF_NUPPER : ANYOF_UPPER;
4578 if (memEQ(posixcc, "digi", 4)) {
4581 = complement ? ANYOF_NDIGIT : ANYOF_DIGIT;
4583 if (memEQ(posixcc, "prin", 4)) {
4586 = complement ? ANYOF_NPRINT : ANYOF_PRINT;
4588 if (memEQ(posixcc, "punc", 4)) {
4591 = complement ? ANYOF_NPUNCT : ANYOF_PUNCT;
4597 if (memEQ(posixcc, "xdigit", 6)) {
4599 = complement ? ANYOF_NXDIGIT : ANYOF_XDIGIT;
4604 if (namedclass == OOB_NAMEDCLASS)
4606 Simple_vFAIL3("POSIX class [:%.*s:] unknown",
4609 assert (posixcc[skip] == ':');
4610 assert (posixcc[skip+1] == ']');
4611 } else if (!SIZE_ONLY) {
4612 /* [[=foo=]] and [[.foo.]] are still future. */
4614 /* adjust RExC_parse so the warning shows after
4616 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse) != ']')
4618 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
4621 /* Maternal grandfather:
4622 * "[:" ending in ":" but not in ":]" */
4632 S_checkposixcc(pTHX_ RExC_state_t *pRExC_state)
4634 if (!SIZE_ONLY && POSIXCC(UCHARAT(RExC_parse))) {
4635 char *s = RExC_parse;
4638 while(*s && isALNUM(*s))
4640 if (*s && c == *s && s[1] == ']') {
4641 if (ckWARN(WARN_REGEXP))
4643 "POSIX syntax [%c %c] belongs inside character classes",
4646 /* [[=foo=]] and [[.foo.]] are still future. */
4647 if (POSIXCC_NOTYET(c)) {
4648 /* adjust RExC_parse so the error shows after
4650 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse++) != ']')
4652 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
4659 S_regclass(pTHX_ RExC_state_t *pRExC_state)
4662 register UV nextvalue;
4663 register IV prevvalue = OOB_UNICODE;
4664 register IV range = 0;
4665 register regnode *ret;
4668 char *rangebegin = 0;
4669 bool need_class = 0;
4670 SV *listsv = Nullsv;
4673 bool optimize_invert = TRUE;
4674 AV* unicode_alternate = 0;
4676 UV literal_endpoint = 0;
4679 ret = reganode(pRExC_state, ANYOF, 0);
4682 ANYOF_FLAGS(ret) = 0;
4684 if (UCHARAT(RExC_parse) == '^') { /* Complement of range. */
4688 ANYOF_FLAGS(ret) |= ANYOF_INVERT;
4692 RExC_size += ANYOF_SKIP;
4694 RExC_emit += ANYOF_SKIP;
4696 ANYOF_FLAGS(ret) |= ANYOF_FOLD;
4698 ANYOF_FLAGS(ret) |= ANYOF_LOCALE;
4699 ANYOF_BITMAP_ZERO(ret);
4700 listsv = newSVpvn("# comment\n", 10);
4703 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
4705 if (!SIZE_ONLY && POSIXCC(nextvalue))
4706 checkposixcc(pRExC_state);
4708 /* allow 1st char to be ] (allowing it to be - is dealt with later) */
4709 if (UCHARAT(RExC_parse) == ']')
4712 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != ']') {
4716 namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
4719 rangebegin = RExC_parse;
4721 value = utf8n_to_uvchr((U8*)RExC_parse,
4722 RExC_end - RExC_parse,
4724 RExC_parse += numlen;
4727 value = UCHARAT(RExC_parse++);
4728 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
4729 if (value == '[' && POSIXCC(nextvalue))
4730 namedclass = regpposixcc(pRExC_state, value);
4731 else if (value == '\\') {
4733 value = utf8n_to_uvchr((U8*)RExC_parse,
4734 RExC_end - RExC_parse,
4736 RExC_parse += numlen;
4739 value = UCHARAT(RExC_parse++);
4740 /* Some compilers cannot handle switching on 64-bit integer
4741 * values, therefore value cannot be an UV. Yes, this will
4742 * be a problem later if we want switch on Unicode.
4743 * A similar issue a little bit later when switching on
4744 * namedclass. --jhi */
4745 switch ((I32)value) {
4746 case 'w': namedclass = ANYOF_ALNUM; break;
4747 case 'W': namedclass = ANYOF_NALNUM; break;
4748 case 's': namedclass = ANYOF_SPACE; break;
4749 case 'S': namedclass = ANYOF_NSPACE; break;
4750 case 'd': namedclass = ANYOF_DIGIT; break;
4751 case 'D': namedclass = ANYOF_NDIGIT; break;
4754 if (RExC_parse >= RExC_end)
4755 vFAIL2("Empty \\%c{}", (U8)value);
4756 if (*RExC_parse == '{') {
4758 e = strchr(RExC_parse++, '}');
4760 vFAIL2("Missing right brace on \\%c{}", c);
4761 while (isSPACE(UCHARAT(RExC_parse)))
4763 if (e == RExC_parse)
4764 vFAIL2("Empty \\%c{}", c);
4766 while (isSPACE(UCHARAT(RExC_parse + n - 1)))
4774 if (UCHARAT(RExC_parse) == '^') {
4777 value = value == 'p' ? 'P' : 'p'; /* toggle */
4778 while (isSPACE(UCHARAT(RExC_parse))) {
4784 Perl_sv_catpvf(aTHX_ listsv,
4785 "+utf8::%.*s\n", (int)n, RExC_parse);
4787 Perl_sv_catpvf(aTHX_ listsv,
4788 "!utf8::%.*s\n", (int)n, RExC_parse);
4791 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
4792 namedclass = ANYOF_MAX; /* no official name, but it's named */
4794 case 'n': value = '\n'; break;
4795 case 'r': value = '\r'; break;
4796 case 't': value = '\t'; break;
4797 case 'f': value = '\f'; break;
4798 case 'b': value = '\b'; break;
4799 case 'e': value = ASCII_TO_NATIVE('\033');break;
4800 case 'a': value = ASCII_TO_NATIVE('\007');break;
4802 if (*RExC_parse == '{') {
4803 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
4804 | PERL_SCAN_DISALLOW_PREFIX;
4805 e = strchr(RExC_parse++, '}');
4807 vFAIL("Missing right brace on \\x{}");
4809 numlen = e - RExC_parse;
4810 value = grok_hex(RExC_parse, &numlen, &flags, NULL);
4814 I32 flags = PERL_SCAN_DISALLOW_PREFIX;
4816 value = grok_hex(RExC_parse, &numlen, &flags, NULL);
4817 RExC_parse += numlen;
4821 value = UCHARAT(RExC_parse++);
4822 value = toCTRL(value);
4824 case '0': case '1': case '2': case '3': case '4':
4825 case '5': case '6': case '7': case '8': case '9':
4829 value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
4830 RExC_parse += numlen;
4834 if (!SIZE_ONLY && ckWARN(WARN_REGEXP) && isALPHA(value))
4836 "Unrecognized escape \\%c in character class passed through",
4840 } /* end of \blah */
4846 if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
4848 if (!SIZE_ONLY && !need_class)
4849 ANYOF_CLASS_ZERO(ret);
4853 /* a bad range like a-\d, a-[:digit:] ? */
4856 if (ckWARN(WARN_REGEXP))
4858 "False [] range \"%*.*s\"",
4859 RExC_parse - rangebegin,
4860 RExC_parse - rangebegin,
4862 if (prevvalue < 256) {
4863 ANYOF_BITMAP_SET(ret, prevvalue);
4864 ANYOF_BITMAP_SET(ret, '-');
4867 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
4868 Perl_sv_catpvf(aTHX_ listsv,
4869 "%04"UVxf"\n%04"UVxf"\n", (UV)prevvalue, (UV) '-');
4873 range = 0; /* this was not a true range */
4877 const char *what = NULL;
4880 if (namedclass > OOB_NAMEDCLASS)
4881 optimize_invert = FALSE;
4882 /* Possible truncation here but in some 64-bit environments
4883 * the compiler gets heartburn about switch on 64-bit values.
4884 * A similar issue a little earlier when switching on value.
4886 switch ((I32)namedclass) {
4889 ANYOF_CLASS_SET(ret, ANYOF_ALNUM);
4891 for (value = 0; value < 256; value++)
4893 ANYOF_BITMAP_SET(ret, value);
4900 ANYOF_CLASS_SET(ret, ANYOF_NALNUM);
4902 for (value = 0; value < 256; value++)
4903 if (!isALNUM(value))
4904 ANYOF_BITMAP_SET(ret, value);
4911 ANYOF_CLASS_SET(ret, ANYOF_ALNUMC);
4913 for (value = 0; value < 256; value++)
4914 if (isALNUMC(value))
4915 ANYOF_BITMAP_SET(ret, value);
4922 ANYOF_CLASS_SET(ret, ANYOF_NALNUMC);
4924 for (value = 0; value < 256; value++)
4925 if (!isALNUMC(value))
4926 ANYOF_BITMAP_SET(ret, value);
4933 ANYOF_CLASS_SET(ret, ANYOF_ALPHA);
4935 for (value = 0; value < 256; value++)
4937 ANYOF_BITMAP_SET(ret, value);
4944 ANYOF_CLASS_SET(ret, ANYOF_NALPHA);
4946 for (value = 0; value < 256; value++)
4947 if (!isALPHA(value))
4948 ANYOF_BITMAP_SET(ret, value);
4955 ANYOF_CLASS_SET(ret, ANYOF_ASCII);
4958 for (value = 0; value < 128; value++)
4959 ANYOF_BITMAP_SET(ret, value);
4961 for (value = 0; value < 256; value++) {
4963 ANYOF_BITMAP_SET(ret, value);
4972 ANYOF_CLASS_SET(ret, ANYOF_NASCII);
4975 for (value = 128; value < 256; value++)
4976 ANYOF_BITMAP_SET(ret, value);
4978 for (value = 0; value < 256; value++) {
4979 if (!isASCII(value))
4980 ANYOF_BITMAP_SET(ret, value);
4989 ANYOF_CLASS_SET(ret, ANYOF_BLANK);
4991 for (value = 0; value < 256; value++)
4993 ANYOF_BITMAP_SET(ret, value);
5000 ANYOF_CLASS_SET(ret, ANYOF_NBLANK);
5002 for (value = 0; value < 256; value++)
5003 if (!isBLANK(value))
5004 ANYOF_BITMAP_SET(ret, value);
5011 ANYOF_CLASS_SET(ret, ANYOF_CNTRL);
5013 for (value = 0; value < 256; value++)
5015 ANYOF_BITMAP_SET(ret, value);
5022 ANYOF_CLASS_SET(ret, ANYOF_NCNTRL);
5024 for (value = 0; value < 256; value++)
5025 if (!isCNTRL(value))
5026 ANYOF_BITMAP_SET(ret, value);
5033 ANYOF_CLASS_SET(ret, ANYOF_DIGIT);
5035 /* consecutive digits assumed */
5036 for (value = '0'; value <= '9'; value++)
5037 ANYOF_BITMAP_SET(ret, value);
5044 ANYOF_CLASS_SET(ret, ANYOF_NDIGIT);
5046 /* consecutive digits assumed */
5047 for (value = 0; value < '0'; value++)
5048 ANYOF_BITMAP_SET(ret, value);
5049 for (value = '9' + 1; value < 256; value++)
5050 ANYOF_BITMAP_SET(ret, value);
5057 ANYOF_CLASS_SET(ret, ANYOF_GRAPH);
5059 for (value = 0; value < 256; value++)
5061 ANYOF_BITMAP_SET(ret, value);
5068 ANYOF_CLASS_SET(ret, ANYOF_NGRAPH);
5070 for (value = 0; value < 256; value++)
5071 if (!isGRAPH(value))
5072 ANYOF_BITMAP_SET(ret, value);
5079 ANYOF_CLASS_SET(ret, ANYOF_LOWER);
5081 for (value = 0; value < 256; value++)
5083 ANYOF_BITMAP_SET(ret, value);
5090 ANYOF_CLASS_SET(ret, ANYOF_NLOWER);
5092 for (value = 0; value < 256; value++)
5093 if (!isLOWER(value))
5094 ANYOF_BITMAP_SET(ret, value);
5101 ANYOF_CLASS_SET(ret, ANYOF_PRINT);
5103 for (value = 0; value < 256; value++)
5105 ANYOF_BITMAP_SET(ret, value);
5112 ANYOF_CLASS_SET(ret, ANYOF_NPRINT);
5114 for (value = 0; value < 256; value++)
5115 if (!isPRINT(value))
5116 ANYOF_BITMAP_SET(ret, value);
5123 ANYOF_CLASS_SET(ret, ANYOF_PSXSPC);
5125 for (value = 0; value < 256; value++)
5126 if (isPSXSPC(value))
5127 ANYOF_BITMAP_SET(ret, value);
5134 ANYOF_CLASS_SET(ret, ANYOF_NPSXSPC);
5136 for (value = 0; value < 256; value++)
5137 if (!isPSXSPC(value))
5138 ANYOF_BITMAP_SET(ret, value);
5145 ANYOF_CLASS_SET(ret, ANYOF_PUNCT);
5147 for (value = 0; value < 256; value++)
5149 ANYOF_BITMAP_SET(ret, value);
5156 ANYOF_CLASS_SET(ret, ANYOF_NPUNCT);
5158 for (value = 0; value < 256; value++)
5159 if (!isPUNCT(value))
5160 ANYOF_BITMAP_SET(ret, value);
5167 ANYOF_CLASS_SET(ret, ANYOF_SPACE);
5169 for (value = 0; value < 256; value++)
5171 ANYOF_BITMAP_SET(ret, value);
5178 ANYOF_CLASS_SET(ret, ANYOF_NSPACE);
5180 for (value = 0; value < 256; value++)
5181 if (!isSPACE(value))
5182 ANYOF_BITMAP_SET(ret, value);
5189 ANYOF_CLASS_SET(ret, ANYOF_UPPER);
5191 for (value = 0; value < 256; value++)
5193 ANYOF_BITMAP_SET(ret, value);
5200 ANYOF_CLASS_SET(ret, ANYOF_NUPPER);
5202 for (value = 0; value < 256; value++)
5203 if (!isUPPER(value))
5204 ANYOF_BITMAP_SET(ret, value);
5211 ANYOF_CLASS_SET(ret, ANYOF_XDIGIT);
5213 for (value = 0; value < 256; value++)
5214 if (isXDIGIT(value))
5215 ANYOF_BITMAP_SET(ret, value);
5222 ANYOF_CLASS_SET(ret, ANYOF_NXDIGIT);
5224 for (value = 0; value < 256; value++)
5225 if (!isXDIGIT(value))
5226 ANYOF_BITMAP_SET(ret, value);
5232 /* this is to handle \p and \P */
5235 vFAIL("Invalid [::] class");
5239 /* Strings such as "+utf8::isWord\n" */
5240 Perl_sv_catpvf(aTHX_ listsv, "%cutf8::Is%s\n", yesno, what);
5243 ANYOF_FLAGS(ret) |= ANYOF_CLASS;
5246 } /* end of namedclass \blah */
5249 if (prevvalue > (IV)value) /* b-a */ {
5250 Simple_vFAIL4("Invalid [] range \"%*.*s\"",
5251 RExC_parse - rangebegin,
5252 RExC_parse - rangebegin,
5254 range = 0; /* not a valid range */
5258 prevvalue = value; /* save the beginning of the range */
5259 if (*RExC_parse == '-' && RExC_parse+1 < RExC_end &&
5260 RExC_parse[1] != ']') {
5263 /* a bad range like \w-, [:word:]- ? */
5264 if (namedclass > OOB_NAMEDCLASS) {
5265 if (ckWARN(WARN_REGEXP))
5267 "False [] range \"%*.*s\"",
5268 RExC_parse - rangebegin,
5269 RExC_parse - rangebegin,
5272 ANYOF_BITMAP_SET(ret, '-');
5274 range = 1; /* yeah, it's a range! */
5275 continue; /* but do it the next time */
5279 /* now is the next time */
5283 if (prevvalue < 256) {
5284 IV ceilvalue = value < 256 ? value : 255;
5287 /* In EBCDIC [\x89-\x91] should include
5288 * the \x8e but [i-j] should not. */
5289 if (literal_endpoint == 2 &&
5290 ((isLOWER(prevvalue) && isLOWER(ceilvalue)) ||
5291 (isUPPER(prevvalue) && isUPPER(ceilvalue))))
5293 if (isLOWER(prevvalue)) {
5294 for (i = prevvalue; i <= ceilvalue; i++)
5296 ANYOF_BITMAP_SET(ret, i);
5298 for (i = prevvalue; i <= ceilvalue; i++)
5300 ANYOF_BITMAP_SET(ret, i);
5305 for (i = prevvalue; i <= ceilvalue; i++)
5306 ANYOF_BITMAP_SET(ret, i);
5308 if (value > 255 || UTF) {
5309 UV prevnatvalue = NATIVE_TO_UNI(prevvalue);
5310 UV natvalue = NATIVE_TO_UNI(value);
5312 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
5313 if (prevnatvalue < natvalue) { /* what about > ? */
5314 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\t%04"UVxf"\n",
5315 prevnatvalue, natvalue);
5317 else if (prevnatvalue == natvalue) {
5318 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n", natvalue);
5320 U8 foldbuf[UTF8_MAXBYTES_CASE+1];
5322 UV f = to_uni_fold(natvalue, foldbuf, &foldlen);
5324 /* If folding and foldable and a single
5325 * character, insert also the folded version
5326 * to the charclass. */
5328 if (foldlen == (STRLEN)UNISKIP(f))
5329 Perl_sv_catpvf(aTHX_ listsv,
5332 /* Any multicharacter foldings
5333 * require the following transform:
5334 * [ABCDEF] -> (?:[ABCabcDEFd]|pq|rst)
5335 * where E folds into "pq" and F folds
5336 * into "rst", all other characters
5337 * fold to single characters. We save
5338 * away these multicharacter foldings,
5339 * to be later saved as part of the
5340 * additional "s" data. */
5343 if (!unicode_alternate)
5344 unicode_alternate = newAV();
5345 sv = newSVpvn((char*)foldbuf, foldlen);
5347 av_push(unicode_alternate, sv);
5351 /* If folding and the value is one of the Greek
5352 * sigmas insert a few more sigmas to make the
5353 * folding rules of the sigmas to work right.
5354 * Note that not all the possible combinations
5355 * are handled here: some of them are handled
5356 * by the standard folding rules, and some of
5357 * them (literal or EXACTF cases) are handled
5358 * during runtime in regexec.c:S_find_byclass(). */
5359 if (value == UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA) {
5360 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
5361 (UV)UNICODE_GREEK_CAPITAL_LETTER_SIGMA);
5362 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
5363 (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
5365 else if (value == UNICODE_GREEK_CAPITAL_LETTER_SIGMA)
5366 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
5367 (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
5372 literal_endpoint = 0;
5376 range = 0; /* this range (if it was one) is done now */
5380 ANYOF_FLAGS(ret) |= ANYOF_LARGE;
5382 RExC_size += ANYOF_CLASS_ADD_SKIP;
5384 RExC_emit += ANYOF_CLASS_ADD_SKIP;
5387 /* optimize case-insensitive simple patterns (e.g. /[a-z]/i) */
5389 /* If the only flag is folding (plus possibly inversion). */
5390 ((ANYOF_FLAGS(ret) & (ANYOF_FLAGS_ALL ^ ANYOF_INVERT)) == ANYOF_FOLD)
5392 for (value = 0; value < 256; ++value) {
5393 if (ANYOF_BITMAP_TEST(ret, value)) {
5394 UV fold = PL_fold[value];
5397 ANYOF_BITMAP_SET(ret, fold);
5400 ANYOF_FLAGS(ret) &= ~ANYOF_FOLD;
5403 /* optimize inverted simple patterns (e.g. [^a-z]) */
5404 if (!SIZE_ONLY && optimize_invert &&
5405 /* If the only flag is inversion. */
5406 (ANYOF_FLAGS(ret) & ANYOF_FLAGS_ALL) == ANYOF_INVERT) {
5407 for (value = 0; value < ANYOF_BITMAP_SIZE; ++value)
5408 ANYOF_BITMAP(ret)[value] ^= ANYOF_FLAGS_ALL;
5409 ANYOF_FLAGS(ret) = ANYOF_UNICODE_ALL;
5416 /* The 0th element stores the character class description
5417 * in its textual form: used later (regexec.c:Perl_regclass_swash())
5418 * to initialize the appropriate swash (which gets stored in
5419 * the 1st element), and also useful for dumping the regnode.
5420 * The 2nd element stores the multicharacter foldings,
5421 * used later (regexec.c:S_reginclass()). */
5422 av_store(av, 0, listsv);
5423 av_store(av, 1, NULL);
5424 av_store(av, 2, (SV*)unicode_alternate);
5425 rv = newRV_noinc((SV*)av);
5426 n = add_data(pRExC_state, 1, "s");
5427 RExC_rx->data->data[n] = (void*)rv;
5435 S_nextchar(pTHX_ RExC_state_t *pRExC_state)
5437 char* retval = RExC_parse++;
5440 if (*RExC_parse == '(' && RExC_parse[1] == '?' &&
5441 RExC_parse[2] == '#') {
5442 while (*RExC_parse != ')') {
5443 if (RExC_parse == RExC_end)
5444 FAIL("Sequence (?#... not terminated");
5450 if (RExC_flags & PMf_EXTENDED) {
5451 if (isSPACE(*RExC_parse)) {
5455 else if (*RExC_parse == '#') {
5456 while (RExC_parse < RExC_end)
5457 if (*RExC_parse++ == '\n') break;
5466 - reg_node - emit a node
5468 STATIC regnode * /* Location. */
5469 S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
5471 register regnode *ret;
5472 register regnode *ptr;
5476 SIZE_ALIGN(RExC_size);
5481 NODE_ALIGN_FILL(ret);
5483 FILL_ADVANCE_NODE(ptr, op);
5484 if (RExC_offsets) { /* MJD */
5485 MJD_OFFSET_DEBUG(("%s:%u: (op %s) %s %u <- %u (len %u) (max %u).\n",
5486 "reg_node", __LINE__,
5488 RExC_emit - RExC_emit_start > RExC_offsets[0]
5489 ? "Overwriting end of array!\n" : "OK",
5490 RExC_emit - RExC_emit_start,
5491 RExC_parse - RExC_start,
5493 Set_Node_Offset(RExC_emit, RExC_parse + (op == END));
5502 - reganode - emit a node with an argument
5504 STATIC regnode * /* Location. */
5505 S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
5507 register regnode *ret;
5508 register regnode *ptr;
5512 SIZE_ALIGN(RExC_size);
5517 NODE_ALIGN_FILL(ret);
5519 FILL_ADVANCE_NODE_ARG(ptr, op, arg);
5520 if (RExC_offsets) { /* MJD */
5521 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %u <- %u (max %u).\n",
5525 RExC_emit - RExC_emit_start > RExC_offsets[0] ?
5526 "Overwriting end of array!\n" : "OK",
5527 RExC_emit - RExC_emit_start,
5528 RExC_parse - RExC_start,
5530 Set_Cur_Node_Offset;
5539 - reguni - emit (if appropriate) a Unicode character
5542 S_reguni(pTHX_ RExC_state_t *pRExC_state, UV uv, char* s, STRLEN* lenp)
5544 *lenp = SIZE_ONLY ? UNISKIP(uv) : (uvchr_to_utf8((U8*)s, uv) - (U8*)s);
5548 - reginsert - insert an operator in front of already-emitted operand
5550 * Means relocating the operand.
5553 S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd)
5555 register regnode *src;
5556 register regnode *dst;
5557 register regnode *place;
5558 register int offset = regarglen[(U8)op];
5560 /* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */
5563 RExC_size += NODE_STEP_REGNODE + offset;
5568 RExC_emit += NODE_STEP_REGNODE + offset;
5570 while (src > opnd) {
5571 StructCopy(--src, --dst, regnode);
5572 if (RExC_offsets) { /* MJD 20010112 */
5573 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s copy %u -> %u (max %u).\n",
5577 dst - RExC_emit_start > RExC_offsets[0]
5578 ? "Overwriting end of array!\n" : "OK",
5579 src - RExC_emit_start,
5580 dst - RExC_emit_start,
5582 Set_Node_Offset_To_R(dst-RExC_emit_start, Node_Offset(src));
5583 Set_Node_Length_To_R(dst-RExC_emit_start, Node_Length(src));
5588 place = opnd; /* Op node, where operand used to be. */
5589 if (RExC_offsets) { /* MJD */
5590 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %u <- %u (max %u).\n",
5594 place - RExC_emit_start > RExC_offsets[0]
5595 ? "Overwriting end of array!\n" : "OK",
5596 place - RExC_emit_start,
5597 RExC_parse - RExC_start,
5599 Set_Node_Offset(place, RExC_parse);
5600 Set_Node_Length(place, 1);
5602 src = NEXTOPER(place);
5603 FILL_ADVANCE_NODE(place, op);
5604 Zero(src, offset, regnode);
5608 - regtail - set the next-pointer at the end of a node chain of p to val.
5611 S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p, regnode *val)
5613 register regnode *scan;
5614 register regnode *temp;
5619 /* Find last node. */
5622 temp = regnext(scan);
5628 if (reg_off_by_arg[OP(scan)]) {
5629 ARG_SET(scan, val - scan);
5632 NEXT_OFF(scan) = val - scan;
5637 - regoptail - regtail on operand of first argument; nop if operandless
5640 S_regoptail(pTHX_ RExC_state_t *pRExC_state, regnode *p, regnode *val)
5642 /* "Operandless" and "op != BRANCH" are synonymous in practice. */
5643 if (p == NULL || SIZE_ONLY)
5645 if (PL_regkind[(U8)OP(p)] == BRANCH) {
5646 regtail(pRExC_state, NEXTOPER(p), val);
5648 else if ( PL_regkind[(U8)OP(p)] == BRANCHJ) {
5649 regtail(pRExC_state, NEXTOPER(NEXTOPER(p)), val);
5656 - regcurly - a little FSA that accepts {\d+,?\d*}
5659 S_regcurly(pTHX_ register const char *s)
5680 S_dumpuntil(pTHX_ regnode *start, regnode *node, regnode *last, SV* sv, I32 l)
5682 register U8 op = EXACT; /* Arbitrary non-END op. */
5683 register regnode *next;
5685 while (op != END && (!last || node < last)) {
5686 /* While that wasn't END last time... */
5692 next = regnext(node);
5694 if (OP(node) == OPTIMIZED)
5697 PerlIO_printf(Perl_debug_log, "%4"IVdf":%*s%s", (IV)(node - start),
5698 (int)(2*l + 1), "", SvPVX(sv));
5699 if (next == NULL) /* Next ptr. */
5700 PerlIO_printf(Perl_debug_log, "(0)");
5702 PerlIO_printf(Perl_debug_log, "(%"IVdf")", (IV)(next - start));
5703 (void)PerlIO_putc(Perl_debug_log, '\n');
5705 if (PL_regkind[(U8)op] == BRANCHJ) {
5706 register regnode *nnode = (OP(next) == LONGJMP
5709 if (last && nnode > last)
5711 node = dumpuntil(start, NEXTOPER(NEXTOPER(node)), nnode, sv, l + 1);
5713 else if (PL_regkind[(U8)op] == BRANCH) {
5714 node = dumpuntil(start, NEXTOPER(node), next, sv, l + 1);
5716 else if ( PL_regkind[(U8)op] == TRIE ) {
5717 const I32 n = ARG(node);
5718 const reg_trie_data *trie = (reg_trie_data*)PL_regdata->data[n];
5719 const I32 arry_len = av_len(trie->words)+1;
5721 PerlIO_printf(Perl_debug_log,
5722 "%*s[Words:%d Chars Stored:%d Unique Chars:%d States:%"IVdf"%s]\n",
5726 trie->uniquecharcount,
5727 (IV)trie->laststate-1,
5728 node->flags ? " EVAL mode" : "");
5730 for (word_idx=0; word_idx < arry_len; word_idx++) {
5731 SV **elem_ptr=av_fetch(trie->words,word_idx,0);
5733 PerlIO_printf(Perl_debug_log, "%*s<%s%s%s>\n",
5736 SvPV_nolen(*elem_ptr),
5741 PerlIO_printf(Perl_debug_log, "(0)\n");
5743 PerlIO_printf(Perl_debug_log, "(%"IVdf")\n", (IV)(next - start));
5749 node = NEXTOPER(node);
5750 node += regarglen[(U8)op];
5753 else if ( op == CURLY) { /* `next' might be very big: optimizer */
5754 node = dumpuntil(start, NEXTOPER(node) + EXTRA_STEP_2ARGS,
5755 NEXTOPER(node) + EXTRA_STEP_2ARGS + 1, sv, l + 1);
5757 else if (PL_regkind[(U8)op] == CURLY && op != CURLYX) {
5758 node = dumpuntil(start, NEXTOPER(node) + EXTRA_STEP_2ARGS,
5761 else if ( op == PLUS || op == STAR) {
5762 node = dumpuntil(start, NEXTOPER(node), NEXTOPER(node) + 1, sv, l + 1);
5764 else if (op == ANYOF) {
5765 /* arglen 1 + class block */
5766 node += 1 + ((ANYOF_FLAGS(node) & ANYOF_LARGE)
5767 ? ANYOF_CLASS_SKIP : ANYOF_SKIP);
5768 node = NEXTOPER(node);
5770 else if (PL_regkind[(U8)op] == EXACT) {
5771 /* Literal string, where present. */
5772 node += NODE_SZ_STR(node) - 1;
5773 node = NEXTOPER(node);
5776 node = NEXTOPER(node);
5777 node += regarglen[(U8)op];
5779 if (op == CURLYX || op == OPEN)
5781 else if (op == WHILEM)
5787 #endif /* DEBUGGING */
5790 - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
5793 Perl_regdump(pTHX_ regexp *r)
5796 SV *sv = sv_newmortal();
5798 (void)dumpuntil(r->program, r->program + 1, NULL, sv, 0);
5800 /* Header fields of interest. */
5801 if (r->anchored_substr)
5802 PerlIO_printf(Perl_debug_log,
5803 "anchored `%s%.*s%s'%s at %"IVdf" ",
5805 (int)(SvCUR(r->anchored_substr) - (SvTAIL(r->anchored_substr)!=0)),
5806 SvPVX(r->anchored_substr),
5808 SvTAIL(r->anchored_substr) ? "$" : "",
5809 (IV)r->anchored_offset);
5810 else if (r->anchored_utf8)
5811 PerlIO_printf(Perl_debug_log,
5812 "anchored utf8 `%s%.*s%s'%s at %"IVdf" ",
5814 (int)(SvCUR(r->anchored_utf8) - (SvTAIL(r->anchored_utf8)!=0)),
5815 SvPVX(r->anchored_utf8),
5817 SvTAIL(r->anchored_utf8) ? "$" : "",
5818 (IV)r->anchored_offset);
5819 if (r->float_substr)
5820 PerlIO_printf(Perl_debug_log,
5821 "floating `%s%.*s%s'%s at %"IVdf"..%"UVuf" ",
5823 (int)(SvCUR(r->float_substr) - (SvTAIL(r->float_substr)!=0)),
5824 SvPVX(r->float_substr),
5826 SvTAIL(r->float_substr) ? "$" : "",
5827 (IV)r->float_min_offset, (UV)r->float_max_offset);
5828 else if (r->float_utf8)
5829 PerlIO_printf(Perl_debug_log,
5830 "floating utf8 `%s%.*s%s'%s at %"IVdf"..%"UVuf" ",
5832 (int)(SvCUR(r->float_utf8) - (SvTAIL(r->float_utf8)!=0)),
5833 SvPVX(r->float_utf8),
5835 SvTAIL(r->float_utf8) ? "$" : "",
5836 (IV)r->float_min_offset, (UV)r->float_max_offset);
5837 if (r->check_substr || r->check_utf8)
5838 PerlIO_printf(Perl_debug_log,
5839 r->check_substr == r->float_substr
5840 && r->check_utf8 == r->float_utf8
5841 ? "(checking floating" : "(checking anchored");
5842 if (r->reganch & ROPT_NOSCAN)
5843 PerlIO_printf(Perl_debug_log, " noscan");
5844 if (r->reganch & ROPT_CHECK_ALL)
5845 PerlIO_printf(Perl_debug_log, " isall");
5846 if (r->check_substr || r->check_utf8)
5847 PerlIO_printf(Perl_debug_log, ") ");
5849 if (r->regstclass) {
5850 regprop(sv, r->regstclass);
5851 PerlIO_printf(Perl_debug_log, "stclass `%s' ", SvPVX(sv));
5853 if (r->reganch & ROPT_ANCH) {
5854 PerlIO_printf(Perl_debug_log, "anchored");
5855 if (r->reganch & ROPT_ANCH_BOL)
5856 PerlIO_printf(Perl_debug_log, "(BOL)");
5857 if (r->reganch & ROPT_ANCH_MBOL)
5858 PerlIO_printf(Perl_debug_log, "(MBOL)");
5859 if (r->reganch & ROPT_ANCH_SBOL)
5860 PerlIO_printf(Perl_debug_log, "(SBOL)");
5861 if (r->reganch & ROPT_ANCH_GPOS)
5862 PerlIO_printf(Perl_debug_log, "(GPOS)");
5863 PerlIO_putc(Perl_debug_log, ' ');
5865 if (r->reganch & ROPT_GPOS_SEEN)
5866 PerlIO_printf(Perl_debug_log, "GPOS ");
5867 if (r->reganch & ROPT_SKIP)
5868 PerlIO_printf(Perl_debug_log, "plus ");
5869 if (r->reganch & ROPT_IMPLICIT)
5870 PerlIO_printf(Perl_debug_log, "implicit ");
5871 PerlIO_printf(Perl_debug_log, "minlen %ld ", (long) r->minlen);
5872 if (r->reganch & ROPT_EVAL_SEEN)
5873 PerlIO_printf(Perl_debug_log, "with eval ");
5874 PerlIO_printf(Perl_debug_log, "\n");
5877 const U32 len = r->offsets[0];
5878 GET_RE_DEBUG_FLAGS_DECL;
5880 PerlIO_printf(Perl_debug_log, "Offsets: [%"UVuf"]\n\t", (UV)r->offsets[0]);
5881 for (i = 1; i <= len; i++)
5882 PerlIO_printf(Perl_debug_log, "%"UVuf"[%"UVuf"] ",
5883 (UV)r->offsets[i*2-1], (UV)r->offsets[i*2]);
5884 PerlIO_printf(Perl_debug_log, "\n");
5887 #endif /* DEBUGGING */
5893 S_put_byte(pTHX_ SV *sv, int c)
5895 if (isCNTRL(c) || c == 255 || !isPRINT(c))
5896 Perl_sv_catpvf(aTHX_ sv, "\\%o", c);
5897 else if (c == '-' || c == ']' || c == '\\' || c == '^')
5898 Perl_sv_catpvf(aTHX_ sv, "\\%c", c);
5900 Perl_sv_catpvf(aTHX_ sv, "%c", c);
5903 #endif /* DEBUGGING */
5907 - regprop - printable representation of opcode
5910 Perl_regprop(pTHX_ SV *sv, const regnode *o)
5915 sv_setpvn(sv, "", 0);
5916 if (OP(o) >= reg_num) /* regnode.type is unsigned */
5917 /* It would be nice to FAIL() here, but this may be called from
5918 regexec.c, and it would be hard to supply pRExC_state. */
5919 Perl_croak(aTHX_ "Corrupted regexp opcode");
5920 sv_catpv(sv, reg_name[OP(o)]); /* Take off const! */
5922 k = PL_regkind[(U8)OP(o)];
5925 SV *dsv = sv_2mortal(newSVpvn("", 0));
5926 /* Using is_utf8_string() is a crude hack but it may
5927 * be the best for now since we have no flag "this EXACTish
5928 * node was UTF-8" --jhi */
5929 bool do_utf8 = is_utf8_string((U8*)STRING(o), STR_LEN(o));
5931 pv_uni_display(dsv, (U8*)STRING(o), STR_LEN(o), 60,
5932 UNI_DISPLAY_REGEX) :
5934 const int len = do_utf8 ?
5937 Perl_sv_catpvf(aTHX_ sv, " <%s%.*s%s>",
5941 } else if (k == TRIE) {/*
5942 this isn't always safe, as Pl_regdata may not be for this regex yet
5943 (depending on where its called from) so its being moved to dumpuntil
5945 reg_trie_data *trie=(reg_trie_data*)PL_regdata->data[n];
5946 Perl_sv_catpvf(aTHX_ sv, " (W:%d L:%d C:%d S:%d)",
5949 trie->uniquecharcount,
5952 } else if (k == CURLY) {
5953 if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX)
5954 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */
5955 Perl_sv_catpvf(aTHX_ sv, " {%d,%d}", ARG1(o), ARG2(o));
5957 else if (k == WHILEM && o->flags) /* Ordinal/of */
5958 Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4);
5959 else if (k == REF || k == OPEN || k == CLOSE || k == GROUPP )
5960 Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o)); /* Parenth number */
5961 else if (k == LOGICAL)
5962 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* 2: embedded, otherwise 1 */
5963 else if (k == ANYOF) {
5964 int i, rangestart = -1;
5965 U8 flags = ANYOF_FLAGS(o);
5966 const char * const anyofs[] = { /* Should be synchronized with
5967 * ANYOF_ #xdefines in regcomp.h */
6000 if (flags & ANYOF_LOCALE)
6001 sv_catpv(sv, "{loc}");
6002 if (flags & ANYOF_FOLD)
6003 sv_catpv(sv, "{i}");
6004 Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
6005 if (flags & ANYOF_INVERT)
6007 for (i = 0; i <= 256; i++) {
6008 if (i < 256 && ANYOF_BITMAP_TEST(o,i)) {
6009 if (rangestart == -1)
6011 } else if (rangestart != -1) {
6012 if (i <= rangestart + 3)
6013 for (; rangestart < i; rangestart++)
6014 put_byte(sv, rangestart);
6016 put_byte(sv, rangestart);
6018 put_byte(sv, i - 1);
6024 if (o->flags & ANYOF_CLASS)
6025 for (i = 0; i < sizeof(anyofs)/sizeof(char*); i++)
6026 if (ANYOF_CLASS_TEST(o,i))
6027 sv_catpv(sv, anyofs[i]);
6029 if (flags & ANYOF_UNICODE)
6030 sv_catpv(sv, "{unicode}");
6031 else if (flags & ANYOF_UNICODE_ALL)
6032 sv_catpv(sv, "{unicode_all}");
6036 SV *sw = regclass_swash(o, FALSE, &lv, 0);
6040 U8 s[UTF8_MAXBYTES_CASE+1];
6042 for (i = 0; i <= 256; i++) { /* just the first 256 */
6043 U8 *e = uvchr_to_utf8(s, i);
6045 if (i < 256 && swash_fetch(sw, s, TRUE)) {
6046 if (rangestart == -1)
6048 } else if (rangestart != -1) {
6051 if (i <= rangestart + 3)
6052 for (; rangestart < i; rangestart++) {
6053 for(e = uvchr_to_utf8(s, rangestart), p = s; p < e; p++)
6057 for (e = uvchr_to_utf8(s, rangestart), p = s; p < e; p++)
6060 for (e = uvchr_to_utf8(s, i - 1), p = s; p < e; p++)
6067 sv_catpv(sv, "..."); /* et cetera */
6071 char *s = savesvpv(lv);
6074 while(*s && *s != '\n') s++;
6095 Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
6097 else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH))
6098 Perl_sv_catpvf(aTHX_ sv, "[-%d]", o->flags);
6099 #endif /* DEBUGGING */
6103 Perl_re_intuit_string(pTHX_ regexp *prog)
6104 { /* Assume that RE_INTUIT is set */
6105 GET_RE_DEBUG_FLAGS_DECL;
6108 const char *s = SvPV(prog->check_substr
6109 ? prog->check_substr : prog->check_utf8, n_a);
6111 if (!PL_colorset) reginitcolors();
6112 PerlIO_printf(Perl_debug_log,
6113 "%sUsing REx %ssubstr:%s `%s%.60s%s%s'\n",
6115 prog->check_substr ? "" : "utf8 ",
6116 PL_colors[5],PL_colors[0],
6119 (strlen(s) > 60 ? "..." : ""));
6122 return prog->check_substr ? prog->check_substr : prog->check_utf8;
6126 Perl_pregfree(pTHX_ struct regexp *r)
6130 SV *dsv = PERL_DEBUG_PAD_ZERO(0);
6131 SV *re_debug_flags=get_sv(RE_DEBUG_FLAGS,0);
6135 if (!r || (--r->refcnt > 0))
6137 DEBUG_r(if (re_debug_flags && (SvIV(re_debug_flags) & RE_DEBUG_COMPILE)) {
6138 const char *s = (r->reganch & ROPT_UTF8)
6139 ? pv_uni_display(dsv, (U8*)r->precomp, r->prelen, 60, UNI_DISPLAY_REGEX)
6140 : pv_display(dsv, r->precomp, r->prelen, 0, 60);
6141 const int len = SvCUR(dsv);
6144 PerlIO_printf(Perl_debug_log,
6145 "%sFreeing REx:%s %s%*.*s%s%s\n",
6146 PL_colors[4],PL_colors[5],PL_colors[0],
6149 len > 60 ? "..." : "");
6153 Safefree(r->precomp);
6154 if (r->offsets) /* 20010421 MJD */
6155 Safefree(r->offsets);
6156 RX_MATCH_COPY_FREE(r);
6157 #ifdef PERL_COPY_ON_WRITE
6159 SvREFCNT_dec(r->saved_copy);
6162 if (r->anchored_substr)
6163 SvREFCNT_dec(r->anchored_substr);
6164 if (r->anchored_utf8)
6165 SvREFCNT_dec(r->anchored_utf8);
6166 if (r->float_substr)
6167 SvREFCNT_dec(r->float_substr);
6169 SvREFCNT_dec(r->float_utf8);
6170 Safefree(r->substrs);
6173 int n = r->data->count;
6174 PAD* new_comppad = NULL;
6179 /* If you add a ->what type here, update the comment in regcomp.h */
6180 switch (r->data->what[n]) {
6182 SvREFCNT_dec((SV*)r->data->data[n]);
6185 Safefree(r->data->data[n]);
6188 new_comppad = (AV*)r->data->data[n];
6191 if (new_comppad == NULL)
6192 Perl_croak(aTHX_ "panic: pregfree comppad");
6193 PAD_SAVE_LOCAL(old_comppad,
6194 /* Watch out for global destruction's random ordering. */
6195 (SvTYPE(new_comppad) == SVt_PVAV) ?
6196 new_comppad : Null(PAD *)
6199 refcnt = OpREFCNT_dec((OP_4tree*)r->data->data[n]);
6202 op_free((OP_4tree*)r->data->data[n]);
6204 PAD_RESTORE_LOCAL(old_comppad);
6205 SvREFCNT_dec((SV*)new_comppad);
6212 reg_trie_data *trie=(reg_trie_data*)r->data->data[n];
6215 refcount = trie->refcount--;
6219 Safefree(trie->charmap);
6220 if (trie->widecharmap)
6221 SvREFCNT_dec((SV*)trie->widecharmap);
6223 Safefree(trie->states);
6225 Safefree(trie->trans);
6228 SvREFCNT_dec((SV*)trie->words);
6229 if (trie->revcharmap)
6230 SvREFCNT_dec((SV*)trie->revcharmap);
6232 Safefree(r->data->data[n]); /* do this last!!!! */
6237 Perl_croak(aTHX_ "panic: regfree data code '%c'", r->data->what[n]);
6240 Safefree(r->data->what);
6243 Safefree(r->startp);
6249 - regnext - dig the "next" pointer out of a node
6252 Perl_regnext(pTHX_ register regnode *p)
6254 register I32 offset;
6256 if (p == &PL_regdummy)
6259 offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p));
6267 S_re_croak2(pTHX_ const char* pat1,const char* pat2,...)
6270 STRLEN l1 = strlen(pat1);
6271 STRLEN l2 = strlen(pat2);
6274 const char *message;
6280 Copy(pat1, buf, l1 , char);
6281 Copy(pat2, buf + l1, l2 , char);
6282 buf[l1 + l2] = '\n';
6283 buf[l1 + l2 + 1] = '\0';
6285 /* ANSI variant takes additional second argument */
6286 va_start(args, pat2);
6290 msv = vmess(buf, &args);
6292 message = SvPV(msv,l1);
6295 Copy(message, buf, l1 , char);
6296 buf[l1-1] = '\0'; /* Overwrite \n */
6297 Perl_croak(aTHX_ "%s", buf);
6300 /* XXX Here's a total kludge. But we need to re-enter for swash routines. */
6303 Perl_save_re_context(pTHX)
6305 SAVEI32(PL_reg_flags); /* from regexec.c */
6307 SAVEPPTR(PL_reginput); /* String-input pointer. */
6308 SAVEPPTR(PL_regbol); /* Beginning of input, for ^ check. */
6309 SAVEPPTR(PL_regeol); /* End of input, for $ check. */
6310 SAVEVPTR(PL_regstartp); /* Pointer to startp array. */
6311 SAVEVPTR(PL_regendp); /* Ditto for endp. */
6312 SAVEVPTR(PL_reglastparen); /* Similarly for lastparen. */
6313 SAVEVPTR(PL_reglastcloseparen); /* Similarly for lastcloseparen. */
6314 SAVEPPTR(PL_regtill); /* How far we are required to go. */
6315 SAVEGENERICPV(PL_reg_start_tmp); /* from regexec.c */
6316 PL_reg_start_tmp = 0;
6317 SAVEI32(PL_reg_start_tmpl); /* from regexec.c */
6318 PL_reg_start_tmpl = 0;
6319 SAVEVPTR(PL_regdata);
6320 SAVEI32(PL_reg_eval_set); /* from regexec.c */
6321 SAVEI32(PL_regnarrate); /* from regexec.c */
6322 SAVEVPTR(PL_regprogram); /* from regexec.c */
6323 SAVEINT(PL_regindent); /* from regexec.c */
6324 SAVEVPTR(PL_regcc); /* from regexec.c */
6325 SAVEVPTR(PL_curcop);
6326 SAVEVPTR(PL_reg_call_cc); /* from regexec.c */
6327 SAVEVPTR(PL_reg_re); /* from regexec.c */
6328 SAVEPPTR(PL_reg_ganch); /* from regexec.c */
6329 SAVESPTR(PL_reg_sv); /* from regexec.c */
6330 SAVEBOOL(PL_reg_match_utf8); /* from regexec.c */
6331 SAVEVPTR(PL_reg_magic); /* from regexec.c */
6332 SAVEI32(PL_reg_oldpos); /* from regexec.c */
6333 SAVEVPTR(PL_reg_oldcurpm); /* from regexec.c */
6334 SAVEVPTR(PL_reg_curpm); /* from regexec.c */
6335 SAVEPPTR(PL_reg_oldsaved); /* old saved substr during match */
6336 PL_reg_oldsaved = Nullch;
6337 SAVEI32(PL_reg_oldsavedlen); /* old length of saved substr during match */
6338 PL_reg_oldsavedlen = 0;
6339 #ifdef PERL_COPY_ON_WRITE
6343 SAVEI32(PL_reg_maxiter); /* max wait until caching pos */
6345 SAVEI32(PL_reg_leftiter); /* wait until caching pos */
6346 PL_reg_leftiter = 0;
6347 SAVEGENERICPV(PL_reg_poscache); /* cache of pos of WHILEM */
6348 PL_reg_poscache = Nullch;
6349 SAVEI32(PL_reg_poscache_size); /* size of pos cache of WHILEM */
6350 PL_reg_poscache_size = 0;
6351 SAVEPPTR(PL_regprecomp); /* uncompiled string. */
6352 SAVEI32(PL_regnpar); /* () count. */
6353 SAVEI32(PL_regsize); /* from regexec.c */
6356 /* Save $1..$n (#18107: UTF-8 s/(\w+)/uc($1)/e); AMS 20021106. */
6360 char digits[TYPE_CHARS(long)];
6362 if (PL_curpm && (rx = PM_GETRE(PL_curpm))) {
6363 for (i = 1; i <= rx->nparens; i++) {
6364 sprintf(digits, "%lu", (long)i);
6365 if ((mgv = gv_fetchpv(digits, FALSE, SVt_PV)))
6372 SAVEPPTR(PL_reg_starttry); /* from regexec.c */
6377 clear_re(pTHX_ void *r)
6379 ReREFCNT_dec((regexp *)r);
6384 * c-indentation-style: bsd
6386 * indent-tabs-mode: t
6389 * ex: set ts=8 sts=4 sw=4 noet: