#define TRIE_STORE_REVCHAR \
STMT_START { \
- SV *tmp = newSVpvs(""); \
- if (UTF) SvUTF8_on(tmp); \
- Perl_sv_catpvf( aTHX_ tmp, "%c", (int)uvc ); \
- av_push( revcharmap, tmp ); \
- } STMT_END
+ if (UTF) { \
+ SV *zlopp = newSV(2); \
+ unsigned char *flrbbbbb = (unsigned char *) SvPVX(zlopp); \
+ unsigned const char *const kapow = uvuni_to_utf8(flrbbbbb, uvc & 0xFF); \
+ SvCUR_set(zlopp, kapow - flrbbbbb); \
+ SvPOK_on(zlopp); \
+ SvUTF8_on(zlopp); \
+ av_push(revcharmap, zlopp); \
+ } else { \
+ char ooooff = (char)uvc; \
+ av_push(revcharmap, newSVpvn(&ooooff, 1)); \
+ } \
+ } STMT_END
#define TRIE_READ_CHAR STMT_START { \
wordlen++; \
/* store the codepoint in the bitmap, and if its ascii
also store its folded equivelent. */
TRIE_BITMAP_SET(trie,uvc);
- if ( folder ) TRIE_BITMAP_SET(trie,folder[ uvc ]);
+
+ /* store the folded codepoint */
+ if ( folder ) TRIE_BITMAP_SET(trie,folder[ uvc ]);
+
+ if ( !UTF ) {
+ /* store first byte of utf8 representation of
+ codepoints in the 127 < uvc < 256 range */
+ if (127 < uvc && uvc < 192) {
+ TRIE_BITMAP_SET(trie,194);
+ } else if (191 < uvc ) {
+ TRIE_BITMAP_SET(trie,195);
+ /* && uvc < 256 -- we know uvc is < 256 already */
+ }
+ }
set_bit = 0; /* We've done our bit :-) */
}
} else {
trie->startstate = state;
trie->minlen -= (state - 1);
trie->maxlen -= (state - 1);
- DEBUG_r({
- regnode *fix = convert;
- U32 word = trie->wordcount;
- mjd_nodelen++;
- Set_Node_Offset_Length(convert, mjd_offset, state - 1);
- while( ++fix < n ) {
- Set_Node_Offset_Length(fix, 0, 0);
- }
- while (word--) {
- SV ** const tmp = av_fetch( trie_words, word, 0 );
- if (tmp) {
- if ( STR_LEN(convert) <= SvCUR(*tmp) )
- sv_chop(*tmp, SvPV_nolen(*tmp) + STR_LEN(convert));
- else
- sv_chop(*tmp, SvPV_nolen(*tmp) + SvCUR(*tmp));
- }
- }
- });
+#ifdef DEBUGGING
+ /* At least the UNICOS C compiler choked on this
+ * being argument to DEBUG_r(), so let's just have
+ * it right here. */
+ if (
+#ifdef PERL_EXT_RE_BUILD
+ 1
+#else
+ DEBUG_r_TEST
+#endif
+ ) {
+ regnode *fix = convert;
+ U32 word = trie->wordcount;
+ mjd_nodelen++;
+ Set_Node_Offset_Length(convert, mjd_offset, state - 1);
+ while( ++fix < n ) {
+ Set_Node_Offset_Length(fix, 0, 0);
+ }
+ while (word--) {
+ SV ** const tmp = av_fetch( trie_words, word, 0 );
+ if (tmp) {
+ if ( STR_LEN(convert) <= SvCUR(*tmp) )
+ sv_chop(*tmp, SvPV_nolen(*tmp) + STR_LEN(convert));
+ else
+ sv_chop(*tmp, SvPV_nolen(*tmp) + SvCUR(*tmp));
+ }
+ }
+ }
+#endif
if (trie->maxlen) {
convert = n;
} else {
SvUTF8(sv) && SvMAGICAL(sv) ?
mg_find(sv, PERL_MAGIC_utf8) : NULL;
if (mg && mg->mg_len >= 0)
- mg->mg_len += CHR_SVLEN(last_str);
+ mg->mg_len += CHR_SVLEN(last_str) - l;
}
data->last_end += l * (mincount - 1);
}
r->prelen = plen;
r->extflags = pm_flags;
{
- bool has_k = ((r->extflags & RXf_PMf_KEEPCOPY) == RXf_PMf_KEEPCOPY);
+ bool has_p = ((r->extflags & RXf_PMf_KEEPCOPY) == RXf_PMf_KEEPCOPY);
bool has_minus = ((r->extflags & RXf_PMf_STD_PMMOD) != RXf_PMf_STD_PMMOD);
bool has_runon = ((RExC_seen & REG_SEEN_RUN_ON_COMMENT)==REG_SEEN_RUN_ON_COMMENT);
U16 reganch = (U16)((r->extflags & RXf_PMf_STD_PMMOD) >> 12);
const char *fptr = STD_PAT_MODS; /*"msix"*/
char *p;
- r->wraplen = r->prelen + has_minus + has_k + has_runon
+ r->wraplen = r->prelen + has_minus + has_p + has_runon
+ (sizeof(STD_PAT_MODS) - 1)
+ (sizeof("(?:)") - 1);
Newx(r->wrapped, r->wraplen + 1, char );
p = r->wrapped;
*p++='('; *p++='?';
- if (has_k)
- *p++ = KEEPCOPY_PAT_MOD; /*'k'*/
+ if (has_p)
+ *p++ = KEEPCOPY_PAT_MOD; /*'p'*/
{
char *r = p + (sizeof(STD_PAT_MODS) - 1) + has_minus - 1;
char *colon = r + 1;
#endif
/* Dig out information for optimizations. */
- r->extflags = pm_flags; /* Again? */
+ r->extflags = RExC_flags; /* was pm_op */
/*dmq: removed as part of de-PMOP: pm->op_pmflags = RExC_flags; */
if (UTF)
r->paren_names = NULL;
#ifdef STUPID_PATTERN_CHECKS
+ if (r->prelen == 0)
+ r->extflags |= RXf_NULL;
if (r->extflags & RXf_SPLIT && r->prelen == 1 && r->precomp[0] == ' ')
/* XXX: this should happen BEFORE we compile */
r->extflags |= (RXf_SKIPWHITE|RXf_WHITE);
r->extflags |= RXf_WHITE;
else if (r->prelen == 1 && r->precomp[0] == '^')
r->extflags |= RXf_START_ONLY;
-#endif
+#else
if (r->extflags & RXf_SPLIT && r->prelen == 1 && r->precomp[0] == ' ')
/* XXX: this should happen BEFORE we compile */
r->extflags |= (RXf_SKIPWHITE|RXf_WHITE);
else {
regnode *first = ri->program + 1;
- char fop = OP(first);
- char nop = OP(NEXTOPER(first));
+ U8 fop = OP(first);
+ U8 nop = OP(NEXTOPER(first));
- if (PL_regkind[fop] == BOL && nop == END)
+ if (PL_regkind[fop] == NOTHING && nop == END)
+ r->extflags |= RXf_NULL;
+ else if (PL_regkind[fop] == BOL && nop == END)
r->extflags |= RXf_START_ONLY;
else if (fop == PLUS && nop ==SPACE && OP(regnext(first))==END)
r->extflags |= RXf_WHITE;
}
-
+#endif
#ifdef DEBUGGING
if (RExC_paren_names) {
ri->name_list_idx = add_data( pRExC_state, 1, "p" );
{
PERL_UNUSED_ARG(value);
- if (flags & RXf_HASH_FETCH) {
+ if (flags & RXapif_FETCH) {
return reg_named_buff_fetch(rx, key, flags);
- } else if (flags & (RXf_HASH_STORE | RXf_HASH_DELETE | RXf_HASH_CLEAR)) {
+ } else if (flags & (RXapif_STORE | RXapif_DELETE | RXapif_CLEAR)) {
Perl_croak(aTHX_ PL_no_modify);
return NULL;
- } else if (flags & RXf_HASH_EXISTS) {
+ } else if (flags & RXapif_EXISTS) {
return reg_named_buff_exists(rx, key, flags)
? &PL_sv_yes
: &PL_sv_no;
- } else if (flags & RXf_HASH_REGNAMES) {
+ } else if (flags & RXapif_REGNAMES) {
return reg_named_buff_all(rx, flags);
- } else if (flags & (RXf_HASH_SCALAR | RXf_HASH_REGNAMES_COUNT)) {
+ } else if (flags & (RXapif_SCALAR | RXapif_REGNAMES_COUNT)) {
return reg_named_buff_scalar(rx, flags);
} else {
Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff", (int)flags);
{
PERL_UNUSED_ARG(lastkey);
- if (flags & RXf_HASH_FIRSTKEY)
+ if (flags & RXapif_FIRSTKEY)
return reg_named_buff_firstkey(rx, flags);
- else if (flags & RXf_HASH_NEXTKEY)
+ else if (flags & RXapif_NEXTKEY)
return reg_named_buff_nextkey(rx, flags);
else {
Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_iter", (int)flags);
{
AV *retarray = NULL;
SV *ret;
- if (flags & RXf_HASH_ALL)
+ if (flags & RXapif_ALL)
retarray=newAV();
if (rx && rx->paren_names) {
const U32 flags)
{
if (rx && rx->paren_names) {
- if (flags & RXf_HASH_ALL) {
+ if (flags & RXapif_ALL) {
return hv_exists_ent(rx->paren_names, key, 0);
} else {
SV *sv = CALLREG_NAMED_BUFF_FETCH(rx, key, flags);
SV*
Perl_reg_named_buff_firstkey(pTHX_ REGEXP * const rx, const U32 flags)
{
- (void)hv_iterinit(rx->paren_names);
+ if ( rx && rx->paren_names ) {
+ (void)hv_iterinit(rx->paren_names);
- return CALLREG_NAMED_BUFF_NEXTKEY(rx, NULL, flags & ~RXf_HASH_FIRSTKEY);
+ return CALLREG_NAMED_BUFF_NEXTKEY(rx, NULL, flags & ~RXapif_FIRSTKEY);
+ } else {
+ return FALSE;
+ }
}
SV*
break;
}
}
- if (parno || flags & RXf_HASH_ALL) {
+ if (parno || flags & RXapif_ALL) {
STRLEN len;
char *pv = HePV(temphe, len);
return newSVpvn(pv,len);
I32 length;
if (rx && rx->paren_names) {
- if (flags & (RXf_HASH_ALL | RXf_HASH_REGNAMES_COUNT)) {
+ if (flags & (RXapif_ALL | RXapif_REGNAMES_COUNT)) {
return newSViv(HvTOTALKEYS(rx->paren_names));
- } else if (flags & RXf_HASH_ONE) {
- ret = CALLREG_NAMED_BUFF_ALL(rx, (flags | RXf_HASH_REGNAMES));
+ } else if (flags & RXapif_ONE) {
+ ret = CALLREG_NAMED_BUFF_ALL(rx, (flags | RXapif_REGNAMES));
av = (AV*)SvRV(ret);
length = av_len(av);
return newSViv(length + 1);
break;
}
}
- if (parno || flags & RXf_HASH_ALL) {
+ if (parno || flags & RXapif_ALL) {
STRLEN len;
char *pv = HePV(temphe, len);
av_push(av, newSVpvn(pv,len));
return;
}
else
- if (paren == RXf_PREMATCH && rx->offs[0].start != -1) {
+ if (paren == RX_BUFF_IDX_PREMATCH && rx->offs[0].start != -1) {
/* $` */
i = rx->offs[0].start;
s = rx->subbeg;
}
else
- if (paren == RXf_POSTMATCH && rx->offs[0].end != -1) {
+ if (paren == RX_BUFF_IDX_POSTMATCH && rx->offs[0].end != -1) {
/* $' */
s = rx->subbeg + rx->offs[0].end;
i = rx->sublen - rx->offs[0].end;
/* Some of this code was originally in C<Perl_magic_len> in F<mg.c> */
switch (paren) {
/* $` / ${^PREMATCH} */
- case RXf_PREMATCH:
+ case RX_BUFF_IDX_PREMATCH:
if (rx->offs[0].start != -1) {
i = rx->offs[0].start;
if (i > 0) {
}
return 0;
/* $' / ${^POSTMATCH} */
- case RXf_POSTMATCH:
+ case RX_BUFF_IDX_POSTMATCH:
if (rx->offs[0].end != -1) {
i = rx->sublen - rx->offs[0].end;
if (i > 0) {
register regnode *ender = NULL;
register I32 parno = 0;
I32 flags;
- const I32 oregflags = RExC_flags;
+ U32 oregflags = RExC_flags;
bool have_branch = 0;
bool is_open = 0;
I32 freeze_paren = 0;
and must be globally applied -- japhy */
switch (*RExC_parse) {
CASE_STD_PMMOD_FLAGS_PARSE_SET(flagsp);
- case 'o':
- case 'g':
+ case ONCE_PAT_MOD: /* 'o' */
+ case GLOBAL_PAT_MOD: /* 'g' */
if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
const I32 wflagbit = *RExC_parse == 'o' ? WASTED_O : WASTED_G;
if (! (wastedflags & wflagbit) ) {
}
break;
- case 'c':
+ case CONTINUE_PAT_MOD: /* 'c' */
if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
if (! (wastedflags & WASTED_C) ) {
wastedflags |= WASTED_GC;
}
}
break;
- case 'k':
+ case KEEPCOPY_PAT_MOD: /* 'p' */
if (flagsp == &negflags) {
if (SIZE_ONLY && ckWARN(WARN_REGEXP))
- vWARN(RExC_parse + 1,"Useless use of (?-k)");
+ vWARN(RExC_parse + 1,"Useless use of (?-p)");
} else {
*flagsp |= RXf_PMf_KEEPCOPY;
}
case ')':
RExC_flags |= posflags;
RExC_flags &= ~negflags;
+ if (paren != ':') {
+ oregflags |= posflags;
+ oregflags &= ~negflags;
+ }
nextchar(pRExC_state);
if (paren != ':') {
*flagp = TRYAGAIN;
| PERL_SCAN_DISALLOW_PREFIX
| (SIZE_ONLY ? PERL_SCAN_SILENT_ILLDIGIT : 0);
UV cp;
+ char string;
len = (STRLEN)(endbrace - name - 2);
cp = grok_hex(name + 2, &len, &fl, NULL);
if ( len != (STRLEN)(endbrace - name - 2) ) {
*valuep = cp;
return NULL;
}
- sv_str= Perl_newSVpvf_nocontext("%c",(int)cp);
+ string = (char)cp;
+ sv_str= newSVpvn(&string, 1);
} else {
/* fetch the charnames handler for this scope */
HV * const table = GvHV(PL_hintgv);
case 0xCE:
if (!LOC && FOLD) {
U32 len,cp;
+ len=0; /* silence a spurious compiler warning */
if ((cp = what_len_TRICKYFOLD_safe(RExC_parse,RExC_end,UTF,len))) {
*flagp |= HASWIDTH; /* could be SIMPLE too, but needs a handler in regexec.regrepeat */
RExC_parse+=len-1; /* we get one from nextchar() as well. :-( */
RExC_seen_zerolen++;
ret = reg_node(pRExC_state, KEEPS);
*flagp |= SIMPLE;
+ /* XXX:dmq : disabling in-place substitution seems to
+ * be necessary here to avoid cases of memory corruption, as
+ * with: C<$_="x" x 80; s/x\K/y/> -- rgs
+ */
+ RExC_seen |= REG_SEEN_LOOKBEHIND;
goto finish_meta_pat;
case 'Z':
ret = reg_node(pRExC_state, SEOL);
goto parse_named_seq;
} }
num = atoi(RExC_parse);
+ if (isg && num == 0)
+ vFAIL("Reference to invalid group 0");
if (isrel) {
num = RExC_npar - num;
if (num < 1)
{
if (isLOWER(prevvalue)) {
for (i = prevvalue; i <= ceilvalue; i++)
- if (isLOWER(i))
+ if (isLOWER(i) && !ANYOF_BITMAP_TEST(ret,i)) {
+ stored++;
ANYOF_BITMAP_SET(ret, i);
+ }
} else {
for (i = prevvalue; i <= ceilvalue; i++)
- if (isUPPER(i))
+ if (isUPPER(i) && !ANYOF_BITMAP_TEST(ret,i)) {
+ stored++;
ANYOF_BITMAP_SET(ret, i);
+ }
}
}
else
/*
- regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
*/
+#ifdef DEBUGGING
+void
+S_regdump_extflags(pTHX_ const char *lead, const U32 flags) {
+ int bit;
+ int set=0;
+ for (bit=0; bit<32; bit++) {
+ if (flags & (1<<bit)) {
+ if (!set++ && lead)
+ PerlIO_printf(Perl_debug_log, "%s",lead);
+ PerlIO_printf(Perl_debug_log, "%s ",PL_reg_extflags_name[bit]);
+ }
+ }
+ if (lead) {
+ if (set)
+ PerlIO_printf(Perl_debug_log, "\n");
+ else
+ PerlIO_printf(Perl_debug_log, "%s[none-set]\n",lead);
+ }
+}
+#endif
+
void
Perl_regdump(pTHX_ const regexp *r)
{
SV * const sv = sv_newmortal();
SV *dsv= sv_newmortal();
RXi_GET_DECL(r,ri);
+ GET_RE_DEBUG_FLAGS_DECL;
(void)dumpuntil(r, ri->program, ri->program + 1, NULL, NULL, sv, 0, 0);
if (r->extflags & RXf_EVAL_SEEN)
PerlIO_printf(Perl_debug_log, "with eval ");
PerlIO_printf(Perl_debug_log, "\n");
+ DEBUG_FLAGS_r(regdump_extflags("r->extflags: ",r->extflags));
#else
PERL_UNUSED_CONTEXT;
PERL_UNUSED_ARG(r);
k = PL_regkind[OP(o)];
if (k == EXACT) {
- SV * const dsv = sv_2mortal(newSVpvs(""));
+ sv_catpvs(sv, " ");
/* Using is_utf8_string() (via PERL_PV_UNI_DETECT)
* is a crude hack but it may be the best for now since
* we have no flag "this EXACTish node was UTF-8"
* --jhi */
- const char * const s =
- pv_pretty(dsv, STRING(o), STR_LEN(o), 60,
- PL_colors[0], PL_colors[1],
- PERL_PV_ESCAPE_UNI_DETECT |
- PERL_PV_PRETTY_ELIPSES |
- PERL_PV_PRETTY_LTGT
- );
- Perl_sv_catpvf(aTHX_ sv, " %s", s );
+ pv_pretty(sv, STRING(o), STR_LEN(o), 60, PL_colors[0], PL_colors[1],
+ PERL_PV_ESCAPE_UNI_DETECT |
+ PERL_PV_PRETTY_ELLIPSES |
+ PERL_PV_PRETTY_LTGT |
+ PERL_PV_PRETTY_NOCLEAR
+ );
} else if (k == TRIE) {
/* print the details of the trie in dumpuntil instead, as
* progi->data isn't available here */
int i;
int rangestart = -1;
U8* bitmap = IS_ANYOF_TRIE(op) ? (U8*)ANYOF_BITMAP(o) : (U8*)TRIE_BITMAP(trie);
- Perl_sv_catpvf(aTHX_ sv, "[");
+ sv_catpvs(sv, "[");
for (i = 0; i <= 256; i++) {
if (i < 256 && BITMAP_TEST(bitmap,i)) {
if (rangestart == -1)
rangestart = -1;
}
}
- Perl_sv_catpvf(aTHX_ sv, "]");
+ sv_catpvs(sv, "]");
}
} else if (k == CURLY) {
} else if (k == LOGICAL)
Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* 2: embedded, otherwise 1 */
else if (k == FOLDCHAR)
- Perl_sv_catpvf(aTHX_ sv, "[0x%"UVXf"]",ARG(o) );
+ Perl_sv_catpvf(aTHX_ sv, "[0x%"UVXf"]", PTR2UV(ARG(o)) );
else if (k == ANYOF) {
int i, rangestart = -1;
const U8 flags = ANYOF_FLAGS(o);
STATIC void
S_put_byte(pTHX_ SV *sv, int c)
{
- if (isCNTRL(c) || c == 255 || !isPRINT(c))
+ /* Our definition of isPRINT() ignores locales, so only bytes that are
+ not part of UTF-8 are considered printable. I assume that the same
+ holds for UTF-EBCDIC.
+ Also, code point 255 is not printable in either (it's E0 in EBCDIC,
+ which Wikipedia says:
+
+ EO, or Eight Ones, is an 8-bit EBCDIC character code represented as all
+ ones (binary 1111 1111, hexadecimal FF). It is similar, but not
+ identical, to the ASCII delete (DEL) or rubout control character.
+ ) So the old condition can be simplified to !isPRINT(c) */
+ if (!isPRINT(c))
Perl_sv_catpvf(aTHX_ sv, "\\%o", c);
- else if (c == '-' || c == ']' || c == '\\' || c == '^')
- Perl_sv_catpvf(aTHX_ sv, "\\%c", c);
- else
- Perl_sv_catpvf(aTHX_ sv, "%c", c);
+ else {
+ const char string = c;
+ if (c == '-' || c == ']' || c == '\\' || c == '^')
+ sv_catpvs(sv, "\\");
+ sv_catpvn(sv, &string, 1);
+ }
}
elem_ptr ? pv_pretty(sv, SvPV_nolen_const(*elem_ptr), SvCUR(*elem_ptr), 60,
PL_colors[0], PL_colors[1],
(SvUTF8(*elem_ptr) ? PERL_PV_ESCAPE_UNI : 0) |
- PERL_PV_PRETTY_ELIPSES |
+ PERL_PV_PRETTY_ELLIPSES |
PERL_PV_PRETTY_LTGT
)
: "???"