* requiring null fields (pat.t#187 and split.t#{13,14}
* (as of patchlevel 7877) will fail. Then again,
* this code seems to be necessary or otherwise
- * building DynaLoader will fail:
- * "Error: '*' not in typemap in DynaLoader.xs, line 164"
- * --jhi */
+ * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/
+ * --jhi updated by dapm */
for (i = *PL_reglastparen + 1; i <= rex->nparens; i++) {
if (i > PL_regsize)
PL_regoffs[i].start = -1;
- pregexec - match a regexp against a string
*/
I32
-Perl_pregexec(pTHX_ register regexp *prog, char *stringarg, register char *strend,
+Perl_pregexec(pTHX_ REGEXP * const prog, char* stringarg, register char *strend,
char *strbeg, I32 minend, SV *screamer, U32 nosave)
/* strend: pointer to null at end of string */
/* strbeg: real beginning of string */
deleted from the finite automaton. */
char *
-Perl_re_intuit_start(pTHX_ regexp *prog, SV *sv, char *strpos,
- char *strend, U32 flags, re_scream_pos_data *data)
+Perl_re_intuit_start(pTHX_ REGEXP * const prog, SV *sv, char *strpos,
+ char *strend, const U32 flags, re_scream_pos_data *data)
{
dVAR;
register I32 start_shift = 0;
return NULL;
}
-
+#define DECL_TRIE_TYPE(scan) \
+ const enum { trie_plain, trie_utf8, trie_utf8_fold, trie_latin_utf8_fold } \
+ trie_type = (scan->flags != EXACT) \
+ ? (do_utf8 ? trie_utf8_fold : (UTF ? trie_latin_utf8_fold : trie_plain)) \
+ : (do_utf8 ? trie_utf8 : trie_plain)
#define REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc, uscan, len, \
uvc, charid, foldlen, foldbuf, uniflags) STMT_START { \
uscan = foldbuf + UNISKIP( uvc ); \
} \
break; \
+ case trie_latin_utf8_fold: \
+ if ( foldlen>0 ) { \
+ uvc = utf8n_to_uvuni( uscan, UTF8_MAXLEN, &len, uniflags ); \
+ foldlen -= len; \
+ uscan += len; \
+ len=0; \
+ } else { \
+ len = 1; \
+ uvc = to_uni_fold( *(U8*)uc, foldbuf, &foldlen ); \
+ foldlen -= UNISKIP( uvc ); \
+ uscan = foldbuf + UNISKIP( uvc ); \
+ } \
+ break; \
case trie_utf8: \
uvc = utf8n_to_uvuni( (U8*)uc, UTF8_MAXLEN, &len, uniflags ); \
break; \
} \
} STMT_END
-#define REXEC_FBC_EXACTISH_CHECK(CoNd) \
+#define REXEC_FBC_EXACTISH_CHECK(CoNd) \
+{ \
+ char *my_strend= (char *)strend; \
if ( (CoNd) \
&& (ln == len || \
- ibcmp_utf8(s, NULL, 0, do_utf8, \
+ !ibcmp_utf8(s, &my_strend, 0, do_utf8, \
m, NULL, ln, (bool)UTF)) \
- && (!reginfo || regtry(reginfo, &s)) ) \
+ && (!reginfo || regtry(reginfo, &s)) ) \
goto got_it; \
else { \
U8 foldbuf[UTF8_MAXBYTES_CASE+1]; \
f = to_utf8_fold(tmpbuf, foldbuf, &foldlen); \
if ( f != c \
&& (f == c1 || f == c2) \
- && (ln == foldlen || \
- !ibcmp_utf8((char *) foldbuf, \
- NULL, foldlen, do_utf8, \
- m, \
- NULL, ln, (bool)UTF)) \
- && (!reginfo || regtry(reginfo, &s)) ) \
+ && (ln == len || \
+ !ibcmp_utf8(s, &my_strend, 0, do_utf8,\
+ m, NULL, ln, (bool)UTF)) \
+ && (!reginfo || regtry(reginfo, &s)) ) \
goto got_it; \
} \
- s += len
+} \
+s += len
#define REXEC_FBC_EXACTISH_SCAN(CoNd) \
STMT_START { \
if ((!reginfo || regtry(reginfo, &s))) \
goto got_it
+#define REXEC_FBC_CSCAN(CoNdUtF8,CoNd) \
+ if (do_utf8) { \
+ REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8); \
+ } \
+ else { \
+ REXEC_FBC_CLASS_SCAN(CoNd); \
+ } \
+ break
+
#define REXEC_FBC_CSCAN_PRELOAD(UtFpReLoAd,CoNdUtF8,CoNd) \
if (do_utf8) { \
UtFpReLoAd; \
U8 *sm = (U8 *) m;
U8 tmpbuf1[UTF8_MAXBYTES_CASE+1];
U8 tmpbuf2[UTF8_MAXBYTES_CASE+1];
- const U32 uniflags = UTF8_ALLOW_DEFAULT;
-
- to_utf8_lower((U8*)m, tmpbuf1, &ulen1);
- to_utf8_upper((U8*)m, tmpbuf2, &ulen2);
-
+ /* used by commented-out code below */
+ /*const U32 uniflags = UTF8_ALLOW_DEFAULT;*/
+
+ /* XXX: Since the node will be case folded at compile
+ time this logic is a little odd, although im not
+ sure that its actually wrong. --dmq */
+
+ c1 = to_utf8_lower((U8*)m, tmpbuf1, &ulen1);
+ c2 = to_utf8_upper((U8*)m, tmpbuf2, &ulen2);
+
+ /* XXX: This is kinda strange. to_utf8_XYZ returns the
+ codepoint of the first character in the converted
+ form, yet originally we did the extra step.
+ No tests fail by commenting this code out however
+ so Ive left it out. -- dmq.
+
c1 = utf8n_to_uvchr(tmpbuf1, UTF8_MAXBYTES_CASE,
0, uniflags);
c2 = utf8n_to_uvchr(tmpbuf2, UTF8_MAXBYTES_CASE,
0, uniflags);
+ */
+
lnc = 0;
while (sm < ((U8 *) m + ln)) {
lnc++;
* matching (called "loose matching" in Unicode).
* ibcmp_utf8() will do just that. */
- if (do_utf8) {
+ if (do_utf8 || UTF) {
UV c, f;
U8 tmpbuf [UTF8_MAXBYTES+1];
- STRLEN len, foldlen;
+ STRLEN len = 1;
+ STRLEN foldlen;
const U32 uniflags = UTF8_ALLOW_DEFAULT;
if (c1 == c2) {
/* Upper and lower of 1st char are equal -
* probably not a "letter". */
while (s <= e) {
- c = utf8n_to_uvchr((U8*)s, UTF8_MAXBYTES, &len,
+ if (do_utf8) {
+ c = utf8n_to_uvchr((U8*)s, UTF8_MAXBYTES, &len,
uniflags);
+ } else {
+ c = *((U8*)s);
+ }
REXEC_FBC_EXACTISH_CHECK(c == c1);
}
}
else {
while (s <= e) {
- c = utf8n_to_uvchr((U8*)s, UTF8_MAXBYTES, &len,
+ if (do_utf8) {
+ c = utf8n_to_uvchr((U8*)s, UTF8_MAXBYTES, &len,
uniflags);
+ } else {
+ c = *((U8*)s);
+ }
/* Handle some of the three Greek sigmas cases.
* Note that not all the possible combinations
}
}
else {
+ /* Neither pattern nor string are UTF8 */
if (c1 == c2)
REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1);
else
!isDIGIT_LC_utf8((U8*)s),
!isDIGIT_LC(*s)
);
+ case LNBREAK:
+ REXEC_FBC_CSCAN(
+ is_LNBREAK_utf8(s),
+ is_LNBREAK_latin1(s)
+ );
+ case VERTWS:
+ REXEC_FBC_CSCAN(
+ is_VERTWS_utf8(s),
+ is_VERTWS_latin1(s)
+ );
+ case NVERTWS:
+ REXEC_FBC_CSCAN(
+ !is_VERTWS_utf8(s),
+ !is_VERTWS_latin1(s)
+ );
+ case HORIZWS:
+ REXEC_FBC_CSCAN(
+ is_HORIZWS_utf8(s),
+ is_HORIZWS_latin1(s)
+ );
+ case NHORIZWS:
+ REXEC_FBC_CSCAN(
+ !is_HORIZWS_utf8(s),
+ !is_HORIZWS_latin1(s)
+ );
case AHOCORASICKC:
case AHOCORASICK:
{
- const enum { trie_plain, trie_utf8, trie_utf8_fold }
- trie_type = do_utf8 ?
- (c->flags == EXACT ? trie_utf8 : trie_utf8_fold)
- : trie_plain;
+ DECL_TRIE_TYPE(c);
/* what trie are we using right now */
reg_ac_data *aho
= (reg_ac_data*)progi->data->data[ ARG( c ) ];
U8 **points; /* map of where we were in the input string
when reading a given char. For ASCII this
is unnecessary overhead as the relationship
- is always 1:1, but for unicode, especially
- case folded unicode this is not true. */
+ is always 1:1, but for Unicode, especially
+ case folded Unicode this is not true. */
U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
U8 *bitmap=NULL;
- regexec_flags - match a regexp against a string
*/
I32
-Perl_regexec_flags(pTHX_ register regexp *prog, char *stringarg, register char *strend,
+Perl_regexec_flags(pTHX_ REGEXP * const prog, char *stringarg, register char *strend,
char *strbeg, I32 minend, SV *sv, void *data, U32 flags)
/* strend: pointer to null at end of string */
/* strbeg: real beginning of string */
if (regtry(®info, &s))
goto got_it;
after_try:
- if (s >= end)
+ if (s > end)
goto phooey;
if (prog->extflags & RXf_USE_INTUIT) {
s = re_intuit_start(prog, sv, s + 1, strend, flags, NULL);
/* Tests pat.t#187 and split.t#{13,14} seem to depend on this code.
* Actually, the code in regcppop() (which Ilya may be meaning by
* PL_reglastparen), is not needed at all by the test suite
- * (op/regexp, op/pat, op/split), but that code is needed, oddly
- * enough, for building DynaLoader, or otherwise this
- * "Error: '*' not in typemap in DynaLoader.xs, line 164"
- * will happen. Meanwhile, this code *is* needed for the
+ * (op/regexp, op/pat, op/split), but that code is needed otherwise
+ * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/
+ * Meanwhile, this code *is* needed for the
* above-mentioned test suite tests to succeed. The common theme
* on those tests seems to be returning null fields from matches.
- * --jhi */
+ * --jhi updated by dapm */
#if 1
if (prog->nparens) {
regexp_paren_pair *pp = PL_regoffs;
case TRIE:
{
/* what type of TRIE am I? (utf8 makes this contextual) */
- const enum { trie_plain, trie_utf8, trie_utf8_fold }
- trie_type = do_utf8 ?
- (scan->flags == EXACT ? trie_utf8 : trie_utf8_fold)
- : trie_plain;
+ DECL_TRIE_TYPE(scan);
/* what trie are we using right now */
reg_trie_data * const trie
if ( got_wordnum ) {
if ( ! ST.accepted ) {
ENTER;
- SAVETMPS;
+ /* SAVETMPS; */ /* XXX is this necessary? dmq */
bufflen = TRIE_INITAL_ACCEPT_BUFFLEN;
sv_accept_buff=newSV(bufflen *
sizeof(reg_trie_accepted) - 1);
PL_reginput = (char *)ST.accept_buff[ best ].endpos;
if ( !ST.jump || !ST.jump[ST.accept_buff[best].wordnum]) {
scan = ST.B;
- /* NOTREACHED */
} else {
scan = ST.me + ST.jump[ST.accept_buff[best].wordnum];
- /* NOTREACHED */
- }
- if (has_cutgroup) {
- PUSH_YES_STATE_GOTO(TRIE_next, scan);
- /* NOTREACHED */
- } else {
- PUSH_STATE_GOTO(TRIE_next, scan);
- /* NOTREACHED */
}
+ PUSH_YES_STATE_GOTO(TRIE_next, scan);
/* NOTREACHED */
}
/* NOTREACHED */
* pack("U0U*", 0xDF) =~ /ss/i,
* the 0xC3 0x9F are the UTF-8
* byte sequence for the U+00DF. */
+
if (!(do_utf8 &&
- toLOWER(s[0]) == 's' &&
+ toLOWER(s[0]) == 's' &&
ln >= 2 &&
toLOWER(s[1]) == 's' &&
(U8)l[0] == 0xC3 &&
re = reg_temp_copy((regexp *)mg->mg_obj); /*XXX:dmq*/
}
else {
- STRLEN len;
- const char * const t = SvPV_const(ret, len);
- PMOP pm;
+ U32 pm_flags = 0;
const I32 osize = PL_regsize;
- Zero(&pm, 1, PMOP);
- if (DO_UTF8(ret)) pm.op_pmdynflags |= PMdf_DYN_UTF8;
- re = CALLREGCOMP((char*)t, (char*)t + len, &pm);
+ if (DO_UTF8(ret)) pm_flags |= RXf_UTF8;
+ re = CALLREGCOMP(ret, pm_flags);
if (!(SvFLAGS(ret)
& (SVs_TEMP | SVs_PADTMP | SVf_READONLY
| SVs_GMG)))
case BRANCH: /* /(...|A|...)/ */
scan = NEXTOPER(scan); /* scan now points to inner node */
- if ((!next || (OP(next) != BRANCH && OP(next) != BRANCHJ))
- && !has_cutgroup)
- {
- /* last branch; skip state push and jump direct to node */
- continue;
- }
ST.lastparen = *PL_reglastparen;
ST.next_branch = next;
REGCP_SET(ST.cp);
sayNO;
/* NOTREACHED */
#undef ST
+ case FOLDCHAR:
+ n = ARG(scan);
+ if ( n == (U32)what_len_TRICKYFOLD(locinput,do_utf8,ln) ) {
+ locinput += ln;
+ } else if ( 0xDF == n && !do_utf8 && !UTF ) {
+ sayNO;
+ } else {
+ U8 folded[UTF8_MAXBYTES_CASE+1];
+ STRLEN foldlen;
+ const char * const l = locinput;
+ char *e = PL_regeol;
+ to_uni_fold(n, folded, &foldlen);
+
+ if (ibcmp_utf8((const char*) folded, 0, foldlen, 1,
+ l, &e, 0, do_utf8)) {
+ sayNO;
+ }
+ locinput = e;
+ }
+ nextchr = UCHARAT(locinput);
+ break;
+ case LNBREAK:
+ if ((n=is_LNBREAK(locinput,do_utf8))) {
+ locinput += n;
+ nextchr = UCHARAT(locinput);
+ } else
+ sayNO;
+ break;
+
+#define CASE_CLASS(nAmE) \
+ case nAmE: \
+ if ((n=is_##nAmE(locinput,do_utf8))) { \
+ locinput += n; \
+ nextchr = UCHARAT(locinput); \
+ } else \
+ sayNO; \
+ break; \
+ case N##nAmE: \
+ if ((n=is_##nAmE(locinput,do_utf8))) { \
+ sayNO; \
+ } else { \
+ locinput += UTF8SKIP(locinput); \
+ nextchr = UCHARAT(locinput); \
+ } \
+ break
+
+ CASE_CLASS(VERTWS);
+ CASE_CLASS(HORIZWS);
+#undef CASE_CLASS
default:
PerlIO_printf(Perl_error_log, "%"UVxf" %d\n",
} else {
while (scan < loceol && !isSPACE(*scan))
scan++;
- break;
}
+ break;
case NSPACEL:
PL_reg_flags |= RF_tainted;
if (do_utf8) {
while (scan < loceol && !isDIGIT(*scan))
scan++;
}
+ case LNBREAK:
+ if (do_utf8) {
+ loceol = PL_regeol;
+ while (hardcount < max && scan < loceol && (c=is_LNBREAK_utf8(scan))) {
+ scan += c;
+ hardcount++;
+ }
+ } else {
+ /*
+ LNBREAK can match two latin chars, which is ok,
+ because we have a null terminated string, but we
+ have to use hardcount in this situation
+ */
+ while (scan < loceol && (c=is_LNBREAK_latin1(scan))) {
+ scan+=c;
+ hardcount++;
+ }
+ }
+ break;
+ case HORIZWS:
+ if (do_utf8) {
+ loceol = PL_regeol;
+ while (hardcount < max && scan < loceol && (c=is_HORIZWS_utf8(scan))) {
+ scan += c;
+ hardcount++;
+ }
+ } else {
+ while (scan < loceol && is_HORIZWS_latin1(scan))
+ scan++;
+ }
+ break;
+ case NHORIZWS:
+ if (do_utf8) {
+ loceol = PL_regeol;
+ while (hardcount < max && scan < loceol && !is_HORIZWS_utf8(scan)) {
+ scan += UTF8SKIP(scan);
+ hardcount++;
+ }
+ } else {
+ while (scan < loceol && !is_HORIZWS_latin1(scan))
+ scan++;
+
+ }
break;
+ case VERTWS:
+ if (do_utf8) {
+ loceol = PL_regeol;
+ while (hardcount < max && scan < loceol && (c=is_VERTWS_utf8(scan))) {
+ scan += c;
+ hardcount++;
+ }
+ } else {
+ while (scan < loceol && is_VERTWS_latin1(scan))
+ scan++;
+
+ }
+ break;
+ case NVERTWS:
+ if (do_utf8) {
+ loceol = PL_regeol;
+ while (hardcount < max && scan < loceol && !is_VERTWS_utf8(scan)) {
+ scan += UTF8SKIP(scan);
+ hardcount++;
+ }
+ } else {
+ while (scan < loceol && !is_VERTWS_latin1(scan))
+ scan++;
+
+ }
+ break;
+
default: /* Called on something of 0 width. */
break; /* So match right here or not at all. */
}
* documentation of these array elements. */
si = *ary;
- a = SvROK(ary[1]) ? &ary[1] : 0;
- b = SvTYPE(ary[2]) == SVt_PVAV ? &ary[2] : 0;
+ a = SvROK(ary[1]) ? &ary[1] : NULL;
+ b = SvTYPE(ary[2]) == SVt_PVAV ? &ary[2] : NULL;
if (a)
sw = *a;