- pregexec - match a regexp against a string
*/
I32
-Perl_pregexec(pTHX_ register regexp *prog, char *stringarg, register char *strend,
+Perl_pregexec(pTHX_ REGEXP * const prog, char* stringarg, register char *strend,
char *strbeg, I32 minend, SV *screamer, U32 nosave)
/* strend: pointer to null at end of string */
/* strbeg: real beginning of string */
deleted from the finite automaton. */
char *
-Perl_re_intuit_start(pTHX_ regexp *prog, SV *sv, char *strpos,
- char *strend, U32 flags, re_scream_pos_data *data)
+Perl_re_intuit_start(pTHX_ REGEXP * const prog, SV *sv, char *strpos,
+ char *strend, const U32 flags, re_scream_pos_data *data)
{
dVAR;
register I32 start_shift = 0;
if ((!reginfo || regtry(reginfo, &s))) \
goto got_it
+#define REXEC_FBC_CSCAN(CoNdUtF8,CoNd) \
+ if (do_utf8) { \
+ REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8); \
+ } \
+ else { \
+ REXEC_FBC_CLASS_SCAN(CoNd); \
+ } \
+ break
+
#define REXEC_FBC_CSCAN_PRELOAD(UtFpReLoAd,CoNdUtF8,CoNd) \
if (do_utf8) { \
UtFpReLoAd; \
!isDIGIT_LC_utf8((U8*)s),
!isDIGIT_LC(*s)
);
+ case LNBREAK:
+ REXEC_FBC_CSCAN(
+ is_LNBREAK_utf8(s),
+ is_LNBREAK_latin1(s)
+ );
+ case VERTWS:
+ REXEC_FBC_CSCAN(
+ is_VERTWS_utf8(s),
+ is_VERTWS_latin1(s)
+ );
+ case NVERTWS:
+ REXEC_FBC_CSCAN(
+ !is_VERTWS_utf8(s),
+ !is_VERTWS_latin1(s)
+ );
+ case HORIZWS:
+ REXEC_FBC_CSCAN(
+ is_HORIZWS_utf8(s),
+ is_HORIZWS_latin1(s)
+ );
+ case NHORIZWS:
+ REXEC_FBC_CSCAN(
+ !is_HORIZWS_utf8(s),
+ !is_HORIZWS_latin1(s)
+ );
case AHOCORASICKC:
case AHOCORASICK:
{
- regexec_flags - match a regexp against a string
*/
I32
-Perl_regexec_flags(pTHX_ register regexp *prog, char *stringarg, register char *strend,
+Perl_regexec_flags(pTHX_ REGEXP * const prog, char *stringarg, register char *strend,
char *strbeg, I32 minend, SV *sv, void *data, U32 flags)
/* strend: pointer to null at end of string */
/* strbeg: real beginning of string */
if (regtry(®info, &s))
goto got_it;
after_try:
- if (s >= end)
+ if (s > end)
goto phooey;
if (prog->extflags & RXf_USE_INTUIT) {
s = re_intuit_start(prog, sv, s + 1, strend, flags, NULL);
* pack("U0U*", 0xDF) =~ /ss/i,
* the 0xC3 0x9F are the UTF-8
* byte sequence for the U+00DF. */
+
if (!(do_utf8 &&
- toLOWER(s[0]) == 's' &&
+ toLOWER(s[0]) == 's' &&
ln >= 2 &&
toLOWER(s[1]) == 's' &&
(U8)l[0] == 0xC3 &&
re = reg_temp_copy((regexp *)mg->mg_obj); /*XXX:dmq*/
}
else {
- STRLEN len;
- const char * const t = SvPV_const(ret, len);
U32 pm_flags = 0;
const I32 osize = PL_regsize;
if (DO_UTF8(ret)) pm_flags |= RXf_UTF8;
- re = CALLREGCOMP((char*)t, (char*)t + len, pm_flags);
+ re = CALLREGCOMP(ret, pm_flags);
if (!(SvFLAGS(ret)
& (SVs_TEMP | SVs_PADTMP | SVf_READONLY
| SVs_GMG)))
sayNO;
/* NOTREACHED */
#undef ST
+ case FOLDCHAR:
+ n = ARG(scan);
+ if ( n == (U32)what_len_TRICKYFOLD(locinput,do_utf8,ln) ) {
+ locinput += ln;
+ } else if ( 0xDF == n && !do_utf8 && !UTF ) {
+ sayNO;
+ } else {
+ U8 folded[UTF8_MAXBYTES_CASE+1];
+ STRLEN foldlen;
+ const char * const l = locinput;
+ char *e = PL_regeol;
+ to_uni_fold(n, folded, &foldlen);
+
+ if (ibcmp_utf8((const char*) folded, 0, foldlen, 1,
+ l, &e, 0, do_utf8)) {
+ sayNO;
+ }
+ locinput = e;
+ }
+ nextchr = UCHARAT(locinput);
+ break;
+ case LNBREAK:
+ if ((n=is_LNBREAK(locinput,do_utf8))) {
+ locinput += n;
+ nextchr = UCHARAT(locinput);
+ } else
+ sayNO;
+ break;
+
+#define CASE_CLASS(nAmE) \
+ case nAmE: \
+ if ((n=is_##nAmE(locinput,do_utf8))) { \
+ locinput += n; \
+ nextchr = UCHARAT(locinput); \
+ } else \
+ sayNO; \
+ break; \
+ case N##nAmE: \
+ if ((n=is_##nAmE(locinput,do_utf8))) { \
+ sayNO; \
+ } else { \
+ locinput += UTF8SKIP(locinput); \
+ nextchr = UCHARAT(locinput); \
+ } \
+ break
+
+ CASE_CLASS(VERTWS);
+ CASE_CLASS(HORIZWS);
+#undef CASE_CLASS
default:
PerlIO_printf(Perl_error_log, "%"UVxf" %d\n",
while (scan < loceol && !isDIGIT(*scan))
scan++;
}
+ case LNBREAK:
+ if (do_utf8) {
+ loceol = PL_regeol;
+ while (hardcount < max && scan < loceol && (c=is_LNBREAK_utf8(scan))) {
+ scan += c;
+ hardcount++;
+ }
+ } else {
+ /*
+ LNBREAK can match two latin chars, which is ok,
+ because we have a null terminated string, but we
+ have to use hardcount in this situation
+ */
+ while (scan < loceol && (c=is_LNBREAK_latin1(scan))) {
+ scan+=c;
+ hardcount++;
+ }
+ }
break;
+ case HORIZWS:
+ if (do_utf8) {
+ loceol = PL_regeol;
+ while (hardcount < max && scan < loceol && (c=is_HORIZWS_utf8(scan))) {
+ scan += c;
+ hardcount++;
+ }
+ } else {
+ while (scan < loceol && is_HORIZWS_latin1(scan))
+ scan++;
+ }
+ break;
+ case NHORIZWS:
+ if (do_utf8) {
+ loceol = PL_regeol;
+ while (hardcount < max && scan < loceol && !is_HORIZWS_utf8(scan)) {
+ scan += UTF8SKIP(scan);
+ hardcount++;
+ }
+ } else {
+ while (scan < loceol && !is_HORIZWS_latin1(scan))
+ scan++;
+
+ }
+ break;
+ case VERTWS:
+ if (do_utf8) {
+ loceol = PL_regeol;
+ while (hardcount < max && scan < loceol && (c=is_VERTWS_utf8(scan))) {
+ scan += c;
+ hardcount++;
+ }
+ } else {
+ while (scan < loceol && is_VERTWS_latin1(scan))
+ scan++;
+
+ }
+ break;
+ case NVERTWS:
+ if (do_utf8) {
+ loceol = PL_regeol;
+ while (hardcount < max && scan < loceol && !is_VERTWS_utf8(scan)) {
+ scan += UTF8SKIP(scan);
+ hardcount++;
+ }
+ } else {
+ while (scan < loceol && !is_VERTWS_latin1(scan))
+ scan++;
+
+ }
+ break;
+
default: /* Called on something of 0 width. */
break; /* So match right here or not at all. */
}