*
**** Alterations to Henry's code are...
****
- **** Copyright (c) 1991-2001, Larry Wall
+ **** Copyright (c) 1991-2002, Larry Wall
****
**** You may distribute under the terms of either the GNU General Public
**** License or the Artistic License, as specified in the README file.
#define HOP3c(pos,off,lim) ((char*)HOP3(pos,off,lim))
#define HOPMAYBE3c(pos,off,lim) ((char*)HOPMAYBE3(pos,off,lim))
-#define LOAD_UTF8_CHARCLASS(a,b) STMT_START { if (!CAT2(PL_utf8_,a)) (void)CAT2(is_utf8_, a)((U8*)b); } STMT_END
+#define LOAD_UTF8_CHARCLASS(a,b) STMT_START { if (!CAT2(PL_utf8_,a)) { ENTER; save_re_context(); (void)CAT2(is_utf8_, a)((U8*)b); LEAVE; } } STMT_END
/* for use after a quantifier and before an EXACT-like node -- japhy */
#define JUMPABLE(rn) ( \
PL_regkind[(U8)OP(rn)] == EXACT || PL_regkind[(U8)OP(rn)] == REF \
)
+/*
+ Search for mandatory following text node; for lookahead, the text must
+ follow but for lookbehind (rn->flags != 0) we skip to the next step.
+*/
#define FIND_NEXT_IMPT(rn) STMT_START { \
while (JUMPABLE(rn)) \
- if (OP(rn) == SUSPEND || OP(rn) == IFMATCH || \
- PL_regkind[(U8)OP(rn)] == CURLY) \
+ if (OP(rn) == SUSPEND || PL_regkind[(U8)OP(rn)] == CURLY) \
rn = NEXTOPER(NEXTOPER(rn)); \
else if (OP(rn) == PLUS) \
rn = NEXTOPER(rn); \
+ else if (OP(rn) == IFMATCH) \
+ rn = (rn->flags == 0) ? NEXTOPER(NEXTOPER(rn)) : rn + ARG(rn); \
else rn += NEXT_OFF(rn); \
} STMT_END
switch (OP(c)) {
case ANYOF:
while (s < strend) {
- if (reginclass(c, (U8*)s, do_utf8)) {
+ STRLEN skip = do_utf8 ? UTF8SKIP(s) : 1;
+
+ if (reginclass(c, (U8*)s, do_utf8) ||
+ (ANYOF_FOLD_SHARP_S(c, s, strend) &&
+ /* The assignment of 2 is intentional:
+ * for the sharp s, the skip is 2. */
+ (skip = SHARP_S_SKIP)
+ )) {
if (tmp && (norun || regtry(prog, s)))
goto got_it;
else
tmp = doevery;
}
- else
- tmp = 1;
- s += do_utf8 ? UTF8SKIP(s) : 1;
+ else
+ tmp = 1;
+ s += skip;
}
break;
case CANY:
to_utf8_lower((U8*)m, tmpbuf1, &ulen1);
to_utf8_upper((U8*)m, tmpbuf2, &ulen2);
- c1 = utf8_to_uvuni(tmpbuf1, 0);
- c2 = utf8_to_uvuni(tmpbuf2, 0);
+ c1 = utf8_to_uvchr(tmpbuf1, 0);
+ c2 = utf8_to_uvchr(tmpbuf2, 0);
}
else {
c1 = *(U8*)m;
* text of the node. The c1 and c2 are the first
* characters (though in Unicode it gets a bit
* more complicated because there are more cases
- * than just upper and lower: one is really supposed
- * to use the so-called folding case for case-insensitive
- * matching (called "loose matching" in Unicode). */
+ * than just upper and lower: one needs to use
+ * the so-called folding case for case-insensitive
+ * matching (called "loose matching" in Unicode).
+ * ibcmp_utf8() will do just that. */
if (do_utf8) {
UV c, f;
c = utf8_to_uvchr((U8*)s, &len);
/* Handle some of the three Greek sigmas cases.
- * Note that not all the possible combinations
- * are handled here: some of them are handled
- * handled by the standard folding rules, and
- * some of them (the character class or ANYOF
- * cases) are handled during compiletime in
- * regexec.c:S_regclass(). */
+ * Note that not all the possible combinations
+ * are handled here: some of them are handled
+ * by the standard folding rules, and some of
+ * them (the character class or ANYOF cases)
+ * are handled during compiletime in
+ * regexec.c:S_regclass(). */
if (c == (UV)UNICODE_GREEK_CAPITAL_LETTER_SIGMA ||
c == (UV)UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA)
c = (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA;
if ( f != c
&& (f == c1 || f == c2)
&& (ln == foldlen ||
- !ibcmp_utf8((char *)foldbuf,
+ !ibcmp_utf8((char *) foldbuf,
(char **)0, foldlen, do_utf8,
m,
(char **)0, ln, UTF))
}
minlen = prog->minlen;
- if (strend - startpos < minlen &&
- !PL_reg_match_utf8 /* ANYOFs can balloon to EXACTFs */
- ) {
+ if (strend - startpos < minlen) {
DEBUG_r(PerlIO_printf(Perl_debug_log,
"String too short [regexec_flags]...\n"));
goto phooey;
#define sayYES goto yes
#define sayNO goto no
+#define sayNO_ANYOF goto no_anyof
#define sayYES_FINAL goto yes_final
#define sayYES_LOUD goto yes_loud
#define sayNO_FINAL goto no_final
if (l >= PL_regeol)
sayNO;
if (NATIVE_TO_UNI(*(U8*)s) !=
- utf8_to_uvchr((U8*)l, &ulen))
+ utf8_to_uvuni((U8*)l, &ulen))
sayNO;
l += ulen;
s ++;
if (l >= PL_regeol)
sayNO;
if (NATIVE_TO_UNI(*((U8*)l)) !=
- utf8_to_uvchr((U8*)s, &ulen))
+ utf8_to_uvuni((U8*)s, &ulen))
sayNO;
s += ulen;
l ++;
char *l = locinput;
char *e = PL_regeol;
- if (ibcmp_utf8(s, 0, ln, do_utf8,
- l, &e, 0, UTF))
- sayNO;
+ if (ibcmp_utf8(s, 0, ln, UTF,
+ l, &e, 0, do_utf8)) {
+ /* One more case for the sharp s:
+ * pack("U0U*", 0xDF) =~ /ss/i,
+ * the 0xC3 0x9F are the UTF-8
+ * byte sequence for the U+00DF. */
+ if (!(do_utf8 &&
+ toLOWER(s[0]) == 's' &&
+ ln >= 2 &&
+ toLOWER(s[1]) == 's' &&
+ (U8)l[0] == 0xC3 &&
+ e - l >= 2 &&
+ (U8)l[1] == 0x9F))
+ sayNO;
+ }
locinput = e;
nextchr = UCHARAT(locinput);
break;
STRLEN inclasslen = PL_regeol - locinput;
if (!reginclasslen(scan, (U8*)locinput, &inclasslen, do_utf8))
- sayNO;
+ sayNO_ANYOF;
if (locinput >= PL_regeol)
sayNO;
locinput += inclasslen;
nextchr = UCHARAT(locinput);
+ break;
}
else {
if (nextchr < 0)
nextchr = UCHARAT(locinput);
if (!reginclass(scan, (U8*)locinput, do_utf8))
- sayNO;
+ sayNO_ANYOF;
if (!nextchr && locinput >= PL_regeol)
sayNO;
nextchr = UCHARAT(++locinput);
+ break;
}
+ no_anyof:
+ /* If we might have the case of the German sharp s
+ * in a casefolding Unicode character class. */
+
+ if (ANYOF_FOLD_SHARP_S(scan, locinput, PL_regeol)) {
+ locinput += SHARP_S_SKIP;
+ nextchr = UCHARAT(locinput);
+ }
+ else
+ sayNO;
break;
case ALNUML:
PL_reg_flags |= RF_tainted;
if (swash_fetch(sw, p, do_utf8))
match = TRUE;
else if (flags & ANYOF_FOLD) {
- U8 tmpbuf[UTF8_MAXLEN_FOLD+1];
- STRLEN tmplen;
-
if (!match && lenp && av) {
I32 i;
STRLEN len;
char *s = SvPV(sv, len);
- if (len <= plen && memEQ(s, p, len)) {
+ if (len <= plen && memEQ(s, (char*)p, len)) {
*lenp = len;
match = TRUE;
break;
}
}
if (!match) {
+ U8 tmpbuf[UTF8_MAXLEN_FOLD+1];
+ STRLEN tmplen;
+
to_utf8_fold(p, tmpbuf, &tmplen);
if (swash_fetch(sw, tmpbuf, do_utf8))
match = TRUE;
}
- if (!match) {
- to_utf8_upper(p, tmpbuf, &tmplen);
- if (swash_fetch(sw, tmpbuf, do_utf8))
- match = TRUE;
- }
}
}
}