*
**** Alterations to Henry's code are...
****
- **** Copyright (c) 1991-2001, Larry Wall
+ **** Copyright (c) 1991-2002, Larry Wall
****
**** You may distribute under the terms of either the GNU General Public
**** License or the Artistic License, as specified in the README file.
n = nnext;
}
}
+
+ if (UTF && OP(scan) == EXACTF && STR_LEN(scan) >= 6) {
+/*
+ Two problematic code points in Unicode casefolding of EXACT nodes:
+
+ U+0390 - GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
+ U+03B0 - GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
+
+ which casefold to
+
+ Unicode UTF-8
+
+ U+03B9 U+0308 U+0301 0xCE 0xB9 0xCC 0x88 0xCC 0x81
+ U+03C5 U+0308 U+0301 0xCF 0x85 0xCC 0x88 0xCC 0x81
+
+ This means that in case-insensitive matching (or "loose matching",
+ as Unicode calls it), an EXACTF of length six (the UTF-8 encoded byte
+ length of the above casefolded versions) can match a target string
+ of length two (the byte length of UTF-8 encoded U+0390 or U+03B0).
+ This would rather mess up the minimum length computation.
+
+ What we'll do is to look for the tail four bytes, and then peek
+ at the preceding two bytes to see whether we need to decrease
+ the minimum length by four (six minus two).
+
+ Thanks to the design of UTF-8, there cannot be false matches:
+ A sequence of valid UTF-8 bytes cannot be a subsequence of
+ another valid sequence of UTF-8 bytes.
+
+*/
+ char *s0 = STRING(scan), *s, *t;
+ char *s1 = s0 + STR_LEN(scan) - 1, *s2 = s1 - 4;
+ char *t0 = "\xcc\x88\xcc\x81";
+ char *t1 = t0 + 3;
+
+ for (s = s0 + 2;
+ s < s2 && (t = ninstr(s, s1, t0, t1));
+ s = t + 4) {
+ if (((U8)t[-1] == 0xB9 && (U8)t[-2] == 0xCE) ||
+ ((U8)t[-1] == 0x85 && (U8)t[-2] == 0xCF))
+ min -= 4;
+ }
+ }
+
#ifdef DEBUGGING
/* Allow dumping */
n = scan + NODE_SZ_STR(scan);
first = NEXTOPER(first);
goto again;
}
- else if ((OP(first) == STAR &&
+ else if (!sawopen && (OP(first) == STAR &&
PL_regkind[(U8)OP(NEXTOPER(first))] == REG_ANY) &&
!(r->reganch & ROPT_ANCH) )
{
/* FALL THROUGH*/
case '?': /* (??...) */
logical = 1;
+ if (*RExC_parse != '{')
+ goto unknown;
paren = *RExC_parse++;
/* FALL THROUGH */
case '{': /* (?{...}) */
}
else if (paren != '?') /* Not Conditional */
ret = br;
- if (flags&HASWIDTH)
- *flagp |= HASWIDTH;
- *flagp |= flags&SPSTART;
+ *flagp |= flags & (SPSTART | HASWIDTH);
lastbr = br;
while (*RExC_parse == '|') {
if (!SIZE_ONLY && RExC_extralen) {
register char *p;
char *oldp, *s;
STRLEN numlen;
- STRLEN ulen;
- U8 tmpbuf[UTF8_MAXLEN_UCLC+1];
+ STRLEN foldlen;
+ U8 tmpbuf[UTF8_MAXLEN_FOLD+1], *foldbuf;
parse_start = RExC_parse - 1;
if (RExC_flags16 & PMf_EXTENDED)
p = regwhite(p, RExC_end);
if (UTF && FOLD) {
- toFOLD_uni(ender, tmpbuf, &ulen);
- ender = utf8_to_uvchr(tmpbuf, 0);
+ /* Prime the casefolded buffer. */
+ ender = toFOLD_uni(ender, tmpbuf, &foldlen);
}
if (ISMULT2(p)) { /* Back off on ?+*. */
if (len)
p = oldp;
- else if (!UNI_IS_INVARIANT(NATIVE_TO_UNI(ender)) && UTF) {
- reguni(pRExC_state, ender, s, &numlen);
- s += numlen;
- len += numlen;
+ else if (UTF) {
+ STRLEN unilen;
+
+ if (FOLD) {
+ /* Emit all the Unicode characters. */
+ for (foldbuf = tmpbuf;
+ foldlen;
+ foldlen -= numlen) {
+ ender = utf8_to_uvchr(foldbuf, &numlen);
+ if (numlen > 0) {
+ reguni(pRExC_state, ender, s, &unilen);
+ s += unilen;
+ len += unilen;
+ /* In EBCDIC the numlen
+ * and unilen can differ. */
+ foldbuf += numlen;
+ if (numlen >= foldlen)
+ break;
+ }
+ else
+ break; /* "Can't happen." */
+ }
+ }
+ else {
+ reguni(pRExC_state, ender, s, &unilen);
+ if (unilen > 0) {
+ s += unilen;
+ len += unilen;
+ }
+ }
}
else {
len++;
}
break;
}
- if (!UNI_IS_INVARIANT(NATIVE_TO_UNI(ender)) && UTF) {
- reguni(pRExC_state, ender, s, &numlen);
- s += numlen;
- len += numlen - 1;
+ if (UTF) {
+ STRLEN unilen;
+
+ if (FOLD) {
+ /* Emit all the Unicode characters. */
+ for (foldbuf = tmpbuf;
+ foldlen;
+ foldlen -= numlen) {
+ ender = utf8_to_uvchr(foldbuf, &numlen);
+ if (numlen > 0) {
+ reguni(pRExC_state, ender, s, &unilen);
+ len += unilen;
+ s += unilen;
+ /* In EBCDIC the numlen
+ * and unilen can differ. */
+ foldbuf += numlen;
+ if (numlen >= foldlen)
+ break;
+ }
+ else
+ break;
+ }
+ }
+ else {
+ reguni(pRExC_state, ender, s, &unilen);
+ if (unilen > 0) {
+ s += unilen;
+ len += unilen;
+ }
+ }
+ len--;
}
else
REGC(ender, s++);
break;
}
+ /* If the encoding pragma is in effect recode the text of
+ * any EXACT-kind nodes. */
if (PL_encoding && PL_regkind[(U8)OP(ret)] == EXACT) {
STRLEN oldlen = STR_LEN(ret);
SV *sv = sv_2mortal(newSVpvn(STRING(ret), oldlen));
STATIC void
S_checkposixcc(pTHX_ RExC_state_t *pRExC_state)
{
- if (!SIZE_ONLY && ckWARN(WARN_REGEXP) &&
- POSIXCC(UCHARAT(RExC_parse))) {
+ if (!SIZE_ONLY && POSIXCC(UCHARAT(RExC_parse))) {
char *s = RExC_parse;
char c = *s++;
SV *listsv = Nullsv;
register char *e;
UV n;
- bool optimize_invert = TRUE;
+ bool optimize_invert = TRUE;
+ AV* unicode_alternate = 0;
ret = reganode(pRExC_state, ANYOF, 0);
nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
- if (!SIZE_ONLY && ckWARN(WARN_REGEXP) && POSIXCC(nextvalue))
+ if (!SIZE_ONLY && POSIXCC(nextvalue))
checkposixcc(pRExC_state);
- if (UCHARAT(RExC_parse) == ']' || UCHARAT(RExC_parse) == '-')
- goto charclassloop; /* allow 1st char to be ] or - */
+ /* allow 1st char to be ] (allowing it to be - is dealt with later) */
+ if (UCHARAT(RExC_parse) == ']')
+ goto charclassloop;
while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != ']') {
ANYOF_BITMAP_SET(ret, i);
}
if (value > 255 || UTF) {
+ UV prevnatvalue = NATIVE_TO_UNI(prevvalue);
+ UV natvalue = NATIVE_TO_UNI(value);
+
ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
- if (prevvalue < value)
+ if (prevnatvalue < natvalue) { /* what about > ? */
Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\t%04"UVxf"\n",
- (UV)prevvalue, (UV)value);
- else if (prevvalue == value)
- Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
- (UV)value);
+ prevnatvalue, natvalue);
+ }
+ else if (prevnatvalue == natvalue) {
+ Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n", natvalue);
+ if (FOLD) {
+ U8 foldbuf[UTF8_MAXLEN_FOLD+1];
+ STRLEN foldlen;
+ UV f = to_uni_fold(natvalue, foldbuf, &foldlen);
+
+ /* If folding and foldable and a single
+ * character, insert also the folded version
+ * to the charclass. */
+ if (f != value) {
+ if (foldlen == UNISKIP(f))
+ Perl_sv_catpvf(aTHX_ listsv,
+ "%04"UVxf"\n", f);
+ else {
+ /* Any multicharacter foldings
+ * require the following transform:
+ * [ABCDEF] -> (?:[ABCabcDEFd]|pq|rst)
+ * where E folds into "pq" and F folds
+ * into "rst", all other characters
+ * fold to single characters. We save
+ * away these multicharacter foldings,
+ * to be later saved as part of the
+ * additional "s" data. */
+ SV *sv;
+
+ if (!unicode_alternate)
+ unicode_alternate = newAV();
+ sv = newSVpvn((char*)foldbuf, foldlen);
+ SvUTF8_on(sv);
+ av_push(unicode_alternate, sv);
+ }
+ }
+
+ /* If folding and the value is one of the Greek
+ * sigmas insert a few more sigmas to make the
+ * folding rules of the sigmas to work right.
+ * Note that not all the possible combinations
+ * are handled here: some of them are handled
+ * by the standard folding rules, and some of
+ * them (literal or EXACTF cases) are handled
+ * during runtime in regexec.c:S_find_byclass(). */
+ if (value == UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA) {
+ Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
+ (UV)UNICODE_GREEK_CAPITAL_LETTER_SIGMA);
+ Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
+ (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
+ }
+ else if (value == UNICODE_GREEK_CAPITAL_LETTER_SIGMA)
+ Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
+ (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
+ }
+ }
}
}
AV *av = newAV();
SV *rv;
+ /* The 0th element stores the character class description
+ * in its textual form: used later (regexec.c:Perl_regclass_swatch())
+ * to initialize the appropriate swash (which gets stored in
+ * the 1st element), and also useful for dumping the regnode.
+ * The 2nd element stores the multicharacter foldings,
+ * used later (regexec.c:s_reginclasslen()). */
av_store(av, 0, listsv);
av_store(av, 1, NULL);
+ av_store(av, 2, (SV*)unicode_alternate);
rv = newRV_noinc((SV*)av);
n = add_data(pRExC_state, 1, "s");
RExC_rx->data->data[n] = (void*)rv;
if (k == EXACT) {
SV *dsv = sv_2mortal(newSVpvn("", 0));
- bool do_utf8 = DO_UTF8(sv);
+ /* Using is_utf8_string() is a crude hack but it may
+ * be the best for now since we have no flag "this EXACTish
+ * node was UTF-8" --jhi */
+ bool do_utf8 = is_utf8_string((U8*)STRING(o), STR_LEN(o));
char *s = do_utf8 ?
- pv_uni_display(dsv, (U8*)STRING(o), STR_LEN(o), 60, 0) :
+ pv_uni_display(dsv, (U8*)STRING(o), STR_LEN(o), 60,
+ UNI_DISPLAY_REGEX) :
STRING(o);
int len = do_utf8 ?
strlen(s) :
{
SV *lv;
- SV *sw = regclass_swash(o, FALSE, &lv);
+ SV *sw = regclass_swash(o, FALSE, &lv, 0);
if (lv) {
if (sw) {
void
Perl_pregfree(pTHX_ struct regexp *r)
{
- DEBUG_r(if (!PL_colorset) reginitcolors());
+#ifdef DEBUGGING
+ SV *dsv = PERL_DEBUG_PAD_ZERO(0);
+#endif
if (!r || (--r->refcnt > 0))
return;
- DEBUG_r(PerlIO_printf(Perl_debug_log,
- "%sFreeing REx:%s `%s%.60s%s%s'\n",
- PL_colors[4],PL_colors[5],PL_colors[0],
- r->precomp,
- PL_colors[1],
- (strlen(r->precomp) > 60 ? "..." : "")));
+ DEBUG_r({
+ char *s = pv_uni_display(dsv, (U8*)r->precomp, r->prelen, 60,
+ UNI_DISPLAY_REGEX);
+ int len = SvCUR(dsv);
+ if (!PL_colorset)
+ reginitcolors();
+ PerlIO_printf(Perl_debug_log,
+ "%sFreeing REx:%s `%s%*.*s%s%s'\n",
+ PL_colors[4],PL_colors[5],PL_colors[0],
+ len, len, s,
+ PL_colors[1],
+ len > 60 ? "..." : "");
+ });
if (r->precomp)
Safefree(r->precomp);
new_comppad = NULL;
break;
case 'n':
- break;
+ break;
default:
Perl_croak(aTHX_ "panic: regfree data code '%c'", r->data->what[n]);
}