From: Jarkko Hietaniemi Date: Fri, 9 Jul 2004 00:35:22 +0000 (+0300) Subject: Re: Segfault using HTML::Entities X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=commitdiff_plain;h=078c425b721ea35d568a7ec980e3b95d036a25c3;p=p5sagit%2Fp5-mst-13.2.git Re: Segfault using HTML::Entities Message-ID: <40EDBE1A.6080205@iki.fi> Date: Fri, 09 Jul 2004 00:35:22 +0300 p4raw-id: //depot/perl@23074 --- diff --git a/pp_ctl.c b/pp_ctl.c index a4c93f1..a9bc3e5 100644 --- a/pp_ctl.c +++ b/pp_ctl.c @@ -187,10 +187,13 @@ PP(pp_substcont) { SV *targ = cx->sb_targ; - if (DO_UTF8(dstr) && !SvUTF8(targ)) - sv_catpvn_utf8_upgrade(dstr, s, cx->sb_strend - s, nsv); - else - sv_catpvn(dstr, s, cx->sb_strend - s); + assert(cx->sb_strend >= s); + if(cx->sb_strend > s) { + if (DO_UTF8(dstr) && !SvUTF8(targ)) + sv_catpvn_utf8_upgrade(dstr, s, cx->sb_strend - s, nsv); + else + sv_catpvn(dstr, s, cx->sb_strend - s); + } cx->sb_rxtainted |= RX_MATCH_TAINTED(rx); #ifdef PERL_COPY_ON_WRITE diff --git a/regexec.c b/regexec.c index 728b1ae..60276cb 100644 --- a/regexec.c +++ b/regexec.c @@ -954,6 +954,7 @@ S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *sta char *m; STRLEN ln; STRLEN lnc; + register STRLEN uskip; unsigned int c1; unsigned int c2; char *e; @@ -964,7 +965,7 @@ S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *sta switch (OP(c)) { case ANYOF: if (do_utf8) { - while (s < strend) { + while (s + (uskip = UTF8SKIP(s)) <= strend) { if ((ANYOF_FLAGS(c) & ANYOF_UNICODE) || !UTF8_IS_INVARIANT((U8)s[0]) ? reginclass(c, (U8*)s, 0, do_utf8) : @@ -976,7 +977,7 @@ S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *sta } else tmp = 1; - s += UTF8SKIP(s); + s += uskip; } } else { @@ -1172,7 +1173,7 @@ S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *sta tmp = ((OP(c) == BOUND ? isALNUM_uni(tmp) : isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp))) != 0); LOAD_UTF8_CHARCLASS(alnum,"a"); - while (s < strend) { + while (s + (uskip = UTF8SKIP(s)) <= strend) { if (tmp == !(OP(c) == BOUND ? swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8) : isALNUM_LC_utf8((U8*)s))) @@ -1181,7 +1182,7 @@ S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *sta if ((norun || regtry(prog, s))) goto got_it; } - s += UTF8SKIP(s); + s += uskip; } } else { @@ -1215,14 +1216,14 @@ S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *sta tmp = ((OP(c) == NBOUND ? isALNUM_uni(tmp) : isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp))) != 0); LOAD_UTF8_CHARCLASS(alnum,"a"); - while (s < strend) { + while (s + (uskip = UTF8SKIP(s)) <= strend) { if (tmp == !(OP(c) == NBOUND ? swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8) : isALNUM_LC_utf8((U8*)s))) tmp = !tmp; else if ((norun || regtry(prog, s))) goto got_it; - s += UTF8SKIP(s); + s += uskip; } } else { @@ -1244,7 +1245,7 @@ S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *sta case ALNUM: if (do_utf8) { LOAD_UTF8_CHARCLASS(alnum,"a"); - while (s < strend) { + while (s + (uskip = UTF8SKIP(s)) <= strend) { if (swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8)) { if (tmp && (norun || regtry(prog, s))) goto got_it; @@ -1253,7 +1254,7 @@ S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *sta } else tmp = 1; - s += UTF8SKIP(s); + s += uskip; } } else { @@ -1273,7 +1274,7 @@ S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *sta case ALNUML: PL_reg_flags |= RF_tainted; if (do_utf8) { - while (s < strend) { + while (s + (uskip = UTF8SKIP(s)) <= strend) { if (isALNUM_LC_utf8((U8*)s)) { if (tmp && (norun || regtry(prog, s))) goto got_it; @@ -1282,7 +1283,7 @@ S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *sta } else tmp = 1; - s += UTF8SKIP(s); + s += uskip; } } else { @@ -1302,7 +1303,7 @@ S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *sta case NALNUM: if (do_utf8) { LOAD_UTF8_CHARCLASS(alnum,"a"); - while (s < strend) { + while (s + (uskip = UTF8SKIP(s)) <= strend) { if (!swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8)) { if (tmp && (norun || regtry(prog, s))) goto got_it; @@ -1311,7 +1312,7 @@ S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *sta } else tmp = 1; - s += UTF8SKIP(s); + s += uskip; } } else { @@ -1331,7 +1332,7 @@ S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *sta case NALNUML: PL_reg_flags |= RF_tainted; if (do_utf8) { - while (s < strend) { + while (s + (uskip = UTF8SKIP(s)) <= strend) { if (!isALNUM_LC_utf8((U8*)s)) { if (tmp && (norun || regtry(prog, s))) goto got_it; @@ -1340,7 +1341,7 @@ S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *sta } else tmp = 1; - s += UTF8SKIP(s); + s += uskip; } } else { @@ -1360,7 +1361,7 @@ S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *sta case SPACE: if (do_utf8) { LOAD_UTF8_CHARCLASS(space," "); - while (s < strend) { + while (s + (uskip = UTF8SKIP(s)) <= strend) { if (*s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, do_utf8)) { if (tmp && (norun || regtry(prog, s))) goto got_it; @@ -1369,7 +1370,7 @@ S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *sta } else tmp = 1; - s += UTF8SKIP(s); + s += uskip; } } else { @@ -1389,7 +1390,7 @@ S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *sta case SPACEL: PL_reg_flags |= RF_tainted; if (do_utf8) { - while (s < strend) { + while (s + (uskip = UTF8SKIP(s)) <= strend) { if (*s == ' ' || isSPACE_LC_utf8((U8*)s)) { if (tmp && (norun || regtry(prog, s))) goto got_it; @@ -1398,7 +1399,7 @@ S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *sta } else tmp = 1; - s += UTF8SKIP(s); + s += uskip; } } else { @@ -1418,7 +1419,7 @@ S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *sta case NSPACE: if (do_utf8) { LOAD_UTF8_CHARCLASS(space," "); - while (s < strend) { + while (s + (uskip = UTF8SKIP(s)) <= strend) { if (!(*s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, do_utf8))) { if (tmp && (norun || regtry(prog, s))) goto got_it; @@ -1427,7 +1428,7 @@ S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *sta } else tmp = 1; - s += UTF8SKIP(s); + s += uskip; } } else { @@ -1447,7 +1448,7 @@ S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *sta case NSPACEL: PL_reg_flags |= RF_tainted; if (do_utf8) { - while (s < strend) { + while (s + (uskip = UTF8SKIP(s)) <= strend) { if (!(*s == ' ' || isSPACE_LC_utf8((U8*)s))) { if (tmp && (norun || regtry(prog, s))) goto got_it; @@ -1456,7 +1457,7 @@ S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *sta } else tmp = 1; - s += UTF8SKIP(s); + s += uskip; } } else { @@ -1476,7 +1477,7 @@ S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *sta case DIGIT: if (do_utf8) { LOAD_UTF8_CHARCLASS(digit,"0"); - while (s < strend) { + while (s + (uskip = UTF8SKIP(s)) <= strend) { if (swash_fetch(PL_utf8_digit,(U8*)s, do_utf8)) { if (tmp && (norun || regtry(prog, s))) goto got_it; @@ -1485,7 +1486,7 @@ S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *sta } else tmp = 1; - s += UTF8SKIP(s); + s += uskip; } } else { @@ -1505,7 +1506,7 @@ S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *sta case DIGITL: PL_reg_flags |= RF_tainted; if (do_utf8) { - while (s < strend) { + while (s + (uskip = UTF8SKIP(s)) <= strend) { if (isDIGIT_LC_utf8((U8*)s)) { if (tmp && (norun || regtry(prog, s))) goto got_it; @@ -1514,7 +1515,7 @@ S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *sta } else tmp = 1; - s += UTF8SKIP(s); + s += uskip; } } else { @@ -1534,7 +1535,7 @@ S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *sta case NDIGIT: if (do_utf8) { LOAD_UTF8_CHARCLASS(digit,"0"); - while (s < strend) { + while (s + (uskip = UTF8SKIP(s)) <= strend) { if (!swash_fetch(PL_utf8_digit,(U8*)s, do_utf8)) { if (tmp && (norun || regtry(prog, s))) goto got_it; @@ -1543,7 +1544,7 @@ S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *sta } else tmp = 1; - s += UTF8SKIP(s); + s += uskip; } } else { @@ -1563,7 +1564,7 @@ S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *sta case NDIGITL: PL_reg_flags |= RF_tainted; if (do_utf8) { - while (s < strend) { + while (s + (uskip = UTF8SKIP(s)) <= strend) { if (!isDIGIT_LC_utf8((U8*)s)) { if (tmp && (norun || regtry(prog, s))) goto got_it; @@ -1572,7 +1573,7 @@ S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *sta } else tmp = 1; - s += UTF8SKIP(s); + s += uskip; } } else { diff --git a/t/run/fresh_perl.t b/t/run/fresh_perl.t index d47f27f..dbbe6a5 100644 --- a/t/run/fresh_perl.t +++ b/t/run/fresh_perl.t @@ -855,3 +855,19 @@ print glob(q(./"TEST")); EXPECT ./"TEST" ./"TEST" +######## "Segfault using HTML::Entities", Richard Jolly , in perl-unicode@perl.org +-lw +BEGIN { + eval 'require Encode'; + if ($@) { exit 0 } # running minitest? +} +# Test case cut down by jhi +$SIG{__WARN__} = sub { $@ = shift }; +use Encode; +my $t = "\xE9"; +Encode::_utf8_on($t); +$t =~ s/([^a])//ge; +$@ =~ s/ at .*/ at/; +print $@ +EXPECT +Malformed UTF-8 character (unexpected end of string) at