From: Nicholas Clark Date: Sun, 30 Apr 2006 11:14:04 +0000 (+0000) Subject: index and rindex couldn't correctly handle surprises from UTF-8 X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=commitdiff_plain;h=73ee8be2712c500c98e5976864ba96726bf311e2;p=p5sagit%2Fp5-mst-13.2.git index and rindex couldn't correctly handle surprises from UTF-8 overloading. p4raw-id: //depot/perl@28022 --- diff --git a/pp.c b/pp.c index b937e0a..65e1d50 100644 --- a/pp.c +++ b/pp.c @@ -3183,8 +3183,8 @@ PP(pp_index) STRLEN llen = 0; I32 offset; I32 retval; - const char *tmps; - const char *tmps2; + const char *big_p; + const char *little_p; const I32 arybase = CopARYBASE_get(PL_curcop); bool big_utf8; bool little_utf8; @@ -3197,6 +3197,9 @@ PP(pp_index) } little = POPs; big = POPs; + big_p = SvPV_const(big, biglen); + little_p = SvPV_const(little, llen); + big_utf8 = DO_UTF8(big); little_utf8 = DO_UTF8(little); if (big_utf8 ^ little_utf8) { @@ -3204,9 +3207,7 @@ PP(pp_index) if (little_utf8 && !PL_encoding) { /* Well, maybe instead we might be able to downgrade the small string? */ - STRLEN little_len; - const U8 * const little_pv = (U8*) SvPV_const(little, little_len); - char * const pv = (char*)bytes_from_utf8(little_pv, &little_len, + char * const pv = (char*)bytes_from_utf8(little_p, &llen, &little_utf8); if (little_utf8) { /* If the large string is ISO-8859-1, and it's not possible to @@ -3219,13 +3220,11 @@ PP(pp_index) /* At this point, pv is a malloc()ed string. So donate it to temp to ensure it will get free()d */ little = temp = newSV(0); - sv_usepvn(temp, pv, little_len); + sv_usepvn(temp, pv, llen); + little_p = SvPVX(little); } else { - SV * const bytes = little_utf8 ? big : little; - STRLEN len; - const char * const p = SvPV_const(bytes, len); - - temp = newSVpvn(p, len); + temp = little_utf8 + ? newSVpvn(big_p, biglen) : newSVpvn(little_p, llen); if (PL_encoding) { sv_recode_to_utf8(temp, PL_encoding); @@ -3235,34 +3234,58 @@ PP(pp_index) if (little_utf8) { big = temp; big_utf8 = TRUE; + big_p = SvPV_const(big, biglen); } else { little = temp; + little_p = SvPV_const(little, llen); } } } - /* Don't actually need the NULL initialisation, but it keeps gcc quiet. */ - tmps2 = is_index ? NULL : SvPV_const(little, llen); - tmps = SvPV_const(big, biglen); + if (SvGAMAGIC(big)) { + /* Life just becomes a lot easier if I use a temporary here. + Otherwise I need to avoid calls to sv_pos_u2b(), which (dangerously) + will trigger magic and overloading again, as will fbm_instr() + */ + big = sv_2mortal(newSVpvn(big_p, biglen)); + if (big_utf8) + SvUTF8_on(big); + big_p = SvPVX(big); + } + if (SvGAMAGIC(little) || index && !SvOK(little)) { + /* index && SvOK() is a hack. fbm_instr() calls SvPV_const, which will + warn on undef, and we've already triggered a warning with the + SvPV_const some lines above. We can't remove that, as we need to + call some SvPV to trigger overloading early and find out if the + string is UTF-8. + This is all getting to messy. The API isn't quite clean enough, + because data access has side effects. + */ + little = sv_2mortal(newSVpvn(little_p, llen)); + if (little_utf8) + SvUTF8_on(little); + little_p = SvPVX(little); + } if (MAXARG < 3) offset = is_index ? 0 : biglen; else { if (big_utf8 && offset > 0) sv_pos_u2b(big, &offset, 0); - offset += llen; + if (!is_index) + offset += llen; } if (offset < 0) offset = 0; else if (offset > (I32)biglen) offset = biglen; - if (!(tmps2 = is_index - ? fbm_instr((unsigned char*)tmps + offset, - (unsigned char*)tmps + biglen, little, 0) - : rninstr(tmps, tmps + offset, - tmps2, tmps2 + llen))) + if (!(little_p = is_index + ? fbm_instr((unsigned char*)big_p + offset, + (unsigned char*)big_p + biglen, little, 0) + : rninstr(big_p, big_p + offset, + little_p, little_p + llen))) retval = -1; else { - retval = tmps2 - tmps; + retval = little_p - big_p; if (retval > 0 && big_utf8) sv_pos_b2u(big, &retval); } diff --git a/t/lib/warnings/9uninit b/t/lib/warnings/9uninit index 575161d..fadcd1b 100644 --- a/t/lib/warnings/9uninit +++ b/t/lib/warnings/9uninit @@ -873,11 +873,11 @@ Use of uninitialized value $m2 in index at - line 14. Use of uninitialized value $g1 in index at - line 15. Use of uninitialized value $m1 in index at - line 15. Use of uninitialized value $m2 in index at - line 15. -Use of uninitialized value $m2 in rindex at - line 16. Use of uninitialized value $m1 in rindex at - line 16. +Use of uninitialized value $m2 in rindex at - line 16. Use of uninitialized value $g1 in rindex at - line 17. -Use of uninitialized value $m2 in rindex at - line 17. Use of uninitialized value $m1 in rindex at - line 17. +Use of uninitialized value $m2 in rindex at - line 17. ######## use warnings 'uninitialized'; my ($m1, $v); diff --git a/t/uni/overload.t b/t/uni/overload.t index 5812425..ef61667 100644 --- a/t/uni/overload.t +++ b/t/uni/overload.t @@ -7,7 +7,7 @@ BEGIN { } } -use Test::More tests => 116; +use Test::More tests => 190; package UTF8Toggle; use strict; @@ -212,6 +212,36 @@ foreach my $operator ('print', 'syswrite', 'syswrite len', 'syswrite off', } } +my $little = "\243\243"; +my $big = " \243 $little ! $little ! $little \243 "; +my $right = rindex $big, $little; +my $right1 = rindex $big, $little, 11; +my $left = index $big, $little; +my $left1 = index $big, $little, 4; + +cmp_ok ($right, ">", $right1, "Sanity check our rindex tests"); +cmp_ok ($left, "<", $left1, "Sanity check our index tests"); + +foreach my $b ($big, UTF8Toggle->new($big)) { + foreach my $l ($little, UTF8Toggle->new($little), + UTF8Toggle->new($little, 1)) { + is (rindex ($b, $l), $right, "rindex"); + is (rindex ($b, $l), $right, "rindex"); + is (rindex ($b, $l), $right, "rindex"); + + is (rindex ($b, $l, 11), $right1, "rindex 11"); + is (rindex ($b, $l, 11), $right1, "rindex 11"); + is (rindex ($b, $l, 11), $right1, "rindex 11"); + + is (index ($b, $l), $left, "index"); + is (index ($b, $l), $left, "index"); + is (index ($b, $l), $left, "index"); + + is (index ($b, $l, 4), $left1, "index 4"); + is (index ($b, $l, 4), $left1, "index 4"); + is (index ($b, $l, 4), $left1, "index 4"); + } +} END { 1 while -f $tmpfile and unlink $tmpfile || die "unlink '$tmpfile': $!";