From: Jarkko Hietaniemi Date: Sun, 15 Oct 2000 15:19:29 +0000 (+0000) Subject: split() utf8 fixes. Should fix both 20001014.001 and 20000426.003. X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=commitdiff_plain;h=93f04dac2ed878fbeae5ba64df628ed3ab0b6e21;p=p5sagit%2Fp5-mst-13.2.git split() utf8 fixes. Should fix both 20001014.001 and 20000426.003. The problem was that rx->minlen was in chars while pp_split() thought it would be in bytes. p4raw-id: //depot/perl@7234 --- diff --git a/pp.c b/pp.c index 389d12b..03609e8 100644 --- a/pp.c +++ b/pp.c @@ -4975,7 +4975,7 @@ PP(pp_split) AV *ary; register I32 limit = POPi; /* note, negative is forever */ SV *sv = POPs; - bool isutf = DO_UTF8(sv); + bool doutf8 = DO_UTF8(sv); STRLEN len; register char *s = SvPV(sv, len); char *strend = s + len; @@ -5078,7 +5078,7 @@ PP(pp_split) sv_setpvn(dstr, s, m-s); if (make_mortal) sv_2mortal(dstr); - if (isutf) + if (doutf8) (void)SvUTF8_on(dstr); XPUSHs(dstr); @@ -5100,7 +5100,7 @@ PP(pp_split) sv_setpvn(dstr, s, m-s); if (make_mortal) sv_2mortal(dstr); - if (isutf) + if (doutf8) (void)SvUTF8_on(dstr); XPUSHs(dstr); s = m; @@ -5111,11 +5111,11 @@ PP(pp_split) && !(rx->reganch & ROPT_ANCH)) { int tail = (rx->reganch & RE_INTUIT_TAIL); SV *csv = CALLREG_INTUIT_STRING(aTHX_ rx); - char c; len = rx->minlen; if (len == 1 && !tail) { - c = *SvPV(csv,len); + STRLEN n_a; + char c = *SvPV(csv, n_a); while (--limit) { /*SUPPRESS 530*/ for (m = s; m < strend && *m != c; m++) ; @@ -5125,10 +5125,12 @@ PP(pp_split) sv_setpvn(dstr, s, m-s); if (make_mortal) sv_2mortal(dstr); - if (isutf) + if (doutf8) (void)SvUTF8_on(dstr); XPUSHs(dstr); - s = m + 1; + /* The rx->minlen is in characters but we want to step + * s ahead by bytes. */ + s = m + (doutf8 ? SvCUR(csv) : len); } } else { @@ -5142,10 +5144,12 @@ PP(pp_split) sv_setpvn(dstr, s, m-s); if (make_mortal) sv_2mortal(dstr); - if (isutf) + if (doutf8) (void)SvUTF8_on(dstr); XPUSHs(dstr); - s = m + len; /* Fake \n at the end */ + /* The rx->minlen is in characters but we want to step + * s ahead by bytes. */ + s = m + (doutf8 ? SvCUR(csv) : len); /* Fake \n at the end */ } } } @@ -5171,7 +5175,7 @@ PP(pp_split) sv_setpvn(dstr, s, m-s); if (make_mortal) sv_2mortal(dstr); - if (isutf) + if (doutf8) (void)SvUTF8_on(dstr); XPUSHs(dstr); if (rx->nparens) { @@ -5186,7 +5190,7 @@ PP(pp_split) dstr = NEWSV(33, 0); if (make_mortal) sv_2mortal(dstr); - if (isutf) + if (doutf8) (void)SvUTF8_on(dstr); XPUSHs(dstr); } @@ -5202,11 +5206,12 @@ PP(pp_split) /* keep field after final delim? */ if (s < strend || (iters && origlimit)) { - dstr = NEWSV(34, strend-s); - sv_setpvn(dstr, s, strend-s); + STRLEN l = strend - s; + dstr = NEWSV(34, l); + sv_setpvn(dstr, s, l); if (make_mortal) sv_2mortal(dstr); - if (isutf) + if (doutf8) (void)SvUTF8_on(dstr); XPUSHs(dstr); iters++; diff --git a/t/pragma/utf8.t b/t/pragma/utf8.t index 953064c..51c084c 100755 --- a/t/pragma/utf8.t +++ b/t/pragma/utf8.t @@ -10,7 +10,7 @@ BEGIN { } } -print "1..82\n"; +print "1..87\n"; my $test = 1; @@ -437,3 +437,37 @@ sub nok_bytes { print "ok $test\n"; $test++; } + +{ + # bug id 20000426.003 + + use utf8; + + my $s = "\x20\x40\x{80}\x{100}\x{80}\x40\x20"; + + my ($a, $b, $c) = split(/\x40/, $s); + print "not " + unless $a eq "\x20" && $b eq "\x{80}\x{100}\x{80}" && $c eq $a; + print "ok $test\n"; + $test++; + + my ($a, $b) = split(/\x{100}/, $s); + print "not " unless $a eq "\x20\x40\x{80}" && $b eq "\x{80}\x40\x20"; + print "ok $test\n"; + $test++; + + my ($a, $b) = split(/\x{80}\x{100}\x{80}/, $s); + print "not " unless $a eq "\x20\x40" && $b eq "\x40\x20"; + print "ok $test\n"; + $test++; + + my ($a, $b) = split(/\x40\x{80}/, $s); + print "not " unless $a eq "\x20" && $b eq "\x{100}\x{80}\x40\x20"; + print "ok $test\n"; + $test++; + + my ($a, $b, $c) = split(/[\x40\x{80}]+/, $s); + print "not " unless $a eq "\x20" && $b eq "\x{100}" && $c eq "\x20"; + print "ok $test\n"; + $test++; +}