From: Nicholas Clark Date: Mon, 7 Nov 2005 13:22:35 +0000 (+0000) Subject: Fix bug #37628 (both lcfirst and ucfirst) X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=commitdiff_plain;h=6f9b16a7d2fe9614c783e2d5785df2cd8d7921a0;p=p5sagit%2Fp5-mst-13.2.git Fix bug #37628 (both lcfirst and ucfirst) p4raw-id: //depot/perl@26034 --- diff --git a/pp.c b/pp.c index c3d9d2b..53ddb0c 100644 --- a/pp.c +++ b/pp.c @@ -3392,9 +3392,8 @@ PP(pp_ucfirst) utf8_to_uvchr(s, &ulen); toTITLE_utf8(s, tmpbuf, &tculen); - utf8_to_uvchr(tmpbuf, 0); - if (!SvPADTMP(sv) || SvREADONLY(sv)) { + if (!SvPADTMP(sv) || SvREADONLY(sv) || ulen != tculen) { dTARGET; /* slen is the byte length of the whole SV. * ulen is the byte length of the original Unicode character @@ -3450,17 +3449,15 @@ PP(pp_lcfirst) (s = (const U8*)SvPV_nomg_const(sv, slen)) && slen && UTF8_IS_START(*s)) { STRLEN ulen; + STRLEN lculen; U8 tmpbuf[UTF8_MAXBYTES_CASE+1]; - U8 *tend; - UV uv; - toLOWER_utf8(s, tmpbuf, &ulen); - uv = utf8_to_uvchr(tmpbuf, 0); - tend = uvchr_to_utf8(tmpbuf, uv); + utf8_to_uvchr(s, &ulen); + toLOWER_utf8(s, tmpbuf, &lculen); - if (!SvPADTMP(sv) || (STRLEN)(tend - tmpbuf) != ulen || SvREADONLY(sv)) { + if (!SvPADTMP(sv) || SvREADONLY(sv) || ulen != lculen) { dTARGET; - sv_setpvn(TARG, (char*)tmpbuf, tend - tmpbuf); + sv_setpvn(TARG, (char*)tmpbuf, lculen); if (slen > ulen) sv_catpvn(TARG, (char*)(s + ulen), slen - ulen); SvUTF8_on(TARG); diff --git a/t/uni/case.pl b/t/uni/case.pl index 43fc84b..f12e482 100644 --- a/t/uni/case.pl +++ b/t/uni/case.pl @@ -7,7 +7,7 @@ sub unidump { } sub casetest { - my ($base, $spec, $func) = @_; + my ($base, $spec, @funcs) = @_; my $file = File::Spec->catfile(File::Spec->catdir(File::Spec->updir, "lib", "unicore", "To"), "$base.pl"); @@ -45,9 +45,9 @@ sub casetest { print "# ", scalar keys %none, " noncase mappings\n"; my $tests = - (scalar keys %simple) + - (scalar keys %$spec) + - (scalar keys %none); + ((scalar keys %simple) + + (scalar keys %$spec) + + (scalar keys %none)) * @funcs; print "1..$tests\n"; my $test = 1; @@ -55,11 +55,13 @@ sub casetest { for my $i (sort keys %simple) { my $w = $simple{$i}; my $c = pack "U0U", hex $i; - my $d = $func->($c); - my $e = unidump($d); - print $d eq pack("U0U", hex $simple{$i}) ? - "ok $test # $i -> $w\n" : "not ok $test # $i -> $e ($w)\n"; - $test++; + foreach my $func (@funcs) { + my $d = $func->($c); + my $e = unidump($d); + print $d eq pack("U0U", hex $simple{$i}) ? + "ok $test # $i -> $w\n" : "not ok $test # $i -> $e ($w)\n"; + $test++; + } } for my $i (sort keys %$spec) { @@ -67,69 +69,73 @@ sub casetest { my $u = unpack "C0U", $i; my $h = sprintf "%04X", $u; my $c = chr($u); $c .= chr(0x100); chop $c; - my $d = $func->($c); - my $e = unidump($d); - if (ord "A" == 193) { # EBCDIC - # We need to a little bit of remapping. - # - # For example, in titlecase (ucfirst) mapping - # of U+0149 the Unicode mapping is U+02BC U+004E. - # The 4E is N, which in EBCDIC is 2B-- - # and the ucfirst() does that right. - # The problem is that our reference - # data is in Unicode code points. - # - # The Right Way here would be to use, say, - # Encode, to remap the less-than 0x100 code points, - # but let's try to be Encode-independent here. - # - # These are the titlecase exceptions: - # - # Unicode Unicode+EBCDIC - # - # 0149 -> 02BC 004E (02BC 002B) - # 01F0 -> 004A 030C (00A2 030C) - # 1E96 -> 0048 0331 (00E7 0331) - # 1E97 -> 0054 0308 (00E8 0308) - # 1E98 -> 0057 030A (00EF 030A) - # 1E99 -> 0059 030A (00DF 030A) - # 1E9A -> 0041 02BE (00A0 02BE) - # - # The uppercase exceptions are identical. - # - # The lowercase has one more: - # - # Unicode Unicode+EBCDIC - # - # 0130 -> 0069 0307 (00D1 0307) - # - if ($i =~ /^(0130|0149|01F0|1E96|1E97|1E98|1E99|1E9A)$/) { - $e =~ s/004E/002B/; # N - $e =~ s/004A/00A2/; # J - $e =~ s/0048/00E7/; # H - $e =~ s/0054/00E8/; # T - $e =~ s/0057/00EF/; # W - $e =~ s/0059/00DF/; # Y - $e =~ s/0041/00A0/; # A - $e =~ s/0069/00D1/; # i + foreach my $func (@funcs) { + my $d = $func->($c); + my $e = unidump($d); + if (ord "A" == 193) { # EBCDIC + # We need to a little bit of remapping. + # + # For example, in titlecase (ucfirst) mapping + # of U+0149 the Unicode mapping is U+02BC U+004E. + # The 4E is N, which in EBCDIC is 2B-- + # and the ucfirst() does that right. + # The problem is that our reference + # data is in Unicode code points. + # + # The Right Way here would be to use, say, + # Encode, to remap the less-than 0x100 code points, + # but let's try to be Encode-independent here. + # + # These are the titlecase exceptions: + # + # Unicode Unicode+EBCDIC + # + # 0149 -> 02BC 004E (02BC 002B) + # 01F0 -> 004A 030C (00A2 030C) + # 1E96 -> 0048 0331 (00E7 0331) + # 1E97 -> 0054 0308 (00E8 0308) + # 1E98 -> 0057 030A (00EF 030A) + # 1E99 -> 0059 030A (00DF 030A) + # 1E9A -> 0041 02BE (00A0 02BE) + # + # The uppercase exceptions are identical. + # + # The lowercase has one more: + # + # Unicode Unicode+EBCDIC + # + # 0130 -> 0069 0307 (00D1 0307) + # + if ($i =~ /^(0130|0149|01F0|1E96|1E97|1E98|1E99|1E9A)$/) { + $e =~ s/004E/002B/; # N + $e =~ s/004A/00A2/; # J + $e =~ s/0048/00E7/; # H + $e =~ s/0054/00E8/; # T + $e =~ s/0057/00EF/; # W + $e =~ s/0059/00DF/; # Y + $e =~ s/0041/00A0/; # A + $e =~ s/0069/00D1/; # i + } + # We have to map the output, not the input, because + # pack/unpack U has been EBCDICified, too, it would + # just undo our remapping. } - # We have to map the output, not the input, because - # pack/unpack U has been EBCDICified, too, it would - # just undo our remapping. + print $w eq $e ? + "ok $test # $i -> $w\n" : "not ok $test # $h -> $e ($w)\n"; + $test++; } - print $w eq $e ? - "ok $test # $i -> $w\n" : "not ok $test # $h -> $e ($w)\n"; - $test++; } for my $i (sort { $a <=> $b } keys %none) { my $w = $i = sprintf "%04X", $i; my $c = pack "U0U", hex $i; - my $d = $func->($c); - my $e = unidump($d); - print $d eq $c ? - "ok $test # $i -> $w\n" : "not ok $test # $i -> $e ($w)\n"; - $test++; + foreach my $func (@funcs) { + my $d = $func->($c); + my $e = unidump($d); + print $d eq $c ? + "ok $test # $i -> $w\n" : "not ok $test # $i -> $e ($w)\n"; + $test++; + } } } diff --git a/t/uni/lower.t b/t/uni/lower.t index 4420d0b..76df6de 100644 --- a/t/uni/lower.t +++ b/t/uni/lower.t @@ -4,5 +4,6 @@ BEGIN { require "case.pl"; } -casetest("Lower", \%utf8::ToSpecLower, sub { lc $_[0] }); - +casetest("Lower", \%utf8::ToSpecLower, + sub { lc $_[0] }, sub { my $a = ""; lc ($_[0] . $a) }, + sub { lcfirst $_[0] }, sub { my $a = ""; lcfirst ($_[0] . $a) }); diff --git a/t/uni/title.t b/t/uni/title.t index c0b7e3a..ae85f94 100644 --- a/t/uni/title.t +++ b/t/uni/title.t @@ -4,5 +4,5 @@ BEGIN { require "case.pl"; } -casetest("Title", \%utf8::ToSpecTitle, sub { ucfirst $_[0] }); - +casetest("Title", \%utf8::ToSpecTitle, sub { ucfirst $_[0] }, + sub { my $a = ""; ucfirst ($_[0] . $a) }); diff --git a/t/uni/upper.t b/t/uni/upper.t index 5694c26..d9f1788 100644 --- a/t/uni/upper.t +++ b/t/uni/upper.t @@ -4,5 +4,5 @@ BEGIN { require "case.pl"; } -casetest("Upper", \%utf8::ToSpecUpper, sub { uc $_[0] }); - +casetest("Upper", \%utf8::ToSpecUpper, sub { uc $_[0] }, + sub { my $a = ""; uc ($_[0] . $a) });