From: Jarkko Hietaniemi Date: Tue, 23 Oct 2001 12:26:12 +0000 (+0000) Subject: Fix multicharacter titlecase (ucfirst). X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=commitdiff_plain;h=44bc797b2bc92bd45d512cde050d525bd921cf16;p=p5sagit%2Fp5-mst-13.2.git Fix multicharacter titlecase (ucfirst). p4raw-id: //depot/perl@12601 --- diff --git a/pod/perlunicode.pod b/pod/perlunicode.pod index 9b4d2e3..37e2f22 100644 --- a/pod/perlunicode.pod +++ b/pod/perlunicode.pod @@ -505,10 +505,11 @@ pack('C0', ...). =item * Case translation operators use the Unicode case translation tables -when provided character input. Note that C translates to -uppercase, while C translates to titlecase (for languages -that make the distinction). Naturally the corresponding backslash -sequences have the same semantics. +when provided character input. Note that C (also known as C<\U> +in doublequoted strings) translates to uppercase, while C +(also known as C<\u> in doublequoted strings) translates to titlecase +(for languages that make the distinction). Naturally the +corresponding backslash sequences have the same semantics. =item * diff --git a/pp.c b/pp.c index 134f243..3d93f75 100644 --- a/pp.c +++ b/pp.c @@ -3149,27 +3149,27 @@ PP(pp_ucfirst) register U8 *s; STRLEN slen; - if (DO_UTF8(sv) && (s = (U8*)SvPV(sv, slen)) && slen && UTF8_IS_START(*s)) { - STRLEN ulen; + if (DO_UTF8(sv)) { U8 tmpbuf[UTF8_MAXLEN*2+1]; - U8 *tend; - UV uv; + STRLEN ulen; + STRLEN tculen; - toTITLE_utf8(s, tmpbuf, &ulen); /* XXX --jhi */ - uv = utf8_to_uvchr(tmpbuf, 0); - - tend = uvchr_to_utf8(tmpbuf, uv); + s = (U8*)SvPV(sv, slen); + utf8_to_uvchr(s, &ulen); - if (!SvPADTMP(sv) || tend - tmpbuf != ulen || SvREADONLY(sv)) { + toTITLE_utf8(s, tmpbuf, &tculen); + utf8_to_uvchr(tmpbuf, 0); + + if (!SvPADTMP(sv) || SvREADONLY(sv)) { dTARGET; - sv_setpvn(TARG, (char*)tmpbuf, tend - tmpbuf); + sv_setpvn(TARG, (char*)tmpbuf, tculen); sv_catpvn(TARG, (char*)(s + ulen), slen - ulen); SvUTF8_on(TARG); SETs(TARG); } else { s = (U8*)SvPV_force(sv, slen); - Copy(tmpbuf, s, ulen, U8); + Copy(tmpbuf, s, tculen, U8); } } else { @@ -3209,7 +3209,7 @@ PP(pp_lcfirst) U8 *tend; UV uv; - toLOWER_utf8(s, tmpbuf, &ulen); /* XXX --jhi */ + toLOWER_utf8(s, tmpbuf, &ulen); uv = utf8_to_uvchr(tmpbuf, 0); tend = uvchr_to_utf8(tmpbuf, uv); @@ -3277,7 +3277,7 @@ PP(pp_uc) d = (U8*)SvPVX(TARG); send = s + len; while (s < send) { - toUPPER_utf8(s, tmpbuf, &ulen); /* XXX --jhi */ + toUPPER_utf8(s, tmpbuf, &ulen); Copy(tmpbuf, d, ulen, U8); d += ulen; s += UTF8SKIP(s); @@ -3344,7 +3344,7 @@ PP(pp_lc) d = (U8*)SvPVX(TARG); send = s + len; while (s < send) { - toLOWER_utf8(s, tmpbuf, &ulen); /* XXX --jhi */ + toLOWER_utf8(s, tmpbuf, &ulen); Copy(tmpbuf, d, ulen, U8); d += ulen; s += UTF8SKIP(s); diff --git a/t/op/lc.t b/t/op/lc.t index 9333c6c..5c43248 100644 --- a/t/op/lc.t +++ b/t/op/lc.t @@ -91,3 +91,18 @@ ok("\U\x{DF}ab\x{149}cd" eq "\x{53}\x{53}AB\x{2BC}\x{4E}CD", ok("\L\x{DF}AB\x{149}CD" eq "\x{DF}ab\x{149}cd", "multicharacter lowercase"); +# titlecase is used for \u / ucfirst. + +# \x{587} is ARMENIAN SMALL LIGATURE ECH YIWN and its titlecase is +# \x{535}\x{582} ARMENIAN CAPITAL LETTER ECH + ARMENIAN SMALL LETTER YIWN +# while its lowercase is +# \x{587} itself +# and its uppercase is +# \x{535}\x{552} ARMENIAN CAPITAL LETTER ECH + ARMENIAN CAPITAL LETTER YIWN + +$a = "\x{587}"; + +ok("\L\x{587}" eq "\x{587}", "ligature lowercase"); +ok("\u\x{587}" eq "\x{535}\x{582}", "ligature titlecase"); +ok("\U\x{587}" eq "\x{535}\x{552}", "ligature uppercase"); +