From: Karl Williamson Date: Fri, 28 May 2010 04:24:40 +0000 (-0600) Subject: Document tricks, work-arounds for user-defined casing X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=commitdiff_plain;h=71648f9a69cff1f8ee90adfed51c64c3c2dfeaf1;p=p5sagit%2Fp5-mst-13.2.git Document tricks, work-arounds for user-defined casing And add a .t file to verify that it works. --- diff --git a/MANIFEST b/MANIFEST index a676231..197d359 100644 --- a/MANIFEST +++ b/MANIFEST @@ -4506,6 +4506,7 @@ t/op/tie.t See if tie/untie functions work t/op/time_loop.t Test that very large values don't hang gmtime and localtime. t/op/time.t See if time functions work t/op/tr.t See if tr works +t/op/turkish.t See if we can implement Turkish casing t/op/undef.t See if undef works t/op/universal.t See if UNIVERSAL class works t/op/unshift.t See if unshift works diff --git a/pod/perlunicode.pod b/pod/perlunicode.pod index 1b4c249..bd193f8 100644 --- a/pod/perlunicode.pod +++ b/pod/perlunicode.pod @@ -899,20 +899,83 @@ to be mapped to "A"; all other characters will remain unchanged. (For serious hackers only) The above means you have to furnish a complete mapping; you can't just override a couple of characters and leave the rest -unchanged. You can find all the mappings in the directory +unchanged. You can find all the official mappings in the directory C<$Config{privlib}>F. The mapping data is returned as the here-document. The C> hashes in those files are special exception mappings derived from -C<$Config{privlib}>F. The "Digit" and +C<$Config{privlib}>F. (The "Digit" and "Fold" mappings that one can see in the directory are not directly user-accessible, one can use either the L module, or just match -case-insensitively (that's when the "Fold" mapping is used). +case-insensitively, which is what uses the "Fold" mapping. Neither are user +overridable.) -The mappings will only take effect on scalars that have been marked as having -Unicode characters, for example by using C. -Old byte-style strings are not affected. +If you have many mappings to change, you can take the official mapping data, +change by hand the affected code points, and place the whole thing into your +subroutine. But this will only be valid on Perls that use the same Unicode +version. Another option would be to have your subroutine read the official +mapping file(s) and overwrite the affected code points. -The mappings are in effect for the package they are defined in. +If you have only a few mappings to change, starting in 5.14 you can use the +following trick, here illustrated for Turkish. + + use Config; + + sub ToUpper { + my $official = do "$Config{privlib}/unicore/To/Upper.pl"; + $utf8::ToSpecUpper{'i'} = + "\N{LATIN CAPITAL LETTER I WITH DOT ABOVE}"; + return $official; + } + +This takes the official mappings and overrides just one, for "LATIN SMALL +LETTER I". The keys to the hash must be in UTF-8 (or on EBCDIC platforms, +UTF-EBCDIC), as illustrated by the inverse function. + + sub ToLower { + my $official = do $lower; + $utf8::ToSpecLower{"\xc4\xb0"} = "i"; + return $official; + } + +This example is for an ASCII platform, and C<\xc4\xb0> is the UTF-8 string that +represents C<\N{LATIN CAPITAL LETTER I WITH DOT ABOVE}>, C. + +(The trick illustrated here does work in earlier releases, but only if all the +characters you want to override have ordinal values of 256 or higher.) + +The mappings are in effect only for the package they are defined in, and only +on scalars that have been marked as having Unicode characters, for example by +using C. You can get around the latter restriction in the +scope of a C>: + + use subs qw(uc ucfirst lc lcfirst); + + sub uc($) { + my $string = shift; + utf8::upgrade($string); + return CORE::uc($string); + } + + sub lc($) { + my $string = shift; + utf8::upgrade($string); + + # Unless an I is before a dot_above, it turns into a dotless i. + $string =~ + s/I (?! [^\p{ccc=0}\p{ccc=Above}]* \x{0307} )/\x{131}/gx; + + # But when the I is followed by a dot_above, remove the + # dot_above so the end result will be i. + $string =~ s/I ([^\p{ccc=0}\p{ccc=Above}]* ) \x{0307}/i$1/gx; + return CORE::lc($string); + } + +These examples (also for Turkish) make sure the input is in UTF-8, and then +call the corresponding official function, which will use the C and +C functions you have defined in the package. The C example +shows how you can add context-dependent casing. (For Turkish, there other +required functions: C, C, and C. These are very +similar to the ones given above.) =head2 Character Encodings for Input and Output diff --git a/t/op/turkish.t b/t/op/turkish.t new file mode 100644 index 0000000..08e2bac --- /dev/null +++ b/t/op/turkish.t @@ -0,0 +1,100 @@ +# Verifies that can implement Turkish casing as defined by Unicode 5.2. + +use Config; + +BEGIN { + chdir 't'; + @INC = '../lib'; + require './test.pl'; +} + +use subs qw(lc lcfirst uc ucfirst); + +sub uc($) { + my $string = shift; + utf8::upgrade($string); + return CORE::uc($string); +} + +sub ucfirst($) { + my $string = shift; + utf8::upgrade($string); + return CORE::ucfirst($string); +} + +sub lc($) { + my $string = shift; + utf8::upgrade($string); + + # Unless an I is before a dot_above, it turns into a dotless i. + $string =~ s/I (?! [^\p{ccc=0}\p{ccc=Above}]* \x{0307} )/\x{131}/gx; + + # But when the I is followed by a dot_above, remove the dot_above so + # the end result will be i. + $string =~ s/I ([^\p{ccc=0}\p{ccc=Above}]* ) \x{0307}/i$1/gx; + return CORE::lc($string); +} + +sub lcfirst($) { + my $string = shift; + utf8::upgrade($string); + + # Unless an I is before a dot_above, it turns into a dotless i. + $string =~ s/^I (?! [^\p{ccc=0}\p{ccc=Above}]* \x{0307} )/\x{131}/x; + + # But when the I is followed by a dot_above, remove the dot_above so + # the end result will be i. + $string =~ s/^I ([^\p{ccc=0}\p{ccc=Above}]* ) \x{0307}/i$1/x; + return CORE::lcfirst($string); +} + +plan tests => 22; + +my $map_directory = "../lib/unicore/To"; +my $upper = "$map_directory/Upper.pl"; +my $lower = "$map_directory/Lower.pl"; +my $title = "$map_directory/Title.pl"; + +sub ToUpper { + my $official = do $upper; + $utf8::ToSpecUpper{'i'} = "\x{0130}"; + return $official; +} + +sub ToTitle { + my $official = do $title; + $utf8::ToSpecTitle{'i'} = "\x{0130}"; + return $official; +} + +sub ToLower { + my $official = do $lower; + $utf8::ToSpecLower{"\xc4\xb0"} = "i"; + return $official; +} + +is(uc("\x{DF}\x{DF}"), "SSSS", "Verify that uc of non-overridden multi-char works"); +is(uc("aa"), "AA", "Verify that uc of non-overridden ASCII works"); +is(uc("\x{101}\x{101}"), "\x{100}\x{100}", "Verify that uc of non-overridden utf8 works"); +is(uc("ii"), "\x{130}\x{130}", "Verify uc('i') eq \\x{130}"); + +is(ucfirst("\x{DF}\x{DF}"), "Ss\x{DF}", "Verify that ucfirst of non-overridden multi-char works"); +is(ucfirst("\x{101}\x{101}"), "\x{100}\x{101}", "Verify that ucfirst of non-overridden utf8 works"); +is(ucfirst("aa"), "Aa", "Verify that ucfirst of non-overridden ASCII works"); +is(ucfirst("ii"), "\x{130}i", "Verify ucfirst('ii') eq \"\\x{130}i\""); + +is(lc("AA"), "aa", "Verify that lc of non-overridden ASCII works"); +is(lc("\x{C0}\x{C0}"), "\x{E0}\x{E0}", "Verify that lc of non-overridden latin1 works"); +is(lc("\x{0178}\x{0178}"), "\x{FF}\x{FF}", "Verify that lc of non-overridden utf8 works"); +is(lc("II"), "\x{131}\x{131}", "Verify that lc('I') eq \\x{131}"); +is(lc("IG\x{0307}IG\x{0307}"), "\x{131}g\x{0307}\x{131}g\x{0307}", "Verify that lc(\"I...\\x{0307}\") eq \"\\x{131}...\\x{0307}\""); +is(lc("I\x{0307}I\x{0307}"), "ii", "Verify that lc(\"I\\x{0307}\") removes the \\x{0307}, leaving 'i'"); +is(lc("\x{130}\x{130}"), "ii", "Verify that lc(\"\\x{130}\") eq 'i'"); + +is(lcfirst("AA"), "aA", "Verify that lcfirst of non-overridden ASCII works"); +is(lcfirst("\x{C0}\x{C0}"), "\x{E0}\x{C0}", "Verify that lcfirst of non-overridden latin1 works"); +is(lcfirst("\x{0178}\x{0178}"), "\x{FF}\x{0178}", "Verify that lcfirst of non-overridden utf8 works"); +is(lcfirst("I"), "\x{131}", "Verify that lcfirst('II') eq \"\\x{131}I\""); +is(lcfirst("IG\x{0307}"), "\x{131}G\x{0307}", "Verify that lcfirst(\"I...\\x{0307}\") eq \"\\x{131}...\\x{0307}\""); +is(lcfirst("I\x{0307}I\x{0307}"), "iI\x{0307}", "Verify that lcfirst(\"I\\x{0307}I\\x{0307}\") removes the first \\x{0307}, leaving 'iI\\x{0307}'"); +is(lcfirst("\x{130}\x{130}"), "i\x{130}", "Verify that lcfirst(\"\\x{130}\\x{130}\") eq \"i\\x{130}\"");