Implement multicharacter case mappings where a single
Jarkko Hietaniemi [Sun, 21 Oct 2001 16:12:08 +0000 (16:12 +0000)]
Unicode character can be mapped into several.

p4raw-id: //depot/perl@12546

16 files changed:
MANIFEST
embed.h
embed.pl
global.sym
lib/unicore/To/Lower.pl
lib/unicore/To/SpecLower.pl [deleted file]
lib/unicore/To/SpecTitle.pl [deleted file]
lib/unicore/To/SpecUpper.pl [deleted file]
lib/unicore/To/Title.pl
lib/unicore/To/Upper.pl
lib/unicore/mktables
pod/perlfunc.pod
pod/perlunicode.pod
proto.h
t/op/lc.t
utf8.c

index 486a2b3..d21175d 100644 (file)
--- a/MANIFEST
+++ b/MANIFEST
@@ -1604,9 +1604,6 @@ lib/unicore/Scripts.txt           Unicode character database
 lib/unicore/SpecCase.txt       Unicode character database
 lib/unicore/To/Digit.pl                Unicode character database
 lib/unicore/To/Lower.pl                Unicode character database
-lib/unicore/To/SpecLower.pl    Unicode character database
-lib/unicore/To/SpecTitle.pl    Unicode character database
-lib/unicore/To/SpecUpper.pl    Unicode character database
 lib/unicore/To/Title.pl                Unicode character database
 lib/unicore/To/Upper.pl                Unicode character database
 lib/unicore/UCD.html           Unicode character database
diff --git a/embed.h b/embed.h
index b591206..8c584a5 100644 (file)
--- a/embed.h
+++ b/embed.h
 #define swash_fetch            Perl_swash_fetch
 #define taint_env              Perl_taint_env
 #define taint_proper           Perl_taint_proper
+#define to_utf8_case           Perl_to_utf8_case
 #define to_utf8_lower          Perl_to_utf8_lower
 #define to_utf8_upper          Perl_to_utf8_upper
 #define to_utf8_title          Perl_to_utf8_title
 #define swash_fetch(a,b,c)     Perl_swash_fetch(aTHX_ a,b,c)
 #define taint_env()            Perl_taint_env(aTHX)
 #define taint_proper(a,b)      Perl_taint_proper(aTHX_ a,b)
+#define to_utf8_case(a,b,c,d,e,f)      Perl_to_utf8_case(aTHX_ a,b,c,d,e,f)
 #define to_utf8_lower(a,b,c)   Perl_to_utf8_lower(aTHX_ a,b,c)
 #define to_utf8_upper(a,b,c)   Perl_to_utf8_upper(aTHX_ a,b,c)
 #define to_utf8_title(a,b,c)   Perl_to_utf8_title(aTHX_ a,b,c)
index 8c3ba3c..6c20660 100755 (executable)
--- a/embed.pl
+++ b/embed.pl
@@ -1818,6 +1818,8 @@ Ap        |SV*    |swash_init     |char* pkg|char* name|SV* listsv \
 Ap     |UV     |swash_fetch    |SV *sv|U8 *ptr|bool do_utf8
 Ap     |void   |taint_env
 Ap     |void   |taint_proper   |const char* f|const char* s
+Ap     |UV     |to_utf8_case   |U8 *p|U8* ustrp|STRLEN *lenp \
+                               |SV **swash|char *normal|char *special
 Ap     |UV     |to_utf8_lower  |U8 *p|U8* ustrp|STRLEN *lenp
 Ap     |UV     |to_utf8_upper  |U8 *p|U8* ustrp|STRLEN *lenp
 Ap     |UV     |to_utf8_title  |U8 *p|U8* ustrp|STRLEN *lenp
index c5a9246..ede1f3d 100644 (file)
@@ -470,6 +470,7 @@ Perl_swash_init
 Perl_swash_fetch
 Perl_taint_env
 Perl_taint_proper
+Perl_to_utf8_case
 Perl_to_utf8_lower
 Perl_to_utf8_upper
 Perl_to_utf8_title
index 0fd4f8d..ce89c8e 100644 (file)
@@ -1,6 +1,112 @@
 # !!!!!!!   DO NOT EDIT THIS FILE   !!!!!!! 
 # This file is built by mktables from e.g. Unicode.txt.
 # Any changes made here will be lost!
+
+%utf8::ToSpecLower = (
+'223' => "\x{00DF}",
+'329' => "\x{0149}",
+'496' => "\x{01F0}",
+'912' => "\x{0390}",
+'944' => "\x{03B0}",
+'1415' => "\x{0587}",
+'7830' => "\x{1E96}",
+'7831' => "\x{1E97}",
+'7832' => "\x{1E98}",
+'7833' => "\x{1E99}",
+'7834' => "\x{1E9A}",
+'8016' => "\x{1F50}",
+'8018' => "\x{1F52}",
+'8020' => "\x{1F54}",
+'8022' => "\x{1F56}",
+'8064' => "\x{1F80}",
+'8065' => "\x{1F81}",
+'8066' => "\x{1F82}",
+'8067' => "\x{1F83}",
+'8068' => "\x{1F84}",
+'8069' => "\x{1F85}",
+'8070' => "\x{1F86}",
+'8071' => "\x{1F87}",
+'8072' => "\x{1F80}",
+'8073' => "\x{1F81}",
+'8074' => "\x{1F82}",
+'8075' => "\x{1F83}",
+'8076' => "\x{1F84}",
+'8077' => "\x{1F85}",
+'8078' => "\x{1F86}",
+'8079' => "\x{1F87}",
+'8080' => "\x{1F90}",
+'8081' => "\x{1F91}",
+'8082' => "\x{1F92}",
+'8083' => "\x{1F93}",
+'8084' => "\x{1F94}",
+'8085' => "\x{1F95}",
+'8086' => "\x{1F96}",
+'8087' => "\x{1F97}",
+'8088' => "\x{1F90}",
+'8089' => "\x{1F91}",
+'8090' => "\x{1F92}",
+'8091' => "\x{1F93}",
+'8092' => "\x{1F94}",
+'8093' => "\x{1F95}",
+'8094' => "\x{1F96}",
+'8095' => "\x{1F97}",
+'8096' => "\x{1FA0}",
+'8097' => "\x{1FA1}",
+'8098' => "\x{1FA2}",
+'8099' => "\x{1FA3}",
+'8100' => "\x{1FA4}",
+'8101' => "\x{1FA5}",
+'8102' => "\x{1FA6}",
+'8103' => "\x{1FA7}",
+'8104' => "\x{1FA0}",
+'8105' => "\x{1FA1}",
+'8106' => "\x{1FA2}",
+'8107' => "\x{1FA3}",
+'8108' => "\x{1FA4}",
+'8109' => "\x{1FA5}",
+'8110' => "\x{1FA6}",
+'8111' => "\x{1FA7}",
+'8114' => "\x{1FB2}",
+'8115' => "\x{1FB3}",
+'8116' => "\x{1FB4}",
+'8118' => "\x{1FB6}",
+'8119' => "\x{1FB7}",
+'8124' => "\x{1FB3}",
+'8130' => "\x{1FC2}",
+'8131' => "\x{1FC3}",
+'8132' => "\x{1FC4}",
+'8134' => "\x{1FC6}",
+'8135' => "\x{1FC7}",
+'8140' => "\x{1FC3}",
+'8146' => "\x{1FD2}",
+'8147' => "\x{1FD3}",
+'8150' => "\x{1FD6}",
+'8151' => "\x{1FD7}",
+'8162' => "\x{1FE2}",
+'8163' => "\x{1FE3}",
+'8164' => "\x{1FE4}",
+'8166' => "\x{1FE6}",
+'8167' => "\x{1FE7}",
+'8178' => "\x{1FF2}",
+'8179' => "\x{1FF3}",
+'8180' => "\x{1FF4}",
+'8182' => "\x{1FF6}",
+'8183' => "\x{1FF7}",
+'8188' => "\x{1FF3}",
+'64256' => "\x{FB00}",
+'64257' => "\x{FB01}",
+'64258' => "\x{FB02}",
+'64259' => "\x{FB03}",
+'64260' => "\x{FB04}",
+'64261' => "\x{FB05}",
+'64262' => "\x{FB06}",
+'64275' => "\x{FB13}",
+'64276' => "\x{FB14}",
+'64277' => "\x{FB15}",
+'64278' => "\x{FB16}",
+'64279' => "\x{FB17}",
+);
+
 return <<'END';
 0041           0061
 0042           0062
diff --git a/lib/unicore/To/SpecLower.pl b/lib/unicore/To/SpecLower.pl
deleted file mode 100644 (file)
index 18c073b..0000000
+++ /dev/null
@@ -1,107 +0,0 @@
-# !!!!!!!   DO NOT EDIT THIS FILE   !!!!!!! 
-# This file is built by mktables from e.g. Unicode.txt.
-# Any changes made here will be lost!
-return <<'END';
-00DF           00DF
-0149           0149
-01F0           01F0
-0390           0390
-03B0           03B0
-0587           0587
-1E96           1E96
-1E97           1E97
-1E98           1E98
-1E99           1E99
-1E9A           1E9A
-1F50           1F50
-1F52           1F52
-1F54           1F54
-1F56           1F56
-1F80           1F80
-1F81           1F81
-1F82           1F82
-1F83           1F83
-1F84           1F84
-1F85           1F85
-1F86           1F86
-1F87           1F87
-1F88           1F80
-1F89           1F81
-1F8A           1F82
-1F8B           1F83
-1F8C           1F84
-1F8D           1F85
-1F8E           1F86
-1F8F           1F87
-1F90           1F90
-1F91           1F91
-1F92           1F92
-1F93           1F93
-1F94           1F94
-1F95           1F95
-1F96           1F96
-1F97           1F97
-1F98           1F90
-1F99           1F91
-1F9A           1F92
-1F9B           1F93
-1F9C           1F94
-1F9D           1F95
-1F9E           1F96
-1F9F           1F97
-1FA0           1FA0
-1FA1           1FA1
-1FA2           1FA2
-1FA3           1FA3
-1FA4           1FA4
-1FA5           1FA5
-1FA6           1FA6
-1FA7           1FA7
-1FA8           1FA0
-1FA9           1FA1
-1FAA           1FA2
-1FAB           1FA3
-1FAC           1FA4
-1FAD           1FA5
-1FAE           1FA6
-1FAF           1FA7
-1FB2           1FB2
-1FB3           1FB3
-1FB4           1FB4
-1FB6           1FB6
-1FB7           1FB7
-1FBC           1FB3
-1FC2           1FC2
-1FC3           1FC3
-1FC4           1FC4
-1FC6           1FC6
-1FC7           1FC7
-1FCC           1FC3
-1FD2           1FD2
-1FD3           1FD3
-1FD6           1FD6
-1FD7           1FD7
-1FE2           1FE2
-1FE3           1FE3
-1FE4           1FE4
-1FE6           1FE6
-1FE7           1FE7
-1FF2           1FF2
-1FF3           1FF3
-1FF4           1FF4
-1FF6           1FF6
-1FF7           1FF7
-1FFC           1FF3
-FB00           FB00
-FB01           FB01
-FB02           FB02
-FB03           FB03
-FB04           FB04
-FB05           FB05
-FB06           FB06
-FB13           FB13
-FB14           FB14
-FB15           FB15
-FB16           FB16
-FB17           FB17
-END
diff --git a/lib/unicore/To/SpecTitle.pl b/lib/unicore/To/SpecTitle.pl
deleted file mode 100644 (file)
index c3e1911..0000000
+++ /dev/null
@@ -1,106 +0,0 @@
-# !!!!!!!   DO NOT EDIT THIS FILE   !!!!!!! 
-# This file is built by mktables from e.g. Unicode.txt.
-# Any changes made here will be lost!
-return <<'END';
-00DF           0053 0073
-0149           02BC 004E
-01F0           004A 030C
-0390           0399 0308 0301
-03B0           03A5 0308 0301
-0587           0535 0582
-1E96           0048 0331
-1E97           0054 0308
-1E98           0057 030A
-1E99           0059 030A
-1E9A           0041 02BE
-1F50           03A5 0313
-1F52           03A5 0313 0300
-1F54           03A5 0313 0301
-1F56           03A5 0313 0342
-1F80           1F88
-1F81           1F89
-1F82           1F8A
-1F83           1F8B
-1F84           1F8C
-1F85           1F8D
-1F86           1F8E
-1F87           1F8F
-1F88           1F88
-1F89           1F89
-1F8A           1F8A
-1F8B           1F8B
-1F8C           1F8C
-1F8D           1F8D
-1F8E           1F8E
-1F8F           1F8F
-1F90           1F98
-1F91           1F99
-1F92           1F9A
-1F93           1F9B
-1F94           1F9C
-1F95           1F9D
-1F96           1F9E
-1F97           1F9F
-1F98           1F98
-1F99           1F99
-1F9A           1F9A
-1F9B           1F9B
-1F9C           1F9C
-1F9D           1F9D
-1F9E           1F9E
-1F9F           1F9F
-1FA0           1FA8
-1FA1           1FA9
-1FA2           1FAA
-1FA3           1FAB
-1FA4           1FAC
-1FA5           1FAD
-1FA6           1FAE
-1FA7           1FAF
-1FA8           1FA8
-1FA9           1FA9
-1FAA           1FAA
-1FAB           1FAB
-1FAC           1FAC
-1FAD           1FAD
-1FAE           1FAE
-1FAF           1FAF
-1FB2           1FBA 0345
-1FB3           1FBC
-1FB4           0386 0345
-1FB6           0391 0342
-1FB7           0391 0342 0345
-1FBC           1FBC
-1FC2           1FCA 0345
-1FC3           1FCC
-1FC4           0389 0345
-1FC6           0397 0342
-1FC7           0397 0342 0345
-1FCC           1FCC
-1FD2           0399 0308 0300
-1FD3           0399 0308 0301
-1FD6           0399 0342
-1FD7           0399 0308 0342
-1FE2           03A5 0308 0300
-1FE3           03A5 0308 0301
-1FE4           03A1 0313
-1FE6           03A5 0342
-1FE7           03A5 0308 0342
-1FF2           1FFA 0345
-1FF3           1FFC
-1FF4           038F 0345
-1FF6           03A9 0342
-1FF7           03A9 0342 0345
-1FFC           1FFC
-FB00           0046 0066
-FB01           0046 0069
-FB02           0046 006C
-FB03           0046 0066 0069
-FB04           0046 0066 006C
-FB05   FB06    0053 0074
-FB13           0544 0576
-FB14           0544 0565
-FB15           0544 056B
-FB16           054E 0576
-FB17           0544 056D
-END
diff --git a/lib/unicore/To/SpecUpper.pl b/lib/unicore/To/SpecUpper.pl
deleted file mode 100644 (file)
index e5af4b1..0000000
+++ /dev/null
@@ -1,106 +0,0 @@
-# !!!!!!!   DO NOT EDIT THIS FILE   !!!!!!! 
-# This file is built by mktables from e.g. Unicode.txt.
-# Any changes made here will be lost!
-return <<'END';
-00DF           0053 0053
-0149           02BC 004E
-01F0           004A 030C
-0390           0399 0308 0301
-03B0           03A5 0308 0301
-0587           0535 0552
-1E96           0048 0331
-1E97           0054 0308
-1E98           0057 030A
-1E99           0059 030A
-1E9A           0041 02BE
-1F50           03A5 0313
-1F52           03A5 0313 0300
-1F54           03A5 0313 0301
-1F56           03A5 0313 0342
-1F80           1F08 0399
-1F81           1F09 0399
-1F82           1F0A 0399
-1F83           1F0B 0399
-1F84           1F0C 0399
-1F85           1F0D 0399
-1F86           1F0E 0399
-1F87           1F0F 0399
-1F88           1F08 0399
-1F89           1F09 0399
-1F8A           1F0A 0399
-1F8B           1F0B 0399
-1F8C           1F0C 0399
-1F8D           1F0D 0399
-1F8E           1F0E 0399
-1F8F           1F0F 0399
-1F90           1F28 0399
-1F91           1F29 0399
-1F92           1F2A 0399
-1F93           1F2B 0399
-1F94           1F2C 0399
-1F95           1F2D 0399
-1F96           1F2E 0399
-1F97           1F2F 0399
-1F98           1F28 0399
-1F99           1F29 0399
-1F9A           1F2A 0399
-1F9B           1F2B 0399
-1F9C           1F2C 0399
-1F9D           1F2D 0399
-1F9E           1F2E 0399
-1F9F           1F2F 0399
-1FA0           1F68 0399
-1FA1           1F69 0399
-1FA2           1F6A 0399
-1FA3           1F6B 0399
-1FA4           1F6C 0399
-1FA5           1F6D 0399
-1FA6           1F6E 0399
-1FA7           1F6F 0399
-1FA8           1F68 0399
-1FA9           1F69 0399
-1FAA           1F6A 0399
-1FAB           1F6B 0399
-1FAC           1F6C 0399
-1FAD           1F6D 0399
-1FAE           1F6E 0399
-1FAF           1F6F 0399
-1FB2           1FBA 0399
-1FB3           0391 0399
-1FB4           0386 0399
-1FB6           0391 0342
-1FB7           0391 0342 0399
-1FBC           0391 0399
-1FC2           1FCA 0399
-1FC3           0397 0399
-1FC4           0389 0399
-1FC6           0397 0342
-1FC7           0397 0342 0399
-1FCC           0397 0399
-1FD2           0399 0308 0300
-1FD3           0399 0308 0301
-1FD6           0399 0342
-1FD7           0399 0308 0342
-1FE2           03A5 0308 0300
-1FE3           03A5 0308 0301
-1FE4           03A1 0313
-1FE6           03A5 0342
-1FE7           03A5 0308 0342
-1FF2           1FFA 0399
-1FF3           03A9 0399
-1FF4           038F 0399
-1FF6           03A9 0342
-1FF7           03A9 0342 0399
-1FFC           03A9 0399
-FB00           0046 0046
-FB01           0046 0049
-FB02           0046 004C
-FB03           0046 0046 0049
-FB04           0046 0046 004C
-FB05   FB06    0053 0054
-FB13           0544 0546
-FB14           0544 0535
-FB15           0544 053B
-FB16           054E 0546
-FB17           0544 053D
-END
index 2fca353..3da9ca9 100644 (file)
@@ -1,6 +1,112 @@
 # !!!!!!!   DO NOT EDIT THIS FILE   !!!!!!! 
 # This file is built by mktables from e.g. Unicode.txt.
 # Any changes made here will be lost!
+
+%utf8::ToSpecTitle = (
+'223' => "\x{0053}\x{0073}",
+'329' => "\x{02BC}\x{004E}",
+'496' => "\x{004A}\x{030C}",
+'912' => "\x{0399}\x{0308}\x{0301}",
+'944' => "\x{03A5}\x{0308}\x{0301}",
+'1415' => "\x{0535}\x{0582}",
+'7830' => "\x{0048}\x{0331}",
+'7831' => "\x{0054}\x{0308}",
+'7832' => "\x{0057}\x{030A}",
+'7833' => "\x{0059}\x{030A}",
+'7834' => "\x{0041}\x{02BE}",
+'8016' => "\x{03A5}\x{0313}",
+'8018' => "\x{03A5}\x{0313}\x{0300}",
+'8020' => "\x{03A5}\x{0313}\x{0301}",
+'8022' => "\x{03A5}\x{0313}\x{0342}",
+'8064' => "\x{1F88}",
+'8065' => "\x{1F89}",
+'8066' => "\x{1F8A}",
+'8067' => "\x{1F8B}",
+'8068' => "\x{1F8C}",
+'8069' => "\x{1F8D}",
+'8070' => "\x{1F8E}",
+'8071' => "\x{1F8F}",
+'8072' => "\x{1F88}",
+'8073' => "\x{1F89}",
+'8074' => "\x{1F8A}",
+'8075' => "\x{1F8B}",
+'8076' => "\x{1F8C}",
+'8077' => "\x{1F8D}",
+'8078' => "\x{1F8E}",
+'8079' => "\x{1F8F}",
+'8080' => "\x{1F98}",
+'8081' => "\x{1F99}",
+'8082' => "\x{1F9A}",
+'8083' => "\x{1F9B}",
+'8084' => "\x{1F9C}",
+'8085' => "\x{1F9D}",
+'8086' => "\x{1F9E}",
+'8087' => "\x{1F9F}",
+'8088' => "\x{1F98}",
+'8089' => "\x{1F99}",
+'8090' => "\x{1F9A}",
+'8091' => "\x{1F9B}",
+'8092' => "\x{1F9C}",
+'8093' => "\x{1F9D}",
+'8094' => "\x{1F9E}",
+'8095' => "\x{1F9F}",
+'8096' => "\x{1FA8}",
+'8097' => "\x{1FA9}",
+'8098' => "\x{1FAA}",
+'8099' => "\x{1FAB}",
+'8100' => "\x{1FAC}",
+'8101' => "\x{1FAD}",
+'8102' => "\x{1FAE}",
+'8103' => "\x{1FAF}",
+'8104' => "\x{1FA8}",
+'8105' => "\x{1FA9}",
+'8106' => "\x{1FAA}",
+'8107' => "\x{1FAB}",
+'8108' => "\x{1FAC}",
+'8109' => "\x{1FAD}",
+'8110' => "\x{1FAE}",
+'8111' => "\x{1FAF}",
+'8114' => "\x{1FBA}\x{0345}",
+'8115' => "\x{1FBC}",
+'8116' => "\x{0386}\x{0345}",
+'8118' => "\x{0391}\x{0342}",
+'8119' => "\x{0391}\x{0342}\x{0345}",
+'8124' => "\x{1FBC}",
+'8130' => "\x{1FCA}\x{0345}",
+'8131' => "\x{1FCC}",
+'8132' => "\x{0389}\x{0345}",
+'8134' => "\x{0397}\x{0342}",
+'8135' => "\x{0397}\x{0342}\x{0345}",
+'8140' => "\x{1FCC}",
+'8146' => "\x{0399}\x{0308}\x{0300}",
+'8147' => "\x{0399}\x{0308}\x{0301}",
+'8150' => "\x{0399}\x{0342}",
+'8151' => "\x{0399}\x{0308}\x{0342}",
+'8162' => "\x{03A5}\x{0308}\x{0300}",
+'8163' => "\x{03A5}\x{0308}\x{0301}",
+'8164' => "\x{03A1}\x{0313}",
+'8166' => "\x{03A5}\x{0342}",
+'8167' => "\x{03A5}\x{0308}\x{0342}",
+'8178' => "\x{1FFA}\x{0345}",
+'8179' => "\x{1FFC}",
+'8180' => "\x{038F}\x{0345}",
+'8182' => "\x{03A9}\x{0342}",
+'8183' => "\x{03A9}\x{0342}\x{0345}",
+'8188' => "\x{1FFC}",
+'64256' => "\x{0046}\x{0066}",
+'64257' => "\x{0046}\x{0069}",
+'64258' => "\x{0046}\x{006C}",
+'64259' => "\x{0046}\x{0066}\x{0069}",
+'64260' => "\x{0046}\x{0066}\x{006C}",
+'64261' => "\x{0053}\x{0074}",
+'64262' => "\x{0053}\x{0074}",
+'64275' => "\x{0544}\x{0576}",
+'64276' => "\x{0544}\x{0565}",
+'64277' => "\x{0544}\x{056B}",
+'64278' => "\x{054E}\x{0576}",
+'64279' => "\x{0544}\x{056D}",
+);
+
 return <<'END';
 0061           0041
 0062           0042
index bfdd4ea..a9c7a9f 100644 (file)
@@ -1,6 +1,112 @@
 # !!!!!!!   DO NOT EDIT THIS FILE   !!!!!!! 
 # This file is built by mktables from e.g. Unicode.txt.
 # Any changes made here will be lost!
+
+%utf8::ToSpecUpper = (
+'223' => "\x{0053}\x{0053}",
+'329' => "\x{02BC}\x{004E}",
+'496' => "\x{004A}\x{030C}",
+'912' => "\x{0399}\x{0308}\x{0301}",
+'944' => "\x{03A5}\x{0308}\x{0301}",
+'1415' => "\x{0535}\x{0552}",
+'7830' => "\x{0048}\x{0331}",
+'7831' => "\x{0054}\x{0308}",
+'7832' => "\x{0057}\x{030A}",
+'7833' => "\x{0059}\x{030A}",
+'7834' => "\x{0041}\x{02BE}",
+'8016' => "\x{03A5}\x{0313}",
+'8018' => "\x{03A5}\x{0313}\x{0300}",
+'8020' => "\x{03A5}\x{0313}\x{0301}",
+'8022' => "\x{03A5}\x{0313}\x{0342}",
+'8064' => "\x{1F08}\x{0399}",
+'8065' => "\x{1F09}\x{0399}",
+'8066' => "\x{1F0A}\x{0399}",
+'8067' => "\x{1F0B}\x{0399}",
+'8068' => "\x{1F0C}\x{0399}",
+'8069' => "\x{1F0D}\x{0399}",
+'8070' => "\x{1F0E}\x{0399}",
+'8071' => "\x{1F0F}\x{0399}",
+'8072' => "\x{1F08}\x{0399}",
+'8073' => "\x{1F09}\x{0399}",
+'8074' => "\x{1F0A}\x{0399}",
+'8075' => "\x{1F0B}\x{0399}",
+'8076' => "\x{1F0C}\x{0399}",
+'8077' => "\x{1F0D}\x{0399}",
+'8078' => "\x{1F0E}\x{0399}",
+'8079' => "\x{1F0F}\x{0399}",
+'8080' => "\x{1F28}\x{0399}",
+'8081' => "\x{1F29}\x{0399}",
+'8082' => "\x{1F2A}\x{0399}",
+'8083' => "\x{1F2B}\x{0399}",
+'8084' => "\x{1F2C}\x{0399}",
+'8085' => "\x{1F2D}\x{0399}",
+'8086' => "\x{1F2E}\x{0399}",
+'8087' => "\x{1F2F}\x{0399}",
+'8088' => "\x{1F28}\x{0399}",
+'8089' => "\x{1F29}\x{0399}",
+'8090' => "\x{1F2A}\x{0399}",
+'8091' => "\x{1F2B}\x{0399}",
+'8092' => "\x{1F2C}\x{0399}",
+'8093' => "\x{1F2D}\x{0399}",
+'8094' => "\x{1F2E}\x{0399}",
+'8095' => "\x{1F2F}\x{0399}",
+'8096' => "\x{1F68}\x{0399}",
+'8097' => "\x{1F69}\x{0399}",
+'8098' => "\x{1F6A}\x{0399}",
+'8099' => "\x{1F6B}\x{0399}",
+'8100' => "\x{1F6C}\x{0399}",
+'8101' => "\x{1F6D}\x{0399}",
+'8102' => "\x{1F6E}\x{0399}",
+'8103' => "\x{1F6F}\x{0399}",
+'8104' => "\x{1F68}\x{0399}",
+'8105' => "\x{1F69}\x{0399}",
+'8106' => "\x{1F6A}\x{0399}",
+'8107' => "\x{1F6B}\x{0399}",
+'8108' => "\x{1F6C}\x{0399}",
+'8109' => "\x{1F6D}\x{0399}",
+'8110' => "\x{1F6E}\x{0399}",
+'8111' => "\x{1F6F}\x{0399}",
+'8114' => "\x{1FBA}\x{0399}",
+'8115' => "\x{0391}\x{0399}",
+'8116' => "\x{0386}\x{0399}",
+'8118' => "\x{0391}\x{0342}",
+'8119' => "\x{0391}\x{0342}\x{0399}",
+'8124' => "\x{0391}\x{0399}",
+'8130' => "\x{1FCA}\x{0399}",
+'8131' => "\x{0397}\x{0399}",
+'8132' => "\x{0389}\x{0399}",
+'8134' => "\x{0397}\x{0342}",
+'8135' => "\x{0397}\x{0342}\x{0399}",
+'8140' => "\x{0397}\x{0399}",
+'8146' => "\x{0399}\x{0308}\x{0300}",
+'8147' => "\x{0399}\x{0308}\x{0301}",
+'8150' => "\x{0399}\x{0342}",
+'8151' => "\x{0399}\x{0308}\x{0342}",
+'8162' => "\x{03A5}\x{0308}\x{0300}",
+'8163' => "\x{03A5}\x{0308}\x{0301}",
+'8164' => "\x{03A1}\x{0313}",
+'8166' => "\x{03A5}\x{0342}",
+'8167' => "\x{03A5}\x{0308}\x{0342}",
+'8178' => "\x{1FFA}\x{0399}",
+'8179' => "\x{03A9}\x{0399}",
+'8180' => "\x{038F}\x{0399}",
+'8182' => "\x{03A9}\x{0342}",
+'8183' => "\x{03A9}\x{0342}\x{0399}",
+'8188' => "\x{03A9}\x{0399}",
+'64256' => "\x{0046}\x{0046}",
+'64257' => "\x{0046}\x{0049}",
+'64258' => "\x{0046}\x{004C}",
+'64259' => "\x{0046}\x{0046}\x{0049}",
+'64260' => "\x{0046}\x{0046}\x{004C}",
+'64261' => "\x{0053}\x{0054}",
+'64262' => "\x{0053}\x{0054}",
+'64275' => "\x{0544}\x{0546}",
+'64276' => "\x{0544}\x{0535}",
+'64277' => "\x{0544}\x{053B}",
+'64278' => "\x{054E}\x{0546}",
+'64279' => "\x{0544}\x{053D}",
+);
+
 return <<'END';
 0061           0041
 0062           0042
index 3328f69..7d8912d 100644 (file)
@@ -745,16 +745,32 @@ if (open(my $SpecCase, "SpecCase.txt")) {
 }
 
 # Now write out the special cases properties in their code point order.
-# The To/Spec{Lower,Title,Upper}.pl are unused for now since the swash
-# routines do not do returning multiple characters.
+# Prepend them to the To/{Upper,Lower,Title}.pl.
 
 for my $case (qw(Lower Title Upper)) {
-    my @case;
-    for my $prop (sort { $a->[0] <=> $b->[0] } @{$Case{$case}}) {
-        my ($ix, $code, $to) = @$prop;
-        append(\@case, $code, $to);
+    my $NormalCase = do "To/$case.pl";
+    if (open(my $Case, ">To/$case.pl")) {
+       header($Case);
+       print $Case <<EOT;
+
+%utf8::ToSpec$case = (
+EOT
+        for my $prop (sort { $a->[0] <=> $b->[0] } @{$Case{$case}}) {
+           my ($ix, $code, $to) = @$prop;
+           my $tostr =
+               join "", map { sprintf "\\x{%s}", $_ } split ' ', $to;
+           print $Case qq['$ix' => "$tostr",\n];
+       }
+       print $Case <<EOT;
+);
+
+EOT
+       begin($Case);
+       print $Case $NormalCase;
+       end($Case);
+    } else {
+       die "$0: To/$case.txt: $!\n";
     }
-    flush(\@case, "To/Spec$case.pl");
 }
 
 # That's all, folks!
index 6d94459..ca48470 100644 (file)
@@ -2330,7 +2330,7 @@ C<redo> work.
 Returns an lowercased version of EXPR.  This is the internal function
 implementing the C<\L> escape in double-quoted strings.  Respects
 current LC_CTYPE locale if C<use locale> in force.  See L<perllocale>
-and L<perlunicode>.
+and L<perlunicode> for more details about locale and Unicode support.
 
 If EXPR is omitted, uses C<$_>.
 
@@ -2341,7 +2341,8 @@ If EXPR is omitted, uses C<$_>.
 Returns the value of EXPR with the first character lowercased.  This
 is the internal function implementing the C<\l> escape in
 double-quoted strings.  Respects current LC_CTYPE locale if C<use
-locale> in force.  See L<perllocale> and L<perlunicode>.
+locale> in force.  See L<perllocale> and L<perlunicode> for more
+details about locale and Unicode support.
 
 If EXPR is omitted, uses C<$_>.
 
@@ -5464,8 +5465,9 @@ otherwise.
 Returns an uppercased version of EXPR.  This is the internal function
 implementing the C<\U> escape in double-quoted strings.  Respects
 current LC_CTYPE locale if C<use locale> in force.  See L<perllocale>
-and L<perlunicode>.  It does not attempt to do titlecase mapping on
-initial letters.  See C<ucfirst> for that.
+and L<perlunicode> for more details about locale and Unicode support.
+It does not attempt to do titlecase mapping on initial letters.  See
+C<ucfirst> for that.
 
 If EXPR is omitted, uses C<$_>.
 
@@ -5476,7 +5478,8 @@ If EXPR is omitted, uses C<$_>.
 Returns the value of EXPR with the first character in uppercase
 (titlecase in Unicode).  This is the internal function implementing
 the C<\u> escape in double-quoted strings.  Respects current LC_CTYPE
-locale if C<use locale> in force.  See L<perllocale> and L<perlunicode>.
+locale if C<use locale> in force.  See L<perllocale> and L<perlunicode>
+for more details about locale and Unicode support.
 
 If EXPR is omitted, uses C<$_>.
 
index 4e7c936..9b4d2e3 100644 (file)
@@ -552,15 +552,37 @@ wide bit complement.
 
 =item *
 
-lc(), uc(), lcfirst(), and ucfirst() work only for some of the
-simplest cases, where the mapping goes from a single Unicode character
-to another single Unicode character, and where the mapping does not
-depend on surrounding characters, or on locales.  More complex cases,
-where for example one character maps into several, are not yet
-implemented.  See the Unicode Technical Report #21, Case Mappings,
-for more details.  The Unicode::UCD module (part of Perl since 5.8.0)
-casespec() and casefold() interfaces supply information about the more
-complex cases.
+lc(), uc(), lcfirst(), and ucfirst() work for the following cases:
+
+=over 8
+
+=item *
+
+the case mapping is from a single Unicode character to another
+single Unicode character
+
+=item *
+
+the case mapping is from a single Unicode character to more
+than one Unicode character
+
+=back
+
+What doesn't yet work are the followng cases:
+
+=over 8
+
+=item *
+
+the "final sigma" (Greek)
+
+=item *
+
+anything to with locales (Lithuanian, Turkish, Azeri)
+
+=back
+
+See the Unicode Technical Report #21, Case Mappings, for more details.
 
 =item *
 
diff --git a/proto.h b/proto.h
index ff3ac5f..b56817a 100644 (file)
--- a/proto.h
+++ b/proto.h
@@ -797,6 +797,7 @@ PERL_CALLCONV SV*   Perl_swash_init(pTHX_ char* pkg, char* name, SV* listsv, I32 m
 PERL_CALLCONV UV       Perl_swash_fetch(pTHX_ SV *sv, U8 *ptr, bool do_utf8);
 PERL_CALLCONV void     Perl_taint_env(pTHX);
 PERL_CALLCONV void     Perl_taint_proper(pTHX_ const char* f, const char* s);
+PERL_CALLCONV UV       Perl_to_utf8_case(pTHX_ U8 *p, U8* ustrp, STRLEN *lenp, SV **swash, char *normal, char *special);
 PERL_CALLCONV UV       Perl_to_utf8_lower(pTHX_ U8 *p, U8* ustrp, STRLEN *lenp);
 PERL_CALLCONV UV       Perl_to_utf8_upper(pTHX_ U8 *p, U8* ustrp, STRLEN *lenp);
 PERL_CALLCONV UV       Perl_to_utf8_title(pTHX_ U8 *p, U8* ustrp, STRLEN *lenp);
index 2db3a8a..9333c6c 100644 (file)
--- a/t/op/lc.t
+++ b/t/op/lc.t
@@ -1,59 +1,93 @@
 #!./perl
 
-print "1..40\n";
+print "1..42\n";
+
+my $test = 1;
+
+sub ok {
+    if ($_[0]) {
+       if ($_[1]) {
+           print "ok $test - $_[1]\n";
+       } else {
+           print "ok $test\n";
+       }
+    } else {
+       if ($_[1]) {
+           print "not ok $test - $_[1]\n";
+       } else {
+           print "not ok $test\n";
+       }
+    }
+    $test++;
+}
 
 $a = "HELLO.* world";
 $b = "hello.* WORLD";
 
-print "ok 1\n"  if "\Q$a\E."      eq "HELLO\\.\\*\\ world.";
-print "ok 2\n"  if "\u$a"         eq "HELLO\.\* world";
-print "ok 3\n"  if "\l$a"         eq "hELLO\.\* world";
-print "ok 4\n"  if "\U$a"         eq "HELLO\.\* WORLD";
-print "ok 5\n"  if "\L$a"         eq "hello\.\* world";
-
-print "ok 6\n"  if quotemeta($a)  eq "HELLO\\.\\*\\ world";
-print "ok 7\n"  if ucfirst($a)    eq "HELLO\.\* world";
-print "ok 8\n"  if lcfirst($a)    eq "hELLO\.\* world";
-print "ok 9\n"  if uc($a)         eq "HELLO\.\* WORLD";
-print "ok 10\n" if lc($a)         eq "hello\.\* world";
-
-print "ok 11\n"  if "\Q$b\E."     eq "hello\\.\\*\\ WORLD.";
-print "ok 12\n"  if "\u$b"        eq "Hello\.\* WORLD";
-print "ok 13\n"  if "\l$b"        eq "hello\.\* WORLD";
-print "ok 14\n"  if "\U$b"        eq "HELLO\.\* WORLD";
-print "ok 15\n"  if "\L$b"        eq "hello\.\* world";
-
-print "ok 16\n"  if quotemeta($b) eq "hello\\.\\*\\ WORLD";
-print "ok 17\n"  if ucfirst($b)   eq "Hello\.\* WORLD";
-print "ok 18\n"  if lcfirst($b)   eq "hello\.\* WORLD";
-print "ok 19\n"  if uc($b)        eq "HELLO\.\* WORLD";
-print "ok 20\n"  if lc($b)        eq "hello\.\* world";
+ok("\Q$a\E."      eq "HELLO\\.\\*\\ world.", '\Q\E HELLO.* world');
+ok("\u$a"         eq "HELLO\.\* world",      '\u');
+ok("\l$a"         eq "hELLO\.\* world",      '\l');
+ok("\U$a"         eq "HELLO\.\* WORLD",      '\U');
+ok("\L$a"         eq "hello\.\* world",      '\L');
+
+ok(quotemeta($a)  eq "HELLO\\.\\*\\ world",  'quotemeta');
+ok(ucfirst($a)    eq "HELLO\.\* world",      'ucfirst');
+ok(lcfirst($a)    eq "hELLO\.\* world",      'lcfirst');
+ok(uc($a)         eq "HELLO\.\* WORLD",      'uc');
+ok(lc($a)         eq "hello\.\* world",      'lc');
+
+ok("\Q$b\E."      eq "hello\\.\\*\\ WORLD.", '\Q\E hello.* WORLD');
+ok("\u$b"         eq "Hello\.\* WORLD",      '\u');
+ok("\l$b"         eq "hello\.\* WORLD",      '\l');
+ok("\U$b"         eq "HELLO\.\* WORLD",      '\U');
+ok("\L$b"         eq "hello\.\* world",      '\L');
+
+ok(quotemeta($b)  eq "hello\\.\\*\\ WORLD",  'quotemeta');
+ok(ucfirst($b)    eq "Hello\.\* WORLD",      'ucfirst');
+ok(lcfirst($b)    eq "hello\.\* WORLD",      'lcfirst');
+ok(uc($b)         eq "HELLO\.\* WORLD",      'uc');
+ok(lc($b)         eq "hello\.\* world",      'lc');
+
+# \x{100} is LATIN CAPITAL LETTER A WITH MACRON; its bijective lowercase is
+# \x{100}, LATIN SMALL LETTER A WITH MACRON.
 
 $a = "\x{100}\x{101}\x{41}\x{61}";
 $b = "\x{101}\x{100}\x{61}\x{41}";
 
-print "ok 21\n" if "\Q$a\E."      eq "\x{100}\x{101}\x{41}\x{61}.";
-print "ok 22\n" if "\u$a"         eq "\x{100}\x{101}\x{41}\x{61}";
-print "ok 23\n" if "\l$a"         eq "\x{101}\x{101}\x{41}\x{61}";
-print "ok 24\n" if "\U$a"         eq "\x{100}\x{100}\x{41}\x{41}";
-print "ok 25\n" if "\L$a"         eq "\x{101}\x{101}\x{61}\x{61}";
-
-print "ok 26\n" if quotemeta($a)  eq "\x{100}\x{101}\x{41}\x{61}";
-print "ok 27\n" if ucfirst($a)    eq "\x{100}\x{101}\x{41}\x{61}";
-print "ok 28\n" if lcfirst($a)    eq "\x{101}\x{101}\x{41}\x{61}";
-print "ok 29\n" if uc($a)         eq "\x{100}\x{100}\x{41}\x{41}";
-print "ok 30\n" if lc($a)         eq "\x{101}\x{101}\x{61}\x{61}";
-
-print "ok 31\n" if "\Q$b\E."      eq "\x{101}\x{100}\x{61}\x{41}.";
-print "ok 32\n" if "\u$b"         eq "\x{100}\x{100}\x{61}\x{41}";
-print "ok 33\n" if "\l$b"         eq "\x{101}\x{100}\x{61}\x{41}";
-print "ok 34\n" if "\U$b"         eq "\x{100}\x{100}\x{41}\x{41}";
-print "ok 35\n" if "\L$b"         eq "\x{101}\x{101}\x{61}\x{61}";
-
-print "ok 36\n"  if quotemeta($b) eq "\x{101}\x{100}\x{61}\x{41}";
-print "ok 37\n"  if ucfirst($b)   eq "\x{100}\x{100}\x{61}\x{41}";
-print "ok 38\n"  if lcfirst($b)   eq "\x{101}\x{100}\x{61}\x{41}";
-print "ok 39\n"  if uc($b)        eq "\x{100}\x{100}\x{41}\x{41}";
-print "ok 40\n"  if lc($b)        eq "\x{101}\x{101}\x{61}\x{61}";
+ok("\Q$a\E."      eq "\x{100}\x{101}\x{41}\x{61}.", '\Q\E \x{100}\x{101}\x{41}\x{61}');
+ok("\u$a"         eq "\x{100}\x{101}\x{41}\x{61}",  '\u');
+ok("\l$a"         eq "\x{101}\x{101}\x{41}\x{61}",  '\l');
+ok("\U$a"         eq "\x{100}\x{100}\x{41}\x{41}",  '\U');
+ok("\L$a"         eq "\x{101}\x{101}\x{61}\x{61}",  '\L');
+
+ok(quotemeta($a)  eq "\x{100}\x{101}\x{41}\x{61}",  'quotemeta');
+ok(ucfirst($a)    eq "\x{100}\x{101}\x{41}\x{61}",  'ucfirst');
+ok(lcfirst($a)    eq "\x{101}\x{101}\x{41}\x{61}",  'lcfirst');
+ok(uc($a)         eq "\x{100}\x{100}\x{41}\x{41}",  'uc');
+ok(lc($a)         eq "\x{101}\x{101}\x{61}\x{61}",  'lc');
+
+ok("\Q$b\E."      eq "\x{101}\x{100}\x{61}\x{41}.", '\Q\E \x{101}\x{100}\x{61}\x{41}');
+ok("\u$b"         eq "\x{100}\x{100}\x{61}\x{41}",  '\u');
+ok("\l$b"         eq "\x{101}\x{100}\x{61}\x{41}",  '\l');
+ok("\U$b"         eq "\x{100}\x{100}\x{41}\x{41}",  '\U');
+ok("\L$b"         eq "\x{101}\x{101}\x{61}\x{61}",  '\L');
+
+ok(quotemeta($b)  eq "\x{101}\x{100}\x{61}\x{41}",  'quotemeta');
+ok(ucfirst($b)    eq "\x{100}\x{100}\x{61}\x{41}",  'ucfirst');
+ok(lcfirst($b)    eq "\x{101}\x{100}\x{61}\x{41}",  'lcfirst');
+ok(uc($b)         eq "\x{100}\x{100}\x{41}\x{41}",  'uc');
+ok(lc($b)         eq "\x{101}\x{101}\x{61}\x{61}",  'lc');
+
+# \x{DF} is LATIN SMALL LETTER SHARP S, its uppercase is SS or \x{53}\x{53};
+# \x{149} is LATIN SMALL LETTER N PRECEDED BY APOSTROPHE, its uppercase is
+# \x{2BC}\x{E4} or MODIFIER LETTER APOSTROPHE and N.
+
+ok("\U\x{DF}ab\x{149}cd" eq "\x{53}\x{53}AB\x{2BC}\x{4E}CD",
+   "multicharacter uppercase");
+
+# The \x{DF} is its own lowercase, ditto for \x{149}.
+# There are no single character -> multiple characters lowercase mappings.
 
+ok("\L\x{DF}AB\x{149}CD" eq "\x{DF}ab\x{149}cd",
+   "multicharacter lowercase");
 
diff --git a/utf8.c b/utf8.c
index 0c09469..4a3fe1d 100644 (file)
--- a/utf8.c
+++ b/utf8.c
@@ -1181,45 +1181,63 @@ Perl_is_utf8_mark(pTHX_ U8 *p)
 }
 
 UV
-Perl_to_utf8_upper(pTHX_ U8 *p, U8* ustrp, STRLEN *lenp)
+Perl_to_utf8_case(pTHX_ U8 *p, U8* ustrp, STRLEN *lenp, SV **swashp,char *normal, char *special)
 {
     UV uv;
 
-    if (!PL_utf8_toupper)
-       PL_utf8_toupper = swash_init("utf8", "ToUpper", &PL_sv_undef, 4, 0);
-    uv = swash_fetch(PL_utf8_toupper, p, TRUE);
-    uv = uv ? UNI_TO_NATIVE(uv) : utf8_to_uvchr(p, 0);
+    if (!*swashp)
+        *swashp = swash_init("utf8", normal, &PL_sv_undef, 4, 0);
+    uv = swash_fetch(*swashp, p, TRUE);
+    if (uv)
+        uv = UNI_TO_NATIVE(uv);
+    else {
+        HV *hv;
+        SV *keysv;
+        HE *he;
+
+        uv = utf8_to_uvchr(p, 0);
+
+        if ((hv    = get_hv(special, FALSE)) &&
+            (keysv = sv_2mortal(Perl_newSVpvf(aTHX_ "%"UVuf, uv))) &&
+            (he    = hv_fetch_ent(hv, keysv, FALSE, 0))) {
+             SV *val = HeVAL(he);
+             char *s = SvPV(val, *lenp);
+             U8 c = *(U8*)s;
+             if (*lenp > 1 || UNI_IS_INVARIANT(c))
+                  Copy(s, ustrp, *lenp, U8);
+             else {
+                  /* something in the 0x80..0xFF range */
+                  ustrp[0] = UTF8_EIGHT_BIT_HI(c);
+                  ustrp[1] = UTF8_EIGHT_BIT_LO(c);
+                  *lenp = 2;
+             }
+             return 0;
+        }
+    }
     *lenp = UNISKIP(uv);
     uvuni_to_utf8(ustrp, uv);
     return uv;
 }
 
 UV
-Perl_to_utf8_title(pTHX_ U8 *p, U8* ustrp, STRLEN *lenp)
+Perl_to_utf8_upper(pTHX_ U8 *p, U8* ustrp, STRLEN *lenp)
 {
-    UV uv;
+    return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
+                            &PL_utf8_toupper, "ToUpper", "utf8::ToSpecUpper");
+}
 
-    if (!PL_utf8_totitle)
-       PL_utf8_totitle = swash_init("utf8", "ToTitle", &PL_sv_undef, 4, 0);
-    uv = swash_fetch(PL_utf8_totitle, p, TRUE);
-    uv = uv ? UNI_TO_NATIVE(uv) : utf8_to_uvchr(p, 0);
-    *lenp = UNISKIP(uv);
-    uvuni_to_utf8(ustrp, uv);
-    return uv;
+UV
+Perl_to_utf8_title(pTHX_ U8 *p, U8* ustrp, STRLEN *lenp)
+{
+    return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
+                            &PL_utf8_totitle, "ToTitle", "utf8::ToSpecTitle");
 }
 
 UV
 Perl_to_utf8_lower(pTHX_ U8 *p, U8* ustrp, STRLEN *lenp)
 {
-    UV uv;
-
-    if (!PL_utf8_tolower)
-       PL_utf8_tolower = swash_init("utf8", "ToLower", &PL_sv_undef, 4, 0);
-    uv = swash_fetch(PL_utf8_tolower, p, TRUE);
-    uv = uv ? UNI_TO_NATIVE(uv) : utf8_to_uvchr(p, 0);
-    *lenp = UNISKIP(uv);
-    uvuni_to_utf8(ustrp, uv);
-    return uv;
+    return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
+                            &PL_utf8_tolower, "ToLower", "utf8::ToSpecLower");
 }
 
 /* a "swash" is a swatch hash */