Unicode character can be mapped into several.
p4raw-id: //depot/perl@12546
lib/unicore/SpecCase.txt Unicode character database
lib/unicore/To/Digit.pl Unicode character database
lib/unicore/To/Lower.pl Unicode character database
-lib/unicore/To/SpecLower.pl Unicode character database
-lib/unicore/To/SpecTitle.pl Unicode character database
-lib/unicore/To/SpecUpper.pl Unicode character database
lib/unicore/To/Title.pl Unicode character database
lib/unicore/To/Upper.pl Unicode character database
lib/unicore/UCD.html Unicode character database
#define swash_fetch Perl_swash_fetch
#define taint_env Perl_taint_env
#define taint_proper Perl_taint_proper
+#define to_utf8_case Perl_to_utf8_case
#define to_utf8_lower Perl_to_utf8_lower
#define to_utf8_upper Perl_to_utf8_upper
#define to_utf8_title Perl_to_utf8_title
#define swash_fetch(a,b,c) Perl_swash_fetch(aTHX_ a,b,c)
#define taint_env() Perl_taint_env(aTHX)
#define taint_proper(a,b) Perl_taint_proper(aTHX_ a,b)
+#define to_utf8_case(a,b,c,d,e,f) Perl_to_utf8_case(aTHX_ a,b,c,d,e,f)
#define to_utf8_lower(a,b,c) Perl_to_utf8_lower(aTHX_ a,b,c)
#define to_utf8_upper(a,b,c) Perl_to_utf8_upper(aTHX_ a,b,c)
#define to_utf8_title(a,b,c) Perl_to_utf8_title(aTHX_ a,b,c)
Ap |UV |swash_fetch |SV *sv|U8 *ptr|bool do_utf8
Ap |void |taint_env
Ap |void |taint_proper |const char* f|const char* s
+Ap |UV |to_utf8_case |U8 *p|U8* ustrp|STRLEN *lenp \
+ |SV **swash|char *normal|char *special
Ap |UV |to_utf8_lower |U8 *p|U8* ustrp|STRLEN *lenp
Ap |UV |to_utf8_upper |U8 *p|U8* ustrp|STRLEN *lenp
Ap |UV |to_utf8_title |U8 *p|U8* ustrp|STRLEN *lenp
Perl_swash_fetch
Perl_taint_env
Perl_taint_proper
+Perl_to_utf8_case
Perl_to_utf8_lower
Perl_to_utf8_upper
Perl_to_utf8_title
# !!!!!!! DO NOT EDIT THIS FILE !!!!!!!
# This file is built by mktables from e.g. Unicode.txt.
# Any changes made here will be lost!
+
+%utf8::ToSpecLower = (
+'223' => "\x{00DF}",
+'329' => "\x{0149}",
+'496' => "\x{01F0}",
+'912' => "\x{0390}",
+'944' => "\x{03B0}",
+'1415' => "\x{0587}",
+'7830' => "\x{1E96}",
+'7831' => "\x{1E97}",
+'7832' => "\x{1E98}",
+'7833' => "\x{1E99}",
+'7834' => "\x{1E9A}",
+'8016' => "\x{1F50}",
+'8018' => "\x{1F52}",
+'8020' => "\x{1F54}",
+'8022' => "\x{1F56}",
+'8064' => "\x{1F80}",
+'8065' => "\x{1F81}",
+'8066' => "\x{1F82}",
+'8067' => "\x{1F83}",
+'8068' => "\x{1F84}",
+'8069' => "\x{1F85}",
+'8070' => "\x{1F86}",
+'8071' => "\x{1F87}",
+'8072' => "\x{1F80}",
+'8073' => "\x{1F81}",
+'8074' => "\x{1F82}",
+'8075' => "\x{1F83}",
+'8076' => "\x{1F84}",
+'8077' => "\x{1F85}",
+'8078' => "\x{1F86}",
+'8079' => "\x{1F87}",
+'8080' => "\x{1F90}",
+'8081' => "\x{1F91}",
+'8082' => "\x{1F92}",
+'8083' => "\x{1F93}",
+'8084' => "\x{1F94}",
+'8085' => "\x{1F95}",
+'8086' => "\x{1F96}",
+'8087' => "\x{1F97}",
+'8088' => "\x{1F90}",
+'8089' => "\x{1F91}",
+'8090' => "\x{1F92}",
+'8091' => "\x{1F93}",
+'8092' => "\x{1F94}",
+'8093' => "\x{1F95}",
+'8094' => "\x{1F96}",
+'8095' => "\x{1F97}",
+'8096' => "\x{1FA0}",
+'8097' => "\x{1FA1}",
+'8098' => "\x{1FA2}",
+'8099' => "\x{1FA3}",
+'8100' => "\x{1FA4}",
+'8101' => "\x{1FA5}",
+'8102' => "\x{1FA6}",
+'8103' => "\x{1FA7}",
+'8104' => "\x{1FA0}",
+'8105' => "\x{1FA1}",
+'8106' => "\x{1FA2}",
+'8107' => "\x{1FA3}",
+'8108' => "\x{1FA4}",
+'8109' => "\x{1FA5}",
+'8110' => "\x{1FA6}",
+'8111' => "\x{1FA7}",
+'8114' => "\x{1FB2}",
+'8115' => "\x{1FB3}",
+'8116' => "\x{1FB4}",
+'8118' => "\x{1FB6}",
+'8119' => "\x{1FB7}",
+'8124' => "\x{1FB3}",
+'8130' => "\x{1FC2}",
+'8131' => "\x{1FC3}",
+'8132' => "\x{1FC4}",
+'8134' => "\x{1FC6}",
+'8135' => "\x{1FC7}",
+'8140' => "\x{1FC3}",
+'8146' => "\x{1FD2}",
+'8147' => "\x{1FD3}",
+'8150' => "\x{1FD6}",
+'8151' => "\x{1FD7}",
+'8162' => "\x{1FE2}",
+'8163' => "\x{1FE3}",
+'8164' => "\x{1FE4}",
+'8166' => "\x{1FE6}",
+'8167' => "\x{1FE7}",
+'8178' => "\x{1FF2}",
+'8179' => "\x{1FF3}",
+'8180' => "\x{1FF4}",
+'8182' => "\x{1FF6}",
+'8183' => "\x{1FF7}",
+'8188' => "\x{1FF3}",
+'64256' => "\x{FB00}",
+'64257' => "\x{FB01}",
+'64258' => "\x{FB02}",
+'64259' => "\x{FB03}",
+'64260' => "\x{FB04}",
+'64261' => "\x{FB05}",
+'64262' => "\x{FB06}",
+'64275' => "\x{FB13}",
+'64276' => "\x{FB14}",
+'64277' => "\x{FB15}",
+'64278' => "\x{FB16}",
+'64279' => "\x{FB17}",
+);
+
return <<'END';
0041 0061
0042 0062
+++ /dev/null
-# !!!!!!! DO NOT EDIT THIS FILE !!!!!!!
-# This file is built by mktables from e.g. Unicode.txt.
-# Any changes made here will be lost!
-return <<'END';
-00DF 00DF
-0149 0149
-01F0 01F0
-0390 0390
-03B0 03B0
-0587 0587
-1E96 1E96
-1E97 1E97
-1E98 1E98
-1E99 1E99
-1E9A 1E9A
-1F50 1F50
-1F52 1F52
-1F54 1F54
-1F56 1F56
-1F80 1F80
-1F81 1F81
-1F82 1F82
-1F83 1F83
-1F84 1F84
-1F85 1F85
-1F86 1F86
-1F87 1F87
-1F88 1F80
-1F89 1F81
-1F8A 1F82
-1F8B 1F83
-1F8C 1F84
-1F8D 1F85
-1F8E 1F86
-1F8F 1F87
-1F90 1F90
-1F91 1F91
-1F92 1F92
-1F93 1F93
-1F94 1F94
-1F95 1F95
-1F96 1F96
-1F97 1F97
-1F98 1F90
-1F99 1F91
-1F9A 1F92
-1F9B 1F93
-1F9C 1F94
-1F9D 1F95
-1F9E 1F96
-1F9F 1F97
-1FA0 1FA0
-1FA1 1FA1
-1FA2 1FA2
-1FA3 1FA3
-1FA4 1FA4
-1FA5 1FA5
-1FA6 1FA6
-1FA7 1FA7
-1FA8 1FA0
-1FA9 1FA1
-1FAA 1FA2
-1FAB 1FA3
-1FAC 1FA4
-1FAD 1FA5
-1FAE 1FA6
-1FAF 1FA7
-1FB2 1FB2
-1FB3 1FB3
-1FB4 1FB4
-1FB6 1FB6
-1FB7 1FB7
-1FBC 1FB3
-1FC2 1FC2
-1FC3 1FC3
-1FC4 1FC4
-1FC6 1FC6
-1FC7 1FC7
-1FCC 1FC3
-1FD2 1FD2
-1FD3 1FD3
-1FD6 1FD6
-1FD7 1FD7
-1FE2 1FE2
-1FE3 1FE3
-1FE4 1FE4
-1FE6 1FE6
-1FE7 1FE7
-1FF2 1FF2
-1FF3 1FF3
-1FF4 1FF4
-1FF6 1FF6
-1FF7 1FF7
-1FFC 1FF3
-FB00 FB00
-FB01 FB01
-FB02 FB02
-FB03 FB03
-FB04 FB04
-FB05 FB05
-FB06 FB06
-FB13 FB13
-FB14 FB14
-FB15 FB15
-FB16 FB16
-FB17 FB17
-END
+++ /dev/null
-# !!!!!!! DO NOT EDIT THIS FILE !!!!!!!
-# This file is built by mktables from e.g. Unicode.txt.
-# Any changes made here will be lost!
-return <<'END';
-00DF 0053 0073
-0149 02BC 004E
-01F0 004A 030C
-0390 0399 0308 0301
-03B0 03A5 0308 0301
-0587 0535 0582
-1E96 0048 0331
-1E97 0054 0308
-1E98 0057 030A
-1E99 0059 030A
-1E9A 0041 02BE
-1F50 03A5 0313
-1F52 03A5 0313 0300
-1F54 03A5 0313 0301
-1F56 03A5 0313 0342
-1F80 1F88
-1F81 1F89
-1F82 1F8A
-1F83 1F8B
-1F84 1F8C
-1F85 1F8D
-1F86 1F8E
-1F87 1F8F
-1F88 1F88
-1F89 1F89
-1F8A 1F8A
-1F8B 1F8B
-1F8C 1F8C
-1F8D 1F8D
-1F8E 1F8E
-1F8F 1F8F
-1F90 1F98
-1F91 1F99
-1F92 1F9A
-1F93 1F9B
-1F94 1F9C
-1F95 1F9D
-1F96 1F9E
-1F97 1F9F
-1F98 1F98
-1F99 1F99
-1F9A 1F9A
-1F9B 1F9B
-1F9C 1F9C
-1F9D 1F9D
-1F9E 1F9E
-1F9F 1F9F
-1FA0 1FA8
-1FA1 1FA9
-1FA2 1FAA
-1FA3 1FAB
-1FA4 1FAC
-1FA5 1FAD
-1FA6 1FAE
-1FA7 1FAF
-1FA8 1FA8
-1FA9 1FA9
-1FAA 1FAA
-1FAB 1FAB
-1FAC 1FAC
-1FAD 1FAD
-1FAE 1FAE
-1FAF 1FAF
-1FB2 1FBA 0345
-1FB3 1FBC
-1FB4 0386 0345
-1FB6 0391 0342
-1FB7 0391 0342 0345
-1FBC 1FBC
-1FC2 1FCA 0345
-1FC3 1FCC
-1FC4 0389 0345
-1FC6 0397 0342
-1FC7 0397 0342 0345
-1FCC 1FCC
-1FD2 0399 0308 0300
-1FD3 0399 0308 0301
-1FD6 0399 0342
-1FD7 0399 0308 0342
-1FE2 03A5 0308 0300
-1FE3 03A5 0308 0301
-1FE4 03A1 0313
-1FE6 03A5 0342
-1FE7 03A5 0308 0342
-1FF2 1FFA 0345
-1FF3 1FFC
-1FF4 038F 0345
-1FF6 03A9 0342
-1FF7 03A9 0342 0345
-1FFC 1FFC
-FB00 0046 0066
-FB01 0046 0069
-FB02 0046 006C
-FB03 0046 0066 0069
-FB04 0046 0066 006C
-FB05 FB06 0053 0074
-FB13 0544 0576
-FB14 0544 0565
-FB15 0544 056B
-FB16 054E 0576
-FB17 0544 056D
-END
+++ /dev/null
-# !!!!!!! DO NOT EDIT THIS FILE !!!!!!!
-# This file is built by mktables from e.g. Unicode.txt.
-# Any changes made here will be lost!
-return <<'END';
-00DF 0053 0053
-0149 02BC 004E
-01F0 004A 030C
-0390 0399 0308 0301
-03B0 03A5 0308 0301
-0587 0535 0552
-1E96 0048 0331
-1E97 0054 0308
-1E98 0057 030A
-1E99 0059 030A
-1E9A 0041 02BE
-1F50 03A5 0313
-1F52 03A5 0313 0300
-1F54 03A5 0313 0301
-1F56 03A5 0313 0342
-1F80 1F08 0399
-1F81 1F09 0399
-1F82 1F0A 0399
-1F83 1F0B 0399
-1F84 1F0C 0399
-1F85 1F0D 0399
-1F86 1F0E 0399
-1F87 1F0F 0399
-1F88 1F08 0399
-1F89 1F09 0399
-1F8A 1F0A 0399
-1F8B 1F0B 0399
-1F8C 1F0C 0399
-1F8D 1F0D 0399
-1F8E 1F0E 0399
-1F8F 1F0F 0399
-1F90 1F28 0399
-1F91 1F29 0399
-1F92 1F2A 0399
-1F93 1F2B 0399
-1F94 1F2C 0399
-1F95 1F2D 0399
-1F96 1F2E 0399
-1F97 1F2F 0399
-1F98 1F28 0399
-1F99 1F29 0399
-1F9A 1F2A 0399
-1F9B 1F2B 0399
-1F9C 1F2C 0399
-1F9D 1F2D 0399
-1F9E 1F2E 0399
-1F9F 1F2F 0399
-1FA0 1F68 0399
-1FA1 1F69 0399
-1FA2 1F6A 0399
-1FA3 1F6B 0399
-1FA4 1F6C 0399
-1FA5 1F6D 0399
-1FA6 1F6E 0399
-1FA7 1F6F 0399
-1FA8 1F68 0399
-1FA9 1F69 0399
-1FAA 1F6A 0399
-1FAB 1F6B 0399
-1FAC 1F6C 0399
-1FAD 1F6D 0399
-1FAE 1F6E 0399
-1FAF 1F6F 0399
-1FB2 1FBA 0399
-1FB3 0391 0399
-1FB4 0386 0399
-1FB6 0391 0342
-1FB7 0391 0342 0399
-1FBC 0391 0399
-1FC2 1FCA 0399
-1FC3 0397 0399
-1FC4 0389 0399
-1FC6 0397 0342
-1FC7 0397 0342 0399
-1FCC 0397 0399
-1FD2 0399 0308 0300
-1FD3 0399 0308 0301
-1FD6 0399 0342
-1FD7 0399 0308 0342
-1FE2 03A5 0308 0300
-1FE3 03A5 0308 0301
-1FE4 03A1 0313
-1FE6 03A5 0342
-1FE7 03A5 0308 0342
-1FF2 1FFA 0399
-1FF3 03A9 0399
-1FF4 038F 0399
-1FF6 03A9 0342
-1FF7 03A9 0342 0399
-1FFC 03A9 0399
-FB00 0046 0046
-FB01 0046 0049
-FB02 0046 004C
-FB03 0046 0046 0049
-FB04 0046 0046 004C
-FB05 FB06 0053 0054
-FB13 0544 0546
-FB14 0544 0535
-FB15 0544 053B
-FB16 054E 0546
-FB17 0544 053D
-END
# !!!!!!! DO NOT EDIT THIS FILE !!!!!!!
# This file is built by mktables from e.g. Unicode.txt.
# Any changes made here will be lost!
+
+%utf8::ToSpecTitle = (
+'223' => "\x{0053}\x{0073}",
+'329' => "\x{02BC}\x{004E}",
+'496' => "\x{004A}\x{030C}",
+'912' => "\x{0399}\x{0308}\x{0301}",
+'944' => "\x{03A5}\x{0308}\x{0301}",
+'1415' => "\x{0535}\x{0582}",
+'7830' => "\x{0048}\x{0331}",
+'7831' => "\x{0054}\x{0308}",
+'7832' => "\x{0057}\x{030A}",
+'7833' => "\x{0059}\x{030A}",
+'7834' => "\x{0041}\x{02BE}",
+'8016' => "\x{03A5}\x{0313}",
+'8018' => "\x{03A5}\x{0313}\x{0300}",
+'8020' => "\x{03A5}\x{0313}\x{0301}",
+'8022' => "\x{03A5}\x{0313}\x{0342}",
+'8064' => "\x{1F88}",
+'8065' => "\x{1F89}",
+'8066' => "\x{1F8A}",
+'8067' => "\x{1F8B}",
+'8068' => "\x{1F8C}",
+'8069' => "\x{1F8D}",
+'8070' => "\x{1F8E}",
+'8071' => "\x{1F8F}",
+'8072' => "\x{1F88}",
+'8073' => "\x{1F89}",
+'8074' => "\x{1F8A}",
+'8075' => "\x{1F8B}",
+'8076' => "\x{1F8C}",
+'8077' => "\x{1F8D}",
+'8078' => "\x{1F8E}",
+'8079' => "\x{1F8F}",
+'8080' => "\x{1F98}",
+'8081' => "\x{1F99}",
+'8082' => "\x{1F9A}",
+'8083' => "\x{1F9B}",
+'8084' => "\x{1F9C}",
+'8085' => "\x{1F9D}",
+'8086' => "\x{1F9E}",
+'8087' => "\x{1F9F}",
+'8088' => "\x{1F98}",
+'8089' => "\x{1F99}",
+'8090' => "\x{1F9A}",
+'8091' => "\x{1F9B}",
+'8092' => "\x{1F9C}",
+'8093' => "\x{1F9D}",
+'8094' => "\x{1F9E}",
+'8095' => "\x{1F9F}",
+'8096' => "\x{1FA8}",
+'8097' => "\x{1FA9}",
+'8098' => "\x{1FAA}",
+'8099' => "\x{1FAB}",
+'8100' => "\x{1FAC}",
+'8101' => "\x{1FAD}",
+'8102' => "\x{1FAE}",
+'8103' => "\x{1FAF}",
+'8104' => "\x{1FA8}",
+'8105' => "\x{1FA9}",
+'8106' => "\x{1FAA}",
+'8107' => "\x{1FAB}",
+'8108' => "\x{1FAC}",
+'8109' => "\x{1FAD}",
+'8110' => "\x{1FAE}",
+'8111' => "\x{1FAF}",
+'8114' => "\x{1FBA}\x{0345}",
+'8115' => "\x{1FBC}",
+'8116' => "\x{0386}\x{0345}",
+'8118' => "\x{0391}\x{0342}",
+'8119' => "\x{0391}\x{0342}\x{0345}",
+'8124' => "\x{1FBC}",
+'8130' => "\x{1FCA}\x{0345}",
+'8131' => "\x{1FCC}",
+'8132' => "\x{0389}\x{0345}",
+'8134' => "\x{0397}\x{0342}",
+'8135' => "\x{0397}\x{0342}\x{0345}",
+'8140' => "\x{1FCC}",
+'8146' => "\x{0399}\x{0308}\x{0300}",
+'8147' => "\x{0399}\x{0308}\x{0301}",
+'8150' => "\x{0399}\x{0342}",
+'8151' => "\x{0399}\x{0308}\x{0342}",
+'8162' => "\x{03A5}\x{0308}\x{0300}",
+'8163' => "\x{03A5}\x{0308}\x{0301}",
+'8164' => "\x{03A1}\x{0313}",
+'8166' => "\x{03A5}\x{0342}",
+'8167' => "\x{03A5}\x{0308}\x{0342}",
+'8178' => "\x{1FFA}\x{0345}",
+'8179' => "\x{1FFC}",
+'8180' => "\x{038F}\x{0345}",
+'8182' => "\x{03A9}\x{0342}",
+'8183' => "\x{03A9}\x{0342}\x{0345}",
+'8188' => "\x{1FFC}",
+'64256' => "\x{0046}\x{0066}",
+'64257' => "\x{0046}\x{0069}",
+'64258' => "\x{0046}\x{006C}",
+'64259' => "\x{0046}\x{0066}\x{0069}",
+'64260' => "\x{0046}\x{0066}\x{006C}",
+'64261' => "\x{0053}\x{0074}",
+'64262' => "\x{0053}\x{0074}",
+'64275' => "\x{0544}\x{0576}",
+'64276' => "\x{0544}\x{0565}",
+'64277' => "\x{0544}\x{056B}",
+'64278' => "\x{054E}\x{0576}",
+'64279' => "\x{0544}\x{056D}",
+);
+
return <<'END';
0061 0041
0062 0042
# !!!!!!! DO NOT EDIT THIS FILE !!!!!!!
# This file is built by mktables from e.g. Unicode.txt.
# Any changes made here will be lost!
+
+%utf8::ToSpecUpper = (
+'223' => "\x{0053}\x{0053}",
+'329' => "\x{02BC}\x{004E}",
+'496' => "\x{004A}\x{030C}",
+'912' => "\x{0399}\x{0308}\x{0301}",
+'944' => "\x{03A5}\x{0308}\x{0301}",
+'1415' => "\x{0535}\x{0552}",
+'7830' => "\x{0048}\x{0331}",
+'7831' => "\x{0054}\x{0308}",
+'7832' => "\x{0057}\x{030A}",
+'7833' => "\x{0059}\x{030A}",
+'7834' => "\x{0041}\x{02BE}",
+'8016' => "\x{03A5}\x{0313}",
+'8018' => "\x{03A5}\x{0313}\x{0300}",
+'8020' => "\x{03A5}\x{0313}\x{0301}",
+'8022' => "\x{03A5}\x{0313}\x{0342}",
+'8064' => "\x{1F08}\x{0399}",
+'8065' => "\x{1F09}\x{0399}",
+'8066' => "\x{1F0A}\x{0399}",
+'8067' => "\x{1F0B}\x{0399}",
+'8068' => "\x{1F0C}\x{0399}",
+'8069' => "\x{1F0D}\x{0399}",
+'8070' => "\x{1F0E}\x{0399}",
+'8071' => "\x{1F0F}\x{0399}",
+'8072' => "\x{1F08}\x{0399}",
+'8073' => "\x{1F09}\x{0399}",
+'8074' => "\x{1F0A}\x{0399}",
+'8075' => "\x{1F0B}\x{0399}",
+'8076' => "\x{1F0C}\x{0399}",
+'8077' => "\x{1F0D}\x{0399}",
+'8078' => "\x{1F0E}\x{0399}",
+'8079' => "\x{1F0F}\x{0399}",
+'8080' => "\x{1F28}\x{0399}",
+'8081' => "\x{1F29}\x{0399}",
+'8082' => "\x{1F2A}\x{0399}",
+'8083' => "\x{1F2B}\x{0399}",
+'8084' => "\x{1F2C}\x{0399}",
+'8085' => "\x{1F2D}\x{0399}",
+'8086' => "\x{1F2E}\x{0399}",
+'8087' => "\x{1F2F}\x{0399}",
+'8088' => "\x{1F28}\x{0399}",
+'8089' => "\x{1F29}\x{0399}",
+'8090' => "\x{1F2A}\x{0399}",
+'8091' => "\x{1F2B}\x{0399}",
+'8092' => "\x{1F2C}\x{0399}",
+'8093' => "\x{1F2D}\x{0399}",
+'8094' => "\x{1F2E}\x{0399}",
+'8095' => "\x{1F2F}\x{0399}",
+'8096' => "\x{1F68}\x{0399}",
+'8097' => "\x{1F69}\x{0399}",
+'8098' => "\x{1F6A}\x{0399}",
+'8099' => "\x{1F6B}\x{0399}",
+'8100' => "\x{1F6C}\x{0399}",
+'8101' => "\x{1F6D}\x{0399}",
+'8102' => "\x{1F6E}\x{0399}",
+'8103' => "\x{1F6F}\x{0399}",
+'8104' => "\x{1F68}\x{0399}",
+'8105' => "\x{1F69}\x{0399}",
+'8106' => "\x{1F6A}\x{0399}",
+'8107' => "\x{1F6B}\x{0399}",
+'8108' => "\x{1F6C}\x{0399}",
+'8109' => "\x{1F6D}\x{0399}",
+'8110' => "\x{1F6E}\x{0399}",
+'8111' => "\x{1F6F}\x{0399}",
+'8114' => "\x{1FBA}\x{0399}",
+'8115' => "\x{0391}\x{0399}",
+'8116' => "\x{0386}\x{0399}",
+'8118' => "\x{0391}\x{0342}",
+'8119' => "\x{0391}\x{0342}\x{0399}",
+'8124' => "\x{0391}\x{0399}",
+'8130' => "\x{1FCA}\x{0399}",
+'8131' => "\x{0397}\x{0399}",
+'8132' => "\x{0389}\x{0399}",
+'8134' => "\x{0397}\x{0342}",
+'8135' => "\x{0397}\x{0342}\x{0399}",
+'8140' => "\x{0397}\x{0399}",
+'8146' => "\x{0399}\x{0308}\x{0300}",
+'8147' => "\x{0399}\x{0308}\x{0301}",
+'8150' => "\x{0399}\x{0342}",
+'8151' => "\x{0399}\x{0308}\x{0342}",
+'8162' => "\x{03A5}\x{0308}\x{0300}",
+'8163' => "\x{03A5}\x{0308}\x{0301}",
+'8164' => "\x{03A1}\x{0313}",
+'8166' => "\x{03A5}\x{0342}",
+'8167' => "\x{03A5}\x{0308}\x{0342}",
+'8178' => "\x{1FFA}\x{0399}",
+'8179' => "\x{03A9}\x{0399}",
+'8180' => "\x{038F}\x{0399}",
+'8182' => "\x{03A9}\x{0342}",
+'8183' => "\x{03A9}\x{0342}\x{0399}",
+'8188' => "\x{03A9}\x{0399}",
+'64256' => "\x{0046}\x{0046}",
+'64257' => "\x{0046}\x{0049}",
+'64258' => "\x{0046}\x{004C}",
+'64259' => "\x{0046}\x{0046}\x{0049}",
+'64260' => "\x{0046}\x{0046}\x{004C}",
+'64261' => "\x{0053}\x{0054}",
+'64262' => "\x{0053}\x{0054}",
+'64275' => "\x{0544}\x{0546}",
+'64276' => "\x{0544}\x{0535}",
+'64277' => "\x{0544}\x{053B}",
+'64278' => "\x{054E}\x{0546}",
+'64279' => "\x{0544}\x{053D}",
+);
+
return <<'END';
0061 0041
0062 0042
}
# Now write out the special cases properties in their code point order.
-# The To/Spec{Lower,Title,Upper}.pl are unused for now since the swash
-# routines do not do returning multiple characters.
+# Prepend them to the To/{Upper,Lower,Title}.pl.
for my $case (qw(Lower Title Upper)) {
- my @case;
- for my $prop (sort { $a->[0] <=> $b->[0] } @{$Case{$case}}) {
- my ($ix, $code, $to) = @$prop;
- append(\@case, $code, $to);
+ my $NormalCase = do "To/$case.pl";
+ if (open(my $Case, ">To/$case.pl")) {
+ header($Case);
+ print $Case <<EOT;
+
+%utf8::ToSpec$case = (
+EOT
+ for my $prop (sort { $a->[0] <=> $b->[0] } @{$Case{$case}}) {
+ my ($ix, $code, $to) = @$prop;
+ my $tostr =
+ join "", map { sprintf "\\x{%s}", $_ } split ' ', $to;
+ print $Case qq['$ix' => "$tostr",\n];
+ }
+ print $Case <<EOT;
+);
+
+EOT
+ begin($Case);
+ print $Case $NormalCase;
+ end($Case);
+ } else {
+ die "$0: To/$case.txt: $!\n";
}
- flush(\@case, "To/Spec$case.pl");
}
# That's all, folks!
Returns an lowercased version of EXPR. This is the internal function
implementing the C<\L> escape in double-quoted strings. Respects
current LC_CTYPE locale if C<use locale> in force. See L<perllocale>
-and L<perlunicode>.
+and L<perlunicode> for more details about locale and Unicode support.
If EXPR is omitted, uses C<$_>.
Returns the value of EXPR with the first character lowercased. This
is the internal function implementing the C<\l> escape in
double-quoted strings. Respects current LC_CTYPE locale if C<use
-locale> in force. See L<perllocale> and L<perlunicode>.
+locale> in force. See L<perllocale> and L<perlunicode> for more
+details about locale and Unicode support.
If EXPR is omitted, uses C<$_>.
Returns an uppercased version of EXPR. This is the internal function
implementing the C<\U> escape in double-quoted strings. Respects
current LC_CTYPE locale if C<use locale> in force. See L<perllocale>
-and L<perlunicode>. It does not attempt to do titlecase mapping on
-initial letters. See C<ucfirst> for that.
+and L<perlunicode> for more details about locale and Unicode support.
+It does not attempt to do titlecase mapping on initial letters. See
+C<ucfirst> for that.
If EXPR is omitted, uses C<$_>.
Returns the value of EXPR with the first character in uppercase
(titlecase in Unicode). This is the internal function implementing
the C<\u> escape in double-quoted strings. Respects current LC_CTYPE
-locale if C<use locale> in force. See L<perllocale> and L<perlunicode>.
+locale if C<use locale> in force. See L<perllocale> and L<perlunicode>
+for more details about locale and Unicode support.
If EXPR is omitted, uses C<$_>.
=item *
-lc(), uc(), lcfirst(), and ucfirst() work only for some of the
-simplest cases, where the mapping goes from a single Unicode character
-to another single Unicode character, and where the mapping does not
-depend on surrounding characters, or on locales. More complex cases,
-where for example one character maps into several, are not yet
-implemented. See the Unicode Technical Report #21, Case Mappings,
-for more details. The Unicode::UCD module (part of Perl since 5.8.0)
-casespec() and casefold() interfaces supply information about the more
-complex cases.
+lc(), uc(), lcfirst(), and ucfirst() work for the following cases:
+
+=over 8
+
+=item *
+
+the case mapping is from a single Unicode character to another
+single Unicode character
+
+=item *
+
+the case mapping is from a single Unicode character to more
+than one Unicode character
+
+=back
+
+What doesn't yet work are the followng cases:
+
+=over 8
+
+=item *
+
+the "final sigma" (Greek)
+
+=item *
+
+anything to with locales (Lithuanian, Turkish, Azeri)
+
+=back
+
+See the Unicode Technical Report #21, Case Mappings, for more details.
=item *
PERL_CALLCONV UV Perl_swash_fetch(pTHX_ SV *sv, U8 *ptr, bool do_utf8);
PERL_CALLCONV void Perl_taint_env(pTHX);
PERL_CALLCONV void Perl_taint_proper(pTHX_ const char* f, const char* s);
+PERL_CALLCONV UV Perl_to_utf8_case(pTHX_ U8 *p, U8* ustrp, STRLEN *lenp, SV **swash, char *normal, char *special);
PERL_CALLCONV UV Perl_to_utf8_lower(pTHX_ U8 *p, U8* ustrp, STRLEN *lenp);
PERL_CALLCONV UV Perl_to_utf8_upper(pTHX_ U8 *p, U8* ustrp, STRLEN *lenp);
PERL_CALLCONV UV Perl_to_utf8_title(pTHX_ U8 *p, U8* ustrp, STRLEN *lenp);
#!./perl
-print "1..40\n";
+print "1..42\n";
+
+my $test = 1;
+
+sub ok {
+ if ($_[0]) {
+ if ($_[1]) {
+ print "ok $test - $_[1]\n";
+ } else {
+ print "ok $test\n";
+ }
+ } else {
+ if ($_[1]) {
+ print "not ok $test - $_[1]\n";
+ } else {
+ print "not ok $test\n";
+ }
+ }
+ $test++;
+}
$a = "HELLO.* world";
$b = "hello.* WORLD";
-print "ok 1\n" if "\Q$a\E." eq "HELLO\\.\\*\\ world.";
-print "ok 2\n" if "\u$a" eq "HELLO\.\* world";
-print "ok 3\n" if "\l$a" eq "hELLO\.\* world";
-print "ok 4\n" if "\U$a" eq "HELLO\.\* WORLD";
-print "ok 5\n" if "\L$a" eq "hello\.\* world";
-
-print "ok 6\n" if quotemeta($a) eq "HELLO\\.\\*\\ world";
-print "ok 7\n" if ucfirst($a) eq "HELLO\.\* world";
-print "ok 8\n" if lcfirst($a) eq "hELLO\.\* world";
-print "ok 9\n" if uc($a) eq "HELLO\.\* WORLD";
-print "ok 10\n" if lc($a) eq "hello\.\* world";
-
-print "ok 11\n" if "\Q$b\E." eq "hello\\.\\*\\ WORLD.";
-print "ok 12\n" if "\u$b" eq "Hello\.\* WORLD";
-print "ok 13\n" if "\l$b" eq "hello\.\* WORLD";
-print "ok 14\n" if "\U$b" eq "HELLO\.\* WORLD";
-print "ok 15\n" if "\L$b" eq "hello\.\* world";
-
-print "ok 16\n" if quotemeta($b) eq "hello\\.\\*\\ WORLD";
-print "ok 17\n" if ucfirst($b) eq "Hello\.\* WORLD";
-print "ok 18\n" if lcfirst($b) eq "hello\.\* WORLD";
-print "ok 19\n" if uc($b) eq "HELLO\.\* WORLD";
-print "ok 20\n" if lc($b) eq "hello\.\* world";
+ok("\Q$a\E." eq "HELLO\\.\\*\\ world.", '\Q\E HELLO.* world');
+ok("\u$a" eq "HELLO\.\* world", '\u');
+ok("\l$a" eq "hELLO\.\* world", '\l');
+ok("\U$a" eq "HELLO\.\* WORLD", '\U');
+ok("\L$a" eq "hello\.\* world", '\L');
+
+ok(quotemeta($a) eq "HELLO\\.\\*\\ world", 'quotemeta');
+ok(ucfirst($a) eq "HELLO\.\* world", 'ucfirst');
+ok(lcfirst($a) eq "hELLO\.\* world", 'lcfirst');
+ok(uc($a) eq "HELLO\.\* WORLD", 'uc');
+ok(lc($a) eq "hello\.\* world", 'lc');
+
+ok("\Q$b\E." eq "hello\\.\\*\\ WORLD.", '\Q\E hello.* WORLD');
+ok("\u$b" eq "Hello\.\* WORLD", '\u');
+ok("\l$b" eq "hello\.\* WORLD", '\l');
+ok("\U$b" eq "HELLO\.\* WORLD", '\U');
+ok("\L$b" eq "hello\.\* world", '\L');
+
+ok(quotemeta($b) eq "hello\\.\\*\\ WORLD", 'quotemeta');
+ok(ucfirst($b) eq "Hello\.\* WORLD", 'ucfirst');
+ok(lcfirst($b) eq "hello\.\* WORLD", 'lcfirst');
+ok(uc($b) eq "HELLO\.\* WORLD", 'uc');
+ok(lc($b) eq "hello\.\* world", 'lc');
+
+# \x{100} is LATIN CAPITAL LETTER A WITH MACRON; its bijective lowercase is
+# \x{100}, LATIN SMALL LETTER A WITH MACRON.
$a = "\x{100}\x{101}\x{41}\x{61}";
$b = "\x{101}\x{100}\x{61}\x{41}";
-print "ok 21\n" if "\Q$a\E." eq "\x{100}\x{101}\x{41}\x{61}.";
-print "ok 22\n" if "\u$a" eq "\x{100}\x{101}\x{41}\x{61}";
-print "ok 23\n" if "\l$a" eq "\x{101}\x{101}\x{41}\x{61}";
-print "ok 24\n" if "\U$a" eq "\x{100}\x{100}\x{41}\x{41}";
-print "ok 25\n" if "\L$a" eq "\x{101}\x{101}\x{61}\x{61}";
-
-print "ok 26\n" if quotemeta($a) eq "\x{100}\x{101}\x{41}\x{61}";
-print "ok 27\n" if ucfirst($a) eq "\x{100}\x{101}\x{41}\x{61}";
-print "ok 28\n" if lcfirst($a) eq "\x{101}\x{101}\x{41}\x{61}";
-print "ok 29\n" if uc($a) eq "\x{100}\x{100}\x{41}\x{41}";
-print "ok 30\n" if lc($a) eq "\x{101}\x{101}\x{61}\x{61}";
-
-print "ok 31\n" if "\Q$b\E." eq "\x{101}\x{100}\x{61}\x{41}.";
-print "ok 32\n" if "\u$b" eq "\x{100}\x{100}\x{61}\x{41}";
-print "ok 33\n" if "\l$b" eq "\x{101}\x{100}\x{61}\x{41}";
-print "ok 34\n" if "\U$b" eq "\x{100}\x{100}\x{41}\x{41}";
-print "ok 35\n" if "\L$b" eq "\x{101}\x{101}\x{61}\x{61}";
-
-print "ok 36\n" if quotemeta($b) eq "\x{101}\x{100}\x{61}\x{41}";
-print "ok 37\n" if ucfirst($b) eq "\x{100}\x{100}\x{61}\x{41}";
-print "ok 38\n" if lcfirst($b) eq "\x{101}\x{100}\x{61}\x{41}";
-print "ok 39\n" if uc($b) eq "\x{100}\x{100}\x{41}\x{41}";
-print "ok 40\n" if lc($b) eq "\x{101}\x{101}\x{61}\x{61}";
+ok("\Q$a\E." eq "\x{100}\x{101}\x{41}\x{61}.", '\Q\E \x{100}\x{101}\x{41}\x{61}');
+ok("\u$a" eq "\x{100}\x{101}\x{41}\x{61}", '\u');
+ok("\l$a" eq "\x{101}\x{101}\x{41}\x{61}", '\l');
+ok("\U$a" eq "\x{100}\x{100}\x{41}\x{41}", '\U');
+ok("\L$a" eq "\x{101}\x{101}\x{61}\x{61}", '\L');
+
+ok(quotemeta($a) eq "\x{100}\x{101}\x{41}\x{61}", 'quotemeta');
+ok(ucfirst($a) eq "\x{100}\x{101}\x{41}\x{61}", 'ucfirst');
+ok(lcfirst($a) eq "\x{101}\x{101}\x{41}\x{61}", 'lcfirst');
+ok(uc($a) eq "\x{100}\x{100}\x{41}\x{41}", 'uc');
+ok(lc($a) eq "\x{101}\x{101}\x{61}\x{61}", 'lc');
+
+ok("\Q$b\E." eq "\x{101}\x{100}\x{61}\x{41}.", '\Q\E \x{101}\x{100}\x{61}\x{41}');
+ok("\u$b" eq "\x{100}\x{100}\x{61}\x{41}", '\u');
+ok("\l$b" eq "\x{101}\x{100}\x{61}\x{41}", '\l');
+ok("\U$b" eq "\x{100}\x{100}\x{41}\x{41}", '\U');
+ok("\L$b" eq "\x{101}\x{101}\x{61}\x{61}", '\L');
+
+ok(quotemeta($b) eq "\x{101}\x{100}\x{61}\x{41}", 'quotemeta');
+ok(ucfirst($b) eq "\x{100}\x{100}\x{61}\x{41}", 'ucfirst');
+ok(lcfirst($b) eq "\x{101}\x{100}\x{61}\x{41}", 'lcfirst');
+ok(uc($b) eq "\x{100}\x{100}\x{41}\x{41}", 'uc');
+ok(lc($b) eq "\x{101}\x{101}\x{61}\x{61}", 'lc');
+
+# \x{DF} is LATIN SMALL LETTER SHARP S, its uppercase is SS or \x{53}\x{53};
+# \x{149} is LATIN SMALL LETTER N PRECEDED BY APOSTROPHE, its uppercase is
+# \x{2BC}\x{E4} or MODIFIER LETTER APOSTROPHE and N.
+
+ok("\U\x{DF}ab\x{149}cd" eq "\x{53}\x{53}AB\x{2BC}\x{4E}CD",
+ "multicharacter uppercase");
+
+# The \x{DF} is its own lowercase, ditto for \x{149}.
+# There are no single character -> multiple characters lowercase mappings.
+ok("\L\x{DF}AB\x{149}CD" eq "\x{DF}ab\x{149}cd",
+ "multicharacter lowercase");
}
UV
-Perl_to_utf8_upper(pTHX_ U8 *p, U8* ustrp, STRLEN *lenp)
+Perl_to_utf8_case(pTHX_ U8 *p, U8* ustrp, STRLEN *lenp, SV **swashp,char *normal, char *special)
{
UV uv;
- if (!PL_utf8_toupper)
- PL_utf8_toupper = swash_init("utf8", "ToUpper", &PL_sv_undef, 4, 0);
- uv = swash_fetch(PL_utf8_toupper, p, TRUE);
- uv = uv ? UNI_TO_NATIVE(uv) : utf8_to_uvchr(p, 0);
+ if (!*swashp)
+ *swashp = swash_init("utf8", normal, &PL_sv_undef, 4, 0);
+ uv = swash_fetch(*swashp, p, TRUE);
+ if (uv)
+ uv = UNI_TO_NATIVE(uv);
+ else {
+ HV *hv;
+ SV *keysv;
+ HE *he;
+
+ uv = utf8_to_uvchr(p, 0);
+
+ if ((hv = get_hv(special, FALSE)) &&
+ (keysv = sv_2mortal(Perl_newSVpvf(aTHX_ "%"UVuf, uv))) &&
+ (he = hv_fetch_ent(hv, keysv, FALSE, 0))) {
+ SV *val = HeVAL(he);
+ char *s = SvPV(val, *lenp);
+ U8 c = *(U8*)s;
+ if (*lenp > 1 || UNI_IS_INVARIANT(c))
+ Copy(s, ustrp, *lenp, U8);
+ else {
+ /* something in the 0x80..0xFF range */
+ ustrp[0] = UTF8_EIGHT_BIT_HI(c);
+ ustrp[1] = UTF8_EIGHT_BIT_LO(c);
+ *lenp = 2;
+ }
+ return 0;
+ }
+ }
*lenp = UNISKIP(uv);
uvuni_to_utf8(ustrp, uv);
return uv;
}
UV
-Perl_to_utf8_title(pTHX_ U8 *p, U8* ustrp, STRLEN *lenp)
+Perl_to_utf8_upper(pTHX_ U8 *p, U8* ustrp, STRLEN *lenp)
{
- UV uv;
+ return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
+ &PL_utf8_toupper, "ToUpper", "utf8::ToSpecUpper");
+}
- if (!PL_utf8_totitle)
- PL_utf8_totitle = swash_init("utf8", "ToTitle", &PL_sv_undef, 4, 0);
- uv = swash_fetch(PL_utf8_totitle, p, TRUE);
- uv = uv ? UNI_TO_NATIVE(uv) : utf8_to_uvchr(p, 0);
- *lenp = UNISKIP(uv);
- uvuni_to_utf8(ustrp, uv);
- return uv;
+UV
+Perl_to_utf8_title(pTHX_ U8 *p, U8* ustrp, STRLEN *lenp)
+{
+ return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
+ &PL_utf8_totitle, "ToTitle", "utf8::ToSpecTitle");
}
UV
Perl_to_utf8_lower(pTHX_ U8 *p, U8* ustrp, STRLEN *lenp)
{
- UV uv;
-
- if (!PL_utf8_tolower)
- PL_utf8_tolower = swash_init("utf8", "ToLower", &PL_sv_undef, 4, 0);
- uv = swash_fetch(PL_utf8_tolower, p, TRUE);
- uv = uv ? UNI_TO_NATIVE(uv) : utf8_to_uvchr(p, 0);
- *lenp = UNISKIP(uv);
- uvuni_to_utf8(ustrp, uv);
- return uv;
+ return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
+ &PL_utf8_tolower, "ToLower", "utf8::ToSpecLower");
}
/* a "swash" is a swatch hash */