Unicode: add the case folding table.
Jarkko Hietaniemi [Fri, 26 Oct 2001 14:12:04 +0000 (14:12 +0000)]
p4raw-id: //depot/perl@12689

MANIFEST
lib/unicore/To/Fold.pl [new file with mode: 0644]
lib/unicore/mktables

index 2e3e9af..99e6773 100644 (file)
--- a/MANIFEST
+++ b/MANIFEST
@@ -1606,6 +1606,7 @@ lib/unicore/Scripts.pl            Unicode character database
 lib/unicore/Scripts.txt                Unicode character database
 lib/unicore/SpecCase.txt       Unicode character database
 lib/unicore/To/Digit.pl                Unicode character database
+lib/unicore/To/Fold.pl         Unicode character database
 lib/unicore/To/Lower.pl                Unicode character database
 lib/unicore/To/Title.pl                Unicode character database
 lib/unicore/To/Upper.pl                Unicode character database
diff --git a/lib/unicore/To/Fold.pl b/lib/unicore/To/Fold.pl
new file mode 100644 (file)
index 0000000..5a24150
--- /dev/null
@@ -0,0 +1,830 @@
+# !!!!!!!   DO NOT EDIT THIS FILE   !!!!!!! 
+# This file is built by mktables from e.g. Unicode.txt.
+# Any changes made here will be lost!
+
+%utf8::ToSpecFold = (
+'223' => "\x{0073}\x{0073}",
+'304' => "\x{0069}",
+'305' => "\x{0069}",
+'329' => "\x{02BC}\x{006E}",
+'496' => "\x{006A}\x{030C}",
+'912' => "\x{03B9}\x{0308}\x{0301}",
+'944' => "\x{03C5}\x{0308}\x{0301}",
+'1415' => "\x{0565}\x{0582}",
+'7830' => "\x{0068}\x{0331}",
+'7831' => "\x{0074}\x{0308}",
+'7832' => "\x{0077}\x{030A}",
+'7833' => "\x{0079}\x{030A}",
+'7834' => "\x{0061}\x{02BE}",
+'8016' => "\x{03C5}\x{0313}",
+'8018' => "\x{03C5}\x{0313}\x{0300}",
+'8020' => "\x{03C5}\x{0313}\x{0301}",
+'8022' => "\x{03C5}\x{0313}\x{0342}",
+'8064' => "\x{1F00}\x{03B9}",
+'8065' => "\x{1F01}\x{03B9}",
+'8066' => "\x{1F02}\x{03B9}",
+'8067' => "\x{1F03}\x{03B9}",
+'8068' => "\x{1F04}\x{03B9}",
+'8069' => "\x{1F05}\x{03B9}",
+'8070' => "\x{1F06}\x{03B9}",
+'8071' => "\x{1F07}\x{03B9}",
+'8072' => "\x{1F00}\x{03B9}",
+'8073' => "\x{1F01}\x{03B9}",
+'8074' => "\x{1F02}\x{03B9}",
+'8075' => "\x{1F03}\x{03B9}",
+'8076' => "\x{1F04}\x{03B9}",
+'8077' => "\x{1F05}\x{03B9}",
+'8078' => "\x{1F06}\x{03B9}",
+'8079' => "\x{1F07}\x{03B9}",
+'8080' => "\x{1F20}\x{03B9}",
+'8081' => "\x{1F21}\x{03B9}",
+'8082' => "\x{1F22}\x{03B9}",
+'8083' => "\x{1F23}\x{03B9}",
+'8084' => "\x{1F24}\x{03B9}",
+'8085' => "\x{1F25}\x{03B9}",
+'8086' => "\x{1F26}\x{03B9}",
+'8087' => "\x{1F27}\x{03B9}",
+'8088' => "\x{1F20}\x{03B9}",
+'8089' => "\x{1F21}\x{03B9}",
+'8090' => "\x{1F22}\x{03B9}",
+'8091' => "\x{1F23}\x{03B9}",
+'8092' => "\x{1F24}\x{03B9}",
+'8093' => "\x{1F25}\x{03B9}",
+'8094' => "\x{1F26}\x{03B9}",
+'8095' => "\x{1F27}\x{03B9}",
+'8096' => "\x{1F60}\x{03B9}",
+'8097' => "\x{1F61}\x{03B9}",
+'8098' => "\x{1F62}\x{03B9}",
+'8099' => "\x{1F63}\x{03B9}",
+'8100' => "\x{1F64}\x{03B9}",
+'8101' => "\x{1F65}\x{03B9}",
+'8102' => "\x{1F66}\x{03B9}",
+'8103' => "\x{1F67}\x{03B9}",
+'8104' => "\x{1F60}\x{03B9}",
+'8105' => "\x{1F61}\x{03B9}",
+'8106' => "\x{1F62}\x{03B9}",
+'8107' => "\x{1F63}\x{03B9}",
+'8108' => "\x{1F64}\x{03B9}",
+'8109' => "\x{1F65}\x{03B9}",
+'8110' => "\x{1F66}\x{03B9}",
+'8111' => "\x{1F67}\x{03B9}",
+'8114' => "\x{1F70}\x{03B9}",
+'8115' => "\x{03B1}\x{03B9}",
+'8116' => "\x{03AC}\x{03B9}",
+'8118' => "\x{03B1}\x{0342}",
+'8119' => "\x{03B1}\x{0342}\x{03B9}",
+'8124' => "\x{03B1}\x{03B9}",
+'8130' => "\x{1F74}\x{03B9}",
+'8131' => "\x{03B7}\x{03B9}",
+'8132' => "\x{03AE}\x{03B9}",
+'8134' => "\x{03B7}\x{0342}",
+'8135' => "\x{03B7}\x{0342}\x{03B9}",
+'8140' => "\x{03B7}\x{03B9}",
+'8146' => "\x{03B9}\x{0308}\x{0300}",
+'8147' => "\x{03B9}\x{0308}\x{0301}",
+'8150' => "\x{03B9}\x{0342}",
+'8151' => "\x{03B9}\x{0308}\x{0342}",
+'8162' => "\x{03C5}\x{0308}\x{0300}",
+'8163' => "\x{03C5}\x{0308}\x{0301}",
+'8164' => "\x{03C1}\x{0313}",
+'8166' => "\x{03C5}\x{0342}",
+'8167' => "\x{03C5}\x{0308}\x{0342}",
+'8178' => "\x{1F7C}\x{03B9}",
+'8179' => "\x{03C9}\x{03B9}",
+'8180' => "\x{03CE}\x{03B9}",
+'8182' => "\x{03C9}\x{0342}",
+'8183' => "\x{03C9}\x{0342}\x{03B9}",
+'8188' => "\x{03C9}\x{03B9}",
+'64256' => "\x{0066}\x{0066}",
+'64257' => "\x{0066}\x{0069}",
+'64258' => "\x{0066}\x{006C}",
+'64259' => "\x{0066}\x{0066}\x{0069}",
+'64260' => "\x{0066}\x{0066}\x{006C}",
+'64261' => "\x{0073}\x{0074}",
+'64262' => "\x{0073}\x{0074}",
+'64275' => "\x{0574}\x{0576}",
+'64276' => "\x{0574}\x{0565}",
+'64277' => "\x{0574}\x{056B}",
+'64278' => "\x{057E}\x{0576}",
+'64279' => "\x{0574}\x{056D}",
+);
+
+return <<'END';
+0041           0061
+0042           0062
+0043           0063
+0044           0064
+0045           0065
+0046           0066
+0047           0067
+0048           0068
+0049           0069
+004A           006A
+004B           006B
+004C           006C
+004D           006D
+004E           006E
+004F           006F
+0050           0070
+0051           0071
+0052           0072
+0053           0073
+0054           0074
+0055           0075
+0056           0076
+0057           0077
+0058           0078
+0059           0079
+005A           007A
+00B5           03BC
+00C0           00E0
+00C1           00E1
+00C2           00E2
+00C3           00E3
+00C4           00E4
+00C5           00E5
+00C6           00E6
+00C7           00E7
+00C8           00E8
+00C9           00E9
+00CA           00EA
+00CB           00EB
+00CC           00EC
+00CD           00ED
+00CE           00EE
+00CF           00EF
+00D0           00F0
+00D1           00F1
+00D2           00F2
+00D3           00F3
+00D4           00F4
+00D5           00F5
+00D6           00F6
+00D8           00F8
+00D9           00F9
+00DA           00FA
+00DB           00FB
+00DC           00FC
+00DD           00FD
+00DE           00FE
+0100           0101
+0102           0103
+0104           0105
+0106           0107
+0108           0109
+010A           010B
+010C           010D
+010E           010F
+0110           0111
+0112           0113
+0114           0115
+0116           0117
+0118           0119
+011A           011B
+011C           011D
+011E           011F
+0120           0121
+0122           0123
+0124           0125
+0126           0127
+0128           0129
+012A           012B
+012C           012D
+012E           012F
+0132           0133
+0134           0135
+0136           0137
+0139           013A
+013B           013C
+013D           013E
+013F           0140
+0141           0142
+0143           0144
+0145           0146
+0147           0148
+014A           014B
+014C           014D
+014E           014F
+0150           0151
+0152           0153
+0154           0155
+0156           0157
+0158           0159
+015A           015B
+015C           015D
+015E           015F
+0160           0161
+0162           0163
+0164           0165
+0166           0167
+0168           0169
+016A           016B
+016C           016D
+016E           016F
+0170           0171
+0172           0173
+0174           0175
+0176           0177
+0178           00FF
+0179           017A
+017B           017C
+017D           017E
+017F           0073
+0181           0253
+0182           0183
+0184           0185
+0186           0254
+0187           0188
+0189           0256
+018A           0257
+018B           018C
+018E           01DD
+018F           0259
+0190           025B
+0191           0192
+0193           0260
+0194           0263
+0196           0269
+0197           0268
+0198           0199
+019C           026F
+019D           0272
+019F           0275
+01A0           01A1
+01A2           01A3
+01A4           01A5
+01A6           0280
+01A7           01A8
+01A9           0283
+01AC           01AD
+01AE           0288
+01AF           01B0
+01B1           028A
+01B2           028B
+01B3           01B4
+01B5           01B6
+01B7           0292
+01B8           01B9
+01BC           01BD
+01C4   01C5    01C6
+01C7   01C8    01C9
+01CA   01CB    01CC
+01CD           01CE
+01CF           01D0
+01D1           01D2
+01D3           01D4
+01D5           01D6
+01D7           01D8
+01D9           01DA
+01DB           01DC
+01DE           01DF
+01E0           01E1
+01E2           01E3
+01E4           01E5
+01E6           01E7
+01E8           01E9
+01EA           01EB
+01EC           01ED
+01EE           01EF
+01F1   01F2    01F3
+01F4           01F5
+01F6           0195
+01F7           01BF
+01F8           01F9
+01FA           01FB
+01FC           01FD
+01FE           01FF
+0200           0201
+0202           0203
+0204           0205
+0206           0207
+0208           0209
+020A           020B
+020C           020D
+020E           020F
+0210           0211
+0212           0213
+0214           0215
+0216           0217
+0218           0219
+021A           021B
+021C           021D
+021E           021F
+0222           0223
+0224           0225
+0226           0227
+0228           0229
+022A           022B
+022C           022D
+022E           022F
+0230           0231
+0232           0233
+0345           03B9
+0386           03AC
+0388           03AD
+0389           03AE
+038A           03AF
+038C           03CC
+038E           03CD
+038F           03CE
+0391           03B1
+0392           03B2
+0393           03B3
+0394           03B4
+0395           03B5
+0396           03B6
+0397           03B7
+0398           03B8
+0399           03B9
+039A           03BA
+039B           03BB
+039C           03BC
+039D           03BD
+039E           03BE
+039F           03BF
+03A0           03C0
+03A1           03C1
+03A3           03C3
+03A4           03C4
+03A5           03C5
+03A6           03C6
+03A7           03C7
+03A8           03C8
+03A9           03C9
+03AA           03CA
+03AB           03CB
+03C2           03C3
+03D0           03B2
+03D1           03B8
+03D5           03C6
+03D6           03C0
+03DA           03DB
+03DC           03DD
+03DE           03DF
+03E0           03E1
+03E2           03E3
+03E4           03E5
+03E6           03E7
+03E8           03E9
+03EA           03EB
+03EC           03ED
+03EE           03EF
+03F0           03BA
+03F1           03C1
+03F2           03C3
+03F4           03B8
+03F5           03B5
+0400           0450
+0401           0451
+0402           0452
+0403           0453
+0404           0454
+0405           0455
+0406           0456
+0407           0457
+0408           0458
+0409           0459
+040A           045A
+040B           045B
+040C           045C
+040D           045D
+040E           045E
+040F           045F
+0410           0430
+0411           0431
+0412           0432
+0413           0433
+0414           0434
+0415           0435
+0416           0436
+0417           0437
+0418           0438
+0419           0439
+041A           043A
+041B           043B
+041C           043C
+041D           043D
+041E           043E
+041F           043F
+0420           0440
+0421           0441
+0422           0442
+0423           0443
+0424           0444
+0425           0445
+0426           0446
+0427           0447
+0428           0448
+0429           0449
+042A           044A
+042B           044B
+042C           044C
+042D           044D
+042E           044E
+042F           044F
+0460           0461
+0462           0463
+0464           0465
+0466           0467
+0468           0469
+046A           046B
+046C           046D
+046E           046F
+0470           0471
+0472           0473
+0474           0475
+0476           0477
+0478           0479
+047A           047B
+047C           047D
+047E           047F
+0480           0481
+048C           048D
+048E           048F
+0490           0491
+0492           0493
+0494           0495
+0496           0497
+0498           0499
+049A           049B
+049C           049D
+049E           049F
+04A0           04A1
+04A2           04A3
+04A4           04A5
+04A6           04A7
+04A8           04A9
+04AA           04AB
+04AC           04AD
+04AE           04AF
+04B0           04B1
+04B2           04B3
+04B4           04B5
+04B6           04B7
+04B8           04B9
+04BA           04BB
+04BC           04BD
+04BE           04BF
+04C1           04C2
+04C3           04C4
+04C7           04C8
+04CB           04CC
+04D0           04D1
+04D2           04D3
+04D4           04D5
+04D6           04D7
+04D8           04D9
+04DA           04DB
+04DC           04DD
+04DE           04DF
+04E0           04E1
+04E2           04E3
+04E4           04E5
+04E6           04E7
+04E8           04E9
+04EA           04EB
+04EC           04ED
+04EE           04EF
+04F0           04F1
+04F2           04F3
+04F4           04F5
+04F8           04F9
+0531           0561
+0532           0562
+0533           0563
+0534           0564
+0535           0565
+0536           0566
+0537           0567
+0538           0568
+0539           0569
+053A           056A
+053B           056B
+053C           056C
+053D           056D
+053E           056E
+053F           056F
+0540           0570
+0541           0571
+0542           0572
+0543           0573
+0544           0574
+0545           0575
+0546           0576
+0547           0577
+0548           0578
+0549           0579
+054A           057A
+054B           057B
+054C           057C
+054D           057D
+054E           057E
+054F           057F
+0550           0580
+0551           0581
+0552           0582
+0553           0583
+0554           0584
+0555           0585
+0556           0586
+1E00           1E01
+1E02           1E03
+1E04           1E05
+1E06           1E07
+1E08           1E09
+1E0A           1E0B
+1E0C           1E0D
+1E0E           1E0F
+1E10           1E11
+1E12           1E13
+1E14           1E15
+1E16           1E17
+1E18           1E19
+1E1A           1E1B
+1E1C           1E1D
+1E1E           1E1F
+1E20           1E21
+1E22           1E23
+1E24           1E25
+1E26           1E27
+1E28           1E29
+1E2A           1E2B
+1E2C           1E2D
+1E2E           1E2F
+1E30           1E31
+1E32           1E33
+1E34           1E35
+1E36           1E37
+1E38           1E39
+1E3A           1E3B
+1E3C           1E3D
+1E3E           1E3F
+1E40           1E41
+1E42           1E43
+1E44           1E45
+1E46           1E47
+1E48           1E49
+1E4A           1E4B
+1E4C           1E4D
+1E4E           1E4F
+1E50           1E51
+1E52           1E53
+1E54           1E55
+1E56           1E57
+1E58           1E59
+1E5A           1E5B
+1E5C           1E5D
+1E5E           1E5F
+1E60           1E61
+1E62           1E63
+1E64           1E65
+1E66           1E67
+1E68           1E69
+1E6A           1E6B
+1E6C           1E6D
+1E6E           1E6F
+1E70           1E71
+1E72           1E73
+1E74           1E75
+1E76           1E77
+1E78           1E79
+1E7A           1E7B
+1E7C           1E7D
+1E7E           1E7F
+1E80           1E81
+1E82           1E83
+1E84           1E85
+1E86           1E87
+1E88           1E89
+1E8A           1E8B
+1E8C           1E8D
+1E8E           1E8F
+1E90           1E91
+1E92           1E93
+1E94           1E95
+1E9B           1E61
+1EA0           1EA1
+1EA2           1EA3
+1EA4           1EA5
+1EA6           1EA7
+1EA8           1EA9
+1EAA           1EAB
+1EAC           1EAD
+1EAE           1EAF
+1EB0           1EB1
+1EB2           1EB3
+1EB4           1EB5
+1EB6           1EB7
+1EB8           1EB9
+1EBA           1EBB
+1EBC           1EBD
+1EBE           1EBF
+1EC0           1EC1
+1EC2           1EC3
+1EC4           1EC5
+1EC6           1EC7
+1EC8           1EC9
+1ECA           1ECB
+1ECC           1ECD
+1ECE           1ECF
+1ED0           1ED1
+1ED2           1ED3
+1ED4           1ED5
+1ED6           1ED7
+1ED8           1ED9
+1EDA           1EDB
+1EDC           1EDD
+1EDE           1EDF
+1EE0           1EE1
+1EE2           1EE3
+1EE4           1EE5
+1EE6           1EE7
+1EE8           1EE9
+1EEA           1EEB
+1EEC           1EED
+1EEE           1EEF
+1EF0           1EF1
+1EF2           1EF3
+1EF4           1EF5
+1EF6           1EF7
+1EF8           1EF9
+1F08           1F00
+1F09           1F01
+1F0A           1F02
+1F0B           1F03
+1F0C           1F04
+1F0D           1F05
+1F0E           1F06
+1F0F           1F07
+1F18           1F10
+1F19           1F11
+1F1A           1F12
+1F1B           1F13
+1F1C           1F14
+1F1D           1F15
+1F28           1F20
+1F29           1F21
+1F2A           1F22
+1F2B           1F23
+1F2C           1F24
+1F2D           1F25
+1F2E           1F26
+1F2F           1F27
+1F38           1F30
+1F39           1F31
+1F3A           1F32
+1F3B           1F33
+1F3C           1F34
+1F3D           1F35
+1F3E           1F36
+1F3F           1F37
+1F48           1F40
+1F49           1F41
+1F4A           1F42
+1F4B           1F43
+1F4C           1F44
+1F4D           1F45
+1F59           1F51
+1F5B           1F53
+1F5D           1F55
+1F5F           1F57
+1F68           1F60
+1F69           1F61
+1F6A           1F62
+1F6B           1F63
+1F6C           1F64
+1F6D           1F65
+1F6E           1F66
+1F6F           1F67
+1FB8           1FB0
+1FB9           1FB1
+1FBA           1F70
+1FBB           1F71
+1FBE           03B9
+1FC8           1F72
+1FC9           1F73
+1FCA           1F74
+1FCB           1F75
+1FD8           1FD0
+1FD9           1FD1
+1FDA           1F76
+1FDB           1F77
+1FE8           1FE0
+1FE9           1FE1
+1FEA           1F7A
+1FEB           1F7B
+1FEC           1FE5
+1FF8           1F78
+1FF9           1F79
+1FFA           1F7C
+1FFB           1F7D
+2126           03C9
+212A           006B
+212B           00E5
+2160           2170
+2161           2171
+2162           2172
+2163           2173
+2164           2174
+2165           2175
+2166           2176
+2167           2177
+2168           2178
+2169           2179
+216A           217A
+216B           217B
+216C           217C
+216D           217D
+216E           217E
+216F           217F
+24B6           24D0
+24B7           24D1
+24B8           24D2
+24B9           24D3
+24BA           24D4
+24BB           24D5
+24BC           24D6
+24BD           24D7
+24BE           24D8
+24BF           24D9
+24C0           24DA
+24C1           24DB
+24C2           24DC
+24C3           24DD
+24C4           24DE
+24C5           24DF
+24C6           24E0
+24C7           24E1
+24C8           24E2
+24C9           24E3
+24CA           24E4
+24CB           24E5
+24CC           24E6
+24CD           24E7
+24CE           24E8
+24CF           24E9
+FF21           FF41
+FF22           FF42
+FF23           FF43
+FF24           FF44
+FF25           FF45
+FF26           FF46
+FF27           FF47
+FF28           FF48
+FF29           FF49
+FF2A           FF4A
+FF2B           FF4B
+FF2C           FF4C
+FF2D           FF4D
+FF2E           FF4E
+FF2F           FF4F
+FF30           FF50
+FF31           FF51
+FF32           FF52
+FF33           FF53
+FF34           FF54
+FF35           FF55
+FF36           FF56
+FF37           FF57
+FF38           FF58
+FF39           FF59
+FF3A           FF5A
+10400          10428
+10401          10429
+10402          1042A
+10403          1042B
+10404          1042C
+10405          1042D
+10406          1042E
+10407          1042F
+10408          10430
+10409          10431
+1040A          10432
+1040B          10433
+1040C          10434
+1040D          10435
+1040E          10436
+1040F          10437
+10410          10438
+10411          10439
+10412          1043A
+10413          1043B
+10414          1043C
+10415          1043D
+10416          1043E
+10417          1043F
+10418          10440
+10419          10441
+1041A          10442
+1041B          10443
+1041C          10444
+1041D          10445
+1041E          10446
+1041F          10447
+10420          10448
+10421          10449
+10422          1044A
+10423          1044B
+10424          1044C
+10425          1044D
+END
index 7d8912d..ea04974 100644 (file)
@@ -748,7 +748,7 @@ if (open(my $SpecCase, "SpecCase.txt")) {
 # Prepend them to the To/{Upper,Lower,Title}.pl.
 
 for my $case (qw(Lower Title Upper)) {
-    my $NormalCase = do "To/$case.pl";
+    my $NormalCase = do "To/$case.pl" || die "$0: To/$case.pl: $!\n";
     if (open(my $Case, ">To/$case.pl")) {
        header($Case);
        print $Case <<EOT;
@@ -773,5 +773,59 @@ EOT
     }
 }
 
+#
+# Read in the case foldings.
+#
+# We will do full case folding, C + F + I (see CaseFold.txt).
+#
+
+if (open(my $CaseFold, "CaseFold.txt")) {
+    my @Fold;
+    my %Fold;
+
+    while (<$CaseFold>) {
+       next unless /^([0-9A-Fa-f]+)\s*;\s*([CFI])\s*;\s*([0-9A-Fa-f]+(?: [0-9A-Fa-f]+)*)\s*;/;
+
+       my ($code, $status, $fold) = ($1, $2, $3);
+
+       if ($status eq 'C') { # Common: one-to-one folding
+           append(\@Fold, $code, $fold);
+       } else { # F: full, or I: dotted uppercase I -> dotless lowercase I
+           $Fold{hex($code)} = $fold;
+       }
+    }
+
+    flush(\@Fold, "To/Fold.pl");
+
+    #
+    # Prepend the special foldings to the common foldings.
+    #
+
+    my $CommonFold = do "To/Fold.pl" || die "$0: To/Fold.pl: $!\n";
+    if (open(my $Fold, ">To/Fold.pl")) {
+       header($Fold);
+       print $Fold <<EOT;
+
+%utf8::ToSpecFold = (
+EOT
+        for my $code (sort { $a <=> $b } keys %Fold) {
+           my $foldstr =
+               join "", map { sprintf "\\x{%s}", $_ } split ' ', $Fold{$code};
+           print $Fold qq['$code' => "$foldstr",\n];
+       }
+       print $Fold <<EOT;
+);
+
+EOT
+       begin($Fold);
+       print $Fold $CommonFold;
+       end($Fold);
+    } else {
+       die "$0: To/Fold.pl: $!\n";
+    }
+} else {
+    die "$0: CaseFold.txt: $!\n";
+}
+
 # That's all, folks!