Commit | Line | Data |
ac5ea531 |
1 | |
4a2e806c |
2 | BEGIN { |
1efaba7f |
3 | unless ("A" eq pack('U', 0x41)) { |
9f1f04a1 |
4 | print "1..0 # Unicode::Normalize " . |
5 | "cannot stringify a Unicode code point\n"; |
f027f502 |
6 | exit 0; |
4a2e806c |
7 | } |
8 | } |
9 | |
6c941e0c |
10 | BEGIN { |
11 | if ($ENV{PERL_CORE}) { |
12 | chdir('t') if -d 't'; |
9f1f04a1 |
13 | @INC = $^O eq 'MacOS' ? qw(::lib) : qw(../lib); |
6c941e0c |
14 | } |
15 | } |
16 | |
ac5ea531 |
17 | ######################### |
18 | |
19 | use Test; |
20 | use strict; |
21 | use warnings; |
2b8d773d |
22 | BEGIN { plan tests => 211 }; |
ac5ea531 |
23 | use Unicode::Normalize qw(:all); |
24 | ok(1); # If we made it this far, we're ok. |
25 | |
fe067ad9 |
26 | sub _pack_U { Unicode::Normalize::pack_U(@_) } |
27 | sub hexU { _pack_U map hex, split ' ', shift } |
6c941e0c |
28 | |
ac5ea531 |
29 | ######################### |
30 | |
fe067ad9 |
31 | ok(getCombinClass( 0), 0); |
32 | ok(getCombinClass( 41), 0); |
33 | ok(getCombinClass( 65), 0); |
34 | ok(getCombinClass( 768), 230); |
35 | ok(getCombinClass(1809), 36); |
36 | |
37 | ok(getCanon( 0), undef); |
38 | ok(getCanon(0x29), undef); |
39 | ok(getCanon(0x41), undef); |
40 | ok(getCanon(0x00C0), _pack_U(0x0041, 0x0300)); |
41 | ok(getCanon(0x00EF), _pack_U(0x0069, 0x0308)); |
42 | ok(getCanon(0x304C), _pack_U(0x304B, 0x3099)); |
43 | ok(getCanon(0x1EA4), _pack_U(0x0041, 0x0302, 0x0301)); |
44 | ok(getCanon(0x1F82), _pack_U(0x03B1, 0x0313, 0x0300, 0x0345)); |
45 | ok(getCanon(0x1FAF), _pack_U(0x03A9, 0x0314, 0x0342, 0x0345)); |
46 | ok(getCanon(0xAC00), _pack_U(0x1100, 0x1161)); |
47 | ok(getCanon(0xAE00), _pack_U(0x1100, 0x1173, 0x11AF)); |
48 | ok(getCanon(0x212C), undef); |
49 | ok(getCanon(0x3243), undef); |
50 | ok(getCanon(0xFA2D), _pack_U(0x9DB4)); |
51 | |
52 | ok(getCompat( 0), undef); |
53 | ok(getCompat(0x29), undef); |
54 | ok(getCompat(0x41), undef); |
55 | ok(getCompat(0x00C0), _pack_U(0x0041, 0x0300)); |
56 | ok(getCompat(0x00EF), _pack_U(0x0069, 0x0308)); |
57 | ok(getCompat(0x304C), _pack_U(0x304B, 0x3099)); |
58 | ok(getCompat(0x1EA4), _pack_U(0x0041, 0x0302, 0x0301)); |
59 | ok(getCompat(0x1F82), _pack_U(0x03B1, 0x0313, 0x0300, 0x0345)); |
60 | ok(getCompat(0x1FAF), _pack_U(0x03A9, 0x0314, 0x0342, 0x0345)); |
61 | ok(getCompat(0x212C), _pack_U(0x0042)); |
62 | ok(getCompat(0x3243), _pack_U(0x0028, 0x81F3, 0x0029)); |
63 | ok(getCompat(0xAC00), _pack_U(0x1100, 0x1161)); |
64 | ok(getCompat(0xAE00), _pack_U(0x1100, 0x1173, 0x11AF)); |
65 | ok(getCompat(0xFA2D), _pack_U(0x9DB4)); |
66 | |
67 | ok(getComposite( 0, 0), undef); |
68 | ok(getComposite( 0, 0x29), undef); |
69 | ok(getComposite(0x29, 0), undef); |
70 | ok(getComposite(0x29, 0x29), undef); |
71 | ok(getComposite( 0, 0x41), undef); |
72 | ok(getComposite(0x41, 0), undef); |
73 | ok(getComposite(0x41, 0x41), undef); |
74 | ok(getComposite(12, 0x0300), undef); |
75 | ok(getComposite(0x0055, 0xFF00), undef); |
76 | ok(getComposite(0x0041, 0x0300), 0x00C0); |
77 | ok(getComposite(0x0055, 0x0300), 0x00D9); |
78 | ok(getComposite(0x0112, 0x0300), 0x1E14); |
79 | ok(getComposite(0x1100, 0x1161), 0xAC00); |
80 | ok(getComposite(0x1100, 0x1173), 0xADF8); |
81 | ok(getComposite(0x1100, 0x11AF), undef); |
82 | ok(getComposite(0x1173, 0x11AF), undef); |
83 | ok(getComposite(0xAC00, 0x11A7), undef); |
84 | ok(getComposite(0xAC00, 0x11A8), 0xAC01); |
85 | ok(getComposite(0xADF8, 0x11AF), 0xAE00); |
86 | |
87 | sub uprops { |
88 | my $uv = shift; |
89 | my $r = ""; |
90 | $r .= isExclusion($uv) ? 'X' : 'x'; |
91 | $r .= isSingleton($uv) ? 'S' : 's'; |
92 | $r .= isNonStDecomp($uv) ? 'N' : 'n'; # Non-Starter Decomposition |
93 | $r .= isComp_Ex($uv) ? 'F' : 'f'; # Full exclusion (X + S + N) |
94 | $r .= isComp2nd($uv) ? 'B' : 'b'; # B = M = Y |
95 | $r .= isNFD_NO($uv) ? 'D' : 'd'; |
96 | $r .= isNFC_MAYBE($uv) ? 'M' : 'm'; # Maybe |
97 | $r .= isNFC_NO($uv) ? 'C' : 'c'; |
98 | $r .= isNFKD_NO($uv) ? 'K' : 'k'; |
99 | $r .= isNFKC_MAYBE($uv) ? 'Y' : 'y'; # maYbe |
100 | $r .= isNFKC_NO($uv) ? 'G' : 'g'; |
101 | return $r; |
102 | } |
103 | |
39f4556f |
104 | ok(uprops(0x0000), 'xsnfbdmckyg'); # NULL |
105 | ok(uprops(0x0029), 'xsnfbdmckyg'); # RIGHT PARENTHESIS |
106 | ok(uprops(0x0041), 'xsnfbdmckyg'); # LATIN CAPITAL LETTER A |
fe067ad9 |
107 | ok(uprops(0x00A0), 'xsnfbdmcKyG'); # NO-BREAK SPACE |
108 | ok(uprops(0x00C0), 'xsnfbDmcKyg'); # LATIN CAPITAL LETTER A WITH GRAVE |
109 | ok(uprops(0x0300), 'xsnfBdMckYg'); # COMBINING GRAVE ACCENT |
110 | ok(uprops(0x0344), 'xsNFbDmCKyG'); # COMBINING GREEK DIALYTIKA TONOS |
111 | ok(uprops(0x0387), 'xSnFbDmCKyG'); # GREEK ANO TELEIA |
112 | ok(uprops(0x0958), 'XsnFbDmCKyG'); # DEVANAGARI LETTER QA |
113 | ok(uprops(0x0F43), 'XsnFbDmCKyG'); # TIBETAN LETTER GHA |
114 | ok(uprops(0x1100), 'xsnfbdmckyg'); # HANGUL CHOSEONG KIYEOK |
115 | ok(uprops(0x1161), 'xsnfBdMckYg'); # HANGUL JUNGSEONG A |
39f4556f |
116 | ok(uprops(0x11AF), 'xsnfBdMckYg'); # HANGUL JONGSEONG RIEUL |
fe067ad9 |
117 | ok(uprops(0x212B), 'xSnFbDmCKyG'); # ANGSTROM SIGN |
118 | ok(uprops(0xAC00), 'xsnfbDmcKyg'); # HANGUL SYLLABLE GA |
119 | ok(uprops(0xF900), 'xSnFbDmCKyG'); # CJK COMPATIBILITY IDEOGRAPH-F900 |
120 | ok(uprops(0xFB4E), 'XsnFbDmCKyG'); # HEBREW LETTER PE WITH RAFE |
121 | ok(uprops(0xFF71), 'xsnfbdmcKyG'); # HALFWIDTH KATAKANA LETTER A |
122 | |
123 | ok(decompose(""), ""); |
124 | ok(decompose("A"), "A"); |
125 | ok(decompose("", 1), ""); |
126 | ok(decompose("A", 1), "A"); |
127 | |
128 | ok(decompose(hexU("1E14 AC01")), hexU("0045 0304 0300 1100 1161 11A8")); |
129 | ok(decompose(hexU("AC00 AE00")), hexU("1100 1161 1100 1173 11AF")); |
130 | ok(decompose(hexU("304C FF76")), hexU("304B 3099 FF76")); |
131 | |
132 | ok(decompose(hexU("1E14 AC01"), 1), hexU("0045 0304 0300 1100 1161 11A8")); |
133 | ok(decompose(hexU("AC00 AE00"), 1), hexU("1100 1161 1100 1173 11AF")); |
134 | ok(decompose(hexU("304C FF76"), 1), hexU("304B 3099 30AB")); |
135 | |
136 | # don't modify the source |
137 | my $sDec = "\x{FA19}"; |
138 | ok(decompose($sDec), "\x{795E}"); |
139 | ok($sDec, "\x{FA19}"); |
140 | |
141 | ok(reorder(""), ""); |
142 | ok(reorder("A"), "A"); |
143 | ok(reorder(hexU("0041 0300 0315 0313 031b 0061")), |
144 | hexU("0041 031b 0300 0313 0315 0061")); |
145 | ok(reorder(hexU("00C1 0300 0315 0313 031b 0061 309A 3099")), |
146 | hexU("00C1 031b 0300 0313 0315 0061 309A 3099")); |
147 | |
148 | # don't modify the source |
149 | my $sReord = "\x{3000}\x{300}\x{31b}"; |
150 | ok(reorder($sReord), "\x{3000}\x{31b}\x{300}"); |
151 | ok($sReord, "\x{3000}\x{300}\x{31b}"); |
152 | |
153 | ok(compose(""), ""); |
154 | ok(compose("A"), "A"); |
155 | ok(compose(hexU("0061 0300")), hexU("00E0")); |
156 | ok(compose(hexU("0061 0300 031B")), hexU("00E0 031B")); |
157 | ok(compose(hexU("0061 0300 0315")), hexU("00E0 0315")); |
158 | ok(compose(hexU("0061 0300 0313")), hexU("00E0 0313")); |
159 | ok(compose(hexU("0061 031B 0300")), hexU("00E0 031B")); |
160 | ok(compose(hexU("0061 0315 0300")), hexU("0061 0315 0300")); |
161 | ok(compose(hexU("0061 0313 0300")), hexU("0061 0313 0300")); |
162 | |
163 | # don't modify the source |
164 | my $sCom = "\x{304B}\x{3099}"; |
165 | ok(compose($sCom), "\x{304C}"); |
166 | ok($sCom, "\x{304B}\x{3099}"); |
167 | |
168 | ok(composeContiguous(""), ""); |
169 | ok(composeContiguous("A"), "A"); |
170 | ok(composeContiguous(hexU("0061 0300")), hexU("00E0")); |
171 | ok(composeContiguous(hexU("0061 0300 031B")), hexU("00E0 031B")); |
172 | ok(composeContiguous(hexU("0061 0300 0315")), hexU("00E0 0315")); |
173 | ok(composeContiguous(hexU("0061 0300 0313")), hexU("00E0 0313")); |
174 | ok(composeContiguous(hexU("0061 031B 0300")), hexU("0061 031B 0300")); |
175 | ok(composeContiguous(hexU("0061 0315 0300")), hexU("0061 0315 0300")); |
176 | ok(composeContiguous(hexU("0061 0313 0300")), hexU("0061 0313 0300")); |
177 | |
178 | # don't modify the source |
179 | my $sCtg = "\x{30DB}\x{309A}"; |
180 | ok(composeContiguous($sCtg), "\x{30DD}"); |
181 | ok($sCtg, "\x{30DB}\x{309A}"); |
8f118dcd |
182 | |
183 | sub answer { defined $_[0] ? $_[0] ? "YES" : "NO" : "MAYBE" } |
184 | |
fe067ad9 |
185 | ok(answer(checkNFD("")), "YES"); |
186 | ok(answer(checkNFC("")), "YES"); |
187 | ok(answer(checkNFKD("")), "YES"); |
188 | ok(answer(checkNFKC("")), "YES"); |
189 | ok(answer(check("NFD", "")), "YES"); |
190 | ok(answer(check("NFC", "")), "YES"); |
191 | ok(answer(check("NFKD","")), "YES"); |
192 | ok(answer(check("NFKC","")), "YES"); |
193 | |
8f118dcd |
194 | # U+0000 to U+007F are prenormalized in all the normalization forms. |
fe067ad9 |
195 | ok(answer(checkNFD("AZaz\t12!#`")), "YES"); |
196 | ok(answer(checkNFC("AZaz\t12!#`")), "YES"); |
197 | ok(answer(checkNFKD("AZaz\t12!#`")), "YES"); |
198 | ok(answer(checkNFKC("AZaz\t12!#`")), "YES"); |
199 | ok(answer(check("D", "AZaz\t12!#`")), "YES"); |
200 | ok(answer(check("C", "AZaz\t12!#`")), "YES"); |
201 | ok(answer(check("KD","AZaz\t12!#`")), "YES"); |
202 | ok(answer(check("KC","AZaz\t12!#`")), "YES"); |
203 | |
204 | ok(answer(checkNFD(NFD(_pack_U(0xC1, 0x1100, 0x1173, 0x11AF)))), "YES"); |
2b8d773d |
205 | ok(answer(checkNFD(hexU("20 C1 1100 1173 11AF"))), "NO"); |
206 | ok(answer(checkNFC(hexU("20 C1 1173 11AF"))), "MAYBE"); |
207 | ok(answer(checkNFC(hexU("20 C1 AE00 1100"))), "YES"); |
208 | ok(answer(checkNFC(hexU("20 C1 AE00 1100 0300"))), "MAYBE"); |
209 | ok(answer(checkNFC(hexU("212B 1100 0300"))), "NO"); |
210 | ok(answer(checkNFC(hexU("1100 0300 212B"))), "NO"); |
211 | ok(answer(checkNFC(hexU("0041 0327 030A"))), "MAYBE"); # A+cedilla+ring |
212 | ok(answer(checkNFC(hexU("0041 030A 0327"))), "NO"); # A+ring+cedilla |
213 | ok(answer(checkNFC(hexU("20 C1 FF71 2025"))),"YES"); |
214 | ok(answer(check("NFC", hexU("20 C1 212B 300"))), "NO"); |
215 | ok(answer(checkNFKD(hexU("20 C1 FF71 2025"))), "NO"); |
216 | ok(answer(checkNFKC(hexU("20 C1 AE00 2025"))), "NO"); |
217 | ok(answer(checkNFKC(hexU("212B 1100 0300"))), "NO"); |
218 | ok(answer(checkNFKC(hexU("1100 0300 212B"))), "NO"); |
219 | ok(answer(checkNFKC(hexU("0041 0327 030A"))), "MAYBE"); # A+cedilla+ring |
220 | ok(answer(checkNFKC(hexU("0041 030A 0327"))), "NO"); # A+ring+cedilla |
221 | ok(answer(check("NFKC", hexU("20 C1 212B 300"))), "NO"); |
f027f502 |
222 | |
223 | "012ABC" =~ /(\d+)(\w+)/; |
fe067ad9 |
224 | ok("012" eq NFC $1 && "ABC" eq NFC $2); |
f027f502 |
225 | |
fe067ad9 |
226 | ok(normalize('C', $1), "012"); |
227 | ok(normalize('C', $2), "ABC"); |
f027f502 |
228 | |
fe067ad9 |
229 | ok(normalize('NFC', $1), "012"); |
230 | ok(normalize('NFC', $2), "ABC"); |
f027f502 |
231 | # s/^NF// in normalize() must not prevent using $1, $&, etc. |
232 | |
fe067ad9 |
233 | # a string with initial zero should be treated like a number |
234 | |
235 | # LATIN CAPITAL LETTER A WITH GRAVE |
236 | ok(getCombinClass("0192"), 0); |
237 | ok(getCanon ("0192"), _pack_U(0x41, 0x300)); |
238 | ok(getCompat("0192"), _pack_U(0x41, 0x300)); |
239 | ok(getComposite("065", "0768"), 192); |
240 | ok(isNFD_NO ("0192")); |
241 | ok(isNFKD_NO("0192")); |
242 | |
243 | # DEVANAGARI LETTER QA |
244 | ok(isExclusion("02392")); |
245 | ok(isComp_Ex ("02392")); |
246 | ok(isNFC_NO ("02392")); |
247 | ok(isNFKC_NO ("02392")); |
248 | ok(isNFD_NO ("02392")); |
249 | ok(isNFKD_NO ("02392")); |
250 | |
251 | # ANGSTROM SIGN |
252 | ok(isSingleton("08491")); |
253 | ok(isComp_Ex ("08491")); |
254 | ok(isNFC_NO ("08491")); |
255 | ok(isNFKC_NO ("08491")); |
256 | ok(isNFD_NO ("08491")); |
257 | ok(isNFKD_NO ("08491")); |
258 | |
259 | # COMBINING GREEK DIALYTIKA TONOS |
260 | ok(isNonStDecomp("0836")); |
261 | ok(isComp_Ex ("0836")); |
262 | ok(isNFC_NO ("0836")); |
263 | ok(isNFKC_NO ("0836")); |
264 | ok(isNFD_NO ("0836")); |
265 | ok(isNFKD_NO ("0836")); |
266 | |
267 | # COMBINING GRAVE ACCENT |
268 | ok(getCombinClass("0768"), 230); |
269 | ok(isComp2nd ("0768")); |
270 | ok(isNFC_MAYBE ("0768")); |
271 | ok(isNFKC_MAYBE("0768")); |
272 | |
273 | # HANGUL SYLLABLE GA |
274 | ok(getCombinClass("044032"), 0); |
275 | ok(getCanon("044032"), _pack_U(0x1100, 0x1161)); |
276 | ok(getCompat("044032"), _pack_U(0x1100, 0x1161)); |
277 | ok(getComposite("04352", "04449"), 0xAC00); |
278 | |
279 | # string with 22 combining characters: (0x300..0x315) |
280 | my $str_cc22 = _pack_U(0x3041, 0x300..0x315, 0x3042); |
281 | ok(decompose($str_cc22), $str_cc22); |
282 | ok(reorder($str_cc22), $str_cc22); |
283 | ok(compose($str_cc22), $str_cc22); |
284 | ok(composeContiguous($str_cc22), $str_cc22); |
285 | ok(NFD($str_cc22), $str_cc22); |
286 | ok(NFC($str_cc22), $str_cc22); |
287 | ok(NFKD($str_cc22), $str_cc22); |
288 | ok(NFKC($str_cc22), $str_cc22); |
289 | ok(FCD($str_cc22), $str_cc22); |
290 | ok(FCC($str_cc22), $str_cc22); |
291 | |
292 | # string with 40 combining characters of the same class: (0x300..0x313)x2 |
293 | my $str_cc40 = _pack_U(0x3041, 0x300..0x313, 0x300..0x313, 0x3042); |
294 | ok(decompose($str_cc40), $str_cc40); |
295 | ok(reorder($str_cc40), $str_cc40); |
296 | ok(compose($str_cc40), $str_cc40); |
297 | ok(composeContiguous($str_cc40), $str_cc40); |
298 | ok(NFD($str_cc40), $str_cc40); |
299 | ok(NFC($str_cc40), $str_cc40); |
300 | ok(NFKD($str_cc40), $str_cc40); |
301 | ok(NFKC($str_cc40), $str_cc40); |
302 | ok(FCD($str_cc40), $str_cc40); |
303 | ok(FCC($str_cc40), $str_cc40); |
304 | |
305 | my $precomp = hexU("304C 304E 3050 3052 3054"); |
306 | my $combseq = hexU("304B 3099 304D 3099 304F 3099 3051 3099 3053 3099"); |
307 | ok(decompose($precomp x 5), $combseq x 5); |
308 | ok(decompose($precomp x 10), $combseq x 10); |
309 | ok(decompose($precomp x 20), $combseq x 20); |
310 | |
311 | my $hangsyl = hexU("AC00 B098 B2E4 B77C B9C8"); |
312 | my $jamoseq = hexU("1100 1161 1102 1161 1103 1161 1105 1161 1106 1161"); |
313 | ok(decompose($hangsyl x 5), $jamoseq x 5); |
314 | ok(decompose($hangsyl x 10), $jamoseq x 10); |
315 | ok(decompose($hangsyl x 20), $jamoseq x 20); |
316 | |
317 | my $notcomp = hexU("304B 304D 304F 3051 3053"); |
318 | ok(decompose($precomp . $notcomp), $combseq . $notcomp); |
319 | ok(decompose($precomp . $notcomp x 5), $combseq . $notcomp x 5); |
320 | ok(decompose($precomp . $notcomp x10), $combseq . $notcomp x10); |
321 | |
322 | |