Upgrade to Unicode-Normalize-1.00
[p5sagit/p5-mst-13.2.git] / ext / Unicode / Normalize / t / func.t
CommitLineData
ac5ea531 1
4a2e806c 2BEGIN {
1efaba7f 3 unless ("A" eq pack('U', 0x41)) {
9f1f04a1 4 print "1..0 # Unicode::Normalize " .
5 "cannot stringify a Unicode code point\n";
f027f502 6 exit 0;
4a2e806c 7 }
8}
9
6c941e0c 10BEGIN {
11 if ($ENV{PERL_CORE}) {
12 chdir('t') if -d 't';
9f1f04a1 13 @INC = $^O eq 'MacOS' ? qw(::lib) : qw(../lib);
6c941e0c 14 }
15}
16
ac5ea531 17#########################
18
19use Test;
20use strict;
21use warnings;
fe067ad9 22BEGIN { plan tests => 202 };
ac5ea531 23use Unicode::Normalize qw(:all);
24ok(1); # If we made it this far, we're ok.
25
fe067ad9 26sub _pack_U { Unicode::Normalize::pack_U(@_) }
27sub hexU { _pack_U map hex, split ' ', shift }
6c941e0c 28
ac5ea531 29#########################
30
fe067ad9 31ok(getCombinClass( 0), 0);
32ok(getCombinClass( 41), 0);
33ok(getCombinClass( 65), 0);
34ok(getCombinClass( 768), 230);
35ok(getCombinClass(1809), 36);
36
37ok(getCanon( 0), undef);
38ok(getCanon(0x29), undef);
39ok(getCanon(0x41), undef);
40ok(getCanon(0x00C0), _pack_U(0x0041, 0x0300));
41ok(getCanon(0x00EF), _pack_U(0x0069, 0x0308));
42ok(getCanon(0x304C), _pack_U(0x304B, 0x3099));
43ok(getCanon(0x1EA4), _pack_U(0x0041, 0x0302, 0x0301));
44ok(getCanon(0x1F82), _pack_U(0x03B1, 0x0313, 0x0300, 0x0345));
45ok(getCanon(0x1FAF), _pack_U(0x03A9, 0x0314, 0x0342, 0x0345));
46ok(getCanon(0xAC00), _pack_U(0x1100, 0x1161));
47ok(getCanon(0xAE00), _pack_U(0x1100, 0x1173, 0x11AF));
48ok(getCanon(0x212C), undef);
49ok(getCanon(0x3243), undef);
50ok(getCanon(0xFA2D), _pack_U(0x9DB4));
51
52ok(getCompat( 0), undef);
53ok(getCompat(0x29), undef);
54ok(getCompat(0x41), undef);
55ok(getCompat(0x00C0), _pack_U(0x0041, 0x0300));
56ok(getCompat(0x00EF), _pack_U(0x0069, 0x0308));
57ok(getCompat(0x304C), _pack_U(0x304B, 0x3099));
58ok(getCompat(0x1EA4), _pack_U(0x0041, 0x0302, 0x0301));
59ok(getCompat(0x1F82), _pack_U(0x03B1, 0x0313, 0x0300, 0x0345));
60ok(getCompat(0x1FAF), _pack_U(0x03A9, 0x0314, 0x0342, 0x0345));
61ok(getCompat(0x212C), _pack_U(0x0042));
62ok(getCompat(0x3243), _pack_U(0x0028, 0x81F3, 0x0029));
63ok(getCompat(0xAC00), _pack_U(0x1100, 0x1161));
64ok(getCompat(0xAE00), _pack_U(0x1100, 0x1173, 0x11AF));
65ok(getCompat(0xFA2D), _pack_U(0x9DB4));
66
67ok(getComposite( 0, 0), undef);
68ok(getComposite( 0, 0x29), undef);
69ok(getComposite(0x29, 0), undef);
70ok(getComposite(0x29, 0x29), undef);
71ok(getComposite( 0, 0x41), undef);
72ok(getComposite(0x41, 0), undef);
73ok(getComposite(0x41, 0x41), undef);
74ok(getComposite(12, 0x0300), undef);
75ok(getComposite(0x0055, 0xFF00), undef);
76ok(getComposite(0x0041, 0x0300), 0x00C0);
77ok(getComposite(0x0055, 0x0300), 0x00D9);
78ok(getComposite(0x0112, 0x0300), 0x1E14);
79ok(getComposite(0x1100, 0x1161), 0xAC00);
80ok(getComposite(0x1100, 0x1173), 0xADF8);
81ok(getComposite(0x1100, 0x11AF), undef);
82ok(getComposite(0x1173, 0x11AF), undef);
83ok(getComposite(0xAC00, 0x11A7), undef);
84ok(getComposite(0xAC00, 0x11A8), 0xAC01);
85ok(getComposite(0xADF8, 0x11AF), 0xAE00);
86
87sub uprops {
88 my $uv = shift;
89 my $r = "";
90 $r .= isExclusion($uv) ? 'X' : 'x';
91 $r .= isSingleton($uv) ? 'S' : 's';
92 $r .= isNonStDecomp($uv) ? 'N' : 'n'; # Non-Starter Decomposition
93 $r .= isComp_Ex($uv) ? 'F' : 'f'; # Full exclusion (X + S + N)
94 $r .= isComp2nd($uv) ? 'B' : 'b'; # B = M = Y
95 $r .= isNFD_NO($uv) ? 'D' : 'd';
96 $r .= isNFC_MAYBE($uv) ? 'M' : 'm'; # Maybe
97 $r .= isNFC_NO($uv) ? 'C' : 'c';
98 $r .= isNFKD_NO($uv) ? 'K' : 'k';
99 $r .= isNFKC_MAYBE($uv) ? 'Y' : 'y'; # maYbe
100 $r .= isNFKC_NO($uv) ? 'G' : 'g';
101 return $r;
102}
103
104ok(uprops(0x0000), 'xsnfbdmckyg');
105ok(uprops(0x0029), 'xsnfbdmckyg');
106ok(uprops(0x0041), 'xsnfbdmckyg');
107ok(uprops(0x00A0), 'xsnfbdmcKyG'); # NO-BREAK SPACE
108ok(uprops(0x00C0), 'xsnfbDmcKyg'); # LATIN CAPITAL LETTER A WITH GRAVE
109ok(uprops(0x0300), 'xsnfBdMckYg'); # COMBINING GRAVE ACCENT
110ok(uprops(0x0344), 'xsNFbDmCKyG'); # COMBINING GREEK DIALYTIKA TONOS
111ok(uprops(0x0387), 'xSnFbDmCKyG'); # GREEK ANO TELEIA
112ok(uprops(0x0958), 'XsnFbDmCKyG'); # DEVANAGARI LETTER QA
113ok(uprops(0x0F43), 'XsnFbDmCKyG'); # TIBETAN LETTER GHA
114ok(uprops(0x1100), 'xsnfbdmckyg'); # HANGUL CHOSEONG KIYEOK
115ok(uprops(0x1161), 'xsnfBdMckYg'); # HANGUL JUNGSEONG A
116ok(uprops(0x11AF), 'xsnfBdMckYg'); # HANGUL JONGSEONG RIEU
117ok(uprops(0x212B), 'xSnFbDmCKyG'); # ANGSTROM SIGN
118ok(uprops(0xAC00), 'xsnfbDmcKyg'); # HANGUL SYLLABLE GA
119ok(uprops(0xF900), 'xSnFbDmCKyG'); # CJK COMPATIBILITY IDEOGRAPH-F900
120ok(uprops(0xFB4E), 'XsnFbDmCKyG'); # HEBREW LETTER PE WITH RAFE
121ok(uprops(0xFF71), 'xsnfbdmcKyG'); # HALFWIDTH KATAKANA LETTER A
122
123ok(decompose(""), "");
124ok(decompose("A"), "A");
125ok(decompose("", 1), "");
126ok(decompose("A", 1), "A");
127
128ok(decompose(hexU("1E14 AC01")), hexU("0045 0304 0300 1100 1161 11A8"));
129ok(decompose(hexU("AC00 AE00")), hexU("1100 1161 1100 1173 11AF"));
130ok(decompose(hexU("304C FF76")), hexU("304B 3099 FF76"));
131
132ok(decompose(hexU("1E14 AC01"), 1), hexU("0045 0304 0300 1100 1161 11A8"));
133ok(decompose(hexU("AC00 AE00"), 1), hexU("1100 1161 1100 1173 11AF"));
134ok(decompose(hexU("304C FF76"), 1), hexU("304B 3099 30AB"));
135
136# don't modify the source
137my $sDec = "\x{FA19}";
138ok(decompose($sDec), "\x{795E}");
139ok($sDec, "\x{FA19}");
140
141ok(reorder(""), "");
142ok(reorder("A"), "A");
143ok(reorder(hexU("0041 0300 0315 0313 031b 0061")),
144 hexU("0041 031b 0300 0313 0315 0061"));
145ok(reorder(hexU("00C1 0300 0315 0313 031b 0061 309A 3099")),
146 hexU("00C1 031b 0300 0313 0315 0061 309A 3099"));
147
148# don't modify the source
149my $sReord = "\x{3000}\x{300}\x{31b}";
150ok(reorder($sReord), "\x{3000}\x{31b}\x{300}");
151ok($sReord, "\x{3000}\x{300}\x{31b}");
152
153ok(compose(""), "");
154ok(compose("A"), "A");
155ok(compose(hexU("0061 0300")), hexU("00E0"));
156ok(compose(hexU("0061 0300 031B")), hexU("00E0 031B"));
157ok(compose(hexU("0061 0300 0315")), hexU("00E0 0315"));
158ok(compose(hexU("0061 0300 0313")), hexU("00E0 0313"));
159ok(compose(hexU("0061 031B 0300")), hexU("00E0 031B"));
160ok(compose(hexU("0061 0315 0300")), hexU("0061 0315 0300"));
161ok(compose(hexU("0061 0313 0300")), hexU("0061 0313 0300"));
162
163# don't modify the source
164my $sCom = "\x{304B}\x{3099}";
165ok(compose($sCom), "\x{304C}");
166ok($sCom, "\x{304B}\x{3099}");
167
168ok(composeContiguous(""), "");
169ok(composeContiguous("A"), "A");
170ok(composeContiguous(hexU("0061 0300")), hexU("00E0"));
171ok(composeContiguous(hexU("0061 0300 031B")), hexU("00E0 031B"));
172ok(composeContiguous(hexU("0061 0300 0315")), hexU("00E0 0315"));
173ok(composeContiguous(hexU("0061 0300 0313")), hexU("00E0 0313"));
174ok(composeContiguous(hexU("0061 031B 0300")), hexU("0061 031B 0300"));
175ok(composeContiguous(hexU("0061 0315 0300")), hexU("0061 0315 0300"));
176ok(composeContiguous(hexU("0061 0313 0300")), hexU("0061 0313 0300"));
177
178# don't modify the source
179my $sCtg = "\x{30DB}\x{309A}";
180ok(composeContiguous($sCtg), "\x{30DD}");
181ok($sCtg, "\x{30DB}\x{309A}");
8f118dcd 182
183sub answer { defined $_[0] ? $_[0] ? "YES" : "NO" : "MAYBE" }
184
fe067ad9 185ok(answer(checkNFD("")), "YES");
186ok(answer(checkNFC("")), "YES");
187ok(answer(checkNFKD("")), "YES");
188ok(answer(checkNFKC("")), "YES");
189ok(answer(check("NFD", "")), "YES");
190ok(answer(check("NFC", "")), "YES");
191ok(answer(check("NFKD","")), "YES");
192ok(answer(check("NFKC","")), "YES");
193
8f118dcd 194# U+0000 to U+007F are prenormalized in all the normalization forms.
fe067ad9 195ok(answer(checkNFD("AZaz\t12!#`")), "YES");
196ok(answer(checkNFC("AZaz\t12!#`")), "YES");
197ok(answer(checkNFKD("AZaz\t12!#`")), "YES");
198ok(answer(checkNFKC("AZaz\t12!#`")), "YES");
199ok(answer(check("D", "AZaz\t12!#`")), "YES");
200ok(answer(check("C", "AZaz\t12!#`")), "YES");
201ok(answer(check("KD","AZaz\t12!#`")), "YES");
202ok(answer(check("KC","AZaz\t12!#`")), "YES");
203
204ok(answer(checkNFD(NFD(_pack_U(0xC1, 0x1100, 0x1173, 0x11AF)))), "YES");
205ok(answer(checkNFD(_pack_U(0x20, 0xC1, 0x1100, 0x1173, 0x11AF))), "NO");
206ok(answer(checkNFC(_pack_U(0x20, 0xC1, 0x1173, 0x11AF))), "MAYBE");
207ok(answer(checkNFC(_pack_U(0x20, 0xC1, 0xAE00, 0x1100))), "YES");
208ok(answer(checkNFC(_pack_U(0x20, 0xC1, 0xAE00, 0x1100, 0x300))), "MAYBE");
209ok(answer(checkNFC(_pack_U(0x20, 0xC1, 0xFF71, 0x2025))), "YES");
210ok(answer(check("NFC", _pack_U(0x20, 0xC1, 0x212B, 0x300))), "NO");
211ok(answer(checkNFKD(_pack_U(0x20, 0xC1, 0xFF71, 0x2025))), "NO");
212ok(answer(checkNFKC(_pack_U(0x20, 0xC1, 0xAE00, 0x2025))), "NO");
f027f502 213
214"012ABC" =~ /(\d+)(\w+)/;
fe067ad9 215ok("012" eq NFC $1 && "ABC" eq NFC $2);
f027f502 216
fe067ad9 217ok(normalize('C', $1), "012");
218ok(normalize('C', $2), "ABC");
f027f502 219
fe067ad9 220ok(normalize('NFC', $1), "012");
221ok(normalize('NFC', $2), "ABC");
f027f502 222 # s/^NF// in normalize() must not prevent using $1, $&, etc.
223
fe067ad9 224# a string with initial zero should be treated like a number
225
226# LATIN CAPITAL LETTER A WITH GRAVE
227ok(getCombinClass("0192"), 0);
228ok(getCanon ("0192"), _pack_U(0x41, 0x300));
229ok(getCompat("0192"), _pack_U(0x41, 0x300));
230ok(getComposite("065", "0768"), 192);
231ok(isNFD_NO ("0192"));
232ok(isNFKD_NO("0192"));
233
234# DEVANAGARI LETTER QA
235ok(isExclusion("02392"));
236ok(isComp_Ex ("02392"));
237ok(isNFC_NO ("02392"));
238ok(isNFKC_NO ("02392"));
239ok(isNFD_NO ("02392"));
240ok(isNFKD_NO ("02392"));
241
242# ANGSTROM SIGN
243ok(isSingleton("08491"));
244ok(isComp_Ex ("08491"));
245ok(isNFC_NO ("08491"));
246ok(isNFKC_NO ("08491"));
247ok(isNFD_NO ("08491"));
248ok(isNFKD_NO ("08491"));
249
250# COMBINING GREEK DIALYTIKA TONOS
251ok(isNonStDecomp("0836"));
252ok(isComp_Ex ("0836"));
253ok(isNFC_NO ("0836"));
254ok(isNFKC_NO ("0836"));
255ok(isNFD_NO ("0836"));
256ok(isNFKD_NO ("0836"));
257
258# COMBINING GRAVE ACCENT
259ok(getCombinClass("0768"), 230);
260ok(isComp2nd ("0768"));
261ok(isNFC_MAYBE ("0768"));
262ok(isNFKC_MAYBE("0768"));
263
264# HANGUL SYLLABLE GA
265ok(getCombinClass("044032"), 0);
266ok(getCanon("044032"), _pack_U(0x1100, 0x1161));
267ok(getCompat("044032"), _pack_U(0x1100, 0x1161));
268ok(getComposite("04352", "04449"), 0xAC00);
269
270# string with 22 combining characters: (0x300..0x315)
271my $str_cc22 = _pack_U(0x3041, 0x300..0x315, 0x3042);
272ok(decompose($str_cc22), $str_cc22);
273ok(reorder($str_cc22), $str_cc22);
274ok(compose($str_cc22), $str_cc22);
275ok(composeContiguous($str_cc22), $str_cc22);
276ok(NFD($str_cc22), $str_cc22);
277ok(NFC($str_cc22), $str_cc22);
278ok(NFKD($str_cc22), $str_cc22);
279ok(NFKC($str_cc22), $str_cc22);
280ok(FCD($str_cc22), $str_cc22);
281ok(FCC($str_cc22), $str_cc22);
282
283# string with 40 combining characters of the same class: (0x300..0x313)x2
284my $str_cc40 = _pack_U(0x3041, 0x300..0x313, 0x300..0x313, 0x3042);
285ok(decompose($str_cc40), $str_cc40);
286ok(reorder($str_cc40), $str_cc40);
287ok(compose($str_cc40), $str_cc40);
288ok(composeContiguous($str_cc40), $str_cc40);
289ok(NFD($str_cc40), $str_cc40);
290ok(NFC($str_cc40), $str_cc40);
291ok(NFKD($str_cc40), $str_cc40);
292ok(NFKC($str_cc40), $str_cc40);
293ok(FCD($str_cc40), $str_cc40);
294ok(FCC($str_cc40), $str_cc40);
295
296my $precomp = hexU("304C 304E 3050 3052 3054");
297my $combseq = hexU("304B 3099 304D 3099 304F 3099 3051 3099 3053 3099");
298ok(decompose($precomp x 5), $combseq x 5);
299ok(decompose($precomp x 10), $combseq x 10);
300ok(decompose($precomp x 20), $combseq x 20);
301
302my $hangsyl = hexU("AC00 B098 B2E4 B77C B9C8");
303my $jamoseq = hexU("1100 1161 1102 1161 1103 1161 1105 1161 1106 1161");
304ok(decompose($hangsyl x 5), $jamoseq x 5);
305ok(decompose($hangsyl x 10), $jamoseq x 10);
306ok(decompose($hangsyl x 20), $jamoseq x 20);
307
308my $notcomp = hexU("304B 304D 304F 3051 3053");
309ok(decompose($precomp . $notcomp), $combseq . $notcomp);
310ok(decompose($precomp . $notcomp x 5), $combseq . $notcomp x 5);
311ok(decompose($precomp . $notcomp x10), $combseq . $notcomp x10);
312
313