4 print "1..0 # Skip: EBCDIC\n";
9 @INC = "::lib" if $^O eq 'MacOS'; # module parses @INC itself
10 require Config; import Config;
11 if ($Config{'extensions'} !~ /\bStorable\b/) {
12 print "1..0 # Skip: Storable was not built; Unicode::UCD uses Storable\n";
21 BEGIN { plan tests => 256 };
23 use Unicode::UCD 'charinfo';
27 $charinfo = charinfo(0); # Null is often problematic, so test it.
29 is($charinfo->{code}, '0000', '<control>');
30 is($charinfo->{name}, '<control>');
31 is($charinfo->{category}, 'Cc');
32 is($charinfo->{combining}, '0');
33 is($charinfo->{bidi}, 'BN');
34 is($charinfo->{decomposition}, '');
35 is($charinfo->{decimal}, '');
36 is($charinfo->{digit}, '');
37 is($charinfo->{numeric}, '');
38 is($charinfo->{mirrored}, 'N');
39 is($charinfo->{unicode10}, 'NULL');
40 is($charinfo->{comment}, '');
41 is($charinfo->{upper}, '');
42 is($charinfo->{lower}, '');
43 is($charinfo->{title}, '');
44 is($charinfo->{block}, 'Basic Latin');
45 is($charinfo->{script}, 'Common');
47 $charinfo = charinfo(0x41);
49 is($charinfo->{code}, '0041', 'LATIN CAPITAL LETTER A');
50 is($charinfo->{name}, 'LATIN CAPITAL LETTER A');
51 is($charinfo->{category}, 'Lu');
52 is($charinfo->{combining}, '0');
53 is($charinfo->{bidi}, 'L');
54 is($charinfo->{decomposition}, '');
55 is($charinfo->{decimal}, '');
56 is($charinfo->{digit}, '');
57 is($charinfo->{numeric}, '');
58 is($charinfo->{mirrored}, 'N');
59 is($charinfo->{unicode10}, '');
60 is($charinfo->{comment}, '');
61 is($charinfo->{upper}, '');
62 is($charinfo->{lower}, '0061');
63 is($charinfo->{title}, '');
64 is($charinfo->{block}, 'Basic Latin');
65 is($charinfo->{script}, 'Latin');
67 $charinfo = charinfo(0x100);
69 is($charinfo->{code}, '0100', 'LATIN CAPITAL LETTER A WITH MACRON');
70 is($charinfo->{name}, 'LATIN CAPITAL LETTER A WITH MACRON');
71 is($charinfo->{category}, 'Lu');
72 is($charinfo->{combining}, '0');
73 is($charinfo->{bidi}, 'L');
74 is($charinfo->{decomposition}, '0041 0304');
75 is($charinfo->{decimal}, '');
76 is($charinfo->{digit}, '');
77 is($charinfo->{numeric}, '');
78 is($charinfo->{mirrored}, 'N');
79 is($charinfo->{unicode10}, 'LATIN CAPITAL LETTER A MACRON');
80 is($charinfo->{comment}, '');
81 is($charinfo->{upper}, '');
82 is($charinfo->{lower}, '0101');
83 is($charinfo->{title}, '');
84 is($charinfo->{block}, 'Latin Extended-A');
85 is($charinfo->{script}, 'Latin');
87 # 0x0590 is in the Hebrew block but unused.
89 $charinfo = charinfo(0x590);
91 is($charinfo->{code}, undef, '0x0590 - unused Hebrew');
92 is($charinfo->{name}, undef);
93 is($charinfo->{category}, undef);
94 is($charinfo->{combining}, undef);
95 is($charinfo->{bidi}, undef);
96 is($charinfo->{decomposition}, undef);
97 is($charinfo->{decimal}, undef);
98 is($charinfo->{digit}, undef);
99 is($charinfo->{numeric}, undef);
100 is($charinfo->{mirrored}, undef);
101 is($charinfo->{unicode10}, undef);
102 is($charinfo->{comment}, undef);
103 is($charinfo->{upper}, undef);
104 is($charinfo->{lower}, undef);
105 is($charinfo->{title}, undef);
106 is($charinfo->{block}, undef);
107 is($charinfo->{script}, undef);
109 # 0x05d0 is in the Hebrew block and used.
111 $charinfo = charinfo(0x5d0);
113 is($charinfo->{code}, '05D0', '05D0 - used Hebrew');
114 is($charinfo->{name}, 'HEBREW LETTER ALEF');
115 is($charinfo->{category}, 'Lo');
116 is($charinfo->{combining}, '0');
117 is($charinfo->{bidi}, 'R');
118 is($charinfo->{decomposition}, '');
119 is($charinfo->{decimal}, '');
120 is($charinfo->{digit}, '');
121 is($charinfo->{numeric}, '');
122 is($charinfo->{mirrored}, 'N');
123 is($charinfo->{unicode10}, '');
124 is($charinfo->{comment}, '');
125 is($charinfo->{upper}, '');
126 is($charinfo->{lower}, '');
127 is($charinfo->{title}, '');
128 is($charinfo->{block}, 'Hebrew');
129 is($charinfo->{script}, 'Hebrew');
131 # An open syllable in Hangul.
133 $charinfo = charinfo(0xAC00);
135 is($charinfo->{code}, 'AC00', 'HANGUL SYLLABLE-AC00');
136 is($charinfo->{name}, 'HANGUL SYLLABLE-AC00');
137 is($charinfo->{category}, 'Lo');
138 is($charinfo->{combining}, '0');
139 is($charinfo->{bidi}, 'L');
140 is($charinfo->{decomposition}, undef);
141 is($charinfo->{decimal}, '');
142 is($charinfo->{digit}, '');
143 is($charinfo->{numeric}, '');
144 is($charinfo->{mirrored}, 'N');
145 is($charinfo->{unicode10}, '');
146 is($charinfo->{comment}, '');
147 is($charinfo->{upper}, '');
148 is($charinfo->{lower}, '');
149 is($charinfo->{title}, '');
150 is($charinfo->{block}, 'Hangul Syllables');
151 is($charinfo->{script}, 'Hangul');
153 # A closed syllable in Hangul.
155 $charinfo = charinfo(0xAE00);
157 is($charinfo->{code}, 'AE00', 'HANGUL SYLLABLE-AE00');
158 is($charinfo->{name}, 'HANGUL SYLLABLE-AE00');
159 is($charinfo->{category}, 'Lo');
160 is($charinfo->{combining}, '0');
161 is($charinfo->{bidi}, 'L');
162 is($charinfo->{decomposition}, undef);
163 is($charinfo->{decimal}, '');
164 is($charinfo->{digit}, '');
165 is($charinfo->{numeric}, '');
166 is($charinfo->{mirrored}, 'N');
167 is($charinfo->{unicode10}, '');
168 is($charinfo->{comment}, '');
169 is($charinfo->{upper}, '');
170 is($charinfo->{lower}, '');
171 is($charinfo->{title}, '');
172 is($charinfo->{block}, 'Hangul Syllables');
173 is($charinfo->{script}, 'Hangul');
175 $charinfo = charinfo(0x1D400);
177 is($charinfo->{code}, '1D400', 'MATHEMATICAL BOLD CAPITAL A');
178 is($charinfo->{name}, 'MATHEMATICAL BOLD CAPITAL A');
179 is($charinfo->{category}, 'Lu');
180 is($charinfo->{combining}, '0');
181 is($charinfo->{bidi}, 'L');
182 is($charinfo->{decomposition}, '<font> 0041');
183 is($charinfo->{decimal}, '');
184 is($charinfo->{digit}, '');
185 is($charinfo->{numeric}, '');
186 is($charinfo->{mirrored}, 'N');
187 is($charinfo->{unicode10}, '');
188 is($charinfo->{comment}, '');
189 is($charinfo->{upper}, '');
190 is($charinfo->{lower}, '');
191 is($charinfo->{title}, '');
192 is($charinfo->{block}, 'Mathematical Alphanumeric Symbols');
193 is($charinfo->{script}, 'Common');
195 $charinfo = charinfo(0x9FBA); #Bug 58428
197 is($charinfo->{code}, '9FBA', 'U+9FBA');
198 is($charinfo->{name}, 'CJK UNIFIED IDEOGRAPH-9FBA');
199 is($charinfo->{category}, 'Lo');
200 is($charinfo->{combining}, '0');
201 is($charinfo->{bidi}, 'L');
202 is($charinfo->{decomposition}, '');
203 is($charinfo->{decimal}, '');
204 is($charinfo->{digit}, '');
205 is($charinfo->{numeric}, '');
206 is($charinfo->{mirrored}, 'N');
207 is($charinfo->{unicode10}, '');
208 is($charinfo->{comment}, '');
209 is($charinfo->{upper}, '');
210 is($charinfo->{lower}, '');
211 is($charinfo->{title}, '');
212 is($charinfo->{block}, 'CJK Unified Ideographs');
213 is($charinfo->{script}, 'Han');
215 use Unicode::UCD qw(charblock charscript);
217 # 0x0590 is in the Hebrew block but unused.
219 is(charblock(0x590), 'Hebrew', '0x0590 - Hebrew unused charblock');
220 is(charscript(0x590), undef, '0x0590 - Hebrew unused charscript');
222 $charinfo = charinfo(0xbe);
224 is($charinfo->{code}, '00BE', 'VULGAR FRACTION THREE QUARTERS');
225 is($charinfo->{name}, 'VULGAR FRACTION THREE QUARTERS');
226 is($charinfo->{category}, 'No');
227 is($charinfo->{combining}, '0');
228 is($charinfo->{bidi}, 'ON');
229 is($charinfo->{decomposition}, '<fraction> 0033 2044 0034');
230 is($charinfo->{decimal}, '');
231 is($charinfo->{digit}, '');
232 is($charinfo->{numeric}, '3/4');
233 is($charinfo->{mirrored}, 'N');
234 is($charinfo->{unicode10}, 'FRACTION THREE QUARTERS');
235 is($charinfo->{comment}, '');
236 is($charinfo->{upper}, '');
237 is($charinfo->{lower}, '');
238 is($charinfo->{title}, '');
239 is($charinfo->{block}, 'Latin-1 Supplement');
240 is($charinfo->{script}, 'Common');
242 use Unicode::UCD qw(charblocks charscripts);
244 my $charblocks = charblocks();
246 ok(exists $charblocks->{Thai}, 'Thai charblock exists');
247 is($charblocks->{Thai}->[0]->[0], hex('0e00'));
248 ok(!exists $charblocks->{PigLatin}, 'PigLatin charblock does not exist');
250 my $charscripts = charscripts();
252 ok(exists $charscripts->{Armenian}, 'Armenian charscript exists');
253 is($charscripts->{Armenian}->[0]->[0], hex('0531'));
254 ok(!exists $charscripts->{PigLatin}, 'PigLatin charscript does not exist');
258 $charscript = charscript("12ab");
259 is($charscript, 'Ethiopic', 'Ethiopic charscript');
261 $charscript = charscript("0x12ab");
262 is($charscript, 'Ethiopic');
264 $charscript = charscript("U+12ab");
265 is($charscript, 'Ethiopic');
269 $ranges = charscript('Ogham');
270 is($ranges->[1]->[0], hex('1681'), 'Ogham charscript');
271 is($ranges->[1]->[1], hex('169a'));
273 use Unicode::UCD qw(charinrange);
275 $ranges = charscript('Cherokee');
276 ok(!charinrange($ranges, "139f"), 'Cherokee charscript');
277 ok( charinrange($ranges, "13a0"));
278 ok( charinrange($ranges, "13f4"));
279 ok(!charinrange($ranges, "13f5"));
281 use Unicode::UCD qw(general_categories);
283 my $gc = general_categories();
285 ok(exists $gc->{L}, 'has L');
286 is($gc->{L}, 'Letter', 'L is Letter');
287 is($gc->{Lu}, 'UppercaseLetter', 'Lu is UppercaseLetter');
289 use Unicode::UCD qw(bidi_types);
291 my $bt = bidi_types();
293 ok(exists $bt->{L}, 'has L');
294 is($bt->{L}, 'Left-to-Right', 'L is Left-to-Right');
295 is($bt->{AL}, 'Right-to-Left Arabic', 'AL is Right-to-Left Arabic');
297 # If this fails, then maybe one should look at the Unicode changes to see
298 # what else might need to be updated.
299 is(Unicode::UCD::UnicodeVersion, '5.2.0', 'UnicodeVersion');
301 use Unicode::UCD qw(compexcl);
303 ok(!compexcl(0x0100), 'compexcl');
304 ok( compexcl(0x0958));
306 use Unicode::UCD qw(casefold);
310 $casefold = casefold(0x41);
312 is($casefold->{code}, '0041', 'casefold 0x41 code');
313 is($casefold->{status}, 'C', 'casefold 0x41 status');
314 is($casefold->{mapping}, '0061', 'casefold 0x41 mapping');
315 is($casefold->{full}, '0061', 'casefold 0x41 full');
316 is($casefold->{simple}, '0061', 'casefold 0x41 simple');
317 is($casefold->{turkic}, "", 'casefold 0x41 turkic');
319 $casefold = casefold(0xdf);
321 is($casefold->{code}, '00DF', 'casefold 0xDF code');
322 is($casefold->{status}, 'F', 'casefold 0xDF status');
323 is($casefold->{mapping}, '0073 0073', 'casefold 0xDF mapping');
324 is($casefold->{full}, '0073 0073', 'casefold 0xDF full');
325 is($casefold->{simple}, "", 'casefold 0xDF simple');
326 is($casefold->{turkic}, "", 'casefold 0xDF turkic');
328 # Do different tests depending on if version <= 3.1, or not.
329 (my $version = Unicode::UCD::UnicodeVersion) =~ /^(\d+)\.(\d+)/;
330 if (defined $1 && ($1 <= 2 || $1 == 3 && defined $2 && $2 <= 1)) {
331 $casefold = casefold(0x130);
333 is($casefold->{code}, '0130', 'casefold 0x130 code');
334 is($casefold->{status}, 'I' , 'casefold 0x130 status');
335 is($casefold->{mapping}, '0069', 'casefold 0x130 mapping');
336 is($casefold->{full}, '0069', 'casefold 0x130 full');
337 is($casefold->{simple}, "0069", 'casefold 0x130 simple');
338 is($casefold->{turkic}, "0069", 'casefold 0x130 turkic');
340 $casefold = casefold(0x131);
342 is($casefold->{code}, '0131', 'casefold 0x131 code');
343 is($casefold->{status}, 'I' , 'casefold 0x131 status');
344 is($casefold->{mapping}, '0069', 'casefold 0x131 mapping');
345 is($casefold->{full}, '0069', 'casefold 0x131 full');
346 is($casefold->{simple}, "0069", 'casefold 0x131 simple');
347 is($casefold->{turkic}, "0069", 'casefold 0x131 turkic');
349 $casefold = casefold(0x49);
351 is($casefold->{code}, '0049', 'casefold 0x49 code');
352 is($casefold->{status}, 'C' , 'casefold 0x49 status');
353 is($casefold->{mapping}, '0069', 'casefold 0x49 mapping');
354 is($casefold->{full}, '0069', 'casefold 0x49 full');
355 is($casefold->{simple}, "0069", 'casefold 0x49 simple');
356 is($casefold->{turkic}, "0131", 'casefold 0x49 turkic');
358 $casefold = casefold(0x130);
360 is($casefold->{code}, '0130', 'casefold 0x130 code');
361 is($casefold->{status}, 'F' , 'casefold 0x130 status');
362 is($casefold->{mapping}, '0069 0307', 'casefold 0x130 mapping');
363 is($casefold->{full}, '0069 0307', 'casefold 0x130 full');
364 is($casefold->{simple}, "", 'casefold 0x130 simple');
365 is($casefold->{turkic}, "0069", 'casefold 0x130 turkic');
368 $casefold = casefold(0x1F88);
370 is($casefold->{code}, '1F88', 'casefold 0x1F88 code');
371 is($casefold->{status}, 'S' , 'casefold 0x1F88 status');
372 is($casefold->{mapping}, '1F80', 'casefold 0x1F88 mapping');
373 is($casefold->{full}, '1F00 03B9', 'casefold 0x1F88 full');
374 is($casefold->{simple}, '1F80', 'casefold 0x1F88 simple');
375 is($casefold->{turkic}, "", 'casefold 0x1F88 turkic');
379 use Unicode::UCD qw(casespec);
385 $casespec = casespec(0xdf);
387 ok($casespec->{code} eq '00DF' &&
388 $casespec->{lower} eq '00DF' &&
389 $casespec->{title} eq '0053 0073' &&
390 $casespec->{upper} eq '0053 0053' &&
391 !defined $casespec->{condition}, 'casespec 0xDF');
393 $casespec = casespec(0x307);
395 ok($casespec->{az}->{code} eq '0307' &&
396 !defined $casespec->{az}->{lower} &&
397 $casespec->{az}->{title} eq '0307' &&
398 $casespec->{az}->{upper} eq '0307' &&
399 $casespec->{az}->{condition} eq 'az After_I',
402 # perl #7305 UnicodeCD::compexcl is weird
404 for (1) {my $a=compexcl $_}
405 ok(1, 'compexcl read-only $_: perl #7305');
406 map {compexcl $_} %{{1=>2}};
407 ok(1, 'compexcl read-only hash: perl #7305');
409 is(Unicode::UCD::_getcode('123'), 123, "_getcode(123)");
410 is(Unicode::UCD::_getcode('0123'), 0x123, "_getcode(0123)");
411 is(Unicode::UCD::_getcode('0x123'), 0x123, "_getcode(0x123)");
412 is(Unicode::UCD::_getcode('0X123'), 0x123, "_getcode(0X123)");
413 is(Unicode::UCD::_getcode('U+123'), 0x123, "_getcode(U+123)");
414 is(Unicode::UCD::_getcode('u+123'), 0x123, "_getcode(u+123)");
415 is(Unicode::UCD::_getcode('U+1234'), 0x1234, "_getcode(U+1234)");
416 is(Unicode::UCD::_getcode('U+12345'), 0x12345, "_getcode(U+12345)");
417 is(Unicode::UCD::_getcode('123x'), undef, "_getcode(123x)");
418 is(Unicode::UCD::_getcode('x123'), undef, "_getcode(x123)");
419 is(Unicode::UCD::_getcode('0x123x'), undef, "_getcode(x123)");
420 is(Unicode::UCD::_getcode('U+123x'), undef, "_getcode(x123)");
423 my $r1 = charscript('Latin');
425 is($n1, 42, "number of ranges in Latin script (Unicode 5.1.0)");
426 shift @$r1 while @$r1;
427 my $r2 = charscript('Latin');
428 is(@$r2, $n1, "modifying results should not mess up internal caches");
432 is(charinfo(0xdeadbeef), undef, "[perl #23273] warnings in Unicode::UCD");
435 use Unicode::UCD qw(namedseq);
437 is(namedseq("KATAKANA LETTER AINU P"), "\x{31F7}\x{309A}", "namedseq");
438 is(namedseq("KATAKANA LETTER AINU Q"), undef);
439 is(namedseq(), undef);
440 is(namedseq(qw(foo bar)), undef);
441 my @ns = namedseq("KATAKANA LETTER AINU P");
446 is($ns{"KATAKANA LETTER AINU P"}, "\x{31F7}\x{309A}");