add tests for version::is_strict() and version::is_lax()
[p5sagit/p5-mst-13.2.git] / lib / Unicode / UCD.t
CommitLineData
25a47338 1#!perl -w
8b731da2 2BEGIN {
a452d459 3 if (ord("A") != 65) {
8b731da2 4 print "1..0 # Skip: EBCDIC\n";
5 exit 0;
6 }
a778afa6 7 chdir 't' if -d 't';
8 @INC = '../lib';
e69a2255 9 @INC = "::lib" if $^O eq 'MacOS'; # module parses @INC itself
25a47338 10 require Config; import Config;
11 if ($Config{'extensions'} !~ /\bStorable\b/) {
12 print "1..0 # Skip: Storable was not built; Unicode::UCD uses Storable\n";
13 exit 0;
14 }
8b731da2 15}
16
a778afa6 17use strict;
18use Unicode::UCD;
f5c9f3db 19use Test::More;
8b731da2 20
6c44f856 21BEGIN { plan tests => 256 };
561c79ed 22
55d7b906 23use Unicode::UCD 'charinfo';
561c79ed 24
b08cd201 25my $charinfo;
26
e10d7780 27$charinfo = charinfo(0); # Null is often problematic, so test it.
28
29is($charinfo->{code}, '0000', '<control>');
30is($charinfo->{name}, '<control>');
31is($charinfo->{category}, 'Cc');
32is($charinfo->{combining}, '0');
33is($charinfo->{bidi}, 'BN');
34is($charinfo->{decomposition}, '');
35is($charinfo->{decimal}, '');
36is($charinfo->{digit}, '');
37is($charinfo->{numeric}, '');
38is($charinfo->{mirrored}, 'N');
39is($charinfo->{unicode10}, 'NULL');
40is($charinfo->{comment}, '');
41is($charinfo->{upper}, '');
42is($charinfo->{lower}, '');
43is($charinfo->{title}, '');
44is($charinfo->{block}, 'Basic Latin');
45is($charinfo->{script}, 'Common');
46
b08cd201 47$charinfo = charinfo(0x41);
48
f5c9f3db 49is($charinfo->{code}, '0041', 'LATIN CAPITAL LETTER A');
50is($charinfo->{name}, 'LATIN CAPITAL LETTER A');
51is($charinfo->{category}, 'Lu');
52is($charinfo->{combining}, '0');
53is($charinfo->{bidi}, 'L');
54is($charinfo->{decomposition}, '');
55is($charinfo->{decimal}, '');
56is($charinfo->{digit}, '');
57is($charinfo->{numeric}, '');
58is($charinfo->{mirrored}, 'N');
59is($charinfo->{unicode10}, '');
60is($charinfo->{comment}, '');
61is($charinfo->{upper}, '');
62is($charinfo->{lower}, '0061');
63is($charinfo->{title}, '');
64is($charinfo->{block}, 'Basic Latin');
65is($charinfo->{script}, 'Latin');
b08cd201 66
67$charinfo = charinfo(0x100);
68
f5c9f3db 69is($charinfo->{code}, '0100', 'LATIN CAPITAL LETTER A WITH MACRON');
70is($charinfo->{name}, 'LATIN CAPITAL LETTER A WITH MACRON');
71is($charinfo->{category}, 'Lu');
72is($charinfo->{combining}, '0');
73is($charinfo->{bidi}, 'L');
74is($charinfo->{decomposition}, '0041 0304');
75is($charinfo->{decimal}, '');
76is($charinfo->{digit}, '');
77is($charinfo->{numeric}, '');
78is($charinfo->{mirrored}, 'N');
79is($charinfo->{unicode10}, 'LATIN CAPITAL LETTER A MACRON');
80is($charinfo->{comment}, '');
81is($charinfo->{upper}, '');
82is($charinfo->{lower}, '0101');
83is($charinfo->{title}, '');
84is($charinfo->{block}, 'Latin Extended-A');
85is($charinfo->{script}, 'Latin');
a196fbfd 86
87# 0x0590 is in the Hebrew block but unused.
561c79ed 88
b08cd201 89$charinfo = charinfo(0x590);
90
f5c9f3db 91is($charinfo->{code}, undef, '0x0590 - unused Hebrew');
92is($charinfo->{name}, undef);
93is($charinfo->{category}, undef);
94is($charinfo->{combining}, undef);
95is($charinfo->{bidi}, undef);
96is($charinfo->{decomposition}, undef);
97is($charinfo->{decimal}, undef);
98is($charinfo->{digit}, undef);
99is($charinfo->{numeric}, undef);
100is($charinfo->{mirrored}, undef);
101is($charinfo->{unicode10}, undef);
102is($charinfo->{comment}, undef);
103is($charinfo->{upper}, undef);
104is($charinfo->{lower}, undef);
105is($charinfo->{title}, undef);
106is($charinfo->{block}, undef);
107is($charinfo->{script}, undef);
a196fbfd 108
109# 0x05d0 is in the Hebrew block and used.
561c79ed 110
b08cd201 111$charinfo = charinfo(0x5d0);
112
f5c9f3db 113is($charinfo->{code}, '05D0', '05D0 - used Hebrew');
114is($charinfo->{name}, 'HEBREW LETTER ALEF');
115is($charinfo->{category}, 'Lo');
116is($charinfo->{combining}, '0');
117is($charinfo->{bidi}, 'R');
118is($charinfo->{decomposition}, '');
119is($charinfo->{decimal}, '');
120is($charinfo->{digit}, '');
121is($charinfo->{numeric}, '');
122is($charinfo->{mirrored}, 'N');
123is($charinfo->{unicode10}, '');
124is($charinfo->{comment}, '');
125is($charinfo->{upper}, '');
126is($charinfo->{lower}, '');
127is($charinfo->{title}, '');
128is($charinfo->{block}, 'Hebrew');
129is($charinfo->{script}, 'Hebrew');
561c79ed 130
74f8133e 131# An open syllable in Hangul.
a6fa416b 132
133$charinfo = charinfo(0xAC00);
134
f5c9f3db 135is($charinfo->{code}, 'AC00', 'HANGUL SYLLABLE-AC00');
136is($charinfo->{name}, 'HANGUL SYLLABLE-AC00');
137is($charinfo->{category}, 'Lo');
138is($charinfo->{combining}, '0');
139is($charinfo->{bidi}, 'L');
140is($charinfo->{decomposition}, undef);
141is($charinfo->{decimal}, '');
142is($charinfo->{digit}, '');
143is($charinfo->{numeric}, '');
144is($charinfo->{mirrored}, 'N');
145is($charinfo->{unicode10}, '');
146is($charinfo->{comment}, '');
147is($charinfo->{upper}, '');
148is($charinfo->{lower}, '');
149is($charinfo->{title}, '');
150is($charinfo->{block}, 'Hangul Syllables');
151is($charinfo->{script}, 'Hangul');
a6fa416b 152
74f8133e 153# A closed syllable in Hangul.
a6fa416b 154
155$charinfo = charinfo(0xAE00);
156
f5c9f3db 157is($charinfo->{code}, 'AE00', 'HANGUL SYLLABLE-AE00');
158is($charinfo->{name}, 'HANGUL SYLLABLE-AE00');
159is($charinfo->{category}, 'Lo');
160is($charinfo->{combining}, '0');
161is($charinfo->{bidi}, 'L');
162is($charinfo->{decomposition}, undef);
163is($charinfo->{decimal}, '');
164is($charinfo->{digit}, '');
165is($charinfo->{numeric}, '');
166is($charinfo->{mirrored}, 'N');
167is($charinfo->{unicode10}, '');
168is($charinfo->{comment}, '');
169is($charinfo->{upper}, '');
170is($charinfo->{lower}, '');
171is($charinfo->{title}, '');
172is($charinfo->{block}, 'Hangul Syllables');
173is($charinfo->{script}, 'Hangul');
a6fa416b 174
175$charinfo = charinfo(0x1D400);
176
f5c9f3db 177is($charinfo->{code}, '1D400', 'MATHEMATICAL BOLD CAPITAL A');
178is($charinfo->{name}, 'MATHEMATICAL BOLD CAPITAL A');
179is($charinfo->{category}, 'Lu');
180is($charinfo->{combining}, '0');
181is($charinfo->{bidi}, 'L');
182is($charinfo->{decomposition}, '<font> 0041');
183is($charinfo->{decimal}, '');
184is($charinfo->{digit}, '');
185is($charinfo->{numeric}, '');
186is($charinfo->{mirrored}, 'N');
187is($charinfo->{unicode10}, '');
188is($charinfo->{comment}, '');
189is($charinfo->{upper}, '');
190is($charinfo->{lower}, '');
191is($charinfo->{title}, '');
192is($charinfo->{block}, 'Mathematical Alphanumeric Symbols');
7be0dac3 193is($charinfo->{script}, 'Common');
a6fa416b 194
a452d459 195$charinfo = charinfo(0x9FBA); #Bug 58428
196
197is($charinfo->{code}, '9FBA', 'U+9FBA');
198is($charinfo->{name}, 'CJK UNIFIED IDEOGRAPH-9FBA');
199is($charinfo->{category}, 'Lo');
200is($charinfo->{combining}, '0');
201is($charinfo->{bidi}, 'L');
202is($charinfo->{decomposition}, '');
203is($charinfo->{decimal}, '');
204is($charinfo->{digit}, '');
205is($charinfo->{numeric}, '');
206is($charinfo->{mirrored}, 'N');
207is($charinfo->{unicode10}, '');
208is($charinfo->{comment}, '');
209is($charinfo->{upper}, '');
210is($charinfo->{lower}, '');
211is($charinfo->{title}, '');
212is($charinfo->{block}, 'CJK Unified Ideographs');
213is($charinfo->{script}, 'Han');
214
55d7b906 215use Unicode::UCD qw(charblock charscript);
a196fbfd 216
217# 0x0590 is in the Hebrew block but unused.
561c79ed 218
f5c9f3db 219is(charblock(0x590), 'Hebrew', '0x0590 - Hebrew unused charblock');
220is(charscript(0x590), undef, '0x0590 - Hebrew unused charscript');
561c79ed 221
b08cd201 222$charinfo = charinfo(0xbe);
223
f5c9f3db 224is($charinfo->{code}, '00BE', 'VULGAR FRACTION THREE QUARTERS');
225is($charinfo->{name}, 'VULGAR FRACTION THREE QUARTERS');
226is($charinfo->{category}, 'No');
227is($charinfo->{combining}, '0');
228is($charinfo->{bidi}, 'ON');
229is($charinfo->{decomposition}, '<fraction> 0033 2044 0034');
230is($charinfo->{decimal}, '');
231is($charinfo->{digit}, '');
232is($charinfo->{numeric}, '3/4');
233is($charinfo->{mirrored}, 'N');
234is($charinfo->{unicode10}, 'FRACTION THREE QUARTERS');
235is($charinfo->{comment}, '');
236is($charinfo->{upper}, '');
237is($charinfo->{lower}, '');
238is($charinfo->{title}, '');
239is($charinfo->{block}, 'Latin-1 Supplement');
7be0dac3 240is($charinfo->{script}, 'Common');
10a6ecd2 241
55d7b906 242use Unicode::UCD qw(charblocks charscripts);
10a6ecd2 243
b08cd201 244my $charblocks = charblocks();
10a6ecd2 245
f5c9f3db 246ok(exists $charblocks->{Thai}, 'Thai charblock exists');
247is($charblocks->{Thai}->[0]->[0], hex('0e00'));
248ok(!exists $charblocks->{PigLatin}, 'PigLatin charblock does not exist');
10a6ecd2 249
b08cd201 250my $charscripts = charscripts();
10a6ecd2 251
f5c9f3db 252ok(exists $charscripts->{Armenian}, 'Armenian charscript exists');
253is($charscripts->{Armenian}->[0]->[0], hex('0531'));
254ok(!exists $charscripts->{PigLatin}, 'PigLatin charscript does not exist');
10a6ecd2 255
256my $charscript;
257
258$charscript = charscript("12ab");
f5c9f3db 259is($charscript, 'Ethiopic', 'Ethiopic charscript');
10a6ecd2 260
261$charscript = charscript("0x12ab");
f5c9f3db 262is($charscript, 'Ethiopic');
10a6ecd2 263
264$charscript = charscript("U+12ab");
f5c9f3db 265is($charscript, 'Ethiopic');
10a6ecd2 266
267my $ranges;
268
269$ranges = charscript('Ogham');
7be0dac3 270is($ranges->[1]->[0], hex('1681'), 'Ogham charscript');
271is($ranges->[1]->[1], hex('169a'));
10a6ecd2 272
55d7b906 273use Unicode::UCD qw(charinrange);
10a6ecd2 274
275$ranges = charscript('Cherokee');
f5c9f3db 276ok(!charinrange($ranges, "139f"), 'Cherokee charscript');
10a6ecd2 277ok( charinrange($ranges, "13a0"));
278ok( charinrange($ranges, "13f4"));
279ok(!charinrange($ranges, "13f5"));
280
ea508aee 281use Unicode::UCD qw(general_categories);
282
283my $gc = general_categories();
284
285ok(exists $gc->{L}, 'has L');
286is($gc->{L}, 'Letter', 'L is Letter');
287is($gc->{Lu}, 'UppercaseLetter', 'Lu is UppercaseLetter');
288
289use Unicode::UCD qw(bidi_types);
290
291my $bt = bidi_types();
292
293ok(exists $bt->{L}, 'has L');
294is($bt->{L}, 'Left-to-Right', 'L is Left-to-Right');
295is($bt->{AL}, 'Right-to-Left Arabic', 'AL is Right-to-Left Arabic');
296
a452d459 297# If this fails, then maybe one should look at the Unicode changes to see
298# what else might need to be updated.
283b82dc 299is(Unicode::UCD::UnicodeVersion, '5.2.0', 'UnicodeVersion');
b08cd201 300
55d7b906 301use Unicode::UCD qw(compexcl);
b08cd201 302
f5c9f3db 303ok(!compexcl(0x0100), 'compexcl');
b08cd201 304ok( compexcl(0x0958));
305
55d7b906 306use Unicode::UCD qw(casefold);
b08cd201 307
308my $casefold;
309
310$casefold = casefold(0x41);
311
a452d459 312is($casefold->{code}, '0041', 'casefold 0x41 code');
313is($casefold->{status}, 'C', 'casefold 0x41 status');
314is($casefold->{mapping}, '0061', 'casefold 0x41 mapping');
315is($casefold->{full}, '0061', 'casefold 0x41 full');
316is($casefold->{simple}, '0061', 'casefold 0x41 simple');
317is($casefold->{turkic}, "", 'casefold 0x41 turkic');
b08cd201 318
319$casefold = casefold(0xdf);
320
a452d459 321is($casefold->{code}, '00DF', 'casefold 0xDF code');
322is($casefold->{status}, 'F', 'casefold 0xDF status');
323is($casefold->{mapping}, '0073 0073', 'casefold 0xDF mapping');
324is($casefold->{full}, '0073 0073', 'casefold 0xDF full');
325is($casefold->{simple}, "", 'casefold 0xDF simple');
326is($casefold->{turkic}, "", 'casefold 0xDF turkic');
327
328# Do different tests depending on if version <= 3.1, or not.
329(my $version = Unicode::UCD::UnicodeVersion) =~ /^(\d+)\.(\d+)/;
330if (defined $1 && ($1 <= 2 || $1 == 3 && defined $2 && $2 <= 1)) {
331 $casefold = casefold(0x130);
332
333 is($casefold->{code}, '0130', 'casefold 0x130 code');
334 is($casefold->{status}, 'I' , 'casefold 0x130 status');
335 is($casefold->{mapping}, '0069', 'casefold 0x130 mapping');
336 is($casefold->{full}, '0069', 'casefold 0x130 full');
337 is($casefold->{simple}, "0069", 'casefold 0x130 simple');
338 is($casefold->{turkic}, "0069", 'casefold 0x130 turkic');
339
340 $casefold = casefold(0x131);
341
342 is($casefold->{code}, '0131', 'casefold 0x131 code');
343 is($casefold->{status}, 'I' , 'casefold 0x131 status');
344 is($casefold->{mapping}, '0069', 'casefold 0x131 mapping');
345 is($casefold->{full}, '0069', 'casefold 0x131 full');
346 is($casefold->{simple}, "0069", 'casefold 0x131 simple');
347 is($casefold->{turkic}, "0069", 'casefold 0x131 turkic');
348} else {
349 $casefold = casefold(0x49);
350
351 is($casefold->{code}, '0049', 'casefold 0x49 code');
352 is($casefold->{status}, 'C' , 'casefold 0x49 status');
353 is($casefold->{mapping}, '0069', 'casefold 0x49 mapping');
354 is($casefold->{full}, '0069', 'casefold 0x49 full');
355 is($casefold->{simple}, "0069", 'casefold 0x49 simple');
356 is($casefold->{turkic}, "0131", 'casefold 0x49 turkic');
357
358 $casefold = casefold(0x130);
359
360 is($casefold->{code}, '0130', 'casefold 0x130 code');
361 is($casefold->{status}, 'F' , 'casefold 0x130 status');
362 is($casefold->{mapping}, '0069 0307', 'casefold 0x130 mapping');
363 is($casefold->{full}, '0069 0307', 'casefold 0x130 full');
364 is($casefold->{simple}, "", 'casefold 0x130 simple');
365 is($casefold->{turkic}, "0069", 'casefold 0x130 turkic');
366}
367
368$casefold = casefold(0x1F88);
369
370is($casefold->{code}, '1F88', 'casefold 0x1F88 code');
371is($casefold->{status}, 'S' , 'casefold 0x1F88 status');
372is($casefold->{mapping}, '1F80', 'casefold 0x1F88 mapping');
373is($casefold->{full}, '1F00 03B9', 'casefold 0x1F88 full');
374is($casefold->{simple}, '1F80', 'casefold 0x1F88 simple');
375is($casefold->{turkic}, "", 'casefold 0x1F88 turkic');
b08cd201 376
377ok(!casefold(0x20));
378
55d7b906 379use Unicode::UCD qw(casespec);
b08cd201 380
381my $casespec;
382
383ok(!casespec(0x41));
384
385$casespec = casespec(0xdf);
386
387ok($casespec->{code} eq '00DF' &&
388 $casespec->{lower} eq '00DF' &&
389 $casespec->{title} eq '0053 0073' &&
390 $casespec->{upper} eq '0053 0053' &&
2d3cf3ee 391 !defined $casespec->{condition}, 'casespec 0xDF');
b08cd201 392
393$casespec = casespec(0x307);
394
f499c386 395ok($casespec->{az}->{code} eq '0307' &&
2d3cf3ee 396 !defined $casespec->{az}->{lower} &&
f499c386 397 $casespec->{az}->{title} eq '0307' &&
398 $casespec->{az}->{upper} eq '0307' &&
9c3dc587 399 $casespec->{az}->{condition} eq 'az After_I',
f5c9f3db 400 'casespec 0x307');
6c8d78fb 401
402# perl #7305 UnicodeCD::compexcl is weird
403
2d3cf3ee 404for (1) {my $a=compexcl $_}
6c8d78fb 405ok(1, 'compexcl read-only $_: perl #7305');
1f27373c 406map {compexcl $_} %{{1=>2}};
6c8d78fb 407ok(1, 'compexcl read-only hash: perl #7305');
408
d7829152 409is(Unicode::UCD::_getcode('123'), 123, "_getcode(123)");
410is(Unicode::UCD::_getcode('0123'), 0x123, "_getcode(0123)");
411is(Unicode::UCD::_getcode('0x123'), 0x123, "_getcode(0x123)");
412is(Unicode::UCD::_getcode('0X123'), 0x123, "_getcode(0X123)");
413is(Unicode::UCD::_getcode('U+123'), 0x123, "_getcode(U+123)");
414is(Unicode::UCD::_getcode('u+123'), 0x123, "_getcode(u+123)");
415is(Unicode::UCD::_getcode('U+1234'), 0x1234, "_getcode(U+1234)");
416is(Unicode::UCD::_getcode('U+12345'), 0x12345, "_getcode(U+12345)");
417is(Unicode::UCD::_getcode('123x'), undef, "_getcode(123x)");
418is(Unicode::UCD::_getcode('x123'), undef, "_getcode(x123)");
419is(Unicode::UCD::_getcode('0x123x'), undef, "_getcode(x123)");
420is(Unicode::UCD::_getcode('U+123x'), undef, "_getcode(x123)");
741297c1 421
422{
423 my $r1 = charscript('Latin');
424 my $n1 = @$r1;
5cd1aaf0 425 is($n1, 42, "number of ranges in Latin script (Unicode 5.1.0)");
741297c1 426 shift @$r1 while @$r1;
427 my $r2 = charscript('Latin');
428 is(@$r2, $n1, "modifying results should not mess up internal caches");
429}
430
c5a29f40 431{
432 is(charinfo(0xdeadbeef), undef, "[perl #23273] warnings in Unicode::UCD");
2d3cf3ee 433}
a2bd7410 434
435use Unicode::UCD qw(namedseq);
436
437is(namedseq("KATAKANA LETTER AINU P"), "\x{31F7}\x{309A}", "namedseq");
438is(namedseq("KATAKANA LETTER AINU Q"), undef);
439is(namedseq(), undef);
440is(namedseq(qw(foo bar)), undef);
441my @ns = namedseq("KATAKANA LETTER AINU P");
442is(scalar @ns, 2);
443is($ns[0], 0x31F7);
444is($ns[1], 0x309A);
445my %ns = namedseq();
446is($ns{"KATAKANA LETTER AINU P"}, "\x{31F7}\x{309A}");
447@ns = namedseq(42);
448is(@ns, 0);
449