Commit | Line | Data |
45394607 |
1 | |
4a2e806c |
2 | BEGIN { |
3 | if (ord("A") == 193) { |
4 | print "1..0 # Unicode::Collate not ported to EBCDIC\n"; |
5 | exit 0; |
6 | } |
7 | } |
8 | |
0116f5dc |
9 | BEGIN { |
10 | if ($ENV{PERL_CORE}) { |
11 | chdir('t') if -d 't'; |
63c6dcc1 |
12 | @INC = $^O eq 'MacOS' ? qw(::lib) : qw(../lib); |
0116f5dc |
13 | } |
14 | } |
15 | |
45394607 |
16 | use Test; |
4d36a948 |
17 | BEGIN { plan tests => 194 }; |
45394607 |
18 | use Unicode::Collate; |
45394607 |
19 | |
4d36a948 |
20 | our $IsEBCDIC = ord("A") != 0x41; |
21 | |
45394607 |
22 | ######################### |
23 | |
0116f5dc |
24 | ok(1); # If we made it this far, we're ok. |
25 | |
26 | my $UCA_Version = "9"; |
3164dd77 |
27 | |
28 | ok(Unicode::Collate::UCA_Version, $UCA_Version); |
29 | ok(Unicode::Collate->UCA_Version, $UCA_Version); |
30 | |
5398038e |
31 | my $Collator = Unicode::Collate->new( |
45394607 |
32 | table => 'keys.txt', |
33 | normalization => undef, |
34 | ); |
35 | |
5398038e |
36 | ok(ref $Collator, "Unicode::Collate"); |
45394607 |
37 | |
3164dd77 |
38 | ok($Collator->UCA_Version, $UCA_Version); |
39 | ok($Collator->UCA_Version(), $UCA_Version); |
40 | |
45394607 |
41 | ok( |
5398038e |
42 | join(':', $Collator->sort( |
45394607 |
43 | qw/ lib strict Carp ExtUtils CGI Time warnings Math overload Pod CPAN / |
44 | ) ), |
45 | join(':', |
46 | qw/ Carp CGI CPAN ExtUtils lib Math overload Pod strict Time warnings / |
47 | ), |
48 | ); |
49 | |
0116f5dc |
50 | ok($Collator->cmp("", ""), 0); |
51 | ok($Collator->eq("", "")); |
52 | ok($Collator->cmp("", "perl"), -1); |
53 | |
54 | ############## |
55 | |
4d36a948 |
56 | # Use pack('U'), not chr(), for Perl 5.6.1. |
57 | my $A_acute = pack('U', $IsEBCDIC ? 0x65 : 0xC1); |
58 | my $a_acute = pack('U', $IsEBCDIC ? 0x45 : 0xE1); |
45394607 |
59 | my $acute = pack('U', 0x0301); |
60 | |
caffd4cf |
61 | ok($Collator->cmp("A$acute", $A_acute), 0); # @version 3.1.1 (prev: -1) |
0116f5dc |
62 | ok($Collator->cmp($a_acute, $A_acute), -1); |
4d36a948 |
63 | ok($Collator->eq("A\cA$acute", $A_acute)); # UCA v9. \cA is invariant. |
0116f5dc |
64 | |
65 | my %old_level = $Collator->change(level => 1); |
66 | ok($Collator->eq("A$acute", $A_acute)); |
67 | ok($Collator->eq("A", $A_acute)); |
68 | |
69 | ok($Collator->change(level => 2)->eq($a_acute, $A_acute)); |
70 | ok($Collator->lt("A", $A_acute)); |
71 | |
72 | ok($Collator->change(%old_level)->lt("A", $A_acute)); |
73 | ok($Collator->lt("A", $A_acute)); |
74 | ok($Collator->lt("A", $a_acute)); |
75 | ok($Collator->lt($a_acute, $A_acute)); |
45394607 |
76 | |
809c7673 |
77 | ############## |
78 | |
79 | eval { require Unicode::Normalize }; |
45394607 |
80 | |
4d36a948 |
81 | if (!$@ && !$IsEBCDIC) { |
45394607 |
82 | my $NFD = Unicode::Collate->new( |
caffd4cf |
83 | table => undef, |
905aa9f0 |
84 | entry => <<'ENTRIES', |
caffd4cf |
85 | 0430 ; [.0CB5.0020.0002.0430] # CYRILLIC SMALL LETTER A |
86 | 0410 ; [.0CB5.0020.0008.0410] # CYRILLIC CAPITAL LETTER A |
87 | 04D3 ; [.0CBD.0020.0002.04D3] # CYRILLIC SMALL LETTER A WITH DIAERESIS |
88 | 0430 0308 ; [.0CBD.0020.0002.04D3] # CYRILLIC SMALL LETTER A WITH DIAERESIS |
89 | 04D2 ; [.0CBD.0020.0008.04D2] # CYRILLIC CAPITAL LETTER A WITH DIAERESIS |
90 | 0410 0308 ; [.0CBD.0020.0008.04D2] # CYRILLIC CAPITAL LETTER A WITH DIAERESIS |
91 | 0430 3099 ; [.0CBE.0020.0002.04D3] # A WITH KATAKANA VOICED |
92 | 0430 3099 0308 ; [.0CBF.0020.0002.04D3] # A WITH KATAKANA VOICED, DIAERESIS |
905aa9f0 |
93 | ENTRIES |
45394607 |
94 | ); |
905aa9f0 |
95 | ok($NFD->eq("\x{4D3}\x{325}", "\x{430}\x{308}\x{325}")); |
96 | ok($NFD->lt("\x{430}\x{308}A", "\x{430}\x{308}B")); |
97 | ok($NFD->lt("\x{430}\x{3099}B", "\x{430}\x{308}\x{3099}A")); |
98 | ok($NFD->eq("\x{0430}\x{3099}\x{309A}\x{0308}", |
99 | "\x{0430}\x{309A}\x{3099}\x{0308}") ); |
45394607 |
100 | } |
809c7673 |
101 | else { |
d16e9e3d |
102 | ok(1); |
905aa9f0 |
103 | ok(1); |
104 | ok(1); |
105 | ok(1); |
45394607 |
106 | } |
107 | |
809c7673 |
108 | ############## |
109 | |
110 | my $trad = Unicode::Collate->new( |
45394607 |
111 | table => 'keys.txt', |
112 | normalization => undef, |
809c7673 |
113 | ignoreName => qr/HANGUL|HIRAGANA|KATAKANA|BOPOMOFO/, |
114 | level => 4, |
115 | entry => << 'ENTRIES', |
caffd4cf |
116 | 0063 0068 ; [.0A3F.0020.0002.0063] % "ch" in traditional Spanish |
117 | 0043 0068 ; [.0A3F.0020.0008.0043] # "Ch" in traditional Spanish |
45394607 |
118 | ENTRIES |
119 | ); |
caffd4cf |
120 | # 0063 ; [.0A3D.0020.0002.0063] # LATIN SMALL LETTER C |
121 | # 0064 ; [.0A49.0020.0002.0064] # LATIN SMALL LETTER D |
4d36a948 |
122 | # Deutsch sz is included in 'keys.txt'; |
45394607 |
123 | |
124 | ok( |
809c7673 |
125 | join(':', $trad->sort( qw/ acha aca ada acia acka / ) ), |
126 | join(':', qw/ aca acia acka acha ada / ), |
45394607 |
127 | ); |
128 | |
129 | ok( |
809c7673 |
130 | join(':', $Collator->sort( qw/ acha aca ada acia acka / ) ), |
131 | join(':', qw/ aca acha acia acka ada / ), |
45394607 |
132 | ); |
caffd4cf |
133 | ok($trad->eq("ocho", "oc\cAho")); # UCA v9 |
4d36a948 |
134 | ok($trad->eq("ocho", "oc\0\cA\0\cBho")); # UCA v9 |
45394607 |
135 | |
45394607 |
136 | my $hiragana = "\x{3042}\x{3044}"; |
137 | my $katakana = "\x{30A2}\x{30A4}"; |
138 | |
809c7673 |
139 | # HIRAGANA and KATAKANA are ignorable via ignoreName |
140 | ok($trad->eq($hiragana, "")); |
141 | ok($trad->eq("", $katakana)); |
142 | ok($trad->eq($hiragana, $katakana)); |
143 | ok($trad->eq($katakana, $hiragana)); |
144 | |
145 | ############## |
146 | |
0116f5dc |
147 | $Collator->change(level => 2); |
809c7673 |
148 | |
0116f5dc |
149 | ok($Collator->{level}, 2); |
5398038e |
150 | |
151 | ok( $Collator->cmp("ABC","abc"), 0); |
152 | ok( $Collator->eq("ABC","abc") ); |
153 | ok( $Collator->le("ABC","abc") ); |
154 | ok( $Collator->cmp($hiragana, $katakana), 0); |
155 | ok( $Collator->eq($hiragana, $katakana) ); |
156 | ok( $Collator->ge($hiragana, $katakana) ); |
45394607 |
157 | |
5398038e |
158 | # hangul |
159 | ok( $Collator->eq("a\x{AC00}b", "a\x{1100}\x{1161}b") ); |
160 | ok( $Collator->eq("a\x{AE00}b", "a\x{1100}\x{1173}\x{11AF}b") ); |
161 | ok( $Collator->gt("a\x{AE00}b", "a\x{1100}\x{1173}b\x{11AF}") ); |
162 | ok( $Collator->lt("a\x{AC00}b", "a\x{AE00}b") ); |
163 | ok( $Collator->gt("a\x{D7A3}b", "a\x{C544}b") ); |
164 | ok( $Collator->lt("a\x{C544}b", "a\x{30A2}b") ); # hangul < hiragana |
45394607 |
165 | |
0116f5dc |
166 | $Collator->change(%old_level, katakana_before_hiragana => 1); |
45394607 |
167 | |
0116f5dc |
168 | ok($Collator->{level}, 4); |
45394607 |
169 | |
5398038e |
170 | ok( $Collator->cmp("abc", "ABC"), -1); |
171 | ok( $Collator->ne("abc", "ABC") ); |
172 | ok( $Collator->lt("abc", "ABC") ); |
173 | ok( $Collator->le("abc", "ABC") ); |
174 | ok( $Collator->cmp($hiragana, $katakana), 1); |
175 | ok( $Collator->ne($hiragana, $katakana) ); |
176 | ok( $Collator->gt($hiragana, $katakana) ); |
177 | ok( $Collator->ge($hiragana, $katakana) ); |
45394607 |
178 | |
0116f5dc |
179 | $Collator->change(upper_before_lower => 1); |
45394607 |
180 | |
5398038e |
181 | ok( $Collator->cmp("abc", "ABC"), 1); |
182 | ok( $Collator->ge("abc", "ABC"), 1); |
183 | ok( $Collator->gt("abc", "ABC"), 1); |
184 | ok( $Collator->cmp($hiragana, $katakana), 1); |
185 | ok( $Collator->ge($hiragana, $katakana), 1); |
186 | ok( $Collator->gt($hiragana, $katakana), 1); |
45394607 |
187 | |
0116f5dc |
188 | $Collator->change(katakana_before_hiragana => 0); |
45394607 |
189 | |
5398038e |
190 | ok( $Collator->cmp("abc", "ABC"), 1); |
191 | ok( $Collator->cmp($hiragana, $katakana), -1); |
45394607 |
192 | |
0116f5dc |
193 | $Collator->change(upper_before_lower => 0); |
45394607 |
194 | |
5398038e |
195 | ok( $Collator->cmp("abc", "ABC"), -1); |
196 | ok( $Collator->le("abc", "ABC") ); |
197 | ok( $Collator->cmp($hiragana, $katakana), -1); |
198 | ok( $Collator->lt($hiragana, $katakana) ); |
45394607 |
199 | |
809c7673 |
200 | ############## |
201 | |
202 | my $ignoreAE = Unicode::Collate->new( |
203 | table => 'keys.txt', |
204 | normalization => undef, |
205 | ignoreChar => qr/^[aAeE]$/, |
206 | ); |
207 | |
208 | ok($ignoreAE->eq("element","lament")); |
209 | ok($ignoreAE->eq("Perl","ePrl")); |
210 | |
211 | ############## |
212 | |
213 | my $onlyABC = Unicode::Collate->new( |
214 | table => undef, |
327745dc |
215 | normalization => undef, |
809c7673 |
216 | entry => << 'ENTRIES', |
217 | 0061 ; [.0101.0020.0002.0061] # LATIN SMALL LETTER A |
218 | 0041 ; [.0101.0020.0008.0041] # LATIN CAPITAL LETTER A |
219 | 0062 ; [.0102.0020.0002.0062] # LATIN SMALL LETTER B |
220 | 0042 ; [.0102.0020.0008.0042] # LATIN CAPITAL LETTER B |
221 | 0063 ; [.0103.0020.0002.0063] # LATIN SMALL LETTER C |
222 | 0043 ; [.0103.0020.0008.0043] # LATIN CAPITAL LETTER C |
223 | ENTRIES |
224 | ); |
225 | |
226 | ok( |
227 | join(':', $onlyABC->sort( qw/ ABA BAC cc A Ab cAc aB / ) ), |
228 | join(':', qw/ A aB Ab ABA BAC cAc cc / ), |
229 | ); |
230 | |
231 | ############## |
232 | |
233 | my $undefAE = Unicode::Collate->new( |
45394607 |
234 | table => 'keys.txt', |
235 | normalization => undef, |
809c7673 |
236 | undefChar => qr/^[aAeE]$/, |
45394607 |
237 | ); |
238 | |
809c7673 |
239 | ok($undefAE ->gt("edge","fog")); |
240 | ok($Collator->lt("edge","fog")); |
241 | ok($undefAE ->gt("lake","like")); |
242 | ok($Collator->lt("lake","like")); |
243 | |
244 | ############## |
45394607 |
245 | |
809c7673 |
246 | # Table is undefined, then no entry is defined. |
247 | |
248 | my $undef_table = Unicode::Collate->new( |
249 | table => undef, |
250 | normalization => undef, |
251 | level => 1, |
252 | ); |
253 | |
254 | # in the Unicode code point order |
255 | ok($undef_table->lt('', 'A')); |
256 | ok($undef_table->lt('ABC', 'B')); |
257 | |
258 | # Hangul should be decomposed (even w/o Unicode::Normalize). |
259 | |
260 | ok($undef_table->lt("Perl", "\x{AC00}")); |
261 | ok($undef_table->eq("\x{AC00}", "\x{1100}\x{1161}")); |
262 | ok($undef_table->eq("\x{AE00}", "\x{1100}\x{1173}\x{11AF}")); |
263 | ok($undef_table->lt("\x{AE00}", "\x{3042}")); |
264 | # U+AC00: Hangul GA |
265 | # U+AE00: Hangul GEUL |
266 | # U+3042: Hiragana A |
267 | |
268 | # Weight for CJK Ideographs is defined, though. |
269 | |
270 | ok($undef_table->lt("", "\x{4E00}")); |
271 | ok($undef_table->lt("\x{4E8C}","ABC")); |
272 | ok($undef_table->lt("\x{4E00}","\x{3042}")); |
273 | ok($undef_table->lt("\x{4E00}","\x{4E8C}")); |
274 | # U+4E00: Ideograph "ONE" |
275 | # U+4E8C: Ideograph "TWO" |
276 | |
277 | |
278 | ############## |
279 | |
280 | my $few_entries = Unicode::Collate->new( |
281 | entry => <<'ENTRIES', |
282 | 0050 ; [.0101.0020.0002.0050] # P |
283 | 0045 ; [.0102.0020.0002.0045] # E |
284 | 0052 ; [.0103.0020.0002.0052] # R |
285 | 004C ; [.0104.0020.0002.004C] # L |
286 | 1100 ; [.0105.0020.0002.1100] # Hangul Jamo initial G |
287 | 1175 ; [.0106.0020.0002.1175] # Hangul Jamo middle I |
288 | 5B57 ; [.0107.0020.0002.5B57] # CJK Ideograph "Letter" |
289 | ENTRIES |
290 | table => undef, |
291 | normalization => undef, |
292 | ); |
293 | |
294 | # defined before undefined |
295 | |
296 | my $sortABC = join '', |
297 | $few_entries->sort(split //, "ABCDEFGHIJKLMNOPQRSTUVWXYZ "); |
298 | |
299 | ok($sortABC eq "PERL ABCDFGHIJKMNOQSTUVWXYZ"); |
300 | |
301 | ok($few_entries->lt('E', 'D')); |
302 | ok($few_entries->lt("\x{5B57}", "\x{4E00}")); |
303 | ok($few_entries->lt("\x{AE30}", "\x{AC00}")); |
304 | |
305 | # Hangul must be decomposed. |
306 | |
307 | ok($few_entries->eq("\x{AC00}", "\x{1100}\x{1161}")); |
308 | |
309 | ############## |
310 | |
0116f5dc |
311 | my $all_undef_8 = Unicode::Collate->new( |
809c7673 |
312 | table => undef, |
313 | normalization => undef, |
314 | overrideCJK => undef, |
315 | overrideHangul => undef, |
0116f5dc |
316 | UCA_Version => 8, |
809c7673 |
317 | ); |
318 | |
319 | # All in the Unicode code point order. |
320 | # No hangul decomposition. |
321 | |
0116f5dc |
322 | ok($all_undef_8->lt("\x{3402}", "\x{4E00}")); |
323 | ok($all_undef_8->lt("\x{4DFF}", "\x{4E00}")); |
324 | ok($all_undef_8->lt("\x{4E00}", "\x{AC00}")); |
325 | ok($all_undef_8->gt("\x{AC00}", "\x{1100}\x{1161}")); |
326 | ok($all_undef_8->gt("\x{AC00}", "\x{ABFF}")); |
327 | |
328 | ############## |
329 | |
330 | my $all_undef_9 = Unicode::Collate->new( |
331 | table => undef, |
332 | normalization => undef, |
333 | overrideCJK => undef, |
334 | overrideHangul => undef, |
335 | UCA_Version => 9, |
336 | ); |
337 | |
338 | # CJK Ideo. < CJK ext A/B < Others. |
339 | # No hangul decomposition. |
340 | |
341 | ok($all_undef_9->lt("\x{4E00}", "\x{3402}")); |
342 | ok($all_undef_9->lt("\x{3402}", "\x{20000}")); |
343 | ok($all_undef_9->lt("\x{20000}", "\x{AC00}")); |
344 | ok($all_undef_9->gt("\x{AC00}", "\x{1100}\x{1161}")); |
caffd4cf |
345 | ok($all_undef_9->gt("\x{AC00}", "\x{ABFF}")); # U+ABFF: not assigned |
809c7673 |
346 | |
347 | ############## |
348 | |
349 | my $ignoreCJK = Unicode::Collate->new( |
350 | table => undef, |
351 | normalization => undef, |
352 | overrideCJK => sub {()}, |
353 | entry => <<'ENTRIES', |
354 | 5B57 ; [.0107.0020.0002.5B57] # CJK Ideograph "Letter" |
355 | ENTRIES |
356 | ); |
357 | |
358 | # All CJK Unified Ideographs except U+5B57 are ignored. |
359 | |
360 | ok($ignoreCJK->eq("\x{4E00}", "")); |
361 | ok($ignoreCJK->lt("\x{4E00}", "\0")); |
362 | ok($ignoreCJK->eq("Pe\x{4E00}rl", "Perl")); # U+4E00 is a CJK. |
363 | ok($ignoreCJK->gt("\x{4DFF}", "\x{4E00}")); # U+4DFF is not CJK. |
364 | ok($ignoreCJK->lt("Pe\x{5B57}rl", "Perl")); # 'r' is unassigned. |
365 | |
366 | ############## |
367 | |
368 | my $ignoreHangul = Unicode::Collate->new( |
369 | table => undef, |
370 | normalization => undef, |
371 | overrideHangul => sub {()}, |
372 | entry => <<'ENTRIES', |
373 | AE00 ; [.0100.0020.0002.AE00] # Hangul GEUL |
374 | ENTRIES |
375 | ); |
376 | |
377 | # All Hangul Syllables except U+AE00 are ignored. |
378 | |
379 | ok($ignoreHangul->eq("\x{AC00}", "")); |
380 | ok($ignoreHangul->lt("\x{AC00}", "\0")); |
381 | ok($ignoreHangul->lt("\x{AC00}", "\x{AE00}")); |
382 | ok($ignoreHangul->lt("\x{AC00}", "\x{1100}\x{1161}")); # Jamo are not ignored. |
383 | ok($ignoreHangul->lt("Pe\x{AE00}rl", "Perl")); # 'r' is unassigned. |
384 | |
385 | ############## |
386 | |
0116f5dc |
387 | my %origAlter = $Collator->change(alternate => 'Blanked'); |
809c7673 |
388 | |
0116f5dc |
389 | ok($Collator->lt("death", "de luge")); |
390 | ok($Collator->lt("de luge", "de-luge")); |
391 | ok($Collator->lt("de-luge", "deluge")); |
392 | ok($Collator->lt("deluge", "de\x{2010}luge")); |
393 | ok($Collator->lt("deluge", "de Luge")); |
809c7673 |
394 | |
0116f5dc |
395 | $Collator->change(alternate => 'Non-ignorable'); |
809c7673 |
396 | |
0116f5dc |
397 | ok($Collator->lt("de luge", "de Luge")); |
398 | ok($Collator->lt("de Luge", "de-luge")); |
399 | ok($Collator->lt("de-Luge", "de\x{2010}luge")); |
400 | ok($Collator->lt("de-luge", "death")); |
401 | ok($Collator->lt("death", "deluge")); |
809c7673 |
402 | |
0116f5dc |
403 | $Collator->change(alternate => 'Shifted'); |
809c7673 |
404 | |
0116f5dc |
405 | ok($Collator->lt("death", "de luge")); |
406 | ok($Collator->lt("de luge", "de-luge")); |
407 | ok($Collator->lt("de-luge", "deluge")); |
408 | ok($Collator->lt("deluge", "de Luge")); |
409 | ok($Collator->lt("de Luge", "deLuge")); |
809c7673 |
410 | |
0116f5dc |
411 | $Collator->change(alternate => 'Shift-Trimmed'); |
809c7673 |
412 | |
0116f5dc |
413 | ok($Collator->lt("death", "deluge")); |
414 | ok($Collator->lt("deluge", "de luge")); |
415 | ok($Collator->lt("de luge", "de-luge")); |
416 | ok($Collator->lt("de-luge", "deLuge")); |
417 | ok($Collator->lt("deLuge", "de Luge")); |
809c7673 |
418 | |
0116f5dc |
419 | $Collator->change(%origAlter); |
809c7673 |
420 | |
0116f5dc |
421 | ok($Collator->{alternate}, 'shifted'); |
809c7673 |
422 | |
423 | ############## |
424 | |
425 | my $overCJK = Unicode::Collate->new( |
426 | table => undef, |
427 | normalization => undef, |
428 | entry => <<'ENTRIES', |
429 | 0061 ; [.0101.0020.0002.0061] # latin a |
430 | 0041 ; [.0101.0020.0008.0041] # LATIN A |
431 | 4E00 ; [.B1FC.0030.0004.4E00] # Ideograph; B1FC = FFFF - 4E03. |
432 | ENTRIES |
433 | overrideCJK => sub { |
434 | my $u = 0xFFFF - $_[0]; # reversed |
435 | [$u, 0x20, 0x2, $u]; |
436 | }, |
437 | ); |
438 | |
439 | ok($overCJK->lt("a", "A")); # diff. at level 3. |
440 | ok($overCJK->lt( "\x{4E03}", "\x{4E00}")); # diff. at level 2. |
441 | ok($overCJK->lt("A\x{4E03}", "A\x{4E00}")); |
442 | ok($overCJK->lt("A\x{4E03}", "a\x{4E00}")); |
443 | ok($overCJK->lt("a\x{4E03}", "A\x{4E00}")); |
444 | |
445 | ############## |
446 | |
0116f5dc |
447 | # rearrange : 0x0E40..0x0E44, 0x0EC0..0x0EC4 (default) |
448 | |
449 | my %old_rearrange = $Collator->change(rearrange => undef); |
450 | |
451 | ok($Collator->gt("\x{0E41}A", "\x{0E40}B")); |
452 | ok($Collator->gt("A\x{0E41}A", "A\x{0E40}B")); |
453 | |
4d36a948 |
454 | $Collator->change(rearrange => [ 0x61 ]); |
455 | # U+0061, 'a': This is a Unicode value, never a native value. |
809c7673 |
456 | |
0116f5dc |
457 | ok($Collator->gt("ab", "AB")); # as 'ba' > 'AB' |
458 | |
459 | $Collator->change(%old_rearrange); |
460 | |
461 | ok($Collator->lt("ab", "AB")); |
809c7673 |
462 | ok($Collator->lt("\x{0E40}", "\x{0E41}")); |
463 | ok($Collator->lt("\x{0E40}A", "\x{0E41}B")); |
464 | ok($Collator->lt("\x{0E41}A", "\x{0E40}B")); |
465 | ok($Collator->lt("A\x{0E41}A", "A\x{0E40}B")); |
466 | |
0116f5dc |
467 | ok($all_undef_8->lt("\x{0E40}", "\x{0E41}")); |
468 | ok($all_undef_8->lt("\x{0E40}A", "\x{0E41}B")); |
469 | ok($all_undef_8->lt("\x{0E41}A", "\x{0E40}B")); |
470 | ok($all_undef_8->lt("A\x{0E41}A", "A\x{0E40}B")); |
809c7673 |
471 | |
472 | ############## |
473 | |
474 | my $no_rearrange = Unicode::Collate->new( |
475 | table => undef, |
476 | normalization => undef, |
477 | rearrange => [], |
478 | ); |
479 | |
480 | ok($no_rearrange->lt("A", "B")); |
481 | ok($no_rearrange->lt("\x{0E40}", "\x{0E41}")); |
482 | ok($no_rearrange->lt("\x{0E40}A", "\x{0E41}B")); |
483 | ok($no_rearrange->gt("\x{0E41}A", "\x{0E40}B")); |
484 | ok($no_rearrange->gt("A\x{0E41}A", "A\x{0E40}B")); |
485 | |
486 | ############## |
487 | |
809c7673 |
488 | my $undef_rearrange = Unicode::Collate->new( |
489 | table => undef, |
490 | normalization => undef, |
491 | rearrange => undef, |
492 | ); |
493 | |
494 | ok($undef_rearrange->lt("A", "B")); |
495 | ok($undef_rearrange->lt("\x{0E40}", "\x{0E41}")); |
496 | ok($undef_rearrange->lt("\x{0E40}A", "\x{0E41}B")); |
497 | ok($undef_rearrange->gt("\x{0E41}A", "\x{0E40}B")); |
498 | ok($undef_rearrange->gt("A\x{0E41}A", "A\x{0E40}B")); |
499 | |
500 | ############## |
501 | |
502 | my $dropArticles = Unicode::Collate->new( |
503 | table => "keys.txt", |
504 | normalization => undef, |
505 | preprocess => sub { |
506 | my $string = shift; |
507 | $string =~ s/\b(?:an?|the)\s+//ig; |
508 | $string; |
509 | }, |
510 | ); |
511 | |
512 | ok($dropArticles->eq("camel", "a camel")); |
513 | ok($dropArticles->eq("Perl", "The Perl")); |
514 | ok($dropArticles->lt("the pen", "a pencil")); |
515 | ok($Collator->lt("Perl", "The Perl")); |
516 | ok($Collator->gt("the pen", "a pencil")); |
517 | |
518 | ############## |
519 | |
520 | my $backLevel1 = Unicode::Collate->new( |
521 | table => undef, |
522 | normalization => undef, |
523 | backwards => [ 1 ], |
524 | ); |
525 | |
526 | # all strings are reversed at level 1. |
527 | |
528 | ok($backLevel1->gt("AB", "BA")); |
529 | ok($backLevel1->gt("\x{3042}\x{3044}", "\x{3044}\x{3042}")); |
530 | |
531 | ############## |
532 | |
533 | my $backLevel2 = Unicode::Collate->new( |
534 | table => "keys.txt", |
535 | normalization => undef, |
536 | undefName => qr/HANGUL|HIRAGANA|KATAKANA|BOPOMOFO/, |
537 | backwards => 2, |
538 | ); |
539 | |
540 | ok($backLevel2->gt("Ca\x{300}ca\x{302}", "ca\x{302}ca\x{300}")); |
541 | ok($backLevel2->gt("ca\x{300}ca\x{302}", "Ca\x{302}ca\x{300}")); |
542 | ok($Collator ->lt("Ca\x{300}ca\x{302}", "ca\x{302}ca\x{300}")); |
543 | ok($Collator ->lt("ca\x{300}ca\x{302}", "Ca\x{302}ca\x{300}")); |
544 | |
3164dd77 |
545 | # HIRAGANA and KATAKANA are made undefined via undefName. |
546 | # So they are after CJK Unified Ideographs. |
809c7673 |
547 | |
548 | ok($backLevel2->lt("\x{4E00}", $hiragana)); |
549 | ok($backLevel2->lt("\x{4E03}", $katakana)); |
550 | ok($Collator ->gt("\x{4E00}", $hiragana)); |
551 | ok($Collator ->gt("\x{4E03}", $katakana)); |
552 | |
553 | ############## |
caffd4cf |
554 | |
4d36a948 |
555 | # ignorable after variable |
caffd4cf |
556 | |
4d36a948 |
557 | # Shifted; |
caffd4cf |
558 | ok($Collator->eq("?\x{300}!\x{301}\x{315}", "?!")); |
4d36a948 |
559 | ok($Collator->eq("?\x{300}A\x{301}", "?$A_acute")); |
caffd4cf |
560 | ok($Collator->eq("?\x{300}", "?")); |
4d36a948 |
561 | ok($Collator->eq("?\x{344}", "?")); # U+0344 has two CEs. |
caffd4cf |
562 | |
4d36a948 |
563 | $Collator->change(level => 3); |
564 | ok($Collator->eq("\cA", "?")); |
565 | |
566 | $Collator->change(alternate => 'blanked', level => 4); |
567 | ok($Collator->eq("?\x{300}!\x{301}\x{315}", "?!")); |
568 | ok($Collator->eq("?\x{300}A\x{301}", "?$A_acute")); |
569 | ok($Collator->eq("?\x{300}", "?")); |
570 | ok($Collator->eq("?\x{344}", "?")); # U+0344 has two CEs. |
571 | |
572 | $Collator->change(level => 3); |
573 | ok($Collator->eq("\cA", "?")); |
caffd4cf |
574 | |
4d36a948 |
575 | $Collator->change(alternate => 'Non-ignorable', level => 4); |
576 | |
577 | ok($Collator->lt("?\x{300}", "?!")); |
578 | ok($Collator->gt("?\x{300}A$acute", "?$A_acute")); |
caffd4cf |
579 | ok($Collator->gt("?\x{300}", "?")); |
4d36a948 |
580 | ok($Collator->gt("?\x{344}", "?")); |
caffd4cf |
581 | |
4d36a948 |
582 | $Collator->change(level => 3); |
583 | ok($Collator->lt("\cA", "?")); |
584 | |
585 | $Collator->change(alternate => 'Shifted', level => 4); |
586 | |
587 | ############## |
588 | |
589 | # According to Conformance Test, |
590 | # a L3-ignorable is treated as a completely ignorable. |
591 | |
592 | my $L3ignorable = Unicode::Collate->new( |
593 | alternate => 'Non-ignorable', |
594 | table => undef, |
595 | normalization => undef, |
596 | entry => <<'ENTRIES', |
597 | 0000 ; [.0000.0000.0000.0000] # [0000] NULL (in 6429) |
598 | 0001 ; [.0000.0000.0000.0000] # [0001] START OF HEADING (in 6429) |
599 | 0591 ; [.0000.0000.0000.0591] # HEBREW ACCENT ETNAHTA |
600 | 1D165 ; [.0000.0000.0000.1D165] # MUSICAL SYMBOL COMBINING STEM |
601 | 0021 ; [*024B.0020.0002.0021] # EXCLAMATION MARK |
602 | 09BE ; [.114E.0020.0002.09BE] # BENGALI VOWEL SIGN AA |
603 | 09C7 ; [.1157.0020.0002.09C7] # BENGALI VOWEL SIGN E |
604 | 09CB ; [.1159.0020.0002.09CB] # BENGALI VOWEL SIGN O |
605 | 09C7 09BE ; [.1159.0020.0002.09CB] # BENGALI VOWEL SIGN O |
606 | ENTRIES |
607 | ); |
608 | |
609 | ok($L3ignorable->lt("\cA", "!")); |
610 | ok($L3ignorable->lt("\x{591}", "!")); |
611 | ok($L3ignorable->eq("\cA", "\x{591}")); |
612 | ok($L3ignorable->eq("\x{09C7}\x{09BE}A", "\x{09C7}\cA\x{09BE}A")); |
613 | ok($L3ignorable->eq("\x{09C7}\x{09BE}A", "\x{09C7}\x{0591}\x{09BE}A")); |
614 | ok($L3ignorable->eq("\x{09C7}\x{09BE}A", "\x{09C7}\x{1D165}\x{09BE}A")); |
615 | ok($L3ignorable->eq("\x{09C7}\x{09BE}A", "\x{09CB}A")); |