Commit | Line | Data |
45394607 |
1 | |
4a2e806c |
2 | BEGIN { |
ae6aa562 |
3 | unless ("A" eq pack('U', 0x41)) { |
9f1f04a1 |
4 | print "1..0 # Unicode::Collate " . |
5 | "cannot stringify a Unicode code point\n"; |
4a2e806c |
6 | exit 0; |
7 | } |
0116f5dc |
8 | if ($ENV{PERL_CORE}) { |
3756e7ca |
9 | chdir('t') if -d 't'; |
10 | @INC = $^O eq 'MacOS' ? qw(::lib) : qw(../lib); |
0116f5dc |
11 | } |
12 | } |
13 | |
45394607 |
14 | use Test; |
6d24ed10 |
15 | BEGIN { plan tests => 113 }; |
4c843366 |
16 | |
17 | use strict; |
18 | use warnings; |
45394607 |
19 | use Unicode::Collate; |
45394607 |
20 | |
91ae00cb |
21 | ok(1); |
45394607 |
22 | |
3756e7ca |
23 | sub _pack_U { Unicode::Collate::pack_U(@_) } |
24 | sub _unpack_U { Unicode::Collate::unpack_U(@_) } |
25 | |
26 | my $A_acute = _pack_U(0xC1); |
27 | my $a_acute = _pack_U(0xE1); |
28 | my $acute = _pack_U(0x0301); |
29 | |
30 | my $hiragana = "\x{3042}\x{3044}"; |
31 | my $katakana = "\x{30A2}\x{30A4}"; |
32 | |
33 | ##### 2..7 |
3164dd77 |
34 | |
5398038e |
35 | my $Collator = Unicode::Collate->new( |
45394607 |
36 | table => 'keys.txt', |
37 | normalization => undef, |
38 | ); |
39 | |
5398038e |
40 | ok(ref $Collator, "Unicode::Collate"); |
45394607 |
41 | |
0116f5dc |
42 | ok($Collator->cmp("", ""), 0); |
43 | ok($Collator->eq("", "")); |
44 | ok($Collator->cmp("", "perl"), -1); |
45 | |
3756e7ca |
46 | ok( |
47 | join(':', $Collator->sort( qw/ acha aca ada acia acka / ) ), |
48 | join(':', qw/ aca acha acia acka ada / ), |
49 | ); |
0116f5dc |
50 | |
3756e7ca |
51 | ok( |
52 | join(':', $Collator->sort( qw/ ACHA ACA ADA ACIA ACKA / ) ), |
53 | join(':', qw/ ACA ACHA ACIA ACKA ADA / ), |
54 | ); |
9f1f04a1 |
55 | |
3756e7ca |
56 | ##### 8..18 |
45394607 |
57 | |
caffd4cf |
58 | ok($Collator->cmp("A$acute", $A_acute), 0); # @version 3.1.1 (prev: -1) |
0116f5dc |
59 | ok($Collator->cmp($a_acute, $A_acute), -1); |
4d36a948 |
60 | ok($Collator->eq("A\cA$acute", $A_acute)); # UCA v9. \cA is invariant. |
0116f5dc |
61 | |
62 | my %old_level = $Collator->change(level => 1); |
63 | ok($Collator->eq("A$acute", $A_acute)); |
64 | ok($Collator->eq("A", $A_acute)); |
65 | |
66 | ok($Collator->change(level => 2)->eq($a_acute, $A_acute)); |
67 | ok($Collator->lt("A", $A_acute)); |
68 | |
69 | ok($Collator->change(%old_level)->lt("A", $A_acute)); |
70 | ok($Collator->lt("A", $A_acute)); |
71 | ok($Collator->lt("A", $a_acute)); |
72 | ok($Collator->lt($a_acute, $A_acute)); |
45394607 |
73 | |
3756e7ca |
74 | ##### 19..25 |
809c7673 |
75 | |
0116f5dc |
76 | $Collator->change(level => 2); |
809c7673 |
77 | |
0116f5dc |
78 | ok($Collator->{level}, 2); |
5398038e |
79 | |
80 | ok( $Collator->cmp("ABC","abc"), 0); |
81 | ok( $Collator->eq("ABC","abc") ); |
82 | ok( $Collator->le("ABC","abc") ); |
83 | ok( $Collator->cmp($hiragana, $katakana), 0); |
84 | ok( $Collator->eq($hiragana, $katakana) ); |
85 | ok( $Collator->ge($hiragana, $katakana) ); |
45394607 |
86 | |
3756e7ca |
87 | ##### 26..31 |
91ae00cb |
88 | |
5398038e |
89 | # hangul |
90 | ok( $Collator->eq("a\x{AC00}b", "a\x{1100}\x{1161}b") ); |
91 | ok( $Collator->eq("a\x{AE00}b", "a\x{1100}\x{1173}\x{11AF}b") ); |
92 | ok( $Collator->gt("a\x{AE00}b", "a\x{1100}\x{1173}b\x{11AF}") ); |
93 | ok( $Collator->lt("a\x{AC00}b", "a\x{AE00}b") ); |
94 | ok( $Collator->gt("a\x{D7A3}b", "a\x{C544}b") ); |
95 | ok( $Collator->lt("a\x{C544}b", "a\x{30A2}b") ); # hangul < hiragana |
45394607 |
96 | |
3756e7ca |
97 | ##### 32..40 |
91ae00cb |
98 | |
0116f5dc |
99 | $Collator->change(%old_level, katakana_before_hiragana => 1); |
45394607 |
100 | |
0116f5dc |
101 | ok($Collator->{level}, 4); |
45394607 |
102 | |
5398038e |
103 | ok( $Collator->cmp("abc", "ABC"), -1); |
104 | ok( $Collator->ne("abc", "ABC") ); |
105 | ok( $Collator->lt("abc", "ABC") ); |
106 | ok( $Collator->le("abc", "ABC") ); |
107 | ok( $Collator->cmp($hiragana, $katakana), 1); |
108 | ok( $Collator->ne($hiragana, $katakana) ); |
109 | ok( $Collator->gt($hiragana, $katakana) ); |
110 | ok( $Collator->ge($hiragana, $katakana) ); |
45394607 |
111 | |
3756e7ca |
112 | ##### 41..46 |
91ae00cb |
113 | |
0116f5dc |
114 | $Collator->change(upper_before_lower => 1); |
45394607 |
115 | |
5398038e |
116 | ok( $Collator->cmp("abc", "ABC"), 1); |
117 | ok( $Collator->ge("abc", "ABC"), 1); |
118 | ok( $Collator->gt("abc", "ABC"), 1); |
119 | ok( $Collator->cmp($hiragana, $katakana), 1); |
120 | ok( $Collator->ge($hiragana, $katakana), 1); |
121 | ok( $Collator->gt($hiragana, $katakana), 1); |
45394607 |
122 | |
3756e7ca |
123 | ##### 47..48 |
91ae00cb |
124 | |
0116f5dc |
125 | $Collator->change(katakana_before_hiragana => 0); |
45394607 |
126 | |
5398038e |
127 | ok( $Collator->cmp("abc", "ABC"), 1); |
128 | ok( $Collator->cmp($hiragana, $katakana), -1); |
45394607 |
129 | |
3756e7ca |
130 | ##### 49..52 |
131 | |
0116f5dc |
132 | $Collator->change(upper_before_lower => 0); |
45394607 |
133 | |
5398038e |
134 | ok( $Collator->cmp("abc", "ABC"), -1); |
135 | ok( $Collator->le("abc", "ABC") ); |
136 | ok( $Collator->cmp($hiragana, $katakana), -1); |
137 | ok( $Collator->lt($hiragana, $katakana) ); |
45394607 |
138 | |
3756e7ca |
139 | ##### 53..54 |
809c7673 |
140 | |
141 | my $ignoreAE = Unicode::Collate->new( |
142 | table => 'keys.txt', |
143 | normalization => undef, |
144 | ignoreChar => qr/^[aAeE]$/, |
145 | ); |
146 | |
147 | ok($ignoreAE->eq("element","lament")); |
148 | ok($ignoreAE->eq("Perl","ePrl")); |
149 | |
3756e7ca |
150 | ##### 55 |
809c7673 |
151 | |
152 | my $onlyABC = Unicode::Collate->new( |
153 | table => undef, |
327745dc |
154 | normalization => undef, |
809c7673 |
155 | entry => << 'ENTRIES', |
156 | 0061 ; [.0101.0020.0002.0061] # LATIN SMALL LETTER A |
157 | 0041 ; [.0101.0020.0008.0041] # LATIN CAPITAL LETTER A |
158 | 0062 ; [.0102.0020.0002.0062] # LATIN SMALL LETTER B |
159 | 0042 ; [.0102.0020.0008.0042] # LATIN CAPITAL LETTER B |
160 | 0063 ; [.0103.0020.0002.0063] # LATIN SMALL LETTER C |
161 | 0043 ; [.0103.0020.0008.0043] # LATIN CAPITAL LETTER C |
162 | ENTRIES |
163 | ); |
164 | |
165 | ok( |
166 | join(':', $onlyABC->sort( qw/ ABA BAC cc A Ab cAc aB / ) ), |
167 | join(':', qw/ A aB Ab ABA BAC cAc cc / ), |
168 | ); |
169 | |
3756e7ca |
170 | ##### 56..59 |
809c7673 |
171 | |
172 | my $undefAE = Unicode::Collate->new( |
45394607 |
173 | table => 'keys.txt', |
174 | normalization => undef, |
809c7673 |
175 | undefChar => qr/^[aAeE]$/, |
45394607 |
176 | ); |
177 | |
809c7673 |
178 | ok($undefAE ->gt("edge","fog")); |
179 | ok($Collator->lt("edge","fog")); |
180 | ok($undefAE ->gt("lake","like")); |
181 | ok($Collator->lt("lake","like")); |
182 | |
3756e7ca |
183 | ##### 60..69 |
45394607 |
184 | |
809c7673 |
185 | # Table is undefined, then no entry is defined. |
186 | |
187 | my $undef_table = Unicode::Collate->new( |
188 | table => undef, |
189 | normalization => undef, |
190 | level => 1, |
191 | ); |
192 | |
193 | # in the Unicode code point order |
194 | ok($undef_table->lt('', 'A')); |
195 | ok($undef_table->lt('ABC', 'B')); |
196 | |
197 | # Hangul should be decomposed (even w/o Unicode::Normalize). |
809c7673 |
198 | ok($undef_table->lt("Perl", "\x{AC00}")); |
199 | ok($undef_table->eq("\x{AC00}", "\x{1100}\x{1161}")); |
200 | ok($undef_table->eq("\x{AE00}", "\x{1100}\x{1173}\x{11AF}")); |
201 | ok($undef_table->lt("\x{AE00}", "\x{3042}")); |
202 | # U+AC00: Hangul GA |
203 | # U+AE00: Hangul GEUL |
204 | # U+3042: Hiragana A |
205 | |
206 | # Weight for CJK Ideographs is defined, though. |
809c7673 |
207 | ok($undef_table->lt("", "\x{4E00}")); |
208 | ok($undef_table->lt("\x{4E8C}","ABC")); |
209 | ok($undef_table->lt("\x{4E00}","\x{3042}")); |
210 | ok($undef_table->lt("\x{4E00}","\x{4E8C}")); |
211 | # U+4E00: Ideograph "ONE" |
212 | # U+4E8C: Ideograph "TWO" |
213 | |
214 | |
3756e7ca |
215 | ##### 70..74 |
809c7673 |
216 | |
217 | my $few_entries = Unicode::Collate->new( |
218 | entry => <<'ENTRIES', |
219 | 0050 ; [.0101.0020.0002.0050] # P |
220 | 0045 ; [.0102.0020.0002.0045] # E |
221 | 0052 ; [.0103.0020.0002.0052] # R |
222 | 004C ; [.0104.0020.0002.004C] # L |
223 | 1100 ; [.0105.0020.0002.1100] # Hangul Jamo initial G |
224 | 1175 ; [.0106.0020.0002.1175] # Hangul Jamo middle I |
225 | 5B57 ; [.0107.0020.0002.5B57] # CJK Ideograph "Letter" |
226 | ENTRIES |
227 | table => undef, |
228 | normalization => undef, |
229 | ); |
230 | |
231 | # defined before undefined |
232 | |
233 | my $sortABC = join '', |
234 | $few_entries->sort(split //, "ABCDEFGHIJKLMNOPQRSTUVWXYZ "); |
235 | |
236 | ok($sortABC eq "PERL ABCDFGHIJKMNOQSTUVWXYZ"); |
237 | |
238 | ok($few_entries->lt('E', 'D')); |
239 | ok($few_entries->lt("\x{5B57}", "\x{4E00}")); |
240 | ok($few_entries->lt("\x{AE30}", "\x{AC00}")); |
241 | |
242 | # Hangul must be decomposed. |
243 | |
244 | ok($few_entries->eq("\x{AC00}", "\x{1100}\x{1161}")); |
245 | |
3756e7ca |
246 | ##### 75..79 |
809c7673 |
247 | |
248 | my $dropArticles = Unicode::Collate->new( |
249 | table => "keys.txt", |
250 | normalization => undef, |
251 | preprocess => sub { |
252 | my $string = shift; |
253 | $string =~ s/\b(?:an?|the)\s+//ig; |
254 | $string; |
255 | }, |
256 | ); |
257 | |
258 | ok($dropArticles->eq("camel", "a camel")); |
259 | ok($dropArticles->eq("Perl", "The Perl")); |
260 | ok($dropArticles->lt("the pen", "a pencil")); |
261 | ok($Collator->lt("Perl", "The Perl")); |
262 | ok($Collator->gt("the pen", "a pencil")); |
263 | |
3756e7ca |
264 | ##### 80..81 |
809c7673 |
265 | |
266 | my $backLevel1 = Unicode::Collate->new( |
267 | table => undef, |
268 | normalization => undef, |
269 | backwards => [ 1 ], |
270 | ); |
271 | |
272 | # all strings are reversed at level 1. |
273 | |
274 | ok($backLevel1->gt("AB", "BA")); |
275 | ok($backLevel1->gt("\x{3042}\x{3044}", "\x{3044}\x{3042}")); |
276 | |
3756e7ca |
277 | ##### 82..89 |
809c7673 |
278 | |
279 | my $backLevel2 = Unicode::Collate->new( |
280 | table => "keys.txt", |
281 | normalization => undef, |
282 | undefName => qr/HANGUL|HIRAGANA|KATAKANA|BOPOMOFO/, |
283 | backwards => 2, |
284 | ); |
285 | |
286 | ok($backLevel2->gt("Ca\x{300}ca\x{302}", "ca\x{302}ca\x{300}")); |
287 | ok($backLevel2->gt("ca\x{300}ca\x{302}", "Ca\x{302}ca\x{300}")); |
288 | ok($Collator ->lt("Ca\x{300}ca\x{302}", "ca\x{302}ca\x{300}")); |
289 | ok($Collator ->lt("ca\x{300}ca\x{302}", "Ca\x{302}ca\x{300}")); |
290 | |
3164dd77 |
291 | # HIRAGANA and KATAKANA are made undefined via undefName. |
292 | # So they are after CJK Unified Ideographs. |
809c7673 |
293 | |
294 | ok($backLevel2->lt("\x{4E00}", $hiragana)); |
295 | ok($backLevel2->lt("\x{4E03}", $katakana)); |
296 | ok($Collator ->gt("\x{4E00}", $hiragana)); |
297 | ok($Collator ->gt("\x{4E03}", $katakana)); |
298 | |
4d36a948 |
299 | |
3756e7ca |
300 | ##### 90..96 |
91ae00cb |
301 | |
302 | my $O_str = Unicode::Collate->new( |
303 | table => "keys.txt", |
304 | normalization => undef, |
305 | entry => <<'ENTRIES', |
306 | 0008 ; [*0008.0000.0000.0000] # BACKSPACE (need to be non-ignorable) |
307 | 004F 0337 ; [.0B53.0020.0008.004F] # capital O WITH SHORT SOLIDUS OVERLAY |
308 | 006F 0008 002F ; [.0B53.0020.0002.006F] # LATIN SMALL LETTER O WITH STROKE |
309 | 004F 0008 002F ; [.0B53.0020.0008.004F] # LATIN CAPITAL LETTER O WITH STROKE |
310 | 006F 0337 ; [.0B53.0020.0002.004F] # small O WITH SHORT SOLIDUS OVERLAY |
311 | 200B ; [.2000.0000.0000.0000] # ZERO WIDTH SPACE (may be non-sense but ...) |
312 | #00F8 ; [.0B53.0020.0002.00F8] # LATIN SMALL LETTER O WITH STROKE |
313 | #00D8 ; [.0B53.0020.0008.00D8] # LATIN CAPITAL LETTER O WITH STROKE |
314 | ENTRIES |
315 | ); |
316 | |
317 | my $o_BS_slash = _pack_U(0x006F, 0x0008, 0x002F); |
318 | my $O_BS_slash = _pack_U(0x004F, 0x0008, 0x002F); |
319 | my $o_sol = _pack_U(0x006F, 0x0337); |
320 | my $O_sol = _pack_U(0x004F, 0x0337); |
321 | my $o_stroke = _pack_U(0x00F8); |
322 | my $O_stroke = _pack_U(0x00D8); |
323 | |
324 | ok($O_str->eq($o_stroke, $o_BS_slash)); |
325 | ok($O_str->eq($O_stroke, $O_BS_slash)); |
326 | |
327 | ok($O_str->eq($o_stroke, $o_sol)); |
328 | ok($O_str->eq($O_stroke, $O_sol)); |
329 | |
330 | ok($Collator->eq("\x{200B}", "\0")); |
331 | ok($O_str ->gt("\x{200B}", "\0")); |
332 | ok($O_str ->gt("\x{200B}", "A")); |
333 | |
3756e7ca |
334 | ##### 97..107 |
abd1ec54 |
335 | |
336 | my %origVer = $Collator->change(UCA_Version => 8); |
337 | |
338 | $Collator->change(level => 3); |
339 | |
340 | ok($Collator->gt("!\x{300}", "")); |
341 | ok($Collator->gt("!\x{300}", "!")); |
342 | ok($Collator->eq("!\x{300}", "\x{300}")); |
343 | |
344 | $Collator->change(level => 2); |
345 | |
346 | ok($Collator->eq("!\x{300}", "\x{300}")); |
347 | |
348 | $Collator->change(level => 4); |
349 | |
350 | ok($Collator->gt("!\x{300}", "!")); |
351 | ok($Collator->lt("!\x{300}", "\x{300}")); |
352 | |
353 | $Collator->change(%origVer, level => 3); |
354 | |
355 | ok($Collator->eq("!\x{300}", "")); |
356 | ok($Collator->eq("!\x{300}", "!")); |
357 | ok($Collator->lt("!\x{300}", "\x{300}")); |
358 | |
359 | $Collator->change(level => 4); |
360 | |
361 | ok($Collator->gt("!\x{300}", "")); |
362 | ok($Collator->eq("!\x{300}", "!")); |
363 | |
6d24ed10 |
364 | ##### 108..113 |
365 | |
366 | $_ = 'Foo'; |
367 | |
368 | my $c = Unicode::Collate->new( |
369 | table => 'keys.txt', |
370 | normalization => undef, |
371 | upper_before_lower => 1, |
372 | ); |
373 | |
374 | ok($_, 'Foo'); # fixed at v. 0.52; no longer clobber $_ |
375 | |
376 | my($temp, @temp); # Not the result but the side effect matters. |
377 | |
378 | $_ = 'Foo'; |
379 | $temp = $c->getSortKey("abc"); |
380 | ok($_, 'Foo'); |
381 | |
382 | $_ = 'Foo'; |
383 | $temp = $c->viewSortKey("abc"); |
384 | ok($_, 'Foo'); |
385 | |
386 | $_ = 'Foo'; |
387 | @temp = $c->sort("abc", "xyz", "def"); |
388 | ok($_, 'Foo'); |
389 | |
390 | $_ = 'Foo'; |
391 | @temp = $c->index("perl5", "RL"); |
392 | ok($_, 'Foo'); |
393 | |
394 | $_ = 'Foo'; |
395 | @temp = $c->index("perl5", "LR"); |
396 | ok($_, 'Foo'); |
397 | |
91ae00cb |
398 | ##### |
abd1ec54 |
399 | |