pass all tests when compiling with -DNO_PERL_PRESERVE_IVUV
[p5sagit/p5-mst-13.2.git] / lib / Unicode / Collate.pm
CommitLineData
45394607 1package Unicode::Collate;
2
3use 5.006;
4use strict;
5use warnings;
6use Carp;
7use Lingua::KO::Hangul::Util;
8require Exporter;
9
10our $VERSION = '0.07';
11our $PACKAGE = __PACKAGE__;
12
13our @ISA = qw(Exporter);
14
15our %EXPORT_TAGS = ();
16our @EXPORT_OK = ();
17our @EXPORT = ();
18
19(our $Path = $INC{'Unicode/Collate.pm'}) =~ s/\.pm$//;
20our $KeyFile = "allkeys.txt";
21
22our %Combin; # combining class from Unicode::Normalize
23
24use constant Min2 => 0x20; # minimum weight at level 2
25use constant Min3 => 0x02; # minimum weight at level 3
26use constant UNDEFINED => 0xFF80; # special value for undefined CE
27
28##
29## constructor
30##
31sub new
32{
33 my $class = shift;
34 my $self = bless { @_ }, $class;
35
36 # alternate
37 $self->{alternate} =
38 ! exists $self->{alternate} ? 'shifted' :
39 ! defined $self->{alternate} ? '' : $self->{alternate};
40
41 # collation level
42 $self->{level} ||= $self->{alternate} =~ /shift/ ? 4 : 3;
43
44 # normalization form
45 $self->{normalization} = 'D' if ! exists $self->{normalization};
46
47 eval "use Unicode::Normalize;" if defined $self->{normalization};
48
49 $self->{normalize} =
50 ! defined $self->{normalization} ? undef :
51 $self->{normalization} =~ /^(?:NF)?C$/ ? \&NFC :
52 $self->{normalization} =~ /^(?:NF)?D$/ ? \&NFD :
53 $self->{normalization} =~ /^(?:NF)?KC$/ ? \&NFKC :
54 $self->{normalization} =~ /^(?:NF)?KD$/ ? \&NFKD :
55 croak "$PACKAGE unknown normalization form name: $self->{normalization}";
56
57 *Combin = \%Unicode::Normalize::Combin if $self->{normalize} && ! %Combin;
58
59 # backwards
60 $self->{backwards} ||= [];
61 $self->{backwards} = [ $self->{backwards} ] if ! ref $self->{backwards};
62
63 # rearrange
64 $self->{rearrange} ||= []; # maybe not U+0000 (an ASCII)
65 $self->{rearrange} = [ $self->{rearrange} ] if ! ref $self->{rearrange};
66
67 # open the table file
68 my $file = defined $self->{table} ? $self->{table} : $KeyFile;
69 open my $fk, "<$Path/$file" or croak "File does not exist at $Path/$file";
70
71 while(<$fk>){
72 next if /^\s*#/;
73 if(/^\s*\@/){
74 if(/^\@version\s*(\S*)/){
75 $self->{version} ||= $1;
76 }
77 elsif(/^\@alternate\s+(.*)/){
78 $self->{alternate} ||= $1;
79 }
80 elsif(/^\@backwards\s+(.*)/){
81 push @{ $self->{backwards} }, $1;
82 }
83 elsif(/^\@rearrange\s+(.*)/){
84 push @{ $self->{rearrange} }, _getHexArray($1);
85 }
86 next;
87 }
88 $self->parseEntry($_);
89 }
90 close $fk;
91 if($self->{entry}){
92 $self->parseEntry($_) foreach split /\n/, $self->{entry};
93 }
94
95 # keys of $self->{rearrangeHash} are $self->{rearrange}.
96 $self->{rearrangeHash} = {};
97 @{ $self->{rearrangeHash} }{ @{ $self->{rearrange} } } = ();
98
99 return $self;
100}
101
102##
103## get $line, parse it, and write an entry in $self
104##
105sub parseEntry
106{
107 my $self = shift;
108 my $line = shift;
109 my($name, $ele, @key);
110
111 return if $line !~ /^\s*[0-9A-Fa-f]/;
112
113 # get name
114 $name = $1 if $line =~ s/#\s*(.*)//;
115 return if defined $self->{undefName} && $name =~ /$self->{undefName}/;
116
117 # get element
118 my($e, $k) = split /;/, $line;
119 my @e = _getHexArray($e);
120 $ele = pack('U*', @e);
121 return if defined $self->{undefChar} && $ele =~ /$self->{undefChar}/;
122
123 # get sort key
124 if(
125 defined $self->{ignoreName} && $name =~ /$self->{ignoreName}/ ||
126 defined $self->{ignoreChar} && $ele =~ /$self->{ignoreChar}/
127 )
128 {
129 $self->{ignored}{$ele} = 1;
130 $self->{entries}{$ele} = 1; # true
131 }
132 else
133 {
134 foreach my $arr ($k =~ /\[(\S+)\]/g) {
135 my $var = $arr =~ /\*/;
136 push @key, $self->getCE( $var, _getHexArray($arr) );
137 }
138 $self->{entries}{$ele} = \@key;
139 }
140 $self->{maxlength}{ord $ele} = scalar @e if @e > 1;
141}
142
143
144##
145## list to collation element
146##
147sub getCE
148{
149 my $self = shift;
150 my $var = shift;
151 my @c = @_;
152
153 $self->{alternate} eq 'blanked' ?
154 $var ? [0,0,0] : [ @c[0..2] ] :
155 $self->{alternate} eq 'non-ignorable' ? [ @c[0..2] ] :
156 $self->{alternate} eq 'shifted' ?
157 $var ? [0,0,0,$c[0] ] : [ @c[0..2], $c[0]+$c[1]+$c[2] ? 0xFFFF : 0 ] :
158 $self->{alternate} eq 'shift-trimmed' ?
159 $var ? [0,0,0,$c[0] ] : [ @c[0..2], 0 ] :
160 \@c;
161}
162
163##
164## to debug
165##
166sub viewSortKey
167{
168 my $self = shift;
169 my $key = $self->getSortKey(@_);
170 my $view = join " ", map sprintf("%04X", $_), unpack 'n*', $key;
171 $view =~ s/ ?0000 ?/|/g;
172 "[$view]";
173}
174
175##
176## sort key
177##
178sub getSortKey
179{
180 my $self = shift;
181 my $code = $self->{preprocess};
182 my $norm = $self->{normalize};
183 my $ent = $self->{entries};
184 my $ign = $self->{ignored};
185 my $max = $self->{maxlength};
186 my $lev = $self->{level};
187 my $cjk = $self->{overrideCJK};
188 my $hang = $self->{overrideHangul};
189 my $rear = $self->{rearrangeHash};
190
191 my $str = ref $code ? &$code(shift) : shift;
192 $str = &$norm($str) if ref $norm;
193
194 my(@src, @buf);
195 @src = unpack('U*', $str);
196
197 # rearrangement
198 for(my $i = 0; $i < @src; $i++)
199 {
200 ($src[$i], $src[$i+1]) = ($src[$i+1], $src[$i])
201 if $rear->{ $src[$i] };
202 $i++;
203 }
204
205 for(my $i = 0; $i < @src; $i++)
206 {
207 my $ch;
208 my $u = $src[$i];
209
210 # non-characters
211 next if $u < 0 || 0x10FFFF < $u # out of range
212 || 0xD800 < $u && $u < 0xDFFF; # unpaired surrogates
213 my $four = $u & 0xFFFF;
214 next if $four == 0xFFFE || $four == 0xFFFF;
215
216 if($max->{$u}) # contract
217 {
218 for(my $j = $max->{$u}; $j >= 1; $j--)
219 {
220 next unless $i+$j-1 < @src;
221 $ch = pack 'U*', @src[$i .. $i+$j-1];
222 $i += $j-1, last if $ent->{$ch};
223 }
224 }
225 else { $ch = pack('U', $u) }
226
227 if(%Combin && defined $ch) # with Combining Char
228 {
229 for(my $j = $i+1; $j < @src && $Combin{ $src[$j] }; $j++)
230 {
231 my $comb = pack 'U', $src[$j];
232 next if ! $ent->{ $ch.$comb };
233 $ch .= $comb;
234 splice(@src, $j, 1);
235 last;
236 }
237 }
238
239 next if !defined $ch || $ign->{$ch}; # ignored
240
241 push @buf,
242 $ent->{$ch}
243 ? @{ $ent->{$ch} }
244 : _isHangul($u)
245 ? $hang
246 ? &$hang($u)
247 : map(@{ $ent->{pack('U', $_)} }, decomposeHangul($u))
248 : _isCJK($u)
249 ? $cjk ? &$cjk($u) : map($self->getCE(0,@$_), _CJK($u))
250 : map($self->getCE(0,@$_), _derivCE($u));
251 }
252
253 # make sort key
254 my @ret = ([],[],[],[]);
255 foreach my $v (0..$lev-1){
256 foreach my $b (@buf){
257 push @{ $ret[$v] }, $b->[$v] if $b->[$v];
258 }
259 }
260 foreach (@{ $self->{backwards} }){
261 my $v = $_ - 1;
262 @{ $ret[$v] } = reverse @{ $ret[$v] };
263 }
264
265 # modification of tertiary weights
266 if($self->{upper_before_lower}){
267 foreach (@{ $ret[2] }){
268 if (0x8 <= $_ && $_ <= 0xC){ $_ -= 6 } # lower
269 elsif(0x2 <= $_ && $_ <= 0x6){ $_ += 6 } # upper
270 elsif($_ == 0x1C) { $_ += 1 } # square upper
271 elsif($_ == 0x1D) { $_ -= 1 } # square lower
272 }
273 }
274 if($self->{katakana_before_hiragana}){
275 foreach (@{ $ret[2] }){
276 if (0x0F <= $_ && $_ <= 0x13){ $_ -= 2 } # katakana
277 elsif(0x0D <= $_ && $_ <= 0x0E){ $_ += 5 } # hiragana
278 }
279 }
280 join "\0\0", map pack('n*', @$_), @ret;
281}
282
283
284##
285## cmp
286##
287sub cmp
288{
289 my $obj = shift;
290 my $a = shift;
291 my $b = shift;
292 $obj->getSortKey($a) cmp $obj->getSortKey($b);
293}
294
295##
296## sort
297##
298sub sort
299{
300 my $obj = shift;
301
302 map { $_->[1] }
303 sort{ $a->[0] cmp $b->[0] }
304 map [ $obj->getSortKey($_), $_ ], @_;
305}
306
307##
308## Derived CE
309##
310sub _derivCE
311{
312 my $code = shift;
313 my $a = UNDEFINED + ($code >> 15); # ok
314 my $b = ($code & 0x7FFF) | 0x8000; # ok
315# my $a = 0xFFC2 + ($code >> 15); # ng
316# my $b = $code & 0x7FFF | 0x1000; # ng
317 $b ? ([$a,2,1,$code],[$b,0,0,$code]) : [$a,2,1,$code];
318}
319
320##
321## "hhhh hhhh hhhh" to (dddd, dddd, dddd)
322##
323sub _getHexArray
324{
325 my $str = shift;
326 map hex(), $str =~ /([0-9a-fA-F]+)/g;
327}
328
329##
330## CJK Unified Ideographs
331##
332sub _isCJK
333{
334 my $u = shift;
335 return 0x3400 <= $u && $u <= 0x4DB5
336 || 0x4E00 <= $u && $u <= 0x9FA5
337# || 0x20000 <= $u && $u <= 0x2A6D6;
338}
339
340##
341## CJK Unified Ideographs
342##
343sub _CJK
344{
345 my $u = shift;
346 $u > 0xFFFF ? _derivCE($u) : [$u,0x20,0x02,$u];
347}
348
349##
350## Hangul Syllables
351##
352sub _isHangul
353{
354 my $code = shift;
355 return 0xAC00 <= $code && $code <= 0xD7A3;
356}
357
3581;
359__END__
360
361=head1 NAME
362
363Unicode::Collate - use UCA (Unicode Collation Algorithm)
364
365=head1 SYNOPSIS
366
367 use Unicode::Collate;
368
369 #construct
370 $UCA = Unicode::Collate->new(%tailoring);
371
372 #sort
373 @sorted = $UCA->sort(@not_sorted);
374
375 #compare
376 $result = $UCA->cmp($a, $b); # returns 1, 0, or -1.
377
378=head1 DESCRIPTION
379
380=head2 Constructor and Tailoring
381
382 $UCA = Unicode::Collate->new(
383 alternate => $alternate,
384 backwards => $levelNumber, # or \@levelNumbers
385 entry => $element,
386 normalization => $normalization_form,
387 ignoreName => qr/$ignoreName/,
388 ignoreChar => qr/$ignoreChar/,
389 katakana_before_hiragana => $bool,
390 level => $collationLevel,
391 overrideCJK => \&overrideCJK,
392 overrideHangul => \&overrideHangul,
393 preprocess => \&preprocess,
394 rearrange => \@charList,
395 table => $filename,
396 undefName => qr/$undefName/,
397 undefChar => qr/$undefChar/,
398 upper_before_lower => $bool,
399 );
400 # if %tailoring is false (empty),
401 # $UCA should do the default collation.
402
403=over 4
404
405=item alternate
406
407-- see 3.2.2 Alternate Weighting, UTR #10.
408
409 alternate => 'shifted', 'blanked', 'non-ignorable', or 'shift-trimmed'.
410
411By default (if specification is omitted), 'shifted' is adopted.
412
413=item backwards
414
415-- see 3.1.2 French Accents, UTR #10.
416
417 backwards => $levelNumber or \@levelNumbers
418
419Weights in reverse order; ex. level 2 (diacritic ordering) in French.
420If omitted, forwards at all the levels.
421
422=item entry
423
424-- see 3.1 Linguistic Features; 3.2.1 File Format, UTR #10.
425
426Overrides a default order or adds a new element
427
428 entry => <<'ENTRIES', # use the UCA file format
42900E6 ; [.0861.0020.0002.00E6] [.08B1.0020.0002.00E6] # ligature <ae> as <a e>
4300063 0068 ; [.0893.0020.0002.0063] # "ch" in traditional Spanish
4310043 0068 ; [.0893.0020.0008.0043] # "Ch" in traditional Spanish
432ENTRIES
433
434=item ignoreName
435
436=item ignoreChar
437
438-- see Completely Ignorable, 3.2.2 Alternate Weighting, UTR #10.
439
440Ignores the entry in the table.
441If an ignored collation element appears in the string to be collated,
442it is ignored as if the element had been deleted from there.
443
444E.g. when 'a' and 'e' are ignored,
445'element' is equal to 'lament' (or 'lmnt').
446
447=item level
448
449-- see 4.3 Form a sort key for each string, UTR #10.
450
451Set the maximum level.
452Any higher levels than the specified one are ignored.
453
454 Level 1: alphabetic ordering
455 Level 2: diacritic ordering
456 Level 3: case ordering
457 Level 4: tie-breaking (e.g. in the case when alternate is 'shifted')
458
459 ex.level => 2,
460
461=item normalization
462
463-- see 4.1 Normalize each input string, UTR #10.
464
465If specified, strings are normalized before preparation sort keys
466(the normalization is executed after preprocess).
467
468As a form name, one of the following names must be used.
469
470 'C' or 'NFC' for Normalization Form C
471 'D' or 'NFD' for Normalization Form D
472 'KC' or 'NFKC' for Normalization Form KC
473 'KD' or 'NFKD' for Normalization Form KD
474
475If omitted, the string is put into Normalization Form D.
476
477If undefined explicitly (as C<normalization =E<gt> undef>),
478any normalization is not carried out (this may make tailoring easier
479if any normalization is not desired).
480
481see B<CAVEAT>.
482
483=item overrideCJK
484
485=item overrideHangul
486
487-- see 7.1 Derived Collation Elements, UTR #10.
488
489By default, mapping of CJK Unified Ideographs
490uses the Unicode codepoint order
491and Hangul Syllables are decomposed into Hangul Jamo.
492
493The mapping of CJK Unified Ideographs
494or Hangul Syllables may be overrided.
495
496ex. CJK Unified Ideographs in the JIS codepoint order.
497
498 overrideCJK => sub {
499 my $u = shift; # get unicode codepoint
500 my $b = pack('n', $u); # to UTF-16BE
501 my $s = your_unicode_to_sjis_converter($b); # convert
502 my $n = unpack('n', $s); # convert sjis to short
503 [ $n, 1, 1 ]; # return collation element
504 },
505
506If you want to override the mapping of Hangul Syllables,
507the Normalization Forms D and KD are not appropriate
508(they will be decomposed before overriding).
509
510=item preprocess
511
512-- see 5.1 Preprocessing, UTR #10.
513
514If specified, the coderef is used to preprocess
515before the formation of sort keys.
516
517ex. dropping English articles, such as "a" or "the".
518Then, "the pen" is before "a pencil".
519
520 preprocess => sub {
521 my $str = shift;
522 $str =~ s/\b(?:an?|the)\s+//g;
523 $str;
524 },
525
526=item rearrange
527
528-- see 3.1.3 Rearrangement, UTR #10.
529
530Characters that are not coded in logical order and to be rearranged.
531By default,
532
533 rearrange => [ 0x0E40..0x0E44, 0x0EC0..0x0EC4 ],
534
535=item table
536
537-- see 3.2 Default Unicode Collation Element Table, UTR #10.
538
539You can use another element table if desired.
540The table file must be in your C<lib/Unicode/Collate> directory.
541
542By default, the file C<lib/Unicode/Collate/allkeys.txt> is used.
543
544=item undefName
545
546=item undefChar
547
548-- see 6.3.4 Reducing the Repertoire, UTR #10.
549
550Undefines the collation element as if it were unassigned in the table.
551This reduces the size of the table.
552If an unassigned character appears in the string to be collated,
553the sort key is made from its codepoint
554as a single-character collation element,
555as it is greater than any other assigned collation elements
556(in the codepoint order among the unassigned characters).
557But, it'd be better to ignore characters
558unfamiliar to you and maybe never used.
559
560=item katakana_before_hiragana
561
562=item upper_before_lower
563
564-- see 6.6 Case Comparisons; 7.3.1 Tertiary Weight Table, UTR #10.
565
566By default, lowercase is before uppercase
567and hiragana is before katakana.
568
569If the parameter is true, this is reversed.
570
571=back
572
573=head2 Other methods
574
575=over 4
576
577=item C<@sorted = $UCA-E<gt>sort(@not_sorted)>
578
579Sorts a list of strings.
580
581=item C<$result = $UCA-E<gt>cmp($a, $b)>
582
583Returns 1 (when C<$a> is greater than C<$b>)
584or 0 (when C<$a> is equal to C<$b>)
585or -1 (when C<$a> is lesser than C<$b>).
586
587=item C<$sortKey = $UCA-E<gt>getSortKey($string)>
588
589-- see 4.3 Form a sort key for each string, UTR #10.
590
591Returns a sort key.
592
593You compare the sort keys using a binary comparison
594and get the result of the comparison of the strings using UCA.
595
596 $UCA->getSortKey($a) cmp $UCA->getSortKey($b)
597
598 is equivalent to
599
600 $UCA->cmp($a, $b)
601
602=back
603
604=head2 EXPORT
605
606None by default.
607
608=head2 CAVEAT
609
610Use of the C<normalization> parameter requires
611the B<Unicode::Normalize> module.
612
613If you need not it (e.g. in the case when you need not
614handle any combining characters),
615assign C<normalization =E<gt> undef> explicitly.
616
617=head1 AUTHOR
618
619SADAHIRO Tomoyuki, E<lt>SADAHIRO@cpan.orgE<gt>
620
621 http://homepage1.nifty.com/nomenclator/perl/
622
623 Copyright(C) 2001, SADAHIRO Tomoyuki. Japan. All rights reserved.
624
625 This program is free software; you can redistribute it and/or
626 modify it under the same terms as Perl itself.
627
628=head1 SEE ALSO
629
630=over 4
631
632=item L<Lingua::KO::Hangul::Util>
633
634utility functions for Hangul Syllables
635
636=item L<Unicode::Normalize>
637
638normalized forms of Unicode text
639
640=item Unicode Collation Algorithm - Unicode TR #10
641
642http://www.unicode.org/unicode/reports/tr10/
643
644=back
645
646=cut