1 package Unicode::Collate;
10 our $VERSION = '0.09';
11 our $PACKAGE = __PACKAGE__;
13 our @ISA = qw(Exporter);
15 our %EXPORT_TAGS = ();
19 (our $Path = $INC{'Unicode/Collate.pm'}) =~ s/\.pm$//;
20 our $KeyFile = "allkeys.txt";
22 our $getCombinClass; # coderef for combining class from Unicode::Normalize
24 use constant Min2 => 0x20; # minimum weight at level 2
25 use constant Min3 => 0x02; # minimum weight at level 3
26 use constant UNDEFINED => 0xFF80; # special value for undefined CE
34 my $self = bless { @_ }, $class;
38 ! exists $self->{alternate} ? 'shifted' :
39 ! defined $self->{alternate} ? '' : $self->{alternate};
42 $self->{level} ||= ($self->{alternate} =~ /^shift/ ? 4 : 3);
45 $self->{normalization} = 'D' if ! exists $self->{normalization};
47 if(defined $self->{normalization}){
48 eval "use Unicode::Normalize;";
49 croak "you'd install Unicode::Normalize for normalization forms: $@"
51 $getCombinClass = \&Unicode::Normalize::getCombinClass
56 ! defined $self->{normalization} ? undef :
57 $self->{normalization} =~ /^(?:NF)?C$/ ? \&NFC :
58 $self->{normalization} =~ /^(?:NF)?D$/ ? \&NFD :
59 $self->{normalization} =~ /^(?:NF)?KC$/ ? \&NFKC :
60 $self->{normalization} =~ /^(?:NF)?KD$/ ? \&NFKD :
61 croak "$PACKAGE unknown normalization form name: $self->{normalization}";
64 $self->{backwards} ||= [];
65 $self->{backwards} = [ $self->{backwards} ] if ! ref $self->{backwards};
68 $self->{rearrange} ||= []; # maybe not U+0000 (an ASCII)
69 $self->{rearrange} = [ $self->{rearrange} ] if ! ref $self->{rearrange};
72 my $file = defined $self->{table} ? $self->{table} : $KeyFile;
73 open my $fk, "<$Path/$file" or croak "File does not exist at $Path/$file";
78 if(/^\@version\s*(\S*)/){
79 $self->{version} ||= $1;
81 elsif(/^\@alternate\s+(.*)/){
82 $self->{alternate} ||= $1;
84 elsif(/^\@backwards\s+(.*)/){
85 push @{ $self->{backwards} }, $1;
87 elsif(/^\@rearrange\s+(.*)/){
88 push @{ $self->{rearrange} }, _getHexArray($1);
92 $self->parseEntry($_);
96 $self->parseEntry($_) foreach split /\n/, $self->{entry};
99 # keys of $self->{rearrangeHash} are $self->{rearrange}.
100 $self->{rearrangeHash} = {};
101 @{ $self->{rearrangeHash} }{ @{ $self->{rearrange} } } = ();
107 ## get $line, parse it, and write an entry in $self
113 my($name, $ele, @key);
115 return if $line !~ /^\s*[0-9A-Fa-f]/;
118 $name = $1 if $line =~ s/#\s*(.*)//;
119 return if defined $self->{undefName} && $name =~ /$self->{undefName}/;
122 my($e, $k) = split /;/, $line;
123 my @e = _getHexArray($e);
124 $ele = pack('U*', @e);
125 return if defined $self->{undefChar} && $ele =~ /$self->{undefChar}/;
129 defined $self->{ignoreName} && $name =~ /$self->{ignoreName}/ ||
130 defined $self->{ignoreChar} && $ele =~ /$self->{ignoreChar}/
133 $self->{entries}{$ele} = $self->{ignored}{$ele} = 1;
137 foreach my $arr ($k =~ /\[(\S+)\]/g) {
138 my $var = $arr =~ /\*/;
139 push @key, $self->altCE( $var, _getHexArray($arr) );
141 $self->{entries}{$ele} = \@key;
143 $self->{maxlength}{ord $ele} = scalar @e if @e > 1;
148 ## arrayref CE = altCE(bool variable?, list[num] weights)
156 $self->{alternate} eq 'blanked' ?
157 $var ? [0,0,0] : [ @c[0..2] ] :
158 $self->{alternate} eq 'non-ignorable' ?
160 $self->{alternate} eq 'shifted' ?
161 $var ? [0,0,0,$c[0] ] : [ @c[0..2], $c[0]+$c[1]+$c[2] ? 0xFFFF : 0 ] :
162 $self->{alternate} eq 'shift-trimmed' ?
163 $var ? [0,0,0,$c[0] ] : [ @c[0..2], 0 ] :
168 ## string hex_sortkey = splitCE(string arg)
173 my $key = $self->getSortKey(@_);
174 my $view = join " ", map sprintf("%04X", $_), unpack 'n*', $key;
175 $view =~ s/ ?0000 ?/|/g;
181 ## list[strings] elements = splitCE(string arg)
186 my $code = $self->{preprocess};
187 my $norm = $self->{UNF};
188 my $ent = $self->{entries};
189 my $max = $self->{maxlength};
190 my $rear = $self->{rearrangeHash};
192 my $str = ref $code ? &$code(shift) : shift;
193 $str = &$norm($str) if ref $norm;
196 @src = unpack('U*', $str);
199 for(my $i = 0; $i < @src; $i++)
201 ($src[$i], $src[$i+1]) = ($src[$i+1], $src[$i])
202 if $rear->{ $src[$i] };
206 for(my $i = 0; $i < @src; $i++)
212 next if $u < 0 || 0x10FFFF < $u # out of range
213 || 0xD800 < $u && $u < 0xDFFF; # unpaired surrogates
214 my $four = $u & 0xFFFF;
215 next if $four == 0xFFFE || $four == 0xFFFF;
217 if($max->{$u}) # contract
219 for(my $j = $max->{$u}; $j >= 1; $j--)
221 next unless $i+$j-1 < @src;
222 $ch = pack 'U*', @src[$i .. $i+$j-1];
223 $i += $j-1, last if $ent->{$ch};
226 else { $ch = pack('U', $u) }
228 # with Combining Char (UTS#10, 4.2.1), here requires Unicode::Normalize.
229 if($getCombinClass && defined $ch)
231 for(my $j = $i+1; $j < @src && $getCombinClass->( $src[$j] ); $j++)
233 my $comb = pack 'U', $src[$j];
234 next if ! $ent->{ $ch.$comb };
241 wantarray ? @buf : \@buf;
246 ## list[arrayrefs] weight = getWt(string element)
252 my $ent = $self->{entries};
253 my $ign = $self->{ignored};
254 my $cjk = $self->{overrideCJK};
255 my $hang = $self->{overrideHangul};
256 return if !defined $ch || $ign->{$ch}; # ignored
257 return @{ $ent->{$ch} } if $ent->{$ch};
258 my $u = unpack('U', $ch);
263 : map(@{ $ent->{pack('U', $_)} }, _decompHangul($u))
265 ? $cjk ? &$cjk($u) : map($self->altCE(0,@$_), _CJK($u))
266 : map($self->altCE(0,@$_), _derivCE($u));
270 ## int = index(string, substring)
275 my $lev = $self->{level};
276 my $str = $self->splitCE(shift);
277 my $sub = $self->splitCE(shift);
279 return wantarray ? (0,0) : 0 if ! @$sub;
280 return wantarray ? () : -1 if ! @$str;
282 my @subWt = grep _ignorableAtLevel($_,$lev),
283 map $self->getWt($_), @$sub;
288 my @tmp = grep _ignorableAtLevel($_,$lev), $self->getWt($e);
290 push @strPt, ($count) x @tmp;
292 while(@strWt >= @subWt){
293 if(_eqArray(\@strWt, \@subWt, $lev)){
295 return wantarray ? ($pos, $count-$pos) : $pos;
301 return wantarray ? () : -1;
305 ## bool _eqArray(arrayref, arrayref, level)
309 my $a = shift; # length $a >= length $b;
312 for my $v (0..$lev-1){
313 for my $c (0..@$b-1){
314 return if $a->[$c][$v] != $b->[$c][$v];
322 ## bool _ignorableAtLevel(CE, level)
324 sub _ignorableAtLevel($$)
327 return if ! defined $ce;
329 ! grep { ! $ce->[$_] } 0..$lv-1;
334 ## string sortkey = getSortKey(string arg)
339 my $lev = $self->{level};
340 my $rCE = $self->splitCE(shift); # get an arrayref
343 my @buf = grep defined(), map $self->getWt($_), @$rCE;
346 my @ret = ([],[],[],[]);
347 foreach my $v (0..$lev-1){
348 foreach my $b (@buf){
349 push @{ $ret[$v] }, $b->[$v] if $b->[$v];
352 foreach (@{ $self->{backwards} }){
354 @{ $ret[$v] } = reverse @{ $ret[$v] };
357 # modification of tertiary weights
358 if($self->{upper_before_lower}){
359 foreach (@{ $ret[2] }){
360 if (0x8 <= $_ && $_ <= 0xC){ $_ -= 6 } # lower
361 elsif(0x2 <= $_ && $_ <= 0x6){ $_ += 6 } # upper
362 elsif($_ == 0x1C) { $_ += 1 } # square upper
363 elsif($_ == 0x1D) { $_ -= 1 } # square lower
366 if($self->{katakana_before_hiragana}){
367 foreach (@{ $ret[2] }){
368 if (0x0F <= $_ && $_ <= 0x13){ $_ -= 2 } # katakana
369 elsif(0x0D <= $_ && $_ <= 0x0E){ $_ += 5 } # hiragana
372 join "\0\0", map pack('n*', @$_), @ret;
377 ## int compare = cmp(string a, string b)
379 sub cmp { $_[0]->getSortKey($_[1]) cmp $_[0]->getSortKey($_[2]) }
380 sub eq { $_[0]->getSortKey($_[1]) eq $_[0]->getSortKey($_[2]) }
381 sub ne { $_[0]->getSortKey($_[1]) ne $_[0]->getSortKey($_[2]) }
382 sub lt { $_[0]->getSortKey($_[1]) lt $_[0]->getSortKey($_[2]) }
383 sub le { $_[0]->getSortKey($_[1]) le $_[0]->getSortKey($_[2]) }
384 sub gt { $_[0]->getSortKey($_[1]) gt $_[0]->getSortKey($_[2]) }
385 sub ge { $_[0]->getSortKey($_[1]) ge $_[0]->getSortKey($_[2]) }
388 ## list[strings] sorted = sort(list[strings] arg)
395 sort{ $a->[0] cmp $b->[0] }
396 map [ $obj->getSortKey($_), $_ ], @_;
400 ## list[arrayrefs] CE = _derivCE(int codepoint)
405 my $a = UNDEFINED + ($code >> 15); # ok
406 my $b = ($code & 0x7FFF) | 0x8000; # ok
407 # my $a = 0xFFC2 + ($code >> 15); # ng
408 # my $b = $code & 0x7FFF | 0x1000; # ng
409 $b ? ([$a,2,1,$code],[$b,0,0,$code]) : [$a,2,1,$code];
413 ## "hhhh hhhh hhhh" to (dddd, dddd, dddd)
415 sub _getHexArray { map hex(), $_[0] =~ /([0-9a-fA-F]+)/g }
418 ## bool is_a_CJK_Unified_Ideograph = _isCJK(int codepoint)
423 return 0x3400 <= $u && $u <= 0x4DB5
424 || 0x4E00 <= $u && $u <= 0x9FA5
425 # || 0x20000 <= $u && $u <= 0x2A6D6;
429 ## list[arrayref] CE = _CJK(int codepoint_of_CJK)
431 sub _CJK { $_[0] > 0xFFFF ? _derivCE($_[0]) : [$_[0], 0x20, 0x02, $_[0]] }
434 ## bool is_a_Hangul_Syllable = _isHangul(int codepoint)
436 sub _isHangul { return 0xAC00 <= $_[0] && $_[0] <= 0xD7A3 }
440 # $code must be in Hangul syllable. check it before you enter here.
441 my $SIndex = $code - 0xAC00;
442 my $LIndex = int( $SIndex / 588);
443 my $VIndex = int(($SIndex % 588) / 28);
444 my $TIndex = $SIndex % 28;
448 $TIndex ? (0x11A7 + $TIndex) : (),
457 Unicode::Collate - use UCA (Unicode Collation Algorithm)
461 use Unicode::Collate;
464 $Collator = Unicode::Collate->new(%tailoring);
467 @sorted = $Collator->sort(@not_sorted);
470 $result = $Collator->cmp($a, $b); # returns 1, 0, or -1.
474 =head2 Constructor and Tailoring
476 The C<new> method returns a collator object.
478 $Collator = Unicode::Collate->new(
479 alternate => $alternate,
480 backwards => $levelNumber, # or \@levelNumbers
482 normalization => $normalization_form,
483 ignoreName => qr/$ignoreName/,
484 ignoreChar => qr/$ignoreChar/,
485 katakana_before_hiragana => $bool,
486 level => $collationLevel,
487 overrideCJK => \&overrideCJK,
488 overrideHangul => \&overrideHangul,
489 preprocess => \&preprocess,
490 rearrange => \@charList,
492 undefName => qr/$undefName/,
493 undefChar => qr/$undefChar/,
494 upper_before_lower => $bool,
496 # if %tailoring is false (empty),
497 # $Collator should do the default collation.
503 -- see 3.2.2 Alternate Weighting, UTR #10.
505 alternate => 'shifted', 'blanked', 'non-ignorable', or 'shift-trimmed'.
507 By default (if specification is omitted), 'shifted' is adopted.
511 -- see 3.1.2 French Accents, UTR #10.
513 backwards => $levelNumber or \@levelNumbers
515 Weights in reverse order; ex. level 2 (diacritic ordering) in French.
516 If omitted, forwards at all the levels.
520 -- see 3.1 Linguistic Features; 3.2.1 File Format, UTR #10.
522 Overrides a default order or adds a new element
524 entry => <<'ENTRIES', # use the UCA file format
525 00E6 ; [.0861.0020.0002.00E6] [.08B1.0020.0002.00E6] # ligature <ae> as <a e>
526 0063 0068 ; [.0893.0020.0002.0063] # "ch" in traditional Spanish
527 0043 0068 ; [.0893.0020.0008.0043] # "Ch" in traditional Spanish
534 -- see Completely Ignorable, 3.2.2 Alternate Weighting, UTR #10.
536 Ignores the entry in the table.
537 If an ignored collation element appears in the string to be collated,
538 it is ignored as if the element had been deleted from there.
540 E.g. when 'a' and 'e' are ignored,
541 'element' is equal to 'lament' (or 'lmnt').
545 -- see 4.3 Form a sort key for each string, UTR #10.
547 Set the maximum level.
548 Any higher levels than the specified one are ignored.
550 Level 1: alphabetic ordering
551 Level 2: diacritic ordering
552 Level 3: case ordering
553 Level 4: tie-breaking (e.g. in the case when alternate is 'shifted')
559 -- see 4.1 Normalize each input string, UTR #10.
561 If specified, strings are normalized before preparation sort keys
562 (the normalization is executed after preprocess).
564 As a form name, one of the following names must be used.
566 'C' or 'NFC' for Normalization Form C
567 'D' or 'NFD' for Normalization Form D
568 'KC' or 'NFKC' for Normalization Form KC
569 'KD' or 'NFKD' for Normalization Form KD
571 If omitted, the string is put into Normalization Form D.
573 If undefined explicitly (as C<normalization =E<gt> undef>),
574 any normalization is not carried out (this may make tailoring easier
575 if any normalization is not desired).
583 -- see 7.1 Derived Collation Elements, UTR #10.
585 By default, mapping of CJK Unified Ideographs
586 uses the Unicode codepoint order
587 and Hangul Syllables are decomposed into Hangul Jamo.
589 The mapping of CJK Unified Ideographs
590 or Hangul Syllables may be overrided.
592 ex. CJK Unified Ideographs in the JIS codepoint order.
595 my $u = shift; # get unicode codepoint
596 my $b = pack('n', $u); # to UTF-16BE
597 my $s = your_unicode_to_sjis_converter($b); # convert
598 my $n = unpack('n', $s); # convert sjis to short
599 [ $n, 1, 1 ]; # return collation element
602 If you want to override the mapping of Hangul Syllables,
603 the Normalization Forms D and KD are not appropriate
604 (they will be decomposed before overriding).
608 -- see 5.1 Preprocessing, UTR #10.
610 If specified, the coderef is used to preprocess
611 before the formation of sort keys.
613 ex. dropping English articles, such as "a" or "the".
614 Then, "the pen" is before "a pencil".
618 $str =~ s/\b(?:an?|the)\s+//g;
624 -- see 3.1.3 Rearrangement, UTR #10.
626 Characters that are not coded in logical order and to be rearranged.
629 rearrange => [ 0x0E40..0x0E44, 0x0EC0..0x0EC4 ],
633 -- see 3.2 Default Unicode Collation Element Table, UTR #10.
635 You can use another element table if desired.
636 The table file must be in your C<lib/Unicode/Collate> directory.
638 By default, the file C<lib/Unicode/Collate/allkeys.txt> is used.
644 -- see 6.3.4 Reducing the Repertoire, UTR #10.
646 Undefines the collation element as if it were unassigned in the table.
647 This reduces the size of the table.
648 If an unassigned character appears in the string to be collated,
649 the sort key is made from its codepoint
650 as a single-character collation element,
651 as it is greater than any other assigned collation elements
652 (in the codepoint order among the unassigned characters).
653 But, it'd be better to ignore characters
654 unfamiliar to you and maybe never used.
656 =item katakana_before_hiragana
658 =item upper_before_lower
660 -- see 6.6 Case Comparisons; 7.3.1 Tertiary Weight Table, UTR #10.
662 By default, lowercase is before uppercase
663 and hiragana is before katakana.
665 If the parameter is true, this is reversed.
673 =item C<@sorted = $Collator-E<gt>sort(@not_sorted)>
675 Sorts a list of strings.
677 =item C<$result = $Collator-E<gt>cmp($a, $b)>
679 Returns 1 (when C<$a> is greater than C<$b>)
680 or 0 (when C<$a> is equal to C<$b>)
681 or -1 (when C<$a> is lesser than C<$b>).
683 =item C<$result = $Collator-E<gt>eq($a, $b)>
685 =item C<$result = $Collator-E<gt>ne($a, $b)>
687 =item C<$result = $Collator-E<gt>lt($a, $b)>
689 =item C<$result = $Collator-E<gt>le($a, $b)>
691 =item C<$result = $Collator-E<gt>gt($a, $b)>
693 =item C<$result = $Collator-E<gt>ge($a, $b)>
695 They works like the same name operators as theirs.
697 eq : whether $a is equal to $b.
698 ne : whether $a is not equal to $b.
699 lt : whether $a is lesser than $b.
700 le : whether $a is lesser than $b or equal to $b.
701 gt : whether $a is greater than $b.
702 ge : whether $a is greater than $b or equal to $b.
704 =item C<$sortKey = $Collator-E<gt>getSortKey($string)>
706 -- see 4.3 Form a sort key for each string, UTR #10.
710 You compare the sort keys using a binary comparison
711 and get the result of the comparison of the strings using UCA.
713 $Collator->getSortKey($a) cmp $Collator->getSortKey($b)
717 $Collator->cmp($a, $b)
719 =item C<$position = $Collator-E<gt>index($string, $substring)>
721 =item C<($position, $length) = $Collator-E<gt>index($string, $substring)>
723 -- see 6.8 Searching, UTR #10.
725 If C<$substring> matches a part of C<$string>, returns
726 the position of the first occurrence of the matching part in scalar context;
727 in list context, returns a two-element list of
728 the position and the length of the matching part.
730 B<Notice> that the length of the matching part may differ from
731 the length of C<$substring>.
733 B<Note> that the position and the length are counted on the string
734 after the process of preprocess, normalization, and rearrangement.
735 Therefore, in case the specified string is not binary equal to
736 the preprocessed/normalized/rearranged string, the position and the length
737 may differ form those on the specified string. But it is guaranteed
738 that, if matched, it returns a non-negative value as C<$position>.
740 If C<$substring> does not match any part of C<$string>,
741 returns C<-1> in scalar context and
742 an empty list in list context.
746 my $Collator = Unicode::Collate->new( normalization => undef, level => 1 );
747 my $str = "Ich mu\x{00DF} studieren.";
748 my $sub = "m\x{00FC}ss";
750 if(my($pos,$len) = $Collator->index($str, $sub)){
751 $match = substr($str, $pos, $len);
754 and get C<"mu\x{00DF}"> in C<$match> since C<"mu>E<223>C<">
755 is primary equal to C<"m>E<252>C<ss">.
765 Use of the C<normalization> parameter requires
766 the B<Unicode::Normalize> module.
768 If you need not it (say, in the case when you need not
769 handle any combining characters),
770 assign C<normalization =E<gt> undef> explicitly.
772 -- see 6.5 Avoiding Normalization, UTR #10.
776 SADAHIRO Tomoyuki, E<lt>SADAHIRO@cpan.orgE<gt>
778 http://homepage1.nifty.com/nomenclator/perl/
780 Copyright(C) 2001, SADAHIRO Tomoyuki. Japan. All rights reserved.
782 This program is free software; you can redistribute it and/or
783 modify it under the same terms as Perl itself.
789 =item Unicode Collation Algorithm - Unicode TR #10
791 http://www.unicode.org/unicode/reports/tr10/
793 =item L<Unicode::Normalize>
795 normalized forms of Unicode text