1 package Unicode::Collate;
10 our $VERSION = '0.10';
11 our $PACKAGE = __PACKAGE__;
13 our @ISA = qw(Exporter);
15 our %EXPORT_TAGS = ();
19 (our $Path = $INC{'Unicode/Collate.pm'}) =~ s/\.pm$//;
20 our $KeyFile = "allkeys.txt";
22 our $getCombinClass; # coderef for combining class from Unicode::Normalize
24 use constant Min2 => 0x20; # minimum weight at level 2
25 use constant Min3 => 0x02; # minimum weight at level 3
26 use constant UNDEFINED => 0xFF80; # special value for undefined CE
34 my $self = bless { @_ }, $class;
38 ! exists $self->{alternate} ? 'shifted' :
39 ! defined $self->{alternate} ? '' : $self->{alternate};
42 $self->{level} ||= ($self->{alternate} =~ /^shift/ ? 4 : 3);
45 $self->{normalization} = 'D' if ! exists $self->{normalization};
47 if(defined $self->{normalization}){
48 eval "use Unicode::Normalize;";
49 croak "you'd install Unicode::Normalize for normalization forms: $@"
51 $getCombinClass = \&Unicode::Normalize::getCombinClass
56 ! defined $self->{normalization} ? undef :
57 $self->{normalization} =~ /^(?:NF)?C$/ ? \&NFC :
58 $self->{normalization} =~ /^(?:NF)?D$/ ? \&NFD :
59 $self->{normalization} =~ /^(?:NF)?KC$/ ? \&NFKC :
60 $self->{normalization} =~ /^(?:NF)?KD$/ ? \&NFKD :
61 croak "$PACKAGE unknown normalization form name: $self->{normalization}";
64 $self->{backwards} ||= [];
65 $self->{backwards} = [ $self->{backwards} ] if ! ref $self->{backwards};
68 $self->{rearrange} ||= []; # maybe not U+0000 (an ASCII)
69 $self->{rearrange} = [ $self->{rearrange} ] if ! ref $self->{rearrange};
72 # if undef is passed explicitly, no file is read.
73 $self->{table} = $KeyFile unless exists $self->{table};
74 $self->read_table if defined $self->{table};
77 $self->parseEntry($_) foreach split /\n/, $self->{entry};
80 # keys of $self->{rearrangeHash} are $self->{rearrange}.
81 $self->{rearrangeHash} = {};
82 @{ $self->{rearrangeHash} }{ @{ $self->{rearrange} } } = ();
90 my $file = $self->{table} ne '' ? $self->{table} : $KeyFile;
91 open my $fk, "<$Path/$file" or croak "File does not exist at $Path/$file";
96 if(/^\@version\s*(\S*)/){
97 $self->{version} ||= $1;
99 elsif(/^\@alternate\s+(.*)/){
100 $self->{alternate} ||= $1;
102 elsif(/^\@backwards\s+(.*)/){
103 push @{ $self->{backwards} }, $1;
105 elsif(/^\@rearrange\s+(.*)/){
106 push @{ $self->{rearrange} }, _getHexArray($1);
110 $self->parseEntry($_);
117 ## get $line, parse it, and write an entry in $self
123 my($name, $ele, @key);
125 return if $line !~ /^\s*[0-9A-Fa-f]/;
128 $name = $1 if $line =~ s/#\s*(.*)//;
129 return if defined $self->{undefName} && $name =~ /$self->{undefName}/;
132 my($e, $k) = split /;/, $line;
133 my @e = _getHexArray($e);
134 $ele = pack('U*', @e);
135 return if defined $self->{undefChar} && $ele =~ /$self->{undefChar}/;
139 defined $self->{ignoreName} && $name =~ /$self->{ignoreName}/ ||
140 defined $self->{ignoreChar} && $ele =~ /$self->{ignoreChar}/
143 $self->{entries}{$ele} = $self->{ignored}{$ele} = 1;
147 foreach my $arr ($k =~ /\[(\S+)\]/g) {
148 my $var = $arr =~ /\*/;
149 push @key, $self->altCE( $var, _getHexArray($arr) );
151 $self->{entries}{$ele} = \@key;
153 $self->{maxlength}{ord $ele} = scalar @e if @e > 1;
158 ## arrayref CE = altCE(bool variable?, list[num] weights)
166 $self->{alternate} eq 'blanked' ?
167 $var ? [0,0,0] : [ @c[0..2] ] :
168 $self->{alternate} eq 'non-ignorable' ?
170 $self->{alternate} eq 'shifted' ?
171 $var ? [0,0,0,$c[0] ] : [ @c[0..2], $c[0]+$c[1]+$c[2] ? 0xFFFF : 0 ] :
172 $self->{alternate} eq 'shift-trimmed' ?
173 $var ? [0,0,0,$c[0] ] : [ @c[0..2], 0 ] :
178 ## string hex_sortkey = splitCE(string arg)
183 my $key = $self->getSortKey(@_);
184 my $view = join " ", map sprintf("%04X", $_), unpack 'n*', $key;
185 $view =~ s/ ?0000 ?/|/g;
191 ## list[strings] elements = splitCE(string arg)
196 my $code = $self->{preprocess};
197 my $norm = $self->{UNF};
198 my $ent = $self->{entries};
199 my $max = $self->{maxlength};
200 my $rear = $self->{rearrangeHash};
202 my $str = ref $code ? &$code(shift) : shift;
203 $str = &$norm($str) if ref $norm;
206 @src = unpack('U*', $str);
209 for(my $i = 0; $i < @src; $i++)
211 ($src[$i], $src[$i+1]) = ($src[$i+1], $src[$i])
212 if $rear->{ $src[$i] };
216 for(my $i = 0; $i < @src; $i++)
222 next unless defined $u;
223 next if $u < 0 || 0x10FFFF < $u # out of range
224 || 0xD800 < $u && $u < 0xDFFF; # unpaired surrogates
225 my $four = $u & 0xFFFF;
226 next if $four == 0xFFFE || $four == 0xFFFF;
228 if($max->{$u}) # contract
230 for(my $j = $max->{$u}; $j >= 1; $j--)
232 next unless $i+$j-1 < @src;
233 $ch = pack 'U*', @src[$i .. $i+$j-1];
234 $i += $j-1, last if $ent->{$ch};
237 else { $ch = pack('U', $u) }
239 # with Combining Char (UTS#10, 4.2.1), here requires Unicode::Normalize.
240 if($getCombinClass && defined $ch)
242 for(my $j = $i+1; $j < @src; $j++)
244 next unless defined $src[$j];
245 last unless $getCombinClass->( $src[$j] );
246 my $comb = pack 'U', $src[$j];
247 next if ! $ent->{ $ch.$comb };
254 wantarray ? @buf : \@buf;
259 ## list[arrayrefs] weight = getWt(string element)
265 my $ent = $self->{entries};
266 my $ign = $self->{ignored};
267 my $cjk = $self->{overrideCJK};
268 my $hang = $self->{overrideHangul};
269 return if !defined $ch || $ign->{$ch}; # ignored
270 return @{ $ent->{$ch} } if $ent->{$ch};
271 my $u = unpack('U', $ch);
276 : map(@{ $ent->{pack('U', $_)} }, _decompHangul($u))
278 ? $cjk ? &$cjk($u) : map($self->altCE(0,@$_), _CJK($u))
279 : map($self->altCE(0,@$_), _derivCE($u));
283 ## int = index(string, substring)
288 my $lev = $self->{level};
289 my $str = $self->splitCE(shift);
290 my $sub = $self->splitCE(shift);
292 return wantarray ? (0,0) : 0 if ! @$sub;
293 return wantarray ? () : -1 if ! @$str;
295 my @subWt = grep _ignorableAtLevel($_,$lev),
296 map $self->getWt($_), @$sub;
301 my @tmp = grep _ignorableAtLevel($_,$lev), $self->getWt($e);
303 push @strPt, ($count) x @tmp;
305 while(@strWt >= @subWt){
306 if(_eqArray(\@strWt, \@subWt, $lev)){
308 return wantarray ? ($pos, $count-$pos) : $pos;
314 return wantarray ? () : -1;
318 ## bool _eqArray(arrayref, arrayref, level)
322 my $a = shift; # length $a >= length $b;
325 for my $v (0..$lev-1){
326 for my $c (0..@$b-1){
327 return if $a->[$c][$v] != $b->[$c][$v];
335 ## bool _ignorableAtLevel(CE, level)
337 sub _ignorableAtLevel($$)
340 return if ! defined $ce;
342 ! grep { ! $ce->[$_] } 0..$lv-1;
347 ## string sortkey = getSortKey(string arg)
352 my $lev = $self->{level};
353 my $rCE = $self->splitCE(shift); # get an arrayref
356 my @buf = grep defined(), map $self->getWt($_), @$rCE;
359 my @ret = ([],[],[],[]);
360 foreach my $v (0..$lev-1){
361 foreach my $b (@buf){
362 push @{ $ret[$v] }, $b->[$v] if $b->[$v];
365 foreach (@{ $self->{backwards} }){
367 @{ $ret[$v] } = reverse @{ $ret[$v] };
370 # modification of tertiary weights
371 if($self->{upper_before_lower}){
372 foreach (@{ $ret[2] }){
373 if (0x8 <= $_ && $_ <= 0xC){ $_ -= 6 } # lower
374 elsif(0x2 <= $_ && $_ <= 0x6){ $_ += 6 } # upper
375 elsif($_ == 0x1C) { $_ += 1 } # square upper
376 elsif($_ == 0x1D) { $_ -= 1 } # square lower
379 if($self->{katakana_before_hiragana}){
380 foreach (@{ $ret[2] }){
381 if (0x0F <= $_ && $_ <= 0x13){ $_ -= 2 } # katakana
382 elsif(0x0D <= $_ && $_ <= 0x0E){ $_ += 5 } # hiragana
385 join "\0\0", map pack('n*', @$_), @ret;
390 ## int compare = cmp(string a, string b)
392 sub cmp { $_[0]->getSortKey($_[1]) cmp $_[0]->getSortKey($_[2]) }
393 sub eq { $_[0]->getSortKey($_[1]) eq $_[0]->getSortKey($_[2]) }
394 sub ne { $_[0]->getSortKey($_[1]) ne $_[0]->getSortKey($_[2]) }
395 sub lt { $_[0]->getSortKey($_[1]) lt $_[0]->getSortKey($_[2]) }
396 sub le { $_[0]->getSortKey($_[1]) le $_[0]->getSortKey($_[2]) }
397 sub gt { $_[0]->getSortKey($_[1]) gt $_[0]->getSortKey($_[2]) }
398 sub ge { $_[0]->getSortKey($_[1]) ge $_[0]->getSortKey($_[2]) }
401 ## list[strings] sorted = sort(list[strings] arg)
408 sort{ $a->[0] cmp $b->[0] }
409 map [ $obj->getSortKey($_), $_ ], @_;
413 ## list[arrayrefs] CE = _derivCE(int codepoint)
418 my $a = UNDEFINED + ($code >> 15); # ok
419 my $b = ($code & 0x7FFF) | 0x8000; # ok
420 # my $a = 0xFFC2 + ($code >> 15); # ng
421 # my $b = $code & 0x7FFF | 0x1000; # ng
422 $b ? ([$a,2,1,$code],[$b,0,0,$code]) : [$a,2,1,$code];
426 ## "hhhh hhhh hhhh" to (dddd, dddd, dddd)
428 sub _getHexArray { map hex(), $_[0] =~ /([0-9a-fA-F]+)/g }
431 ## bool is_a_CJK_Unified_Ideograph = _isCJK(int codepoint)
436 return 0x3400 <= $u && $u <= 0x4DB5
437 || 0x4E00 <= $u && $u <= 0x9FA5
438 # || 0x20000 <= $u && $u <= 0x2A6D6;
442 ## list[arrayref] CE = _CJK(int codepoint_of_CJK)
444 sub _CJK { $_[0] > 0xFFFF ? _derivCE($_[0]) : [$_[0], 0x20, 0x02, $_[0]] }
447 ## bool is_a_Hangul_Syllable = _isHangul(int codepoint)
449 sub _isHangul { return 0xAC00 <= $_[0] && $_[0] <= 0xD7A3 }
453 # $code must be in Hangul syllable. check it before you enter here.
454 my $SIndex = $code - 0xAC00;
455 my $LIndex = int( $SIndex / 588);
456 my $VIndex = int(($SIndex % 588) / 28);
457 my $TIndex = $SIndex % 28;
461 $TIndex ? (0x11A7 + $TIndex) : (),
470 Unicode::Collate - use UCA (Unicode Collation Algorithm)
474 use Unicode::Collate;
477 $Collator = Unicode::Collate->new(%tailoring);
480 @sorted = $Collator->sort(@not_sorted);
483 $result = $Collator->cmp($a, $b); # returns 1, 0, or -1.
487 =head2 Constructor and Tailoring
489 The C<new> method returns a collator object.
491 $Collator = Unicode::Collate->new(
492 alternate => $alternate,
493 backwards => $levelNumber, # or \@levelNumbers
495 normalization => $normalization_form,
496 ignoreName => qr/$ignoreName/,
497 ignoreChar => qr/$ignoreChar/,
498 katakana_before_hiragana => $bool,
499 level => $collationLevel,
500 overrideCJK => \&overrideCJK,
501 overrideHangul => \&overrideHangul,
502 preprocess => \&preprocess,
503 rearrange => \@charList,
505 undefName => qr/$undefName/,
506 undefChar => qr/$undefChar/,
507 upper_before_lower => $bool,
509 # if %tailoring is false (empty),
510 # $Collator should do the default collation.
516 -- see 3.2.2 Alternate Weighting, UTR #10.
518 alternate => 'shifted', 'blanked', 'non-ignorable', or 'shift-trimmed'.
520 By default (if specification is omitted), 'shifted' is adopted.
524 -- see 3.1.2 French Accents, UTR #10.
526 backwards => $levelNumber or \@levelNumbers
528 Weights in reverse order; ex. level 2 (diacritic ordering) in French.
529 If omitted, forwards at all the levels.
533 -- see 3.1 Linguistic Features; 3.2.1 File Format, UTR #10.
535 Overrides a default order or adds a new collation element
537 entry => <<'ENTRIES', # use the UCA file format
538 00E6 ; [.0861.0020.0002.00E6] [.08B1.0020.0002.00E6] # ligature <ae> as <a e>
539 0063 0068 ; [.0893.0020.0002.0063] # "ch" in traditional Spanish
540 0043 0068 ; [.0893.0020.0008.0043] # "Ch" in traditional Spanish
547 -- see Completely Ignorable, 3.2.2 Alternate Weighting, UTR #10.
549 Ignores the entry in the table.
550 If an ignored collation element appears in the string to be collated,
551 it is ignored as if the element had been deleted from there.
553 E.g. when 'a' and 'e' are ignored,
554 'element' is equal to 'lament' (or 'lmnt').
558 -- see 4.3 Form a sort key for each string, UTR #10.
560 Set the maximum level.
561 Any higher levels than the specified one are ignored.
563 Level 1: alphabetic ordering
564 Level 2: diacritic ordering
565 Level 3: case ordering
566 Level 4: tie-breaking (e.g. in the case when alternate is 'shifted')
572 -- see 4.1 Normalize each input string, UTR #10.
574 If specified, strings are normalized before preparation of sort keys
575 (the normalization is executed after preprocess).
577 As a form name, one of the following names must be used.
579 'C' or 'NFC' for Normalization Form C
580 'D' or 'NFD' for Normalization Form D
581 'KC' or 'NFKC' for Normalization Form KC
582 'KD' or 'NFKD' for Normalization Form KD
584 If omitted, the string is put into Normalization Form D.
586 If undefined explicitly (as C<normalization =E<gt> undef>),
587 any normalization is not carried out (this may make tailoring easier
588 if any normalization is not desired).
596 -- see 7.1 Derived Collation Elements, UTR #10.
598 By default, mapping of CJK Unified Ideographs
599 uses the Unicode codepoint order
600 and Hangul Syllables are decomposed into Hangul Jamo.
602 The mapping of CJK Unified Ideographs
603 or Hangul Syllables may be overrided.
605 ex. CJK Unified Ideographs in the JIS codepoint order.
608 my $u = shift; # get unicode codepoint
609 my $b = pack('n', $u); # to UTF-16BE
610 my $s = your_unicode_to_sjis_converter($b); # convert
611 my $n = unpack('n', $s); # convert sjis to short
612 [ $n, 1, 1 ]; # return collation element
615 If you want to override the mapping of Hangul Syllables,
616 the Normalization Forms D and KD are not appropriate
617 (they will be decomposed before overriding).
621 -- see 5.1 Preprocessing, UTR #10.
623 If specified, the coderef is used to preprocess
624 before the formation of sort keys.
626 ex. dropping English articles, such as "a" or "the".
627 Then, "the pen" is before "a pencil".
631 $str =~ s/\b(?:an?|the)\s+//g;
637 -- see 3.1.3 Rearrangement, UTR #10.
639 Characters that are not coded in logical order and to be rearranged.
642 rearrange => [ 0x0E40..0x0E44, 0x0EC0..0x0EC4 ],
646 -- see 3.2 Default Unicode Collation Element Table, UTR #10.
648 You can use another element table if desired.
649 The table file must be in your C<lib/Unicode/Collate> directory.
651 By default, the file C<lib/Unicode/Collate/allkeys.txt> is used.
653 If undefined explicitly (as C<table =E<gt> undef>),
654 no file is read (you'd define collation elements using L<entry>).
660 -- see 6.3.4 Reducing the Repertoire, UTR #10.
662 Undefines the collation element as if it were unassigned in the table.
663 This reduces the size of the table.
664 If an unassigned character appears in the string to be collated,
665 the sort key is made from its codepoint
666 as a single-character collation element,
667 as it is greater than any other assigned collation elements
668 (in the codepoint order among the unassigned characters).
669 But, it'd be better to ignore characters
670 unfamiliar to you and maybe never used.
672 =item katakana_before_hiragana
674 =item upper_before_lower
676 -- see 6.6 Case Comparisons; 7.3.1 Tertiary Weight Table, UTR #10.
678 By default, lowercase is before uppercase
679 and hiragana is before katakana.
681 If the parameter is true, this is reversed.
689 =item C<@sorted = $Collator-E<gt>sort(@not_sorted)>
691 Sorts a list of strings.
693 =item C<$result = $Collator-E<gt>cmp($a, $b)>
695 Returns 1 (when C<$a> is greater than C<$b>)
696 or 0 (when C<$a> is equal to C<$b>)
697 or -1 (when C<$a> is lesser than C<$b>).
699 =item C<$result = $Collator-E<gt>eq($a, $b)>
701 =item C<$result = $Collator-E<gt>ne($a, $b)>
703 =item C<$result = $Collator-E<gt>lt($a, $b)>
705 =item C<$result = $Collator-E<gt>le($a, $b)>
707 =item C<$result = $Collator-E<gt>gt($a, $b)>
709 =item C<$result = $Collator-E<gt>ge($a, $b)>
711 They works like the same name operators as theirs.
713 eq : whether $a is equal to $b.
714 ne : whether $a is not equal to $b.
715 lt : whether $a is lesser than $b.
716 le : whether $a is lesser than $b or equal to $b.
717 gt : whether $a is greater than $b.
718 ge : whether $a is greater than $b or equal to $b.
720 =item C<$sortKey = $Collator-E<gt>getSortKey($string)>
722 -- see 4.3 Form a sort key for each string, UTR #10.
726 You compare the sort keys using a binary comparison
727 and get the result of the comparison of the strings using UCA.
729 $Collator->getSortKey($a) cmp $Collator->getSortKey($b)
733 $Collator->cmp($a, $b)
735 =item C<$position = $Collator-E<gt>index($string, $substring)>
737 =item C<($position, $length) = $Collator-E<gt>index($string, $substring)>
739 -- see 6.8 Searching, UTR #10.
741 If C<$substring> matches a part of C<$string>, returns
742 the position of the first occurrence of the matching part in scalar context;
743 in list context, returns a two-element list of
744 the position and the length of the matching part.
746 B<Notice> that the length of the matching part may differ from
747 the length of C<$substring>.
749 B<Note> that the position and the length are counted on the string
750 after the process of preprocess, normalization, and rearrangement.
751 Therefore, in case the specified string is not binary equal to
752 the preprocessed/normalized/rearranged string, the position and the length
753 may differ form those on the specified string. But it is guaranteed
754 that, if matched, it returns a non-negative value as C<$position>.
756 If C<$substring> does not match any part of C<$string>,
757 returns C<-1> in scalar context and
758 an empty list in list context.
762 my $Collator = Unicode::Collate->new( normalization => undef, level => 1 );
763 my $str = "Ich mu\x{00DF} studieren.";
764 my $sub = "m\x{00FC}ss";
766 if(my($pos,$len) = $Collator->index($str, $sub)){
767 $match = substr($str, $pos, $len);
770 and get C<"mu\x{00DF}"> in C<$match> since C<"mu>E<223>C<">
771 is primary equal to C<"m>E<252>C<ss">.
781 Use of the C<normalization> parameter requires
782 the B<Unicode::Normalize> module.
784 If you need not it (say, in the case when you need not
785 handle any combining characters),
786 assign C<normalization =E<gt> undef> explicitly.
788 -- see 6.5 Avoiding Normalization, UTR #10.
792 SADAHIRO Tomoyuki, E<lt>SADAHIRO@cpan.orgE<gt>
794 http://homepage1.nifty.com/nomenclator/perl/
796 Copyright(C) 2001, SADAHIRO Tomoyuki. Japan. All rights reserved.
798 This program is free software; you can redistribute it and/or
799 modify it under the same terms as Perl itself.
805 =item Unicode Collation Algorithm - Unicode TR #10
807 http://www.unicode.org/unicode/reports/tr10/
809 =item L<Unicode::Normalize>
811 normalized forms of Unicode text