1 package Unicode::Collate;
10 our $PACKAGE = __PACKAGE__;
12 our @ISA = qw(Exporter);
14 our %EXPORT_TAGS = ();
18 (our $Path = $INC{'Unicode/Collate.pm'}) =~ s/\.pm$//;
19 our $KeyFile = "allkeys.txt";
21 # Lingua::KO::Hangul::Util not part of the standard distribution
22 # but it will be used if available.
24 eval { require Lingua::KO::Hangul::Util };
25 my $hasHangulUtil = ! $@;
27 Lingua::KO::Hangul::Util->import();
30 our %Combin; # combining class from Unicode::Normalize
32 use constant Min2 => 0x20; # minimum weight at level 2
33 use constant Min3 => 0x02; # minimum weight at level 3
34 use constant UNDEFINED => 0xFF80; # special value for undefined CE
42 my $self = bless { @_ }, $class;
46 ! exists $self->{alternate} ? 'shifted' :
47 ! defined $self->{alternate} ? '' : $self->{alternate};
50 $self->{level} ||= ($self->{alternate} =~ /shift/ ? 4 : 3);
53 $self->{normalization} = 'D' if ! exists $self->{normalization};
55 eval "use Unicode::Normalize;" if defined $self->{normalization};
58 ! defined $self->{normalization} ? undef :
59 $self->{normalization} =~ /^(?:NF)?C$/ ? \&NFC :
60 $self->{normalization} =~ /^(?:NF)?D$/ ? \&NFD :
61 $self->{normalization} =~ /^(?:NF)?KC$/ ? \&NFKC :
62 $self->{normalization} =~ /^(?:NF)?KD$/ ? \&NFKD :
63 croak "$PACKAGE unknown normalization form name: $self->{normalization}";
65 *Combin = \%Unicode::Normalize::Combin if $self->{normalize} && ! %Combin;
68 $self->{backwards} ||= [];
69 $self->{backwards} = [ $self->{backwards} ] if ! ref $self->{backwards};
72 $self->{rearrange} ||= []; # maybe not U+0000 (an ASCII)
73 $self->{rearrange} = [ $self->{rearrange} ] if ! ref $self->{rearrange};
76 my $file = defined $self->{table} ? $self->{table} : $KeyFile;
77 open my $fk, "<$Path/$file" or croak "File does not exist at $Path/$file";
82 if(/^\@version\s*(\S*)/){
83 $self->{version} ||= $1;
85 elsif(/^\@alternate\s+(.*)/){
86 $self->{alternate} ||= $1;
88 elsif(/^\@backwards\s+(.*)/){
89 push @{ $self->{backwards} }, $1;
91 elsif(/^\@rearrange\s+(.*)/){
92 push @{ $self->{rearrange} }, _getHexArray($1);
96 $self->parseEntry($_);
100 $self->parseEntry($_) foreach split /\n/, $self->{entry};
103 # keys of $self->{rearrangeHash} are $self->{rearrange}.
104 $self->{rearrangeHash} = {};
105 @{ $self->{rearrangeHash} }{ @{ $self->{rearrange} } } = ();
111 ## get $line, parse it, and write an entry in $self
117 my($name, $ele, @key);
119 return if $line !~ /^\s*[0-9A-Fa-f]/;
122 $name = $1 if $line =~ s/#\s*(.*)//;
123 return if defined $self->{undefName} && $name =~ /$self->{undefName}/;
126 my($e, $k) = split /;/, $line;
127 my @e = _getHexArray($e);
128 $ele = pack('U*', @e);
129 return if defined $self->{undefChar} && $ele =~ /$self->{undefChar}/;
133 defined $self->{ignoreName} && $name =~ /$self->{ignoreName}/ ||
134 defined $self->{ignoreChar} && $ele =~ /$self->{ignoreChar}/
137 $self->{entries}{$ele} = $self->{ignored}{$ele} = 1;
141 foreach my $arr ($k =~ /\[(\S+)\]/g) {
142 my $var = $arr =~ /\*/;
143 push @key, $self->altCE( $var, _getHexArray($arr) );
145 $self->{entries}{$ele} = \@key;
147 $self->{maxlength}{ord $ele} = scalar @e if @e > 1;
152 ## arrayref CE = altCE(bool variable?, list[num] weights)
160 $self->{alternate} eq 'blanked' ?
161 $var ? [0,0,0] : [ @c[0..2] ] :
162 $self->{alternate} eq 'non-ignorable' ?
164 $self->{alternate} eq 'shifted' ?
165 $var ? [0,0,0,$c[0] ] : [ @c[0..2], $c[0]+$c[1]+$c[2] ? 0xFFFF : 0 ] :
166 $self->{alternate} eq 'shift-trimmed' ?
167 $var ? [0,0,0,$c[0] ] : [ @c[0..2], 0 ] :
172 ## string hex_sortkey = splitCE(string arg)
177 my $key = $self->getSortKey(@_);
178 my $view = join " ", map sprintf("%04X", $_), unpack 'n*', $key;
179 $view =~ s/ ?0000 ?/|/g;
185 ## list[strings] elements = splitCE(string arg)
190 my $code = $self->{preprocess};
191 my $norm = $self->{normalize};
192 my $ent = $self->{entries};
193 my $max = $self->{maxlength};
194 my $rear = $self->{rearrangeHash};
196 my $str = ref $code ? &$code(shift) : shift;
197 $str = &$norm($str) if ref $norm;
200 @src = unpack('U*', $str);
203 for(my $i = 0; $i < @src; $i++)
205 ($src[$i], $src[$i+1]) = ($src[$i+1], $src[$i])
206 if $rear->{ $src[$i] };
210 for(my $i = 0; $i < @src; $i++)
216 next if $u < 0 || 0x10FFFF < $u # out of range
217 || 0xD800 < $u && $u < 0xDFFF; # unpaired surrogates
218 my $four = $u & 0xFFFF;
219 next if $four == 0xFFFE || $four == 0xFFFF;
221 if($max->{$u}) # contract
223 for(my $j = $max->{$u}; $j >= 1; $j--)
225 next unless $i+$j-1 < @src;
226 $ch = pack 'U*', @src[$i .. $i+$j-1];
227 $i += $j-1, last if $ent->{$ch};
230 else { $ch = pack('U', $u) }
232 if(%Combin && defined $ch) # with Combining Char
234 for(my $j = $i+1; $j < @src && $Combin{ $src[$j] }; $j++)
236 my $comb = pack 'U', $src[$j];
237 next if ! $ent->{ $ch.$comb };
245 wantarray ? @buf : \@buf;
250 ## list[arrayrefs] weight = getWt(string element)
256 my $ent = $self->{entries};
257 my $ign = $self->{ignored};
258 my $cjk = $self->{overrideCJK};
259 my $hang = $self->{overrideHangul};
260 return if !defined $ch || $ign->{$ch}; # ignored
261 return @{ $ent->{$ch} } if $ent->{$ch};
262 my $u = unpack('U', $ch);
268 map(@{ $ent->{pack('U', $_)} }, decomposeHangul($u)) :
269 # runtime compile error...
270 (eval 'use Lingua::KO::Hangul::Util', print $@))
272 ? $cjk ? &$cjk($u) : map($self->altCE(0,@$_), _CJK($u))
273 : map($self->altCE(0,@$_), _derivCE($u));
277 ## int = index(string, substring)
282 my $lev = $self->{level};
283 my $str = $self->splitCE(shift);
284 my $sub = $self->splitCE(shift);
286 return wantarray ? (0,0) : 0 if ! @$sub;
287 return wantarray ? () : -1 if ! @$str;
289 my @subWt = grep _ignorableAtLevel($_,$lev),
290 map $self->getWt($_), @$sub;
295 my @tmp = grep _ignorableAtLevel($_,$lev), $self->getWt($e);
297 push @strPt, ($count) x @tmp;
299 while(@strWt >= @subWt){
300 if(_eqArray(\@strWt, \@subWt, $lev)){
302 return wantarray ? ($pos, $count-$pos) : $pos;
308 return wantarray ? () : -1;
312 ## bool _eqArray(arrayref, arrayref, level)
316 my $a = shift; # length $a >= length $b;
319 for my $v (0..$lev-1){
320 for my $c (0..@$b-1){
321 return if $a->[$c][$v] != $b->[$c][$v];
329 ## bool _ignorableAtLevel(CE, level)
331 sub _ignorableAtLevel($$)
334 return if ! defined $ce;
336 ! grep { ! $ce->[$_] } 0..$lv-1;
341 ## string sortkey = getSortKey(string arg)
346 my $lev = $self->{level};
347 my $rCE = $self->splitCE(shift); # get an arrayref
350 my @buf = grep defined(), map $self->getWt($_), @$rCE;
353 my @ret = ([],[],[],[]);
354 foreach my $v (0..$lev-1){
355 foreach my $b (@buf){
356 push @{ $ret[$v] }, $b->[$v] if $b->[$v];
359 foreach (@{ $self->{backwards} }){
361 @{ $ret[$v] } = reverse @{ $ret[$v] };
364 # modification of tertiary weights
365 if($self->{upper_before_lower}){
366 foreach (@{ $ret[2] }){
367 if (0x8 <= $_ && $_ <= 0xC){ $_ -= 6 } # lower
368 elsif(0x2 <= $_ && $_ <= 0x6){ $_ += 6 } # upper
369 elsif($_ == 0x1C) { $_ += 1 } # square upper
370 elsif($_ == 0x1D) { $_ -= 1 } # square lower
373 if($self->{katakana_before_hiragana}){
374 foreach (@{ $ret[2] }){
375 if (0x0F <= $_ && $_ <= 0x13){ $_ -= 2 } # katakana
376 elsif(0x0D <= $_ && $_ <= 0x0E){ $_ += 5 } # hiragana
379 join "\0\0", map pack('n*', @$_), @ret;
384 ## int compare = cmp(string a, string b)
391 $obj->getSortKey($a) cmp $obj->getSortKey($b);
395 ## list[strings] sorted = sort(list[strings] arg)
402 sort{ $a->[0] cmp $b->[0] }
403 map [ $obj->getSortKey($_), $_ ], @_;
407 ## list[arrayrefs] CE = _derivCE(int codepoint)
412 my $a = UNDEFINED + ($code >> 15); # ok
413 my $b = ($code & 0x7FFF) | 0x8000; # ok
414 # my $a = 0xFFC2 + ($code >> 15); # ng
415 # my $b = $code & 0x7FFF | 0x1000; # ng
416 $b ? ([$a,2,1,$code],[$b,0,0,$code]) : [$a,2,1,$code];
420 ## "hhhh hhhh hhhh" to (dddd, dddd, dddd)
425 map hex(), $str =~ /([0-9a-fA-F]+)/g;
429 ## bool is_a_CJK_Unified_Ideograph = _isCJK(int codepoint)
434 return 0x3400 <= $u && $u <= 0x4DB5
435 || 0x4E00 <= $u && $u <= 0x9FA5
436 # || 0x20000 <= $u && $u <= 0x2A6D6;
440 ## list[arrayref] CE = _CJK(int codepoint_of_CJK)
445 $u > 0xFFFF ? _derivCE($u) : [$u,0x20,0x02,$u];
449 ## bool is_a_Hangul_Syllable = _isHangul(int codepoint)
454 return 0xAC00 <= $code && $code <= 0xD7A3;
462 Unicode::Collate - use UCA (Unicode Collation Algorithm)
466 use Unicode::Collate;
469 $UCA = Unicode::Collate->new(%tailoring);
472 @sorted = $UCA->sort(@not_sorted);
475 $result = $UCA->cmp($a, $b); # returns 1, 0, or -1.
479 =head2 Constructor and Tailoring
481 The C<new> method returns a collator object.
483 $UCA = Unicode::Collate->new(
484 alternate => $alternate,
485 backwards => $levelNumber, # or \@levelNumbers
487 normalization => $normalization_form,
488 ignoreName => qr/$ignoreName/,
489 ignoreChar => qr/$ignoreChar/,
490 katakana_before_hiragana => $bool,
491 level => $collationLevel,
492 overrideCJK => \&overrideCJK,
493 overrideHangul => \&overrideHangul,
494 preprocess => \&preprocess,
495 rearrange => \@charList,
497 undefName => qr/$undefName/,
498 undefChar => qr/$undefChar/,
499 upper_before_lower => $bool,
501 # if %tailoring is false (empty),
502 # $UCA should do the default collation.
508 -- see 3.2.2 Alternate Weighting, UTR #10.
510 alternate => 'shifted', 'blanked', 'non-ignorable', or 'shift-trimmed'.
512 By default (if specification is omitted), 'shifted' is adopted.
516 -- see 3.1.2 French Accents, UTR #10.
518 backwards => $levelNumber or \@levelNumbers
520 Weights in reverse order; ex. level 2 (diacritic ordering) in French.
521 If omitted, forwards at all the levels.
525 -- see 3.1 Linguistic Features; 3.2.1 File Format, UTR #10.
527 Overrides a default order or adds a new element
529 entry => <<'ENTRIES', # use the UCA file format
530 00E6 ; [.0861.0020.0002.00E6] [.08B1.0020.0002.00E6] # ligature <ae> as <a e>
531 0063 0068 ; [.0893.0020.0002.0063] # "ch" in traditional Spanish
532 0043 0068 ; [.0893.0020.0008.0043] # "Ch" in traditional Spanish
539 -- see Completely Ignorable, 3.2.2 Alternate Weighting, UTR #10.
541 Ignores the entry in the table.
542 If an ignored collation element appears in the string to be collated,
543 it is ignored as if the element had been deleted from there.
545 E.g. when 'a' and 'e' are ignored,
546 'element' is equal to 'lament' (or 'lmnt').
550 -- see 4.3 Form a sort key for each string, UTR #10.
552 Set the maximum level.
553 Any higher levels than the specified one are ignored.
555 Level 1: alphabetic ordering
556 Level 2: diacritic ordering
557 Level 3: case ordering
558 Level 4: tie-breaking (e.g. in the case when alternate is 'shifted')
564 -- see 4.1 Normalize each input string, UTR #10.
566 If specified, strings are normalized before preparation sort keys
567 (the normalization is executed after preprocess).
569 As a form name, one of the following names must be used.
571 'C' or 'NFC' for Normalization Form C
572 'D' or 'NFD' for Normalization Form D
573 'KC' or 'NFKC' for Normalization Form KC
574 'KD' or 'NFKD' for Normalization Form KD
576 If omitted, the string is put into Normalization Form D.
578 If undefined explicitly (as C<normalization =E<gt> undef>),
579 any normalization is not carried out (this may make tailoring easier
580 if any normalization is not desired).
588 -- see 7.1 Derived Collation Elements, UTR #10.
590 By default, mapping of CJK Unified Ideographs
591 uses the Unicode codepoint order
592 and Hangul Syllables are decomposed into Hangul Jamo.
594 The mapping of CJK Unified Ideographs
595 or Hangul Syllables may be overrided.
597 ex. CJK Unified Ideographs in the JIS codepoint order.
600 my $u = shift; # get unicode codepoint
601 my $b = pack('n', $u); # to UTF-16BE
602 my $s = your_unicode_to_sjis_converter($b); # convert
603 my $n = unpack('n', $s); # convert sjis to short
604 [ $n, 1, 1 ]; # return collation element
607 If you want to override the mapping of Hangul Syllables,
608 the Normalization Forms D and KD are not appropriate
609 (they will be decomposed before overriding).
613 -- see 5.1 Preprocessing, UTR #10.
615 If specified, the coderef is used to preprocess
616 before the formation of sort keys.
618 ex. dropping English articles, such as "a" or "the".
619 Then, "the pen" is before "a pencil".
623 $str =~ s/\b(?:an?|the)\s+//g;
629 -- see 3.1.3 Rearrangement, UTR #10.
631 Characters that are not coded in logical order and to be rearranged.
634 rearrange => [ 0x0E40..0x0E44, 0x0EC0..0x0EC4 ],
638 -- see 3.2 Default Unicode Collation Element Table, UTR #10.
640 You can use another element table if desired.
641 The table file must be in your C<lib/Unicode/Collate> directory.
643 By default, the file C<lib/Unicode/Collate/allkeys.txt> is used.
649 -- see 6.3.4 Reducing the Repertoire, UTR #10.
651 Undefines the collation element as if it were unassigned in the table.
652 This reduces the size of the table.
653 If an unassigned character appears in the string to be collated,
654 the sort key is made from its codepoint
655 as a single-character collation element,
656 as it is greater than any other assigned collation elements
657 (in the codepoint order among the unassigned characters).
658 But, it'd be better to ignore characters
659 unfamiliar to you and maybe never used.
661 =item katakana_before_hiragana
663 =item upper_before_lower
665 -- see 6.6 Case Comparisons; 7.3.1 Tertiary Weight Table, UTR #10.
667 By default, lowercase is before uppercase
668 and hiragana is before katakana.
670 If the parameter is true, this is reversed.
678 =item C<@sorted = $UCA-E<gt>sort(@not_sorted)>
680 Sorts a list of strings.
682 =item C<$result = $UCA-E<gt>cmp($a, $b)>
684 Returns 1 (when C<$a> is greater than C<$b>)
685 or 0 (when C<$a> is equal to C<$b>)
686 or -1 (when C<$a> is lesser than C<$b>).
688 =item C<$sortKey = $UCA-E<gt>getSortKey($string)>
690 -- see 4.3 Form a sort key for each string, UTR #10.
694 You compare the sort keys using a binary comparison
695 and get the result of the comparison of the strings using UCA.
697 $UCA->getSortKey($a) cmp $UCA->getSortKey($b)
703 =item C<$position = $UCA-E<gt>index($string, $substring)>
705 =item C<($position, $length) = $UCA-E<gt>index($string, $substring)>
707 -- see 6.8 Searching, UTR #10.
709 If C<$substring> matches a part of C<$string>, returns
710 the position of the first occurrence of the matching part in scalar context;
711 in list context, returns a two-element list of
712 the position and the length of the matching part.
714 B<Notice> that the length of the matching part may differ from
715 the length of C<$substring>.
717 B<Note> that the position and the length are counted on the string
718 after the process of preprocess, normalization, and rearrangement.
719 Therefore, in case the specified string is not binary equal to
720 the preprocessed/normalized/rearranged string, the position and the length
721 may differ form those on the specified string. But it is guaranteed
722 that, if matched, it returns a non-negative value as C<$position>.
724 If C<$substring> does not match any part of C<$string>,
725 returns C<-1> in scalar context and
726 an empty list in list context.
730 my $UCA = Unicode::Collate->new( normalization => undef, level => 1 );
731 my $str = "Ich mu\x{00DF} studieren.";
732 my $sub = "m\x{00FC}ss";
734 if(my @tmp = $UCA->index($str, $sub)){
735 $match = substr($str, $tmp[0], $tmp[1]);
738 and get C<"mu\x{00DF}"> in C<$match> since C<"mu>E<223>C<">
739 is primary equal to C<"m>E<252>C<ss">.
749 Use of the C<normalization> parameter requires
750 the B<Unicode::Normalize> module.
752 If you need not it (e.g. in the case when you need not
753 handle any combining characters),
754 assign C<normalization =E<gt> undef> explicitly.
758 SADAHIRO Tomoyuki, E<lt>SADAHIRO@cpan.orgE<gt>
760 http://homepage1.nifty.com/nomenclator/perl/
762 Copyright(C) 2001, SADAHIRO Tomoyuki. Japan. All rights reserved.
764 This program is free software; you can redistribute it and/or
765 modify it under the same terms as Perl itself.
771 =item L<Lingua::KO::Hangul::Util>
773 utility functions for Hangul Syllables
775 =item L<Unicode::Normalize>
777 normalized forms of Unicode text
779 =item Unicode Collation Algorithm - Unicode TR #10
781 http://www.unicode.org/unicode/reports/tr10/