1 package Unicode::Collate;
7 use Lingua::KO::Hangul::Util;
10 our $VERSION = '0.08';
11 our $PACKAGE = __PACKAGE__;
13 our @ISA = qw(Exporter);
15 our %EXPORT_TAGS = ();
19 (our $Path = $INC{'Unicode/Collate.pm'}) =~ s/\.pm$//;
20 our $KeyFile = "allkeys.txt";
22 our %Combin; # combining class from Unicode::Normalize
24 use constant Min2 => 0x20; # minimum weight at level 2
25 use constant Min3 => 0x02; # minimum weight at level 3
26 use constant UNDEFINED => 0xFF80; # special value for undefined CE
34 my $self = bless { @_ }, $class;
38 ! exists $self->{alternate} ? 'shifted' :
39 ! defined $self->{alternate} ? '' : $self->{alternate};
42 $self->{level} ||= ($self->{alternate} =~ /shift/ ? 4 : 3);
45 $self->{normalization} = 'D' if ! exists $self->{normalization};
47 eval "use Unicode::Normalize;" if defined $self->{normalization};
50 ! defined $self->{normalization} ? undef :
51 $self->{normalization} =~ /^(?:NF)?C$/ ? \&NFC :
52 $self->{normalization} =~ /^(?:NF)?D$/ ? \&NFD :
53 $self->{normalization} =~ /^(?:NF)?KC$/ ? \&NFKC :
54 $self->{normalization} =~ /^(?:NF)?KD$/ ? \&NFKD :
55 croak "$PACKAGE unknown normalization form name: $self->{normalization}";
57 *Combin = \%Unicode::Normalize::Combin if $self->{normalize} && ! %Combin;
60 $self->{backwards} ||= [];
61 $self->{backwards} = [ $self->{backwards} ] if ! ref $self->{backwards};
64 $self->{rearrange} ||= []; # maybe not U+0000 (an ASCII)
65 $self->{rearrange} = [ $self->{rearrange} ] if ! ref $self->{rearrange};
68 my $file = defined $self->{table} ? $self->{table} : $KeyFile;
69 open my $fk, "<$Path/$file" or croak "File does not exist at $Path/$file";
74 if(/^\@version\s*(\S*)/){
75 $self->{version} ||= $1;
77 elsif(/^\@alternate\s+(.*)/){
78 $self->{alternate} ||= $1;
80 elsif(/^\@backwards\s+(.*)/){
81 push @{ $self->{backwards} }, $1;
83 elsif(/^\@rearrange\s+(.*)/){
84 push @{ $self->{rearrange} }, _getHexArray($1);
88 $self->parseEntry($_);
92 $self->parseEntry($_) foreach split /\n/, $self->{entry};
95 # keys of $self->{rearrangeHash} are $self->{rearrange}.
96 $self->{rearrangeHash} = {};
97 @{ $self->{rearrangeHash} }{ @{ $self->{rearrange} } } = ();
103 ## get $line, parse it, and write an entry in $self
109 my($name, $ele, @key);
111 return if $line !~ /^\s*[0-9A-Fa-f]/;
114 $name = $1 if $line =~ s/#\s*(.*)//;
115 return if defined $self->{undefName} && $name =~ /$self->{undefName}/;
118 my($e, $k) = split /;/, $line;
119 my @e = _getHexArray($e);
120 $ele = pack('U*', @e);
121 return if defined $self->{undefChar} && $ele =~ /$self->{undefChar}/;
125 defined $self->{ignoreName} && $name =~ /$self->{ignoreName}/ ||
126 defined $self->{ignoreChar} && $ele =~ /$self->{ignoreChar}/
129 $self->{entries}{$ele} = $self->{ignored}{$ele} = 1;
133 foreach my $arr ($k =~ /\[(\S+)\]/g) {
134 my $var = $arr =~ /\*/;
135 push @key, $self->altCE( $var, _getHexArray($arr) );
137 $self->{entries}{$ele} = \@key;
139 $self->{maxlength}{ord $ele} = scalar @e if @e > 1;
144 ## arrayref CE = altCE(bool variable?, list[num] weights)
152 $self->{alternate} eq 'blanked' ?
153 $var ? [0,0,0] : [ @c[0..2] ] :
154 $self->{alternate} eq 'non-ignorable' ?
156 $self->{alternate} eq 'shifted' ?
157 $var ? [0,0,0,$c[0] ] : [ @c[0..2], $c[0]+$c[1]+$c[2] ? 0xFFFF : 0 ] :
158 $self->{alternate} eq 'shift-trimmed' ?
159 $var ? [0,0,0,$c[0] ] : [ @c[0..2], 0 ] :
164 ## string hex_sortkey = splitCE(string arg)
169 my $key = $self->getSortKey(@_);
170 my $view = join " ", map sprintf("%04X", $_), unpack 'n*', $key;
171 $view =~ s/ ?0000 ?/|/g;
177 ## list[strings] elements = splitCE(string arg)
182 my $code = $self->{preprocess};
183 my $norm = $self->{normalize};
184 my $ent = $self->{entries};
185 my $max = $self->{maxlength};
186 my $rear = $self->{rearrangeHash};
188 my $str = ref $code ? &$code(shift) : shift;
189 $str = &$norm($str) if ref $norm;
192 @src = unpack('U*', $str);
195 for(my $i = 0; $i < @src; $i++)
197 ($src[$i], $src[$i+1]) = ($src[$i+1], $src[$i])
198 if $rear->{ $src[$i] };
202 for(my $i = 0; $i < @src; $i++)
208 next if $u < 0 || 0x10FFFF < $u # out of range
209 || 0xD800 < $u && $u < 0xDFFF; # unpaired surrogates
210 my $four = $u & 0xFFFF;
211 next if $four == 0xFFFE || $four == 0xFFFF;
213 if($max->{$u}) # contract
215 for(my $j = $max->{$u}; $j >= 1; $j--)
217 next unless $i+$j-1 < @src;
218 $ch = pack 'U*', @src[$i .. $i+$j-1];
219 $i += $j-1, last if $ent->{$ch};
222 else { $ch = pack('U', $u) }
224 if(%Combin && defined $ch) # with Combining Char
226 for(my $j = $i+1; $j < @src && $Combin{ $src[$j] }; $j++)
228 my $comb = pack 'U', $src[$j];
229 next if ! $ent->{ $ch.$comb };
237 wantarray ? @buf : \@buf;
242 ## list[arrayrefs] weight = getWt(string element)
248 my $ent = $self->{entries};
249 my $ign = $self->{ignored};
250 my $cjk = $self->{overrideCJK};
251 my $hang = $self->{overrideHangul};
252 return if !defined $ch || $ign->{$ch}; # ignored
253 return @{ $ent->{$ch} } if $ent->{$ch};
254 my $u = unpack('U', $ch);
259 : map(@{ $ent->{pack('U', $_)} }, decomposeHangul($u))
261 ? $cjk ? &$cjk($u) : map($self->altCE(0,@$_), _CJK($u))
262 : map($self->altCE(0,@$_), _derivCE($u));
266 ## int = index(string, substring)
271 my $lev = $self->{level};
272 my $str = $self->splitCE(shift);
273 my $sub = $self->splitCE(shift);
275 return wantarray ? (0,0) : 0 if ! @$sub;
276 return wantarray ? () : -1 if ! @$str;
278 my @subWt = grep _ignorableAtLevel($_,$lev),
279 map $self->getWt($_), @$sub;
284 my @tmp = grep _ignorableAtLevel($_,$lev), $self->getWt($e);
286 push @strPt, ($count) x @tmp;
288 while(@strWt >= @subWt){
289 if(_eqArray(\@strWt, \@subWt, $lev)){
291 return wantarray ? ($pos, $count-$pos) : $pos;
297 return wantarray ? () : -1;
301 ## bool _eqArray(arrayref, arrayref, level)
305 my $a = shift; # length $a >= length $b;
308 for my $v (0..$lev-1){
309 for my $c (0..@$b-1){
310 return if $a->[$c][$v] != $b->[$c][$v];
318 ## bool _ignorableAtLevel(CE, level)
320 sub _ignorableAtLevel($$)
323 return if ! defined $ce;
325 ! grep { ! $ce->[$_] } 0..$lv-1;
330 ## string sortkey = getSortKey(string arg)
335 my $lev = $self->{level};
336 my $rCE = $self->splitCE(shift); # get an arrayref
339 my @buf = grep defined(), map $self->getWt($_), @$rCE;
342 my @ret = ([],[],[],[]);
343 foreach my $v (0..$lev-1){
344 foreach my $b (@buf){
345 push @{ $ret[$v] }, $b->[$v] if $b->[$v];
348 foreach (@{ $self->{backwards} }){
350 @{ $ret[$v] } = reverse @{ $ret[$v] };
353 # modification of tertiary weights
354 if($self->{upper_before_lower}){
355 foreach (@{ $ret[2] }){
356 if (0x8 <= $_ && $_ <= 0xC){ $_ -= 6 } # lower
357 elsif(0x2 <= $_ && $_ <= 0x6){ $_ += 6 } # upper
358 elsif($_ == 0x1C) { $_ += 1 } # square upper
359 elsif($_ == 0x1D) { $_ -= 1 } # square lower
362 if($self->{katakana_before_hiragana}){
363 foreach (@{ $ret[2] }){
364 if (0x0F <= $_ && $_ <= 0x13){ $_ -= 2 } # katakana
365 elsif(0x0D <= $_ && $_ <= 0x0E){ $_ += 5 } # hiragana
368 join "\0\0", map pack('n*', @$_), @ret;
373 ## int compare = cmp(string a, string b)
380 $obj->getSortKey($a) cmp $obj->getSortKey($b);
384 ## list[strings] sorted = sort(list[strings] arg)
391 sort{ $a->[0] cmp $b->[0] }
392 map [ $obj->getSortKey($_), $_ ], @_;
396 ## list[arrayrefs] CE = _derivCE(int codepoint)
401 my $a = UNDEFINED + ($code >> 15); # ok
402 my $b = ($code & 0x7FFF) | 0x8000; # ok
403 # my $a = 0xFFC2 + ($code >> 15); # ng
404 # my $b = $code & 0x7FFF | 0x1000; # ng
405 $b ? ([$a,2,1,$code],[$b,0,0,$code]) : [$a,2,1,$code];
409 ## "hhhh hhhh hhhh" to (dddd, dddd, dddd)
414 map hex(), $str =~ /([0-9a-fA-F]+)/g;
418 ## bool is_a_CJK_Unified_Ideograph = _isCJK(int codepoint)
423 return 0x3400 <= $u && $u <= 0x4DB5
424 || 0x4E00 <= $u && $u <= 0x9FA5
425 # || 0x20000 <= $u && $u <= 0x2A6D6;
429 ## list[arrayref] CE = _CJK(int codepoint_of_CJK)
434 $u > 0xFFFF ? _derivCE($u) : [$u,0x20,0x02,$u];
438 ## bool is_a_Hangul_Syllable = _isHangul(int codepoint)
443 return 0xAC00 <= $code && $code <= 0xD7A3;
451 Unicode::Collate - use UCA (Unicode Collation Algorithm)
455 use Unicode::Collate;
458 $UCA = Unicode::Collate->new(%tailoring);
461 @sorted = $UCA->sort(@not_sorted);
464 $result = $UCA->cmp($a, $b); # returns 1, 0, or -1.
468 =head2 Constructor and Tailoring
470 The C<new> method returns a collator object.
472 $UCA = Unicode::Collate->new(
473 alternate => $alternate,
474 backwards => $levelNumber, # or \@levelNumbers
476 normalization => $normalization_form,
477 ignoreName => qr/$ignoreName/,
478 ignoreChar => qr/$ignoreChar/,
479 katakana_before_hiragana => $bool,
480 level => $collationLevel,
481 overrideCJK => \&overrideCJK,
482 overrideHangul => \&overrideHangul,
483 preprocess => \&preprocess,
484 rearrange => \@charList,
486 undefName => qr/$undefName/,
487 undefChar => qr/$undefChar/,
488 upper_before_lower => $bool,
490 # if %tailoring is false (empty),
491 # $UCA should do the default collation.
497 -- see 3.2.2 Alternate Weighting, UTR #10.
499 alternate => 'shifted', 'blanked', 'non-ignorable', or 'shift-trimmed'.
501 By default (if specification is omitted), 'shifted' is adopted.
505 -- see 3.1.2 French Accents, UTR #10.
507 backwards => $levelNumber or \@levelNumbers
509 Weights in reverse order; ex. level 2 (diacritic ordering) in French.
510 If omitted, forwards at all the levels.
514 -- see 3.1 Linguistic Features; 3.2.1 File Format, UTR #10.
516 Overrides a default order or adds a new element
518 entry => <<'ENTRIES', # use the UCA file format
519 00E6 ; [.0861.0020.0002.00E6] [.08B1.0020.0002.00E6] # ligature <ae> as <a e>
520 0063 0068 ; [.0893.0020.0002.0063] # "ch" in traditional Spanish
521 0043 0068 ; [.0893.0020.0008.0043] # "Ch" in traditional Spanish
528 -- see Completely Ignorable, 3.2.2 Alternate Weighting, UTR #10.
530 Ignores the entry in the table.
531 If an ignored collation element appears in the string to be collated,
532 it is ignored as if the element had been deleted from there.
534 E.g. when 'a' and 'e' are ignored,
535 'element' is equal to 'lament' (or 'lmnt').
539 -- see 4.3 Form a sort key for each string, UTR #10.
541 Set the maximum level.
542 Any higher levels than the specified one are ignored.
544 Level 1: alphabetic ordering
545 Level 2: diacritic ordering
546 Level 3: case ordering
547 Level 4: tie-breaking (e.g. in the case when alternate is 'shifted')
553 -- see 4.1 Normalize each input string, UTR #10.
555 If specified, strings are normalized before preparation sort keys
556 (the normalization is executed after preprocess).
558 As a form name, one of the following names must be used.
560 'C' or 'NFC' for Normalization Form C
561 'D' or 'NFD' for Normalization Form D
562 'KC' or 'NFKC' for Normalization Form KC
563 'KD' or 'NFKD' for Normalization Form KD
565 If omitted, the string is put into Normalization Form D.
567 If undefined explicitly (as C<normalization =E<gt> undef>),
568 any normalization is not carried out (this may make tailoring easier
569 if any normalization is not desired).
577 -- see 7.1 Derived Collation Elements, UTR #10.
579 By default, mapping of CJK Unified Ideographs
580 uses the Unicode codepoint order
581 and Hangul Syllables are decomposed into Hangul Jamo.
583 The mapping of CJK Unified Ideographs
584 or Hangul Syllables may be overrided.
586 ex. CJK Unified Ideographs in the JIS codepoint order.
589 my $u = shift; # get unicode codepoint
590 my $b = pack('n', $u); # to UTF-16BE
591 my $s = your_unicode_to_sjis_converter($b); # convert
592 my $n = unpack('n', $s); # convert sjis to short
593 [ $n, 1, 1 ]; # return collation element
596 If you want to override the mapping of Hangul Syllables,
597 the Normalization Forms D and KD are not appropriate
598 (they will be decomposed before overriding).
602 -- see 5.1 Preprocessing, UTR #10.
604 If specified, the coderef is used to preprocess
605 before the formation of sort keys.
607 ex. dropping English articles, such as "a" or "the".
608 Then, "the pen" is before "a pencil".
612 $str =~ s/\b(?:an?|the)\s+//g;
618 -- see 3.1.3 Rearrangement, UTR #10.
620 Characters that are not coded in logical order and to be rearranged.
623 rearrange => [ 0x0E40..0x0E44, 0x0EC0..0x0EC4 ],
627 -- see 3.2 Default Unicode Collation Element Table, UTR #10.
629 You can use another element table if desired.
630 The table file must be in your C<lib/Unicode/Collate> directory.
632 By default, the file C<lib/Unicode/Collate/allkeys.txt> is used.
638 -- see 6.3.4 Reducing the Repertoire, UTR #10.
640 Undefines the collation element as if it were unassigned in the table.
641 This reduces the size of the table.
642 If an unassigned character appears in the string to be collated,
643 the sort key is made from its codepoint
644 as a single-character collation element,
645 as it is greater than any other assigned collation elements
646 (in the codepoint order among the unassigned characters).
647 But, it'd be better to ignore characters
648 unfamiliar to you and maybe never used.
650 =item katakana_before_hiragana
652 =item upper_before_lower
654 -- see 6.6 Case Comparisons; 7.3.1 Tertiary Weight Table, UTR #10.
656 By default, lowercase is before uppercase
657 and hiragana is before katakana.
659 If the parameter is true, this is reversed.
667 =item C<@sorted = $UCA-E<gt>sort(@not_sorted)>
669 Sorts a list of strings.
671 =item C<$result = $UCA-E<gt>cmp($a, $b)>
673 Returns 1 (when C<$a> is greater than C<$b>)
674 or 0 (when C<$a> is equal to C<$b>)
675 or -1 (when C<$a> is lesser than C<$b>).
677 =item C<$sortKey = $UCA-E<gt>getSortKey($string)>
679 -- see 4.3 Form a sort key for each string, UTR #10.
683 You compare the sort keys using a binary comparison
684 and get the result of the comparison of the strings using UCA.
686 $UCA->getSortKey($a) cmp $UCA->getSortKey($b)
692 =item C<$position = $UCA-E<gt>index($string, $substring)>
694 =item C<($position, $length) = $UCA-E<gt>index($string, $substring)>
696 -- see 6.8 Searching, UTR #10.
698 If C<$substring> matches a part of C<$string>, returns
699 the position of the first occurrence of the matching part in scalar context;
700 in list context, returns a two-element list of
701 the position and the length of the matching part.
703 B<Notice> that the length of the matching part may differ from
704 the length of C<$substring>.
706 B<Note> that the position and the length are counted on the string
707 after the process of preprocess, normalization, and rearrangement.
708 Therefore, in case the specified string is not binary equal to
709 the preprocessed/normalized/rearranged string, the position and the length
710 may differ form those on the specified string. But it is guaranteed
711 that, if matched, it returns a non-negative value as C<$position>.
713 If C<$substring> does not match any part of C<$string>,
714 returns C<-1> in scalar context and
715 an empty list in list context.
719 my $UCA = Unicode::Collate->new( normalization => undef, level => 1 );
720 my $str = "Ich mu\x{00DF} studieren.";
721 my $sub = "m\x{00FC}ss";
723 if(my @tmp = $UCA->index($str, $sub)){
724 $match = substr($str, $tmp[0], $tmp[1]);
727 and get C<"mu\x{00DF}"> in C<$match> since C<"mu>E<223>C<">
728 is primary equal to C<"m>E<252>C<ss">.
738 Use of the C<normalization> parameter requires
739 the B<Unicode::Normalize> module.
741 If you need not it (e.g. in the case when you need not
742 handle any combining characters),
743 assign C<normalization =E<gt> undef> explicitly.
747 SADAHIRO Tomoyuki, E<lt>SADAHIRO@cpan.orgE<gt>
749 http://homepage1.nifty.com/nomenclator/perl/
751 Copyright(C) 2001, SADAHIRO Tomoyuki. Japan. All rights reserved.
753 This program is free software; you can redistribute it and/or
754 modify it under the same terms as Perl itself.
760 =item L<Lingua::KO::Hangul::Util>
762 utility functions for Hangul Syllables
764 =item L<Unicode::Normalize>
766 normalized forms of Unicode text
768 =item Unicode Collation Algorithm - Unicode TR #10
770 http://www.unicode.org/unicode/reports/tr10/