1 package Unicode::Collate;
5 die "Unicode::Collate not ported to EBCDIC\n";
16 our $VERSION = '0.10';
17 our $PACKAGE = __PACKAGE__;
19 our @ISA = qw(Exporter);
21 our %EXPORT_TAGS = ();
25 (our $Path = $INC{'Unicode/Collate.pm'}) =~ s/\.pm$//;
26 our $KeyFile = "allkeys.txt";
28 our $getCombinClass; # coderef for combining class from Unicode::Normalize
30 use constant Min2 => 0x20; # minimum weight at level 2
31 use constant Min3 => 0x02; # minimum weight at level 3
32 use constant UNDEFINED => 0xFF80; # special value for undefined CE
40 my $self = bless { @_ }, $class;
44 ! exists $self->{alternate} ? 'shifted' :
45 ! defined $self->{alternate} ? '' : $self->{alternate};
48 $self->{level} ||= ($self->{alternate} =~ /^shift/ ? 4 : 3);
51 $self->{normalization} = 'D' if ! exists $self->{normalization};
53 if(defined $self->{normalization}){
54 eval "use Unicode::Normalize;";
55 croak "you'd install Unicode::Normalize for normalization forms: $@"
57 $getCombinClass = \&Unicode::Normalize::getCombinClass
62 ! defined $self->{normalization} ? undef :
63 $self->{normalization} =~ /^(?:NF)?C$/ ? \&NFC :
64 $self->{normalization} =~ /^(?:NF)?D$/ ? \&NFD :
65 $self->{normalization} =~ /^(?:NF)?KC$/ ? \&NFKC :
66 $self->{normalization} =~ /^(?:NF)?KD$/ ? \&NFKD :
67 croak "$PACKAGE unknown normalization form name: $self->{normalization}";
70 $self->{backwards} ||= [];
71 $self->{backwards} = [ $self->{backwards} ] if ! ref $self->{backwards};
74 $self->{rearrange} ||= []; # maybe not U+0000 (an ASCII)
75 $self->{rearrange} = [ $self->{rearrange} ] if ! ref $self->{rearrange};
78 # if undef is passed explicitly, no file is read.
79 $self->{table} = $KeyFile unless exists $self->{table};
80 $self->read_table if defined $self->{table};
83 $self->parseEntry($_) foreach split /\n/, $self->{entry};
86 # keys of $self->{rearrangeHash} are $self->{rearrange}.
87 $self->{rearrangeHash} = {};
88 @{ $self->{rearrangeHash} }{ @{ $self->{rearrange} } } = ();
96 my $file = $self->{table} ne '' ? $self->{table} : $KeyFile;
97 open my $fk, "<$Path/$file" or croak "File does not exist at $Path/$file";
102 if(/^\@version\s*(\S*)/){
103 $self->{version} ||= $1;
105 elsif(/^\@alternate\s+(.*)/){
106 $self->{alternate} ||= $1;
108 elsif(/^\@backwards\s+(.*)/){
109 push @{ $self->{backwards} }, $1;
111 elsif(/^\@rearrange\s+(.*)/){
112 push @{ $self->{rearrange} }, _getHexArray($1);
116 $self->parseEntry($_);
123 ## get $line, parse it, and write an entry in $self
129 my($name, $ele, @key);
131 return if $line !~ /^\s*[0-9A-Fa-f]/;
134 $name = $1 if $line =~ s/#\s*(.*)//;
135 return if defined $self->{undefName} && $name =~ /$self->{undefName}/;
138 my($e, $k) = split /;/, $line;
139 my @e = _getHexArray($e);
140 { no warnings 'utf8'; $ele = pack('U*', @e); }
141 return if defined $self->{undefChar} && $ele =~ /$self->{undefChar}/;
144 { no warnings 'utf8';
146 defined $self->{ignoreName} && $name =~ /$self->{ignoreName}/ ||
147 defined $self->{ignoreChar} && $ele =~ /$self->{ignoreChar}/
150 $self->{entries}{$ele} = $self->{ignored}{$ele} = 1;
154 foreach my $arr ($k =~ /\[(\S+)\]/g) {
155 my $var = $arr =~ /\*/;
156 push @key, $self->altCE( $var, _getHexArray($arr) );
158 $self->{entries}{$ele} = \@key;
161 $self->{maxlength}{ord $ele} = scalar @e if @e > 1;
166 ## arrayref CE = altCE(bool variable?, list[num] weights)
174 $self->{alternate} eq 'blanked' ?
175 $var ? [0,0,0] : [ @c[0..2] ] :
176 $self->{alternate} eq 'non-ignorable' ?
178 $self->{alternate} eq 'shifted' ?
179 $var ? [0,0,0,$c[0] ] : [ @c[0..2], $c[0]+$c[1]+$c[2] ? 0xFFFF : 0 ] :
180 $self->{alternate} eq 'shift-trimmed' ?
181 $var ? [0,0,0,$c[0] ] : [ @c[0..2], 0 ] :
186 ## string hex_sortkey = splitCE(string arg)
191 my $key = $self->getSortKey(@_);
192 my $view = join " ", map sprintf("%04X", $_), unpack 'n*', $key;
193 $view =~ s/ ?0000 ?/|/g;
199 ## list[strings] elements = splitCE(string arg)
204 my $code = $self->{preprocess};
205 my $norm = $self->{UNF};
206 my $ent = $self->{entries};
207 my $max = $self->{maxlength};
208 my $rear = $self->{rearrangeHash};
210 my $str = ref $code ? &$code(shift) : shift;
211 $str = &$norm($str) if ref $norm;
214 @src = unpack('U*', $str);
217 for(my $i = 0; $i < @src; $i++)
219 ($src[$i], $src[$i+1]) = ($src[$i+1], $src[$i])
220 if $rear->{ $src[$i] };
224 for(my $i = 0; $i < @src; $i++)
230 next unless defined $u;
231 next if $u < 0 || 0x10FFFF < $u # out of range
232 || 0xD800 < $u && $u < 0xDFFF; # unpaired surrogates
233 my $four = $u & 0xFFFF;
234 next if $four == 0xFFFE || $four == 0xFFFF;
236 if($max->{$u}) # contract
238 for(my $j = $max->{$u}; $j >= 1; $j--)
240 next unless $i+$j-1 < @src;
241 $ch = pack 'U*', @src[$i .. $i+$j-1];
242 $i += $j-1, last if $ent->{$ch};
245 else { $ch = pack('U', $u) }
247 # with Combining Char (UTS#10, 4.2.1), here requires Unicode::Normalize.
248 if($getCombinClass && defined $ch)
250 for(my $j = $i+1; $j < @src; $j++)
252 next unless defined $src[$j];
253 last unless $getCombinClass->( $src[$j] );
254 my $comb = pack 'U', $src[$j];
255 next if ! $ent->{ $ch.$comb };
262 wantarray ? @buf : \@buf;
267 ## list[arrayrefs] weight = getWt(string element)
273 my $ent = $self->{entries};
274 my $ign = $self->{ignored};
275 my $cjk = $self->{overrideCJK};
276 my $hang = $self->{overrideHangul};
277 return if !defined $ch || $ign->{$ch}; # ignored
278 return @{ $ent->{$ch} } if $ent->{$ch};
279 my $u = unpack('U', $ch);
284 : map(@{ $ent->{pack('U', $_)} }, _decompHangul($u))
286 ? $cjk ? &$cjk($u) : map($self->altCE(0,@$_), _CJK($u))
287 : map($self->altCE(0,@$_), _derivCE($u));
291 ## int = index(string, substring)
296 my $lev = $self->{level};
297 my $str = $self->splitCE(shift);
298 my $sub = $self->splitCE(shift);
300 return wantarray ? (0,0) : 0 if ! @$sub;
301 return wantarray ? () : -1 if ! @$str;
303 my @subWt = grep _ignorableAtLevel($_,$lev),
304 map $self->getWt($_), @$sub;
309 my @tmp = grep _ignorableAtLevel($_,$lev), $self->getWt($e);
311 push @strPt, ($count) x @tmp;
313 while(@strWt >= @subWt){
314 if(_eqArray(\@strWt, \@subWt, $lev)){
316 return wantarray ? ($pos, $count-$pos) : $pos;
322 return wantarray ? () : -1;
326 ## bool _eqArray(arrayref, arrayref, level)
330 my $a = shift; # length $a >= length $b;
333 for my $v (0..$lev-1){
334 for my $c (0..@$b-1){
335 return if $a->[$c][$v] != $b->[$c][$v];
343 ## bool _ignorableAtLevel(CE, level)
345 sub _ignorableAtLevel($$)
348 return if ! defined $ce;
350 ! grep { ! $ce->[$_] } 0..$lv-1;
355 ## string sortkey = getSortKey(string arg)
360 my $lev = $self->{level};
361 my $rCE = $self->splitCE(shift); # get an arrayref
364 my @buf = grep defined(), map $self->getWt($_), @$rCE;
367 my @ret = ([],[],[],[]);
368 foreach my $v (0..$lev-1){
369 foreach my $b (@buf){
370 push @{ $ret[$v] }, $b->[$v] if $b->[$v];
373 foreach (@{ $self->{backwards} }){
375 @{ $ret[$v] } = reverse @{ $ret[$v] };
378 # modification of tertiary weights
379 if($self->{upper_before_lower}){
380 foreach (@{ $ret[2] }){
381 if (0x8 <= $_ && $_ <= 0xC){ $_ -= 6 } # lower
382 elsif(0x2 <= $_ && $_ <= 0x6){ $_ += 6 } # upper
383 elsif($_ == 0x1C) { $_ += 1 } # square upper
384 elsif($_ == 0x1D) { $_ -= 1 } # square lower
387 if($self->{katakana_before_hiragana}){
388 foreach (@{ $ret[2] }){
389 if (0x0F <= $_ && $_ <= 0x13){ $_ -= 2 } # katakana
390 elsif(0x0D <= $_ && $_ <= 0x0E){ $_ += 5 } # hiragana
393 join "\0\0", map pack('n*', @$_), @ret;
398 ## int compare = cmp(string a, string b)
400 sub cmp { $_[0]->getSortKey($_[1]) cmp $_[0]->getSortKey($_[2]) }
401 sub eq { $_[0]->getSortKey($_[1]) eq $_[0]->getSortKey($_[2]) }
402 sub ne { $_[0]->getSortKey($_[1]) ne $_[0]->getSortKey($_[2]) }
403 sub lt { $_[0]->getSortKey($_[1]) lt $_[0]->getSortKey($_[2]) }
404 sub le { $_[0]->getSortKey($_[1]) le $_[0]->getSortKey($_[2]) }
405 sub gt { $_[0]->getSortKey($_[1]) gt $_[0]->getSortKey($_[2]) }
406 sub ge { $_[0]->getSortKey($_[1]) ge $_[0]->getSortKey($_[2]) }
409 ## list[strings] sorted = sort(list[strings] arg)
416 sort{ $a->[0] cmp $b->[0] }
417 map [ $obj->getSortKey($_), $_ ], @_;
421 ## list[arrayrefs] CE = _derivCE(int codepoint)
426 my $a = UNDEFINED + ($code >> 15); # ok
427 my $b = ($code & 0x7FFF) | 0x8000; # ok
428 # my $a = 0xFFC2 + ($code >> 15); # ng
429 # my $b = $code & 0x7FFF | 0x1000; # ng
430 $b ? ([$a,2,1,$code],[$b,0,0,$code]) : [$a,2,1,$code];
434 ## "hhhh hhhh hhhh" to (dddd, dddd, dddd)
436 sub _getHexArray { map hex(), $_[0] =~ /([0-9a-fA-F]+)/g }
439 ## bool is_a_CJK_Unified_Ideograph = _isCJK(int codepoint)
444 return 0x3400 <= $u && $u <= 0x4DB5
445 || 0x4E00 <= $u && $u <= 0x9FA5
446 # || 0x20000 <= $u && $u <= 0x2A6D6;
450 ## list[arrayref] CE = _CJK(int codepoint_of_CJK)
452 sub _CJK { $_[0] > 0xFFFF ? _derivCE($_[0]) : [$_[0], 0x20, 0x02, $_[0]] }
455 ## bool is_a_Hangul_Syllable = _isHangul(int codepoint)
457 sub _isHangul { return 0xAC00 <= $_[0] && $_[0] <= 0xD7A3 }
461 # $code must be in Hangul syllable. check it before you enter here.
462 my $SIndex = $code - 0xAC00;
463 my $LIndex = int( $SIndex / 588);
464 my $VIndex = int(($SIndex % 588) / 28);
465 my $TIndex = $SIndex % 28;
469 $TIndex ? (0x11A7 + $TIndex) : (),
478 Unicode::Collate - use UCA (Unicode Collation Algorithm)
482 use Unicode::Collate;
485 $Collator = Unicode::Collate->new(%tailoring);
488 @sorted = $Collator->sort(@not_sorted);
491 $result = $Collator->cmp($a, $b); # returns 1, 0, or -1.
495 =head2 Constructor and Tailoring
497 The C<new> method returns a collator object.
499 $Collator = Unicode::Collate->new(
500 alternate => $alternate,
501 backwards => $levelNumber, # or \@levelNumbers
503 normalization => $normalization_form,
504 ignoreName => qr/$ignoreName/,
505 ignoreChar => qr/$ignoreChar/,
506 katakana_before_hiragana => $bool,
507 level => $collationLevel,
508 overrideCJK => \&overrideCJK,
509 overrideHangul => \&overrideHangul,
510 preprocess => \&preprocess,
511 rearrange => \@charList,
513 undefName => qr/$undefName/,
514 undefChar => qr/$undefChar/,
515 upper_before_lower => $bool,
517 # if %tailoring is false (empty),
518 # $Collator should do the default collation.
524 -- see 3.2.2 Alternate Weighting, UTR #10.
526 alternate => 'shifted', 'blanked', 'non-ignorable', or 'shift-trimmed'.
528 By default (if specification is omitted), 'shifted' is adopted.
532 -- see 3.1.2 French Accents, UTR #10.
534 backwards => $levelNumber or \@levelNumbers
536 Weights in reverse order; ex. level 2 (diacritic ordering) in French.
537 If omitted, forwards at all the levels.
541 -- see 3.1 Linguistic Features; 3.2.1 File Format, UTR #10.
543 Overrides a default order or adds a new collation element
545 entry => <<'ENTRIES', # use the UCA file format
546 00E6 ; [.0861.0020.0002.00E6] [.08B1.0020.0002.00E6] # ligature <ae> as <a e>
547 0063 0068 ; [.0893.0020.0002.0063] # "ch" in traditional Spanish
548 0043 0068 ; [.0893.0020.0008.0043] # "Ch" in traditional Spanish
555 -- see Completely Ignorable, 3.2.2 Alternate Weighting, UTR #10.
557 Ignores the entry in the table.
558 If an ignored collation element appears in the string to be collated,
559 it is ignored as if the element had been deleted from there.
561 E.g. when 'a' and 'e' are ignored,
562 'element' is equal to 'lament' (or 'lmnt').
566 -- see 4.3 Form a sort key for each string, UTR #10.
568 Set the maximum level.
569 Any higher levels than the specified one are ignored.
571 Level 1: alphabetic ordering
572 Level 2: diacritic ordering
573 Level 3: case ordering
574 Level 4: tie-breaking (e.g. in the case when alternate is 'shifted')
580 -- see 4.1 Normalize each input string, UTR #10.
582 If specified, strings are normalized before preparation of sort keys
583 (the normalization is executed after preprocess).
585 As a form name, one of the following names must be used.
587 'C' or 'NFC' for Normalization Form C
588 'D' or 'NFD' for Normalization Form D
589 'KC' or 'NFKC' for Normalization Form KC
590 'KD' or 'NFKD' for Normalization Form KD
592 If omitted, the string is put into Normalization Form D.
594 If undefined explicitly (as C<normalization =E<gt> undef>),
595 any normalization is not carried out (this may make tailoring easier
596 if any normalization is not desired).
604 -- see 7.1 Derived Collation Elements, UTR #10.
606 By default, mapping of CJK Unified Ideographs
607 uses the Unicode codepoint order
608 and Hangul Syllables are decomposed into Hangul Jamo.
610 The mapping of CJK Unified Ideographs
611 or Hangul Syllables may be overrided.
613 ex. CJK Unified Ideographs in the JIS codepoint order.
616 my $u = shift; # get unicode codepoint
617 my $b = pack('n', $u); # to UTF-16BE
618 my $s = your_unicode_to_sjis_converter($b); # convert
619 my $n = unpack('n', $s); # convert sjis to short
620 [ $n, 1, 1 ]; # return collation element
623 If you want to override the mapping of Hangul Syllables,
624 the Normalization Forms D and KD are not appropriate
625 (they will be decomposed before overriding).
629 -- see 5.1 Preprocessing, UTR #10.
631 If specified, the coderef is used to preprocess
632 before the formation of sort keys.
634 ex. dropping English articles, such as "a" or "the".
635 Then, "the pen" is before "a pencil".
639 $str =~ s/\b(?:an?|the)\s+//g;
645 -- see 3.1.3 Rearrangement, UTR #10.
647 Characters that are not coded in logical order and to be rearranged.
650 rearrange => [ 0x0E40..0x0E44, 0x0EC0..0x0EC4 ],
654 -- see 3.2 Default Unicode Collation Element Table, UTR #10.
656 You can use another element table if desired.
657 The table file must be in your C<lib/Unicode/Collate> directory.
659 By default, the file C<lib/Unicode/Collate/allkeys.txt> is used.
661 If undefined explicitly (as C<table =E<gt> undef>),
662 no file is read (you'd define collation elements using L<entry>).
668 -- see 6.3.4 Reducing the Repertoire, UTR #10.
670 Undefines the collation element as if it were unassigned in the table.
671 This reduces the size of the table.
672 If an unassigned character appears in the string to be collated,
673 the sort key is made from its codepoint
674 as a single-character collation element,
675 as it is greater than any other assigned collation elements
676 (in the codepoint order among the unassigned characters).
677 But, it'd be better to ignore characters
678 unfamiliar to you and maybe never used.
680 =item katakana_before_hiragana
682 =item upper_before_lower
684 -- see 6.6 Case Comparisons; 7.3.1 Tertiary Weight Table, UTR #10.
686 By default, lowercase is before uppercase
687 and hiragana is before katakana.
689 If the parameter is true, this is reversed.
697 =item C<@sorted = $Collator-E<gt>sort(@not_sorted)>
699 Sorts a list of strings.
701 =item C<$result = $Collator-E<gt>cmp($a, $b)>
703 Returns 1 (when C<$a> is greater than C<$b>)
704 or 0 (when C<$a> is equal to C<$b>)
705 or -1 (when C<$a> is lesser than C<$b>).
707 =item C<$result = $Collator-E<gt>eq($a, $b)>
709 =item C<$result = $Collator-E<gt>ne($a, $b)>
711 =item C<$result = $Collator-E<gt>lt($a, $b)>
713 =item C<$result = $Collator-E<gt>le($a, $b)>
715 =item C<$result = $Collator-E<gt>gt($a, $b)>
717 =item C<$result = $Collator-E<gt>ge($a, $b)>
719 They works like the same name operators as theirs.
721 eq : whether $a is equal to $b.
722 ne : whether $a is not equal to $b.
723 lt : whether $a is lesser than $b.
724 le : whether $a is lesser than $b or equal to $b.
725 gt : whether $a is greater than $b.
726 ge : whether $a is greater than $b or equal to $b.
728 =item C<$sortKey = $Collator-E<gt>getSortKey($string)>
730 -- see 4.3 Form a sort key for each string, UTR #10.
734 You compare the sort keys using a binary comparison
735 and get the result of the comparison of the strings using UCA.
737 $Collator->getSortKey($a) cmp $Collator->getSortKey($b)
741 $Collator->cmp($a, $b)
743 =item C<$position = $Collator-E<gt>index($string, $substring)>
745 =item C<($position, $length) = $Collator-E<gt>index($string, $substring)>
747 -- see 6.8 Searching, UTR #10.
749 If C<$substring> matches a part of C<$string>, returns
750 the position of the first occurrence of the matching part in scalar context;
751 in list context, returns a two-element list of
752 the position and the length of the matching part.
754 B<Notice> that the length of the matching part may differ from
755 the length of C<$substring>.
757 B<Note> that the position and the length are counted on the string
758 after the process of preprocess, normalization, and rearrangement.
759 Therefore, in case the specified string is not binary equal to
760 the preprocessed/normalized/rearranged string, the position and the length
761 may differ form those on the specified string. But it is guaranteed
762 that, if matched, it returns a non-negative value as C<$position>.
764 If C<$substring> does not match any part of C<$string>,
765 returns C<-1> in scalar context and
766 an empty list in list context.
770 my $Collator = Unicode::Collate->new( normalization => undef, level => 1 );
771 my $str = "Ich mu\x{00DF} studieren.";
772 my $sub = "m\x{00FC}ss";
774 if(my($pos,$len) = $Collator->index($str, $sub)){
775 $match = substr($str, $pos, $len);
778 and get C<"mu\x{00DF}"> in C<$match> since C<"mu>E<223>C<">
779 is primary equal to C<"m>E<252>C<ss">.
789 Unicode::Collate has not been ported to EBCDIC. The code mostly would
790 work just fine but a decision needs to be made: how the module should
791 work in EBCDIC? Should the low 256 characters be understood as
792 Unicode or as EBCDIC code points? Should one be chosen or should
793 there be a way to do either? Or should such translation be left
794 outside the module for the user to do, for example by using
799 Use of the C<normalization> parameter requires
800 the B<Unicode::Normalize> module.
802 If you need not it (say, in the case when you need not
803 handle any combining characters),
804 assign C<normalization =E<gt> undef> explicitly.
806 -- see 6.5 Avoiding Normalization, UTR #10.
810 SADAHIRO Tomoyuki, E<lt>SADAHIRO@cpan.orgE<gt>
812 http://homepage1.nifty.com/nomenclator/perl/
814 Copyright(C) 2001, SADAHIRO Tomoyuki. Japan. All rights reserved.
816 This program is free software; you can redistribute it and/or
817 modify it under the same terms as Perl itself.
823 =item Unicode Collation Algorithm - Unicode TR #10
825 http://www.unicode.org/unicode/reports/tr10/
827 =item L<Unicode::Normalize>
829 normalized forms of Unicode text