1 package Unicode::Collate;
5 die "Unicode::Collate not ported to EBCDIC\n";
16 our $VERSION = '0.10';
17 our $PACKAGE = __PACKAGE__;
19 our @ISA = qw(Exporter);
21 our %EXPORT_TAGS = ();
25 (our $Path = $INC{'Unicode/Collate.pm'}) =~ s/\.pm$//;
26 our $KeyFile = "allkeys.txt";
28 our $getCombinClass; # coderef for combining class from Unicode::Normalize
30 use constant Min2 => 0x20; # minimum weight at level 2
31 use constant Min3 => 0x02; # minimum weight at level 3
32 use constant UNDEFINED => 0xFF80; # special value for undefined CE
40 my $self = bless { @_ }, $class;
44 ! exists $self->{alternate} ? 'shifted' :
45 ! defined $self->{alternate} ? '' : $self->{alternate};
48 $self->{level} ||= ($self->{alternate} =~ /^shift/ ? 4 : 3);
51 $self->{normalization} = 'D' if ! exists $self->{normalization};
53 if(defined $self->{normalization}){
54 eval "use Unicode::Normalize;";
55 croak "you'd install Unicode::Normalize for normalization forms: $@"
57 $getCombinClass = \&Unicode::Normalize::getCombinClass
62 ! defined $self->{normalization} ? undef :
63 $self->{normalization} =~ /^(?:NF)?C$/ ? \&NFC :
64 $self->{normalization} =~ /^(?:NF)?D$/ ? \&NFD :
65 $self->{normalization} =~ /^(?:NF)?KC$/ ? \&NFKC :
66 $self->{normalization} =~ /^(?:NF)?KD$/ ? \&NFKD :
67 croak "$PACKAGE unknown normalization form name: $self->{normalization}";
70 $self->{backwards} ||= [];
71 $self->{backwards} = [ $self->{backwards} ] if ! ref $self->{backwards};
74 $self->{rearrange} ||= []; # maybe not U+0000 (an ASCII)
75 $self->{rearrange} = [ $self->{rearrange} ] if ! ref $self->{rearrange};
78 # if undef is passed explicitly, no file is read.
79 $self->{table} = $KeyFile unless exists $self->{table};
80 $self->read_table if defined $self->{table};
83 $self->parseEntry($_) foreach split /\n/, $self->{entry};
86 # keys of $self->{rearrangeHash} are $self->{rearrange}.
87 $self->{rearrangeHash} = {};
88 @{ $self->{rearrangeHash} }{ @{ $self->{rearrange} } } = ();
96 my $file = $self->{table} ne '' ? $self->{table} : $KeyFile;
97 open my $fk, "<$Path/$file" or croak "File does not exist at $Path/$file";
102 if(/^\@version\s*(\S*)/){
103 $self->{version} ||= $1;
105 elsif(/^\@alternate\s+(.*)/){
106 $self->{alternate} ||= $1;
108 elsif(/^\@backwards\s+(.*)/){
109 push @{ $self->{backwards} }, $1;
111 elsif(/^\@rearrange\s+(.*)/){
112 push @{ $self->{rearrange} }, _getHexArray($1);
116 $self->parseEntry($_);
123 ## get $line, parse it, and write an entry in $self
129 my($name, $ele, @key);
131 return if $line !~ /^\s*[0-9A-Fa-f]/;
134 $name = $1 if $line =~ s/#\s*(.*)//;
135 return if defined $self->{undefName} && $name =~ /$self->{undefName}/;
138 my($e, $k) = split /;/, $line;
139 my @e = _getHexArray($e);
140 $ele = pack('U*', @e);
141 return if defined $self->{undefChar} && $ele =~ /$self->{undefChar}/;
145 defined $self->{ignoreName} && $name =~ /$self->{ignoreName}/ ||
146 defined $self->{ignoreChar} && $ele =~ /$self->{ignoreChar}/
149 $self->{entries}{$ele} = $self->{ignored}{$ele} = 1;
153 foreach my $arr ($k =~ /\[(\S+)\]/g) {
154 my $var = $arr =~ /\*/;
155 push @key, $self->altCE( $var, _getHexArray($arr) );
157 $self->{entries}{$ele} = \@key;
159 $self->{maxlength}{ord $ele} = scalar @e if @e > 1;
164 ## arrayref CE = altCE(bool variable?, list[num] weights)
172 $self->{alternate} eq 'blanked' ?
173 $var ? [0,0,0] : [ @c[0..2] ] :
174 $self->{alternate} eq 'non-ignorable' ?
176 $self->{alternate} eq 'shifted' ?
177 $var ? [0,0,0,$c[0] ] : [ @c[0..2], $c[0]+$c[1]+$c[2] ? 0xFFFF : 0 ] :
178 $self->{alternate} eq 'shift-trimmed' ?
179 $var ? [0,0,0,$c[0] ] : [ @c[0..2], 0 ] :
184 ## string hex_sortkey = splitCE(string arg)
189 my $key = $self->getSortKey(@_);
190 my $view = join " ", map sprintf("%04X", $_), unpack 'n*', $key;
191 $view =~ s/ ?0000 ?/|/g;
197 ## list[strings] elements = splitCE(string arg)
202 my $code = $self->{preprocess};
203 my $norm = $self->{UNF};
204 my $ent = $self->{entries};
205 my $max = $self->{maxlength};
206 my $rear = $self->{rearrangeHash};
208 my $str = ref $code ? &$code(shift) : shift;
209 $str = &$norm($str) if ref $norm;
212 @src = unpack('U*', $str);
215 for(my $i = 0; $i < @src; $i++)
217 ($src[$i], $src[$i+1]) = ($src[$i+1], $src[$i])
218 if $rear->{ $src[$i] };
222 for(my $i = 0; $i < @src; $i++)
228 next unless defined $u;
229 next if $u < 0 || 0x10FFFF < $u # out of range
230 || 0xD800 < $u && $u < 0xDFFF; # unpaired surrogates
231 my $four = $u & 0xFFFF;
232 next if $four == 0xFFFE || $four == 0xFFFF;
234 if($max->{$u}) # contract
236 for(my $j = $max->{$u}; $j >= 1; $j--)
238 next unless $i+$j-1 < @src;
239 $ch = pack 'U*', @src[$i .. $i+$j-1];
240 $i += $j-1, last if $ent->{$ch};
243 else { $ch = pack('U', $u) }
245 # with Combining Char (UTS#10, 4.2.1), here requires Unicode::Normalize.
246 if($getCombinClass && defined $ch)
248 for(my $j = $i+1; $j < @src; $j++)
250 next unless defined $src[$j];
251 last unless $getCombinClass->( $src[$j] );
252 my $comb = pack 'U', $src[$j];
253 next if ! $ent->{ $ch.$comb };
260 wantarray ? @buf : \@buf;
265 ## list[arrayrefs] weight = getWt(string element)
271 my $ent = $self->{entries};
272 my $ign = $self->{ignored};
273 my $cjk = $self->{overrideCJK};
274 my $hang = $self->{overrideHangul};
275 return if !defined $ch || $ign->{$ch}; # ignored
276 return @{ $ent->{$ch} } if $ent->{$ch};
277 my $u = unpack('U', $ch);
282 : map(@{ $ent->{pack('U', $_)} }, _decompHangul($u))
284 ? $cjk ? &$cjk($u) : map($self->altCE(0,@$_), _CJK($u))
285 : map($self->altCE(0,@$_), _derivCE($u));
289 ## int = index(string, substring)
294 my $lev = $self->{level};
295 my $str = $self->splitCE(shift);
296 my $sub = $self->splitCE(shift);
298 return wantarray ? (0,0) : 0 if ! @$sub;
299 return wantarray ? () : -1 if ! @$str;
301 my @subWt = grep _ignorableAtLevel($_,$lev),
302 map $self->getWt($_), @$sub;
307 my @tmp = grep _ignorableAtLevel($_,$lev), $self->getWt($e);
309 push @strPt, ($count) x @tmp;
311 while(@strWt >= @subWt){
312 if(_eqArray(\@strWt, \@subWt, $lev)){
314 return wantarray ? ($pos, $count-$pos) : $pos;
320 return wantarray ? () : -1;
324 ## bool _eqArray(arrayref, arrayref, level)
328 my $a = shift; # length $a >= length $b;
331 for my $v (0..$lev-1){
332 for my $c (0..@$b-1){
333 return if $a->[$c][$v] != $b->[$c][$v];
341 ## bool _ignorableAtLevel(CE, level)
343 sub _ignorableAtLevel($$)
346 return if ! defined $ce;
348 ! grep { ! $ce->[$_] } 0..$lv-1;
353 ## string sortkey = getSortKey(string arg)
358 my $lev = $self->{level};
359 my $rCE = $self->splitCE(shift); # get an arrayref
362 my @buf = grep defined(), map $self->getWt($_), @$rCE;
365 my @ret = ([],[],[],[]);
366 foreach my $v (0..$lev-1){
367 foreach my $b (@buf){
368 push @{ $ret[$v] }, $b->[$v] if $b->[$v];
371 foreach (@{ $self->{backwards} }){
373 @{ $ret[$v] } = reverse @{ $ret[$v] };
376 # modification of tertiary weights
377 if($self->{upper_before_lower}){
378 foreach (@{ $ret[2] }){
379 if (0x8 <= $_ && $_ <= 0xC){ $_ -= 6 } # lower
380 elsif(0x2 <= $_ && $_ <= 0x6){ $_ += 6 } # upper
381 elsif($_ == 0x1C) { $_ += 1 } # square upper
382 elsif($_ == 0x1D) { $_ -= 1 } # square lower
385 if($self->{katakana_before_hiragana}){
386 foreach (@{ $ret[2] }){
387 if (0x0F <= $_ && $_ <= 0x13){ $_ -= 2 } # katakana
388 elsif(0x0D <= $_ && $_ <= 0x0E){ $_ += 5 } # hiragana
391 join "\0\0", map pack('n*', @$_), @ret;
396 ## int compare = cmp(string a, string b)
398 sub cmp { $_[0]->getSortKey($_[1]) cmp $_[0]->getSortKey($_[2]) }
399 sub eq { $_[0]->getSortKey($_[1]) eq $_[0]->getSortKey($_[2]) }
400 sub ne { $_[0]->getSortKey($_[1]) ne $_[0]->getSortKey($_[2]) }
401 sub lt { $_[0]->getSortKey($_[1]) lt $_[0]->getSortKey($_[2]) }
402 sub le { $_[0]->getSortKey($_[1]) le $_[0]->getSortKey($_[2]) }
403 sub gt { $_[0]->getSortKey($_[1]) gt $_[0]->getSortKey($_[2]) }
404 sub ge { $_[0]->getSortKey($_[1]) ge $_[0]->getSortKey($_[2]) }
407 ## list[strings] sorted = sort(list[strings] arg)
414 sort{ $a->[0] cmp $b->[0] }
415 map [ $obj->getSortKey($_), $_ ], @_;
419 ## list[arrayrefs] CE = _derivCE(int codepoint)
424 my $a = UNDEFINED + ($code >> 15); # ok
425 my $b = ($code & 0x7FFF) | 0x8000; # ok
426 # my $a = 0xFFC2 + ($code >> 15); # ng
427 # my $b = $code & 0x7FFF | 0x1000; # ng
428 $b ? ([$a,2,1,$code],[$b,0,0,$code]) : [$a,2,1,$code];
432 ## "hhhh hhhh hhhh" to (dddd, dddd, dddd)
434 sub _getHexArray { map hex(), $_[0] =~ /([0-9a-fA-F]+)/g }
437 ## bool is_a_CJK_Unified_Ideograph = _isCJK(int codepoint)
442 return 0x3400 <= $u && $u <= 0x4DB5
443 || 0x4E00 <= $u && $u <= 0x9FA5
444 # || 0x20000 <= $u && $u <= 0x2A6D6;
448 ## list[arrayref] CE = _CJK(int codepoint_of_CJK)
450 sub _CJK { $_[0] > 0xFFFF ? _derivCE($_[0]) : [$_[0], 0x20, 0x02, $_[0]] }
453 ## bool is_a_Hangul_Syllable = _isHangul(int codepoint)
455 sub _isHangul { return 0xAC00 <= $_[0] && $_[0] <= 0xD7A3 }
459 # $code must be in Hangul syllable. check it before you enter here.
460 my $SIndex = $code - 0xAC00;
461 my $LIndex = int( $SIndex / 588);
462 my $VIndex = int(($SIndex % 588) / 28);
463 my $TIndex = $SIndex % 28;
467 $TIndex ? (0x11A7 + $TIndex) : (),
476 Unicode::Collate - use UCA (Unicode Collation Algorithm)
480 use Unicode::Collate;
483 $Collator = Unicode::Collate->new(%tailoring);
486 @sorted = $Collator->sort(@not_sorted);
489 $result = $Collator->cmp($a, $b); # returns 1, 0, or -1.
493 =head2 Constructor and Tailoring
495 The C<new> method returns a collator object.
497 $Collator = Unicode::Collate->new(
498 alternate => $alternate,
499 backwards => $levelNumber, # or \@levelNumbers
501 normalization => $normalization_form,
502 ignoreName => qr/$ignoreName/,
503 ignoreChar => qr/$ignoreChar/,
504 katakana_before_hiragana => $bool,
505 level => $collationLevel,
506 overrideCJK => \&overrideCJK,
507 overrideHangul => \&overrideHangul,
508 preprocess => \&preprocess,
509 rearrange => \@charList,
511 undefName => qr/$undefName/,
512 undefChar => qr/$undefChar/,
513 upper_before_lower => $bool,
515 # if %tailoring is false (empty),
516 # $Collator should do the default collation.
522 -- see 3.2.2 Alternate Weighting, UTR #10.
524 alternate => 'shifted', 'blanked', 'non-ignorable', or 'shift-trimmed'.
526 By default (if specification is omitted), 'shifted' is adopted.
530 -- see 3.1.2 French Accents, UTR #10.
532 backwards => $levelNumber or \@levelNumbers
534 Weights in reverse order; ex. level 2 (diacritic ordering) in French.
535 If omitted, forwards at all the levels.
539 -- see 3.1 Linguistic Features; 3.2.1 File Format, UTR #10.
541 Overrides a default order or adds a new collation element
543 entry => <<'ENTRIES', # use the UCA file format
544 00E6 ; [.0861.0020.0002.00E6] [.08B1.0020.0002.00E6] # ligature <ae> as <a e>
545 0063 0068 ; [.0893.0020.0002.0063] # "ch" in traditional Spanish
546 0043 0068 ; [.0893.0020.0008.0043] # "Ch" in traditional Spanish
553 -- see Completely Ignorable, 3.2.2 Alternate Weighting, UTR #10.
555 Ignores the entry in the table.
556 If an ignored collation element appears in the string to be collated,
557 it is ignored as if the element had been deleted from there.
559 E.g. when 'a' and 'e' are ignored,
560 'element' is equal to 'lament' (or 'lmnt').
564 -- see 4.3 Form a sort key for each string, UTR #10.
566 Set the maximum level.
567 Any higher levels than the specified one are ignored.
569 Level 1: alphabetic ordering
570 Level 2: diacritic ordering
571 Level 3: case ordering
572 Level 4: tie-breaking (e.g. in the case when alternate is 'shifted')
578 -- see 4.1 Normalize each input string, UTR #10.
580 If specified, strings are normalized before preparation of sort keys
581 (the normalization is executed after preprocess).
583 As a form name, one of the following names must be used.
585 'C' or 'NFC' for Normalization Form C
586 'D' or 'NFD' for Normalization Form D
587 'KC' or 'NFKC' for Normalization Form KC
588 'KD' or 'NFKD' for Normalization Form KD
590 If omitted, the string is put into Normalization Form D.
592 If undefined explicitly (as C<normalization =E<gt> undef>),
593 any normalization is not carried out (this may make tailoring easier
594 if any normalization is not desired).
602 -- see 7.1 Derived Collation Elements, UTR #10.
604 By default, mapping of CJK Unified Ideographs
605 uses the Unicode codepoint order
606 and Hangul Syllables are decomposed into Hangul Jamo.
608 The mapping of CJK Unified Ideographs
609 or Hangul Syllables may be overrided.
611 ex. CJK Unified Ideographs in the JIS codepoint order.
614 my $u = shift; # get unicode codepoint
615 my $b = pack('n', $u); # to UTF-16BE
616 my $s = your_unicode_to_sjis_converter($b); # convert
617 my $n = unpack('n', $s); # convert sjis to short
618 [ $n, 1, 1 ]; # return collation element
621 If you want to override the mapping of Hangul Syllables,
622 the Normalization Forms D and KD are not appropriate
623 (they will be decomposed before overriding).
627 -- see 5.1 Preprocessing, UTR #10.
629 If specified, the coderef is used to preprocess
630 before the formation of sort keys.
632 ex. dropping English articles, such as "a" or "the".
633 Then, "the pen" is before "a pencil".
637 $str =~ s/\b(?:an?|the)\s+//g;
643 -- see 3.1.3 Rearrangement, UTR #10.
645 Characters that are not coded in logical order and to be rearranged.
648 rearrange => [ 0x0E40..0x0E44, 0x0EC0..0x0EC4 ],
652 -- see 3.2 Default Unicode Collation Element Table, UTR #10.
654 You can use another element table if desired.
655 The table file must be in your C<lib/Unicode/Collate> directory.
657 By default, the file C<lib/Unicode/Collate/allkeys.txt> is used.
659 If undefined explicitly (as C<table =E<gt> undef>),
660 no file is read (you'd define collation elements using L<entry>).
666 -- see 6.3.4 Reducing the Repertoire, UTR #10.
668 Undefines the collation element as if it were unassigned in the table.
669 This reduces the size of the table.
670 If an unassigned character appears in the string to be collated,
671 the sort key is made from its codepoint
672 as a single-character collation element,
673 as it is greater than any other assigned collation elements
674 (in the codepoint order among the unassigned characters).
675 But, it'd be better to ignore characters
676 unfamiliar to you and maybe never used.
678 =item katakana_before_hiragana
680 =item upper_before_lower
682 -- see 6.6 Case Comparisons; 7.3.1 Tertiary Weight Table, UTR #10.
684 By default, lowercase is before uppercase
685 and hiragana is before katakana.
687 If the parameter is true, this is reversed.
695 =item C<@sorted = $Collator-E<gt>sort(@not_sorted)>
697 Sorts a list of strings.
699 =item C<$result = $Collator-E<gt>cmp($a, $b)>
701 Returns 1 (when C<$a> is greater than C<$b>)
702 or 0 (when C<$a> is equal to C<$b>)
703 or -1 (when C<$a> is lesser than C<$b>).
705 =item C<$result = $Collator-E<gt>eq($a, $b)>
707 =item C<$result = $Collator-E<gt>ne($a, $b)>
709 =item C<$result = $Collator-E<gt>lt($a, $b)>
711 =item C<$result = $Collator-E<gt>le($a, $b)>
713 =item C<$result = $Collator-E<gt>gt($a, $b)>
715 =item C<$result = $Collator-E<gt>ge($a, $b)>
717 They works like the same name operators as theirs.
719 eq : whether $a is equal to $b.
720 ne : whether $a is not equal to $b.
721 lt : whether $a is lesser than $b.
722 le : whether $a is lesser than $b or equal to $b.
723 gt : whether $a is greater than $b.
724 ge : whether $a is greater than $b or equal to $b.
726 =item C<$sortKey = $Collator-E<gt>getSortKey($string)>
728 -- see 4.3 Form a sort key for each string, UTR #10.
732 You compare the sort keys using a binary comparison
733 and get the result of the comparison of the strings using UCA.
735 $Collator->getSortKey($a) cmp $Collator->getSortKey($b)
739 $Collator->cmp($a, $b)
741 =item C<$position = $Collator-E<gt>index($string, $substring)>
743 =item C<($position, $length) = $Collator-E<gt>index($string, $substring)>
745 -- see 6.8 Searching, UTR #10.
747 If C<$substring> matches a part of C<$string>, returns
748 the position of the first occurrence of the matching part in scalar context;
749 in list context, returns a two-element list of
750 the position and the length of the matching part.
752 B<Notice> that the length of the matching part may differ from
753 the length of C<$substring>.
755 B<Note> that the position and the length are counted on the string
756 after the process of preprocess, normalization, and rearrangement.
757 Therefore, in case the specified string is not binary equal to
758 the preprocessed/normalized/rearranged string, the position and the length
759 may differ form those on the specified string. But it is guaranteed
760 that, if matched, it returns a non-negative value as C<$position>.
762 If C<$substring> does not match any part of C<$string>,
763 returns C<-1> in scalar context and
764 an empty list in list context.
768 my $Collator = Unicode::Collate->new( normalization => undef, level => 1 );
769 my $str = "Ich mu\x{00DF} studieren.";
770 my $sub = "m\x{00FC}ss";
772 if(my($pos,$len) = $Collator->index($str, $sub)){
773 $match = substr($str, $pos, $len);
776 and get C<"mu\x{00DF}"> in C<$match> since C<"mu>E<223>C<">
777 is primary equal to C<"m>E<252>C<ss">.
787 Unicode::Collate has not been ported to EBCDIC. The code mostly would
788 work just fine but a decision needs to be made: how the module should
789 work in EBCDIC? Should the low 256 characters be understood as
790 Unicode or as EBCDIC code points? Should one be chosen or should
791 there be a way to do either? Or should such translation be left
792 outside the module for the user to do, for example by using
797 Use of the C<normalization> parameter requires
798 the B<Unicode::Normalize> module.
800 If you need not it (say, in the case when you need not
801 handle any combining characters),
802 assign C<normalization =E<gt> undef> explicitly.
804 -- see 6.5 Avoiding Normalization, UTR #10.
808 SADAHIRO Tomoyuki, E<lt>SADAHIRO@cpan.orgE<gt>
810 http://homepage1.nifty.com/nomenclator/perl/
812 Copyright(C) 2001, SADAHIRO Tomoyuki. Japan. All rights reserved.
814 This program is free software; you can redistribute it and/or
815 modify it under the same terms as Perl itself.
821 =item Unicode Collation Algorithm - Unicode TR #10
823 http://www.unicode.org/unicode/reports/tr10/
825 =item L<Unicode::Normalize>
827 normalized forms of Unicode text