1 package Unicode::Collate;
5 die "Unicode::Collate not ported to EBCDIC\n";
16 our $VERSION = '0.12';
17 our $PACKAGE = __PACKAGE__;
19 our @ISA = qw(Exporter);
21 our %EXPORT_TAGS = ();
25 (our $Path = $INC{'Unicode/Collate.pm'}) =~ s/\.pm$//;
26 our $KeyFile = "allkeys.txt";
30 eval { require Unicode::UCD };
33 $UNICODE_VERSION = Unicode::UCD::UnicodeVersion();
35 else { # XXX, Perl 5.6.1
37 foreach my $d (@INC) {
39 $f = File::Spec->catfile($d, "unicode", "Unicode.301");
41 $UNICODE_VERSION = '3.0.1';
48 our $getCombinClass; # coderef for combining class from Unicode::Normalize
50 use constant Min2 => 0x20; # minimum weight at level 2
51 use constant Min3 => 0x02; # minimum weight at level 3
52 use constant UNDEFINED => 0xFF80; # special value for undefined CE's
54 our $DefaultRearrange = [ 0x0E40..0x0E44, 0x0EC0..0x0EC4 ];
56 sub UCA_Version { "8.0" }
58 sub Base_Unicode_Version { $UNICODE_VERSION || 'unknown' }
66 my $self = bless { @_ }, $class;
68 # alternate lowercased
70 ! exists $self->{alternate} ? 'shifted' : lc($self->{alternate});
72 croak "$PACKAGE unknown alternate tag name: $self->{alternate}"
73 unless $self->{alternate} eq 'blanked'
74 || $self->{alternate} eq 'non-ignorable'
75 || $self->{alternate} eq 'shifted'
76 || $self->{alternate} eq 'shift-trimmed';
81 croak "Illegal level lower than 1 (passed $self->{level})."
82 if $self->{level} < 1;
83 croak "A level higher than 4 (passed $self->{level}) is not supported."
84 if 4 < $self->{level};
86 # overrideHangul and -CJK
87 # If true: CODEREF used; '': default; undef: derived elements
88 $self->{overrideHangul} = ''
89 if ! exists $self->{overrideHangul};
90 $self->{overrideCJK} = ''
91 if ! exists $self->{overrideCJK};
94 $self->{normalization} = 'D'
95 if ! exists $self->{normalization};
98 if (defined $self->{normalization}) {
99 eval { require Unicode::Normalize };
100 croak "Unicode/Normalize.pm is required to normalize strings: $@"
103 Unicode::Normalize->import();
104 $getCombinClass = \&Unicode::Normalize::getCombinClass
105 if ! $getCombinClass;
108 $self->{normalization} =~ /^(?:NF)?C$/ ? \&NFC :
109 $self->{normalization} =~ /^(?:NF)?D$/ ? \&NFD :
110 $self->{normalization} =~ /^(?:NF)?KC$/ ? \&NFKC :
111 $self->{normalization} =~ /^(?:NF)?KD$/ ? \&NFKD :
112 croak "$PACKAGE unknown normalization form name: "
113 . $self->{normalization};
117 # If undef is passed explicitly, no file is read.
118 $self->{table} = $KeyFile
119 if ! exists $self->{table};
121 if defined $self->{table};
123 if ($self->{entry}) {
124 $self->parseEntry($_) foreach split /\n/, $self->{entry};
128 $self->{backwards} ||= [ ];
129 $self->{backwards} = [ $self->{backwards} ]
130 if ! ref $self->{backwards};
133 $self->{rearrange} = $DefaultRearrange
134 if ! exists $self->{rearrange};
135 $self->{rearrange} = []
136 if ! defined $self->{rearrange};
137 croak "$PACKAGE: A list for rearrangement must be store in an ARRAYREF"
138 if ! ref $self->{rearrange};
140 # keys of $self->{rearrangeHash} are $self->{rearrange}.
141 $self->{rearrangeHash} = undef;
143 if (@{ $self->{rearrange} }) {
144 @{ $self->{rearrangeHash} }{ @{ $self->{rearrange} } } = ();
152 my $file = $self->{table} ne '' ? $self->{table} : $KeyFile;
154 open my $fk, "<$Path/$file"
155 or croak "File does not exist at $Path/$file";
160 if (/^\@version\s*(\S*)/) {
161 $self->{version} ||= $1;
163 elsif (/^\@alternate\s+(.*)/) {
164 $self->{alternate} ||= $1;
166 elsif (/^\@backwards\s+(.*)/) {
167 push @{ $self->{backwards} }, $1;
169 elsif (/^\@rearrange\s+(.*)/) {
170 push @{ $self->{rearrange} }, _getHexArray($1);
174 $self->parseEntry($_);
181 ## get $line, parse it, and write an entry in $self
187 my($name, $ele, @key);
189 return if $line !~ /^\s*[0-9A-Fa-f]/;
191 # removes comment and gets name
193 if $line =~ s/[#%]\s*(.*)//;
194 return if defined $self->{undefName} && $name =~ /$self->{undefName}/;
197 my($e, $k) = split /;/, $line;
198 croak "Wrong Entry: <charList> must be separated by ';' from <collElement>"
201 my @e = _getHexArray($e);
202 $ele = pack('U*', @e);
203 return if defined $self->{undefChar} && $ele =~ /$self->{undefChar}/;
206 if (defined $self->{ignoreName} && $name =~ /$self->{ignoreName}/ ||
207 defined $self->{ignoreChar} && $ele =~ /$self->{ignoreChar}/)
209 $self->{entries}{$ele} = $self->{ignored}{$ele} = 1;
212 my $combining = 1; # primary = 0, secondary != 0;
214 foreach my $arr ($k =~ /\[([^\[\]]+)\]/g) { # SPACEs allowed
215 my $var = $arr =~ /\*/; # exactly /^\*/ but be lenient.
216 push @key, $self->altCE($var, _getHexArray($arr));
217 $combining = 0 unless $key[-1][0] == 0 && $key[-1][1] != 0;
219 $self->{entries}{$ele} = \@key;
220 $self->{combining}{$ele} = 1 if $combining;
222 $self->{maxlength}{ord $ele} = scalar @e if @e > 1;
227 ## arrayref CE = altCE(bool variable?, list[num] weights)
235 $self->{alternate} eq 'blanked' ?
236 $var ? [0,0,0,$c[3]] : \@c :
237 $self->{alternate} eq 'non-ignorable' ?
239 $self->{alternate} eq 'shifted' ?
240 $var ? [0,0,0,$c[0] ] : [ @c[0..2], $c[0]+$c[1]+$c[2] ? 0xFFFF : 0 ] :
241 $self->{alternate} eq 'shift-trimmed' ?
242 $var ? [0,0,0,$c[0] ] : [ @c[0..2], 0 ] :
243 croak "$PACKAGE unknown alternate name: $self->{alternate}";
247 ## string hex_sortkey = splitCE(string arg)
252 my $key = $self->getSortKey(@_);
253 my $view = join " ", map sprintf("%04X", $_), unpack 'n*', $key;
254 $view =~ s/ ?0000 ?/|/g;
260 ## list[strings] elements = splitCE(string arg)
265 my $code = $self->{preprocess};
266 my $norm = $self->{UNF};
267 my $ent = $self->{entries};
268 my $max = $self->{maxlength};
269 my $reH = $self->{rearrangeHash};
271 my $str = ref $code ? &$code(shift) : shift;
272 $str = &$norm($str) if ref $norm;
274 my @src = unpack('U*', $str);
279 for (my $i = 0; $i < @src; $i++) {
280 if (exists $reH->{ $src[$i] } && $i + 1 < @src) {
281 ($src[$i], $src[$i+1]) = ($src[$i+1], $src[$i]);
287 for (my $i = 0; $i < @src; $i++) {
292 next unless defined $u;
293 next if $u < 0 || 0x10FFFF < $u # out of range
294 || (0xD800 <= $u && $u <= 0xDFFF); # unpaired surrogates
295 my $four = $u & 0xFFFF;
296 next if $four == 0xFFFE || $four == 0xFFFF;
298 if ($max->{$u}) { # contract
299 for (my $j = $max->{$u}; $j >= 1; $j--) {
300 next unless $i+$j-1 < @src;
301 $ch = pack 'U*', @src[$i .. $i+$j-1];
302 $i += $j-1, last if $ent->{$ch};
308 # with Combining Char (UTS#10, 4.2.1), here requires Unicode::Normalize.
309 if ($getCombinClass && defined $ch) {
310 for (my $j = $i+1; $j < @src; $j++) {
311 next unless defined $src[$j];
312 last unless $getCombinClass->( $src[$j] );
313 my $comb = pack 'U', $src[$j];
314 next if ! $ent->{ $ch.$comb };
321 wantarray ? @buf : \@buf;
326 ## list[arrayrefs] weight = getWt(string element)
332 my $ent = $self->{entries};
333 my $ign = $self->{ignored};
334 my $cjk = $self->{overrideCJK};
335 my $hang = $self->{overrideHangul};
337 return if !defined $ch || $ign->{$ch}; # ignored
338 return @{ $ent->{$ch} } if $ent->{$ch};
339 my $u = unpack('U', $ch);
341 if (0xAC00 <= $u && $u <= 0xD7A3) { # is_Hangul
347 my $ar = $ent->{pack('U', $v)};
348 $ar ? @$ar : map($self->altCE(0,@$_), _derivCE($v));
350 : map($self->altCE(0,@$_), _derivCE($u));
352 elsif (0x3400 <= $u && $u <= 0x4DB5 ||
353 0x4E00 <= $u && $u <= 0x9FA5 ||
354 0x20000 <= $u && $u <= 0x2A6D6) { # is_CJK
357 : defined $cjk && $u <= 0xFFFF
358 ? $self->altCE(0, ($u, 0x20, 0x02, $u))
359 : map($self->altCE(0,@$_), _derivCE($u));
362 return map($self->altCE(0,@$_), _derivCE($u));
367 ## int = index(string, substring)
372 my $lev = $self->{level};
373 my $comb = $self->{combining};
374 my $str = $self->splitCE(shift);
375 my $sub = $self->splitCE(shift);
377 return wantarray ? (0,0) : 0 if ! @$sub;
378 return wantarray ? () : -1 if ! @$str;
380 my @subWt = grep _ignorableAtLevel($_,$lev),
381 map $self->getWt($_), @$sub;
385 for (my $i = 0; $i < @$str; $i++) {
388 my @tmp = grep _ignorableAtLevel($_,$lev), $self->getWt($str->[$i]);
389 $go_ahead += length $str->[$i];
391 # /*XXX*/ still broken.
392 # index("e\x{300}", "e") should be 'no match' at level 2 or higher
393 # as "e\x{300}" is a *single* grapheme cluster and not equal to "e".
395 # go ahead as far as we find a combining character;
396 while ($i + 1 < @$str &&
397 (! defined $str->[$i+1] || $comb->{ $str->[$i+1] }) ) {
399 $go_ahead += length $str->[$i];
400 next if ! defined $str->[$i];
402 grep _ignorableAtLevel($_,$lev), $self->getWt($str->[$i]);
406 push @strPt, ($count) x @tmp;
409 while (@strWt >= @subWt) {
410 if (_eqArray(\@strWt, \@subWt, $lev)) {
412 return wantarray ? ($pos, $count-$pos) : $pos;
418 return wantarray ? () : -1;
422 ## bool _eqArray(arrayref, arrayref, level)
426 my $a = shift; # length $a >= length $b;
429 for my $v (0..$lev-1) {
430 for my $c (0..@$b-1){
431 return if $a->[$c][$v] != $b->[$c][$v];
439 ## bool _ignorableAtLevel(CE, level)
441 sub _ignorableAtLevel($$)
444 return unless defined $ce;
446 return ! grep { ! $ce->[$_] } 0..$lv-1;
451 ## string sortkey = getSortKey(string arg)
456 my $lev = $self->{level};
457 my $rCE = $self->splitCE(shift); # get an arrayref
460 my @buf = grep defined(), map $self->getWt($_), @$rCE;
463 my @ret = ([],[],[],[]);
464 foreach my $v (0..$lev-1) {
465 foreach my $b (@buf) {
466 push @{ $ret[$v] }, $b->[$v] if $b->[$v];
469 foreach (@{ $self->{backwards} }) {
471 @{ $ret[$v] } = reverse @{ $ret[$v] };
474 # modification of tertiary weights
475 if ($self->{upper_before_lower}) {
476 foreach (@{ $ret[2] }) {
477 if (0x8 <= $_ && $_ <= 0xC) { $_ -= 6 } # lower
478 elsif (0x2 <= $_ && $_ <= 0x6) { $_ += 6 } # upper
479 elsif ($_ == 0x1C) { $_ += 1 } # square upper
480 elsif ($_ == 0x1D) { $_ -= 1 } # square lower
483 if ($self->{katakana_before_hiragana}) {
484 foreach (@{ $ret[2] }) {
485 if (0x0F <= $_ && $_ <= 0x13) { $_ -= 2 } # katakana
486 elsif (0x0D <= $_ && $_ <= 0x0E) { $_ += 5 } # hiragana
489 join "\0\0", map pack('n*', @$_), @ret;
494 ## int compare = cmp(string a, string b)
496 sub cmp { $_[0]->getSortKey($_[1]) cmp $_[0]->getSortKey($_[2]) }
497 sub eq { $_[0]->getSortKey($_[1]) eq $_[0]->getSortKey($_[2]) }
498 sub ne { $_[0]->getSortKey($_[1]) ne $_[0]->getSortKey($_[2]) }
499 sub lt { $_[0]->getSortKey($_[1]) lt $_[0]->getSortKey($_[2]) }
500 sub le { $_[0]->getSortKey($_[1]) le $_[0]->getSortKey($_[2]) }
501 sub gt { $_[0]->getSortKey($_[1]) gt $_[0]->getSortKey($_[2]) }
502 sub ge { $_[0]->getSortKey($_[1]) ge $_[0]->getSortKey($_[2]) }
505 ## list[strings] sorted = sort(list[strings] arg)
511 sort{ $a->[0] cmp $b->[0] }
512 map [ $obj->getSortKey($_), $_ ], @_;
516 ## list[arrayrefs] CE = _derivCE(int codepoint)
520 my $a = UNDEFINED + ($code >> 15); # ok
521 my $b = ($code & 0x7FFF) | 0x8000; # ok
522 # my $a = 0xFFC2 + ($code >> 15); # ng
523 # my $b = $code & 0x7FFF | 0x1000; # ng
524 $b ? ([$a,2,1,$code],[$b,0,0,$code]) : [$a,2,1,$code];
528 ## "hhhh hhhh hhhh" to (dddd, dddd, dddd)
530 sub _getHexArray { map hex, $_[0] =~ /([0-9a-fA-F]+)/g }
533 # $code must be in Hangul syllable.
534 # Check it before you enter here.
538 my $SIndex = $code - 0xAC00;
539 my $LIndex = int( $SIndex / 588);
540 my $VIndex = int(($SIndex % 588) / 28);
541 my $TIndex = $SIndex % 28;
545 $TIndex ? (0x11A7 + $TIndex) : (),
554 Unicode::Collate - Unicode Collation Algorithm
558 use Unicode::Collate;
561 $Collator = Unicode::Collate->new(%tailoring);
564 @sorted = $Collator->sort(@not_sorted);
567 $result = $Collator->cmp($a, $b); # returns 1, 0, or -1.
571 =head2 Constructor and Tailoring
573 The C<new> method returns a collator object.
575 $Collator = Unicode::Collate->new(
576 alternate => $alternate,
577 backwards => $levelNumber, # or \@levelNumbers
579 normalization => $normalization_form,
580 ignoreName => qr/$ignoreName/,
581 ignoreChar => qr/$ignoreChar/,
582 katakana_before_hiragana => $bool,
583 level => $collationLevel,
584 overrideCJK => \&overrideCJK,
585 overrideHangul => \&overrideHangul,
586 preprocess => \&preprocess,
587 rearrange => \@charList,
589 undefName => qr/$undefName/,
590 undefChar => qr/$undefChar/,
591 upper_before_lower => $bool,
593 # if %tailoring is false (i.e. empty),
594 # $Collator should do the default collation.
600 -- see 3.2.2 Alternate Weighting, UTR #10.
602 This key allows to alternate weighting for variable collation elements,
603 which are marked with an ASTERISK in the table
604 (NOTE: Many punction marks and symbols are variable in F<allkeys.txt>).
606 alternate => 'blanked', 'non-ignorable', 'shifted', or 'shift-trimmed'.
608 These names are case-insensitive.
609 By default (if specification is omitted), 'shifted' is adopted.
611 'Blanked' Variable elements are ignorable at levels 1 through 3;
612 considered at the 4th level.
614 'Non-ignorable' Variable elements are not reset to ignorable.
616 'Shifted' Variable elements are ignorable at levels 1 through 3
617 their level 4 weight is replaced by the old level 1 weight.
618 Level 4 weight for Non-Variable elements is 0xFFFF.
620 'Shift-Trimmed' Same as 'shifted', but all FFFF's at the 4th level
625 -- see 3.1.2 French Accents, UTR #10.
627 backwards => $levelNumber or \@levelNumbers
629 Weights in reverse order; ex. level 2 (diacritic ordering) in French.
630 If omitted, forwards at all the levels.
634 -- see 3.1 Linguistic Features; 3.2.1 File Format, UTR #10.
636 Overrides a default order or defines additional collation elements
638 entry => <<'ENTRIES', # use the UCA file format
639 00E6 ; [.0861.0020.0002.00E6] [.08B1.0020.0002.00E6] # ligature <ae> as <a><e>
640 0063 0068 ; [.0893.0020.0002.0063] # "ch" in traditional Spanish
641 0043 0068 ; [.0893.0020.0008.0043] # "Ch" in traditional Spanish
648 -- see Completely Ignorable, 3.2.2 Alternate Weighting, UTR #10.
650 Makes the entry in the table ignorable.
651 If a collation element is ignorable,
652 it is ignored as if the element had been deleted from there.
654 E.g. when 'a' and 'e' are ignorable,
655 'element' is equal to 'lament' (or 'lmnt').
659 -- see 4.3 Form a sort key for each string, UTR #10.
661 Set the maximum level.
662 Any higher levels than the specified one are ignored.
664 Level 1: alphabetic ordering
665 Level 2: diacritic ordering
666 Level 3: case ordering
667 Level 4: tie-breaking (e.g. in the case when alternate is 'shifted')
671 If omitted, the maximum is the 4th.
675 -- see 4.1 Normalize each input string, UTR #10.
677 If specified, strings are normalized before preparation of sort keys
678 (the normalization is executed after preprocess).
680 As a form name, one of the following names must be used.
682 'C' or 'NFC' for Normalization Form C
683 'D' or 'NFD' for Normalization Form D
684 'KC' or 'NFKC' for Normalization Form KC
685 'KD' or 'NFKD' for Normalization Form KD
687 If omitted, the string is put into Normalization Form D.
689 If C<undef> is passed explicitly as the value for this key,
690 any normalization is not carried out (this may make tailoring easier
691 if any normalization is not desired).
697 -- see 7.1 Derived Collation Elements, UTR #10.
699 By default, mapping of CJK Unified Ideographs
700 uses the Unicode codepoint order.
701 But the mapping of CJK Unified Ideographs may be overrided.
703 ex. CJK Unified Ideographs in the JIS code point order.
706 my $u = shift; # get a Unicode codepoint
707 my $b = pack('n', $u); # to UTF-16BE
708 my $s = your_unicode_to_sjis_converter($b); # convert
709 my $n = unpack('n', $s); # convert sjis to short
710 [ $n, 0x20, 0x2, $u ]; # return the collation element
713 ex. ignores all CJK Unified Ideographs.
715 overrideCJK => sub {()}, # CODEREF returning empty list
717 # where ->eq("Pe\x{4E00}rl", "Perl") is true
718 # as U+4E00 is a CJK Unified Ideograph and to be ignorable.
720 If C<undef> is passed explicitly as the value for this key,
721 weights for CJK Unified Ideographs are treated as undefined.
722 But assignment of weight for CJK Unified Ideographs
723 in table or L<entry> is still valid.
727 -- see 7.1 Derived Collation Elements, UTR #10.
729 By default, Hangul Syllables are decomposed into Hangul Jamo.
730 But the mapping of Hangul Syllables may be overrided.
732 This tag works like L<overrideCJK>, so see there for examples.
734 If you want to override the mapping of Hangul Syllables,
735 the Normalization Forms D and KD are not appropriate
736 (they will be decomposed before overriding).
738 If C<undef> is passed explicitly as the value for this key,
739 weight for Hangul Syllables is treated as undefined
740 without decomposition into Hangul Jamo.
741 But definition of weight for Hangul Syllables
742 in table or L<entry> is still valid.
746 -- see 5.1 Preprocessing, UTR #10.
748 If specified, the coderef is used to preprocess
749 before the formation of sort keys.
751 ex. dropping English articles, such as "a" or "the".
752 Then, "the pen" is before "a pencil".
756 $str =~ s/\b(?:an?|the)\s+//gi;
762 -- see 3.1.3 Rearrangement, UTR #10.
764 Characters that are not coded in logical order and to be rearranged.
767 rearrange => [ 0x0E40..0x0E44, 0x0EC0..0x0EC4 ],
769 If you want to disallow any rearrangement,
770 pass C<undef> or C<[]> (a reference to an empty list)
771 as the value for this key.
775 -- see 3.2 Default Unicode Collation Element Table, UTR #10.
777 You can use another element table if desired.
778 The table file must be in your C<lib/Unicode/Collate> directory.
780 By default, the file C<lib/Unicode/Collate/allkeys.txt> is used.
782 If C<undef> is passed explicitly as the value for this key,
783 no file is read (but you can define collation elements via L<entry>).
785 A typical way to define a collation element table
786 without any file of table:
788 $onlyABC = Unicode::Collate->new(
790 entry => << 'ENTRIES',
791 0061 ; [.0101.0020.0002.0061] # LATIN SMALL LETTER A
792 0041 ; [.0101.0020.0008.0041] # LATIN CAPITAL LETTER A
793 0062 ; [.0102.0020.0002.0062] # LATIN SMALL LETTER B
794 0042 ; [.0102.0020.0008.0042] # LATIN CAPITAL LETTER B
795 0063 ; [.0103.0020.0002.0063] # LATIN SMALL LETTER C
796 0043 ; [.0103.0020.0008.0043] # LATIN CAPITAL LETTER C
804 -- see 6.3.4 Reducing the Repertoire, UTR #10.
806 Undefines the collation element as if it were unassigned in the table.
807 This reduces the size of the table.
808 If an unassigned character appears in the string to be collated,
809 the sort key is made from its codepoint
810 as a single-character collation element,
811 as it is greater than any other assigned collation elements
812 (in the codepoint order among the unassigned characters).
813 But, it'd be better to ignore characters
814 unfamiliar to you and maybe never used.
816 =item katakana_before_hiragana
818 =item upper_before_lower
820 -- see 6.6 Case Comparisons; 7.3.1 Tertiary Weight Table, UTR #10.
822 By default, lowercase is before uppercase
823 and hiragana is before katakana.
825 If the tag is made true, this is reversed.
827 B<NOTE>: These tags simplemindedly assume
828 any lowercase/uppercase or hiragana/katakana distinctions
829 should occur in level 3, and their weights at level 3
830 should be same as those mentioned in 7.3.1, UTR #10.
831 If you define your collation elements which violates this,
832 these tags doesn't work validly.
836 =head2 Methods for Collation
840 =item C<@sorted = $Collator-E<gt>sort(@not_sorted)>
842 Sorts a list of strings.
844 =item C<$result = $Collator-E<gt>cmp($a, $b)>
846 Returns 1 (when C<$a> is greater than C<$b>)
847 or 0 (when C<$a> is equal to C<$b>)
848 or -1 (when C<$a> is lesser than C<$b>).
850 =item C<$result = $Collator-E<gt>eq($a, $b)>
852 =item C<$result = $Collator-E<gt>ne($a, $b)>
854 =item C<$result = $Collator-E<gt>lt($a, $b)>
856 =item C<$result = $Collator-E<gt>le($a, $b)>
858 =item C<$result = $Collator-E<gt>gt($a, $b)>
860 =item C<$result = $Collator-E<gt>ge($a, $b)>
862 They works like the same name operators as theirs.
864 eq : whether $a is equal to $b.
865 ne : whether $a is not equal to $b.
866 lt : whether $a is lesser than $b.
867 le : whether $a is lesser than $b or equal to $b.
868 gt : whether $a is greater than $b.
869 ge : whether $a is greater than $b or equal to $b.
871 =item C<$sortKey = $Collator-E<gt>getSortKey($string)>
873 -- see 4.3 Form a sort key for each string, UTR #10.
877 You compare the sort keys using a binary comparison
878 and get the result of the comparison of the strings using UCA.
880 $Collator->getSortKey($a) cmp $Collator->getSortKey($b)
884 $Collator->cmp($a, $b)
886 =item C<$sortKeyForm = $Collator-E<gt>viewSortKey($string)>
888 Returns a string formalized to display a sort key.
889 Weights are enclosed with C<'['> and C<']'>
890 and level boundaries are denoted by C<'|'>.
892 use Unicode::Collate;
893 my $c = Unicode::Collate->new();
894 print $c->viewSortKey("Perl"),"\n";
897 # [09B3 08B1 09CB 094F|0020 0020 0020 0020|0008 0002 0002 0002|FFFF FFFF FFFF FFFF]
898 # Level 1 Level 2 Level 3 Level 4
900 =item C<$position = $Collator-E<gt>index($string, $substring)>
902 =item C<($position, $length) = $Collator-E<gt>index($string, $substring)>
904 -- see 6.8 Searching, UTR #10.
906 If C<$substring> matches a part of C<$string>, returns
907 the position of the first occurrence of the matching part in scalar context;
908 in list context, returns a two-element list of
909 the position and the length of the matching part.
911 B<Notice> that the length of the matching part may differ from
912 the length of C<$substring>.
914 B<Note> that the position and the length are counted on the string
915 after the process of preprocess, normalization, and rearrangement.
916 Therefore, in case the specified string is not binary equal to
917 the preprocessed/normalized/rearranged string, the position and the length
918 may differ form those on the specified string. But it is guaranteed
919 that, if matched, it returns a non-negative value as C<$position>.
921 If C<$substring> does not match any part of C<$string>,
922 returns C<-1> in scalar context and
923 an empty list in list context.
927 my $Collator = Unicode::Collate->new( normalization => undef, level => 1 );
928 my $str = "Ich mu\x{00DF} studieren.";
929 my $sub = "m\x{00FC}ss";
931 if (my($pos,$len) = $Collator->index($str, $sub)) {
932 $match = substr($str, $pos, $len);
935 and get C<"mu\x{00DF}"> in C<$match> since C<"mu>E<223>C<">
936 is primary equal to C<"m>E<252>C<ss">.
946 Returns the version number of Unicode Technical Standard 10
947 this module consults.
949 =item Base_Unicode_Version
951 Returns the version number of the Unicode Standard
952 this module is based on.
962 Unicode::Collate has not been ported to EBCDIC. The code mostly would
963 work just fine but a decision needs to be made: how the module should
964 work in EBCDIC? Should the low 256 characters be understood as
965 Unicode or as EBCDIC code points? Should one be chosen or should
966 there be a way to do either? Or should such translation be left
967 outside the module for the user to do, for example by using
969 (or utf8::unicode_to_native()/utf8::native_to_unicode()?)
973 Use of the C<normalization> parameter requires
974 the B<Unicode::Normalize> module.
976 If you need not it (say, in the case when you need not
977 handle any combining characters),
978 assign C<normalization =E<gt> undef> explicitly.
980 -- see 6.5 Avoiding Normalization, UTR #10.
984 C<index()> is an experimental method and
985 its return value may be unreliable.
986 The correct implementation for C<index()> must be based
987 on Locale-Sensitive Support: Level 3 in UTR #18,
988 F<Unicode Regular Expression Guidelines>.
990 See also 4.2 Locale-Dependent Graphemes in UTR #18.
994 SADAHIRO Tomoyuki, E<lt>SADAHIRO@cpan.orgE<gt>
996 http://homepage1.nifty.com/nomenclator/perl/
998 Copyright(C) 2001-2002, SADAHIRO Tomoyuki. Japan. All rights reserved.
1000 This library is free software; you can redistribute it
1001 and/or modify it under the same terms as Perl itself.
1007 =item http://www.unicode.org/unicode/reports/tr10/
1009 Unicode Collation Algorithm - UTR #10
1011 =item http://www.unicode.org/unicode/reports/tr10/allkeys.txt
1013 The Default Unicode Collation Element Table
1015 =item http://www.unicode.org/unicode/reports/tr15/
1017 Unicode Normalization Forms - UAX #15
1019 =item http://www.unicode.org/unicode/reports/tr18
1021 Unicode Regular Expression Guidelines - UTR #18
1023 =item L<Unicode::Normalize>