1 package Unicode::Collate;
5 die "Unicode::Collate not ported to EBCDIC\n";
17 our $VERSION = '0.12';
18 our $PACKAGE = __PACKAGE__;
20 our @ISA = qw(Exporter);
22 our %EXPORT_TAGS = ();
26 (our $Path = $INC{'Unicode/Collate.pm'}) =~ s/\.pm$//;
27 our $KeyFile = "allkeys.txt";
31 eval { require Unicode::UCD };
34 $UNICODE_VERSION = Unicode::UCD::UnicodeVersion();
36 else { # XXX, Perl 5.6.1
38 foreach my $d (@INC) {
40 $f = File::Spec->catfile($d, "unicode", "Unicode.301");
42 $UNICODE_VERSION = '3.0.1';
49 our $getCombinClass; # coderef for combining class from Unicode::Normalize
51 use constant Min2 => 0x20; # minimum weight at level 2
52 use constant Min3 => 0x02; # minimum weight at level 3
53 use constant UNDEFINED => 0xFF80; # special value for undefined CE's
55 our $DefaultRearrange = [ 0x0E40..0x0E44, 0x0EC0..0x0EC4 ];
57 sub UCA_Version { "8.0" }
59 sub Base_Unicode_Version { $UNICODE_VERSION || 'unknown' }
67 my $self = bless { @_ }, $class;
69 # alternate lowercased
71 ! exists $self->{alternate} ? 'shifted' : lc($self->{alternate});
73 croak "$PACKAGE unknown alternate tag name: $self->{alternate}"
74 unless $self->{alternate} eq 'blanked'
75 || $self->{alternate} eq 'non-ignorable'
76 || $self->{alternate} eq 'shifted'
77 || $self->{alternate} eq 'shift-trimmed';
82 croak "Illegal level lower than 1 (passed $self->{level})."
83 if $self->{level} < 1;
84 croak "A level higher than 4 (passed $self->{level}) is not supported."
85 if 4 < $self->{level};
87 # overrideHangul and -CJK
88 # If true: CODEREF used; '': default; undef: derived elements
89 $self->{overrideHangul} = ''
90 if ! exists $self->{overrideHangul};
91 $self->{overrideCJK} = ''
92 if ! exists $self->{overrideCJK};
95 $self->{normalization} = 'D'
96 if ! exists $self->{normalization};
99 if (defined $self->{normalization}) {
100 eval { require Unicode::Normalize };
101 croak "Unicode/Normalize.pm is required to normalize strings: $@"
104 Unicode::Normalize->import();
105 $getCombinClass = \&Unicode::Normalize::getCombinClass
106 if ! $getCombinClass;
109 $self->{normalization} =~ /^(?:NF)?C$/ ? \&NFC :
110 $self->{normalization} =~ /^(?:NF)?D$/ ? \&NFD :
111 $self->{normalization} =~ /^(?:NF)?KC$/ ? \&NFKC :
112 $self->{normalization} =~ /^(?:NF)?KD$/ ? \&NFKD :
113 croak "$PACKAGE unknown normalization form name: "
114 . $self->{normalization};
118 # If undef is passed explicitly, no file is read.
119 $self->{table} = $KeyFile
120 if ! exists $self->{table};
122 if defined $self->{table};
124 if ($self->{entry}) {
125 $self->parseEntry($_) foreach split /\n/, $self->{entry};
129 $self->{backwards} ||= [ ];
130 $self->{backwards} = [ $self->{backwards} ]
131 if ! ref $self->{backwards};
134 $self->{rearrange} = $DefaultRearrange
135 if ! exists $self->{rearrange};
136 $self->{rearrange} = []
137 if ! defined $self->{rearrange};
138 croak "$PACKAGE: A list for rearrangement must be store in an ARRAYREF"
139 if ! ref $self->{rearrange};
141 # keys of $self->{rearrangeHash} are $self->{rearrange}.
142 $self->{rearrangeHash} = undef;
144 if (@{ $self->{rearrange} }) {
145 @{ $self->{rearrangeHash} }{ @{ $self->{rearrange} } } = ();
153 my $file = $self->{table} ne '' ? $self->{table} : $KeyFile;
155 my $filepath = File::Spec->catfile($Path, $file);
156 open my $fk, "<$filepath"
157 or croak "File does not exist at $filepath";
162 if (/^\@version\s*(\S*)/) {
163 $self->{version} ||= $1;
165 elsif (/^\@alternate\s+(.*)/) {
166 $self->{alternate} ||= $1;
168 elsif (/^\@backwards\s+(.*)/) {
169 push @{ $self->{backwards} }, $1;
171 elsif (/^\@rearrange\s+(.*)/) {
172 push @{ $self->{rearrange} }, _getHexArray($1);
176 $self->parseEntry($_);
183 ## get $line, parse it, and write an entry in $self
189 my($name, $ele, @key);
191 return if $line !~ /^\s*[0-9A-Fa-f]/;
193 # removes comment and gets name
195 if $line =~ s/[#%]\s*(.*)//;
196 return if defined $self->{undefName} && $name =~ /$self->{undefName}/;
199 my($e, $k) = split /;/, $line;
200 croak "Wrong Entry: <charList> must be separated by ';' from <collElement>"
203 my @e = _getHexArray($e);
204 $ele = pack('U*', @e);
205 return if defined $self->{undefChar} && $ele =~ /$self->{undefChar}/;
208 if (defined $self->{ignoreName} && $name =~ /$self->{ignoreName}/ ||
209 defined $self->{ignoreChar} && $ele =~ /$self->{ignoreChar}/)
211 $self->{entries}{$ele} = $self->{ignored}{$ele} = 1;
214 my $combining = 1; # primary = 0, secondary != 0;
216 foreach my $arr ($k =~ /\[([^\[\]]+)\]/g) { # SPACEs allowed
217 my $var = $arr =~ /\*/; # exactly /^\*/ but be lenient.
218 push @key, $self->altCE($var, _getHexArray($arr));
219 $combining = 0 unless $key[-1][0] == 0 && $key[-1][1] != 0;
221 $self->{entries}{$ele} = \@key;
222 $self->{combining}{$ele} = 1 if $combining;
224 $self->{maxlength}{ord $ele} = scalar @e if @e > 1;
229 ## arrayref CE = altCE(bool variable?, list[num] weights)
237 $self->{alternate} eq 'blanked' ?
238 $var ? [0,0,0,$c[3]] : \@c :
239 $self->{alternate} eq 'non-ignorable' ?
241 $self->{alternate} eq 'shifted' ?
242 $var ? [0,0,0,$c[0] ] : [ @c[0..2], $c[0]+$c[1]+$c[2] ? 0xFFFF : 0 ] :
243 $self->{alternate} eq 'shift-trimmed' ?
244 $var ? [0,0,0,$c[0] ] : [ @c[0..2], 0 ] :
245 croak "$PACKAGE unknown alternate name: $self->{alternate}";
249 ## string hex_sortkey = splitCE(string arg)
254 my $key = $self->getSortKey(@_);
255 my $view = join " ", map sprintf("%04X", $_), unpack 'n*', $key;
256 $view =~ s/ ?0000 ?/|/g;
262 ## list[strings] elements = splitCE(string arg)
267 my $code = $self->{preprocess};
268 my $norm = $self->{UNF};
269 my $ent = $self->{entries};
270 my $max = $self->{maxlength};
271 my $reH = $self->{rearrangeHash};
273 my $str = ref $code ? &$code(shift) : shift;
274 $str = &$norm($str) if ref $norm;
276 my @src = unpack('U*', $str);
281 for (my $i = 0; $i < @src; $i++) {
282 if (exists $reH->{ $src[$i] } && $i + 1 < @src) {
283 ($src[$i], $src[$i+1]) = ($src[$i+1], $src[$i]);
289 for (my $i = 0; $i < @src; $i++) {
294 next unless defined $u;
295 next if $u < 0 || 0x10FFFF < $u # out of range
296 || (0xD800 <= $u && $u <= 0xDFFF); # unpaired surrogates
297 my $four = $u & 0xFFFF;
298 next if $four == 0xFFFE || $four == 0xFFFF;
300 if ($max->{$u}) { # contract
301 for (my $j = $max->{$u}; $j >= 1; $j--) {
302 next unless $i+$j-1 < @src;
303 $ch = pack 'U*', @src[$i .. $i+$j-1];
304 $i += $j-1, last if $ent->{$ch};
310 # with Combining Char (UTS#10, 4.2.1), here requires Unicode::Normalize.
311 if ($getCombinClass && defined $ch) {
312 for (my $j = $i+1; $j < @src; $j++) {
313 next unless defined $src[$j];
314 last unless $getCombinClass->( $src[$j] );
315 my $comb = pack 'U', $src[$j];
316 next if ! $ent->{ $ch.$comb };
323 wantarray ? @buf : \@buf;
328 ## list[arrayrefs] weight = getWt(string element)
334 my $ent = $self->{entries};
335 my $ign = $self->{ignored};
336 my $cjk = $self->{overrideCJK};
337 my $hang = $self->{overrideHangul};
339 return if !defined $ch || $ign->{$ch}; # ignored
340 return @{ $ent->{$ch} } if $ent->{$ch};
341 my $u = unpack('U', $ch);
343 if (0xAC00 <= $u && $u <= 0xD7A3) { # is_Hangul
349 my $ar = $ent->{pack('U', $v)};
350 $ar ? @$ar : map($self->altCE(0,@$_), _derivCE($v));
352 : map($self->altCE(0,@$_), _derivCE($u));
354 elsif (0x3400 <= $u && $u <= 0x4DB5 ||
355 0x4E00 <= $u && $u <= 0x9FA5 ||
356 0x20000 <= $u && $u <= 0x2A6D6) { # is_CJK
359 : defined $cjk && $u <= 0xFFFF
360 ? $self->altCE(0, ($u, 0x20, 0x02, $u))
361 : map($self->altCE(0,@$_), _derivCE($u));
364 return map($self->altCE(0,@$_), _derivCE($u));
369 ## int = index(string, substring)
374 my $lev = $self->{level};
375 my $comb = $self->{combining};
376 my $str = $self->splitCE(shift);
377 my $sub = $self->splitCE(shift);
379 return wantarray ? (0,0) : 0 if ! @$sub;
380 return wantarray ? () : -1 if ! @$str;
382 my @subWt = grep _ignorableAtLevel($_,$lev),
383 map $self->getWt($_), @$sub;
387 for (my $i = 0; $i < @$str; $i++) {
390 my @tmp = grep _ignorableAtLevel($_,$lev), $self->getWt($str->[$i]);
391 $go_ahead += length $str->[$i];
393 # /*XXX*/ still broken.
394 # index("e\x{300}", "e") should be 'no match' at level 2 or higher
395 # as "e\x{300}" is a *single* grapheme cluster and not equal to "e".
397 # go ahead as far as we find a combining character;
398 while ($i + 1 < @$str &&
399 (! defined $str->[$i+1] || $comb->{ $str->[$i+1] }) ) {
401 $go_ahead += length $str->[$i];
402 next if ! defined $str->[$i];
404 grep _ignorableAtLevel($_,$lev), $self->getWt($str->[$i]);
408 push @strPt, ($count) x @tmp;
411 while (@strWt >= @subWt) {
412 if (_eqArray(\@strWt, \@subWt, $lev)) {
414 return wantarray ? ($pos, $count-$pos) : $pos;
420 return wantarray ? () : -1;
424 ## bool _eqArray(arrayref, arrayref, level)
428 my $a = shift; # length $a >= length $b;
431 for my $v (0..$lev-1) {
432 for my $c (0..@$b-1){
433 return if $a->[$c][$v] != $b->[$c][$v];
441 ## bool _ignorableAtLevel(CE, level)
443 sub _ignorableAtLevel($$)
446 return unless defined $ce;
448 return ! grep { ! $ce->[$_] } 0..$lv-1;
453 ## string sortkey = getSortKey(string arg)
458 my $lev = $self->{level};
459 my $rCE = $self->splitCE(shift); # get an arrayref
462 my @buf = grep defined(), map $self->getWt($_), @$rCE;
465 my @ret = ([],[],[],[]);
466 foreach my $v (0..$lev-1) {
467 foreach my $b (@buf) {
468 push @{ $ret[$v] }, $b->[$v] if $b->[$v];
471 foreach (@{ $self->{backwards} }) {
473 @{ $ret[$v] } = reverse @{ $ret[$v] };
476 # modification of tertiary weights
477 if ($self->{upper_before_lower}) {
478 foreach (@{ $ret[2] }) {
479 if (0x8 <= $_ && $_ <= 0xC) { $_ -= 6 } # lower
480 elsif (0x2 <= $_ && $_ <= 0x6) { $_ += 6 } # upper
481 elsif ($_ == 0x1C) { $_ += 1 } # square upper
482 elsif ($_ == 0x1D) { $_ -= 1 } # square lower
485 if ($self->{katakana_before_hiragana}) {
486 foreach (@{ $ret[2] }) {
487 if (0x0F <= $_ && $_ <= 0x13) { $_ -= 2 } # katakana
488 elsif (0x0D <= $_ && $_ <= 0x0E) { $_ += 5 } # hiragana
491 join "\0\0", map pack('n*', @$_), @ret;
496 ## int compare = cmp(string a, string b)
498 sub cmp { $_[0]->getSortKey($_[1]) cmp $_[0]->getSortKey($_[2]) }
499 sub eq { $_[0]->getSortKey($_[1]) eq $_[0]->getSortKey($_[2]) }
500 sub ne { $_[0]->getSortKey($_[1]) ne $_[0]->getSortKey($_[2]) }
501 sub lt { $_[0]->getSortKey($_[1]) lt $_[0]->getSortKey($_[2]) }
502 sub le { $_[0]->getSortKey($_[1]) le $_[0]->getSortKey($_[2]) }
503 sub gt { $_[0]->getSortKey($_[1]) gt $_[0]->getSortKey($_[2]) }
504 sub ge { $_[0]->getSortKey($_[1]) ge $_[0]->getSortKey($_[2]) }
507 ## list[strings] sorted = sort(list[strings] arg)
513 sort{ $a->[0] cmp $b->[0] }
514 map [ $obj->getSortKey($_), $_ ], @_;
518 ## list[arrayrefs] CE = _derivCE(int codepoint)
522 my $a = UNDEFINED + ($code >> 15); # ok
523 my $b = ($code & 0x7FFF) | 0x8000; # ok
524 # my $a = 0xFFC2 + ($code >> 15); # ng
525 # my $b = $code & 0x7FFF | 0x1000; # ng
526 $b ? ([$a,2,1,$code],[$b,0,0,$code]) : [$a,2,1,$code];
530 ## "hhhh hhhh hhhh" to (dddd, dddd, dddd)
532 sub _getHexArray { map hex, $_[0] =~ /([0-9a-fA-F]+)/g }
535 # $code must be in Hangul syllable.
536 # Check it before you enter here.
540 my $SIndex = $code - 0xAC00;
541 my $LIndex = int( $SIndex / 588);
542 my $VIndex = int(($SIndex % 588) / 28);
543 my $TIndex = $SIndex % 28;
547 $TIndex ? (0x11A7 + $TIndex) : (),
556 Unicode::Collate - Unicode Collation Algorithm
560 use Unicode::Collate;
563 $Collator = Unicode::Collate->new(%tailoring);
566 @sorted = $Collator->sort(@not_sorted);
569 $result = $Collator->cmp($a, $b); # returns 1, 0, or -1.
573 =head2 Constructor and Tailoring
575 The C<new> method returns a collator object.
577 $Collator = Unicode::Collate->new(
578 alternate => $alternate,
579 backwards => $levelNumber, # or \@levelNumbers
581 normalization => $normalization_form,
582 ignoreName => qr/$ignoreName/,
583 ignoreChar => qr/$ignoreChar/,
584 katakana_before_hiragana => $bool,
585 level => $collationLevel,
586 overrideCJK => \&overrideCJK,
587 overrideHangul => \&overrideHangul,
588 preprocess => \&preprocess,
589 rearrange => \@charList,
591 undefName => qr/$undefName/,
592 undefChar => qr/$undefChar/,
593 upper_before_lower => $bool,
595 # if %tailoring is false (i.e. empty),
596 # $Collator should do the default collation.
602 -- see 3.2.2 Alternate Weighting, UTR #10.
604 This key allows to alternate weighting for variable collation elements,
605 which are marked with an ASTERISK in the table
606 (NOTE: Many punction marks and symbols are variable in F<allkeys.txt>).
608 alternate => 'blanked', 'non-ignorable', 'shifted', or 'shift-trimmed'.
610 These names are case-insensitive.
611 By default (if specification is omitted), 'shifted' is adopted.
613 'Blanked' Variable elements are ignorable at levels 1 through 3;
614 considered at the 4th level.
616 'Non-ignorable' Variable elements are not reset to ignorable.
618 'Shifted' Variable elements are ignorable at levels 1 through 3
619 their level 4 weight is replaced by the old level 1 weight.
620 Level 4 weight for Non-Variable elements is 0xFFFF.
622 'Shift-Trimmed' Same as 'shifted', but all FFFF's at the 4th level
627 -- see 3.1.2 French Accents, UTR #10.
629 backwards => $levelNumber or \@levelNumbers
631 Weights in reverse order; ex. level 2 (diacritic ordering) in French.
632 If omitted, forwards at all the levels.
636 -- see 3.1 Linguistic Features; 3.2.1 File Format, UTR #10.
638 Overrides a default order or defines additional collation elements
640 entry => <<'ENTRIES', # use the UCA file format
641 00E6 ; [.0861.0020.0002.00E6] [.08B1.0020.0002.00E6] # ligature <ae> as <a><e>
642 0063 0068 ; [.0893.0020.0002.0063] # "ch" in traditional Spanish
643 0043 0068 ; [.0893.0020.0008.0043] # "Ch" in traditional Spanish
650 -- see Completely Ignorable, 3.2.2 Alternate Weighting, UTR #10.
652 Makes the entry in the table ignorable.
653 If a collation element is ignorable,
654 it is ignored as if the element had been deleted from there.
656 E.g. when 'a' and 'e' are ignorable,
657 'element' is equal to 'lament' (or 'lmnt').
661 -- see 4.3 Form a sort key for each string, UTR #10.
663 Set the maximum level.
664 Any higher levels than the specified one are ignored.
666 Level 1: alphabetic ordering
667 Level 2: diacritic ordering
668 Level 3: case ordering
669 Level 4: tie-breaking (e.g. in the case when alternate is 'shifted')
673 If omitted, the maximum is the 4th.
677 -- see 4.1 Normalize each input string, UTR #10.
679 If specified, strings are normalized before preparation of sort keys
680 (the normalization is executed after preprocess).
682 As a form name, one of the following names must be used.
684 'C' or 'NFC' for Normalization Form C
685 'D' or 'NFD' for Normalization Form D
686 'KC' or 'NFKC' for Normalization Form KC
687 'KD' or 'NFKD' for Normalization Form KD
689 If omitted, the string is put into Normalization Form D.
691 If C<undef> is passed explicitly as the value for this key,
692 any normalization is not carried out (this may make tailoring easier
693 if any normalization is not desired).
699 -- see 7.1 Derived Collation Elements, UTR #10.
701 By default, mapping of CJK Unified Ideographs
702 uses the Unicode codepoint order.
703 But the mapping of CJK Unified Ideographs may be overrided.
705 ex. CJK Unified Ideographs in the JIS code point order.
708 my $u = shift; # get a Unicode codepoint
709 my $b = pack('n', $u); # to UTF-16BE
710 my $s = your_unicode_to_sjis_converter($b); # convert
711 my $n = unpack('n', $s); # convert sjis to short
712 [ $n, 0x20, 0x2, $u ]; # return the collation element
715 ex. ignores all CJK Unified Ideographs.
717 overrideCJK => sub {()}, # CODEREF returning empty list
719 # where ->eq("Pe\x{4E00}rl", "Perl") is true
720 # as U+4E00 is a CJK Unified Ideograph and to be ignorable.
722 If C<undef> is passed explicitly as the value for this key,
723 weights for CJK Unified Ideographs are treated as undefined.
724 But assignment of weight for CJK Unified Ideographs
725 in table or L<entry> is still valid.
729 -- see 7.1 Derived Collation Elements, UTR #10.
731 By default, Hangul Syllables are decomposed into Hangul Jamo.
732 But the mapping of Hangul Syllables may be overrided.
734 This tag works like L<overrideCJK>, so see there for examples.
736 If you want to override the mapping of Hangul Syllables,
737 the Normalization Forms D and KD are not appropriate
738 (they will be decomposed before overriding).
740 If C<undef> is passed explicitly as the value for this key,
741 weight for Hangul Syllables is treated as undefined
742 without decomposition into Hangul Jamo.
743 But definition of weight for Hangul Syllables
744 in table or L<entry> is still valid.
748 -- see 5.1 Preprocessing, UTR #10.
750 If specified, the coderef is used to preprocess
751 before the formation of sort keys.
753 ex. dropping English articles, such as "a" or "the".
754 Then, "the pen" is before "a pencil".
758 $str =~ s/\b(?:an?|the)\s+//gi;
764 -- see 3.1.3 Rearrangement, UTR #10.
766 Characters that are not coded in logical order and to be rearranged.
769 rearrange => [ 0x0E40..0x0E44, 0x0EC0..0x0EC4 ],
771 If you want to disallow any rearrangement,
772 pass C<undef> or C<[]> (a reference to an empty list)
773 as the value for this key.
777 -- see 3.2 Default Unicode Collation Element Table, UTR #10.
779 You can use another element table if desired.
780 The table file must be in your C<lib/Unicode/Collate> directory.
782 By default, the file C<lib/Unicode/Collate/allkeys.txt> is used.
784 If C<undef> is passed explicitly as the value for this key,
785 no file is read (but you can define collation elements via L<entry>).
787 A typical way to define a collation element table
788 without any file of table:
790 $onlyABC = Unicode::Collate->new(
792 entry => << 'ENTRIES',
793 0061 ; [.0101.0020.0002.0061] # LATIN SMALL LETTER A
794 0041 ; [.0101.0020.0008.0041] # LATIN CAPITAL LETTER A
795 0062 ; [.0102.0020.0002.0062] # LATIN SMALL LETTER B
796 0042 ; [.0102.0020.0008.0042] # LATIN CAPITAL LETTER B
797 0063 ; [.0103.0020.0002.0063] # LATIN SMALL LETTER C
798 0043 ; [.0103.0020.0008.0043] # LATIN CAPITAL LETTER C
806 -- see 6.3.4 Reducing the Repertoire, UTR #10.
808 Undefines the collation element as if it were unassigned in the table.
809 This reduces the size of the table.
810 If an unassigned character appears in the string to be collated,
811 the sort key is made from its codepoint
812 as a single-character collation element,
813 as it is greater than any other assigned collation elements
814 (in the codepoint order among the unassigned characters).
815 But, it'd be better to ignore characters
816 unfamiliar to you and maybe never used.
818 =item katakana_before_hiragana
820 =item upper_before_lower
822 -- see 6.6 Case Comparisons; 7.3.1 Tertiary Weight Table, UTR #10.
824 By default, lowercase is before uppercase
825 and hiragana is before katakana.
827 If the tag is made true, this is reversed.
829 B<NOTE>: These tags simplemindedly assume
830 any lowercase/uppercase or hiragana/katakana distinctions
831 should occur in level 3, and their weights at level 3
832 should be same as those mentioned in 7.3.1, UTR #10.
833 If you define your collation elements which violates this,
834 these tags doesn't work validly.
838 =head2 Methods for Collation
842 =item C<@sorted = $Collator-E<gt>sort(@not_sorted)>
844 Sorts a list of strings.
846 =item C<$result = $Collator-E<gt>cmp($a, $b)>
848 Returns 1 (when C<$a> is greater than C<$b>)
849 or 0 (when C<$a> is equal to C<$b>)
850 or -1 (when C<$a> is lesser than C<$b>).
852 =item C<$result = $Collator-E<gt>eq($a, $b)>
854 =item C<$result = $Collator-E<gt>ne($a, $b)>
856 =item C<$result = $Collator-E<gt>lt($a, $b)>
858 =item C<$result = $Collator-E<gt>le($a, $b)>
860 =item C<$result = $Collator-E<gt>gt($a, $b)>
862 =item C<$result = $Collator-E<gt>ge($a, $b)>
864 They works like the same name operators as theirs.
866 eq : whether $a is equal to $b.
867 ne : whether $a is not equal to $b.
868 lt : whether $a is lesser than $b.
869 le : whether $a is lesser than $b or equal to $b.
870 gt : whether $a is greater than $b.
871 ge : whether $a is greater than $b or equal to $b.
873 =item C<$sortKey = $Collator-E<gt>getSortKey($string)>
875 -- see 4.3 Form a sort key for each string, UTR #10.
879 You compare the sort keys using a binary comparison
880 and get the result of the comparison of the strings using UCA.
882 $Collator->getSortKey($a) cmp $Collator->getSortKey($b)
886 $Collator->cmp($a, $b)
888 =item C<$sortKeyForm = $Collator-E<gt>viewSortKey($string)>
890 Returns a string formalized to display a sort key.
891 Weights are enclosed with C<'['> and C<']'>
892 and level boundaries are denoted by C<'|'>.
894 use Unicode::Collate;
895 my $c = Unicode::Collate->new();
896 print $c->viewSortKey("Perl"),"\n";
899 # [09B3 08B1 09CB 094F|0020 0020 0020 0020|0008 0002 0002 0002|FFFF FFFF FFFF FFFF]
900 # Level 1 Level 2 Level 3 Level 4
902 =item C<$position = $Collator-E<gt>index($string, $substring)>
904 =item C<($position, $length) = $Collator-E<gt>index($string, $substring)>
906 -- see 6.8 Searching, UTR #10.
908 If C<$substring> matches a part of C<$string>, returns
909 the position of the first occurrence of the matching part in scalar context;
910 in list context, returns a two-element list of
911 the position and the length of the matching part.
913 B<Notice> that the length of the matching part may differ from
914 the length of C<$substring>.
916 B<Note> that the position and the length are counted on the string
917 after the process of preprocess, normalization, and rearrangement.
918 Therefore, in case the specified string is not binary equal to
919 the preprocessed/normalized/rearranged string, the position and the length
920 may differ form those on the specified string. But it is guaranteed
921 that, if matched, it returns a non-negative value as C<$position>.
923 If C<$substring> does not match any part of C<$string>,
924 returns C<-1> in scalar context and
925 an empty list in list context.
929 my $Collator = Unicode::Collate->new( normalization => undef, level => 1 );
930 my $str = "Ich mu\x{00DF} studieren.";
931 my $sub = "m\x{00FC}ss";
933 if (my($pos,$len) = $Collator->index($str, $sub)) {
934 $match = substr($str, $pos, $len);
937 and get C<"mu\x{00DF}"> in C<$match> since C<"mu>E<223>C<">
938 is primary equal to C<"m>E<252>C<ss">.
948 Returns the version number of Unicode Technical Standard 10
949 this module consults.
951 =item Base_Unicode_Version
953 Returns the version number of the Unicode Standard
954 this module is based on.
964 Unicode::Collate has not been ported to EBCDIC. The code mostly would
965 work just fine but a decision needs to be made: how the module should
966 work in EBCDIC? Should the low 256 characters be understood as
967 Unicode or as EBCDIC code points? Should one be chosen or should
968 there be a way to do either? Or should such translation be left
969 outside the module for the user to do, for example by using
971 (or utf8::unicode_to_native()/utf8::native_to_unicode()?)
975 Use of the C<normalization> parameter requires
976 the B<Unicode::Normalize> module.
978 If you need not it (say, in the case when you need not
979 handle any combining characters),
980 assign C<normalization =E<gt> undef> explicitly.
982 -- see 6.5 Avoiding Normalization, UTR #10.
986 C<index()> is an experimental method and
987 its return value may be unreliable.
988 The correct implementation for C<index()> must be based
989 on Locale-Sensitive Support: Level 3 in UTR #18,
990 F<Unicode Regular Expression Guidelines>.
992 See also 4.2 Locale-Dependent Graphemes in UTR #18.
996 SADAHIRO Tomoyuki, E<lt>SADAHIRO@cpan.orgE<gt>
998 http://homepage1.nifty.com/nomenclator/perl/
1000 Copyright(C) 2001-2002, SADAHIRO Tomoyuki. Japan. All rights reserved.
1002 This library is free software; you can redistribute it
1003 and/or modify it under the same terms as Perl itself.
1009 =item http://www.unicode.org/unicode/reports/tr10/
1011 Unicode Collation Algorithm - UTR #10
1013 =item http://www.unicode.org/unicode/reports/tr10/allkeys.txt
1015 The Default Unicode Collation Element Table
1017 =item http://www.unicode.org/unicode/reports/tr15/
1019 Unicode Normalization Forms - UAX #15
1021 =item http://www.unicode.org/unicode/reports/tr18
1023 Unicode Regular Expression Guidelines - UTR #18
1025 =item L<Unicode::Normalize>