1 package Unicode::Collate;
5 die "Unicode::Collate not ported to EBCDIC\n";
16 our $VERSION = '0.11';
17 our $PACKAGE = __PACKAGE__;
19 our @ISA = qw(Exporter);
21 our %EXPORT_TAGS = ();
25 (our $Path = $INC{'Unicode/Collate.pm'}) =~ s/\.pm$//;
26 our $KeyFile = "allkeys.txt";
32 foreach my $d (@INC) {
34 $f = File::Spec->catfile($d, "unicore", "version");
36 chomp($UNICODE_VERSION = <$fh>);
42 $f = File::Spec->catfile($d, "unicode", "Unicode.301");
44 $UNICODE_VERSION = "3.0.1";
51 our $getCombinClass; # coderef for combining class from Unicode::Normalize
53 use constant Min2 => 0x20; # minimum weight at level 2
54 use constant Min3 => 0x02; # minimum weight at level 3
55 use constant UNDEFINED => 0xFF80; # special value for undefined CE's
57 our $DefaultRearrange = [ 0x0E40..0x0E44, 0x0EC0..0x0EC4 ];
59 sub UCA_Version { "8.0" }
61 sub Base_Unicode_Version { $UNICODE_VERSION }
69 my $self = bless { @_ }, $class;
71 # alternate lowercased
73 ! exists $self->{alternate} ? 'shifted' : lc($self->{alternate});
75 croak "$PACKAGE unknown alternate tag name: $self->{alternate}"
76 unless $self->{alternate} eq 'blanked'
77 || $self->{alternate} eq 'non-ignorable'
78 || $self->{alternate} eq 'shifted'
79 || $self->{alternate} eq 'shift-trimmed';
84 croak "Illegal level lower than 1 (passed $self->{level})."
85 if $self->{level} < 1;
86 croak "A level higher than 4 (passed $self->{level}) is not supported."
87 if 4 < $self->{level};
89 # overrideHangul and -CJK
90 # If true: CODEREF used; '': default; undef: derived elements
91 $self->{overrideHangul} = ''
92 if ! exists $self->{overrideHangul};
93 $self->{overrideCJK} = ''
94 if ! exists $self->{overrideCJK};
97 $self->{normalization} = 'D'
98 if ! exists $self->{normalization};
101 if (defined $self->{normalization}) {
102 eval { require Unicode::Normalize };
103 croak "Unicode/Normalize.pm is required to normalize strings: $@"
106 Unicode::Normalize->import();
107 $getCombinClass = \&Unicode::Normalize::getCombinClass
108 if ! $getCombinClass;
111 $self->{normalization} =~ /^(?:NF)?C$/ ? \&NFC :
112 $self->{normalization} =~ /^(?:NF)?D$/ ? \&NFD :
113 $self->{normalization} =~ /^(?:NF)?KC$/ ? \&NFKC :
114 $self->{normalization} =~ /^(?:NF)?KD$/ ? \&NFKD :
115 croak "$PACKAGE unknown normalization form name: "
116 . $self->{normalization};
120 # If undef is passed explicitly, no file is read.
121 $self->{table} = $KeyFile
122 if ! exists $self->{table};
124 if defined $self->{table};
126 if ($self->{entry}) {
127 $self->parseEntry($_) foreach split /\n/, $self->{entry};
131 $self->{backwards} ||= [ ];
132 $self->{backwards} = [ $self->{backwards} ]
133 if ! ref $self->{backwards};
136 $self->{rearrange} = $DefaultRearrange
137 if ! exists $self->{rearrange};
138 $self->{rearrange} = []
139 if ! defined $self->{rearrange};
140 croak "$PACKAGE: A list for rearrangement must be store in an ARRAYREF"
141 if ! ref $self->{rearrange};
143 # keys of $self->{rearrangeHash} are $self->{rearrange}.
144 $self->{rearrangeHash} = undef;
146 if (@{ $self->{rearrange} }) {
147 @{ $self->{rearrangeHash} }{ @{ $self->{rearrange} } } = ();
155 my $file = $self->{table} ne '' ? $self->{table} : $KeyFile;
157 open my $fk, "<$Path/$file"
158 or croak "File does not exist at $Path/$file";
163 if (/^\@version\s*(\S*)/) {
164 $self->{version} ||= $1;
166 elsif (/^\@alternate\s+(.*)/) {
167 $self->{alternate} ||= $1;
169 elsif (/^\@backwards\s+(.*)/) {
170 push @{ $self->{backwards} }, $1;
172 elsif (/^\@rearrange\s+(.*)/) {
173 push @{ $self->{rearrange} }, _getHexArray($1);
177 $self->parseEntry($_);
184 ## get $line, parse it, and write an entry in $self
190 my($name, $ele, @key);
192 return if $line !~ /^\s*[0-9A-Fa-f]/;
194 # removes comment and gets name
196 if $line =~ s/[#%]\s*(.*)//;
197 return if defined $self->{undefName} && $name =~ /$self->{undefName}/;
200 my($e, $k) = split /;/, $line;
201 croak "Wrong Entry: <charList> must be separated by ';' from <collElement>"
204 my @e = _getHexArray($e);
205 $ele = pack('U*', @e);
206 return if defined $self->{undefChar} && $ele =~ /$self->{undefChar}/;
209 if (defined $self->{ignoreName} && $name =~ /$self->{ignoreName}/ ||
210 defined $self->{ignoreChar} && $ele =~ /$self->{ignoreChar}/)
212 $self->{entries}{$ele} = $self->{ignored}{$ele} = 1;
215 my $combining = 1; # primary = 0, secondary != 0;
217 foreach my $arr ($k =~ /\[([^\[\]]+)\]/g) { # SPACEs allowed
218 my $var = $arr =~ /\*/; # exactly /^\*/ but be lenient.
219 push @key, $self->altCE($var, _getHexArray($arr));
220 $combining = 0 unless $key[-1][0] == 0 && $key[-1][1] != 0;
222 $self->{entries}{$ele} = \@key;
223 $self->{combining}{$ele} = 1 if $combining;
225 $self->{maxlength}{ord $ele} = scalar @e if @e > 1;
230 ## arrayref CE = altCE(bool variable?, list[num] weights)
238 $self->{alternate} eq 'blanked' ?
239 $var ? [0,0,0,$c[3]] : \@c :
240 $self->{alternate} eq 'non-ignorable' ?
242 $self->{alternate} eq 'shifted' ?
243 $var ? [0,0,0,$c[0] ] : [ @c[0..2], $c[0]+$c[1]+$c[2] ? 0xFFFF : 0 ] :
244 $self->{alternate} eq 'shift-trimmed' ?
245 $var ? [0,0,0,$c[0] ] : [ @c[0..2], 0 ] :
246 croak "$PACKAGE unknown alternate name: $self->{alternate}";
250 ## string hex_sortkey = splitCE(string arg)
255 my $key = $self->getSortKey(@_);
256 my $view = join " ", map sprintf("%04X", $_), unpack 'n*', $key;
257 $view =~ s/ ?0000 ?/|/g;
263 ## list[strings] elements = splitCE(string arg)
268 my $code = $self->{preprocess};
269 my $norm = $self->{UNF};
270 my $ent = $self->{entries};
271 my $max = $self->{maxlength};
272 my $reH = $self->{rearrangeHash};
274 my $str = ref $code ? &$code(shift) : shift;
275 $str = &$norm($str) if ref $norm;
277 my @src = unpack('U*', $str);
282 for (my $i = 0; $i < @src; $i++) {
283 if (exists $reH->{ $src[$i] } && $i + 1 < @src) {
284 ($src[$i], $src[$i+1]) = ($src[$i+1], $src[$i]);
290 for (my $i = 0; $i < @src; $i++) {
295 next unless defined $u;
296 next if $u < 0 || 0x10FFFF < $u # out of range
297 || (0xD800 <= $u && $u <= 0xDFFF); # unpaired surrogates
298 my $four = $u & 0xFFFF;
299 next if $four == 0xFFFE || $four == 0xFFFF;
301 if ($max->{$u}) { # contract
302 for (my $j = $max->{$u}; $j >= 1; $j--) {
303 next unless $i+$j-1 < @src;
304 $ch = pack 'U*', @src[$i .. $i+$j-1];
305 $i += $j-1, last if $ent->{$ch};
311 # with Combining Char (UTS#10, 4.2.1), here requires Unicode::Normalize.
312 if ($getCombinClass && defined $ch) {
313 for (my $j = $i+1; $j < @src; $j++) {
314 next unless defined $src[$j];
315 last unless $getCombinClass->( $src[$j] );
316 my $comb = pack 'U', $src[$j];
317 next if ! $ent->{ $ch.$comb };
324 wantarray ? @buf : \@buf;
329 ## list[arrayrefs] weight = getWt(string element)
335 my $ent = $self->{entries};
336 my $ign = $self->{ignored};
337 my $cjk = $self->{overrideCJK};
338 my $hang = $self->{overrideHangul};
340 return if !defined $ch || $ign->{$ch}; # ignored
341 return @{ $ent->{$ch} } if $ent->{$ch};
342 my $u = unpack('U', $ch);
344 if (0xAC00 <= $u && $u <= 0xD7A3) { # is_Hangul
350 my $ar = $ent->{pack('U', $v)};
351 $ar ? @$ar : map($self->altCE(0,@$_), _derivCE($v));
353 : map($self->altCE(0,@$_), _derivCE($u));
355 elsif (0x3400 <= $u && $u <= 0x4DB5 ||
356 0x4E00 <= $u && $u <= 0x9FA5 ||
357 0x20000 <= $u && $u <= 0x2A6D6) { # is_CJK
360 : defined $cjk && $u <= 0xFFFF
361 ? $self->altCE(0, ($u, 0x20, 0x02, $u))
362 : map($self->altCE(0,@$_), _derivCE($u));
365 return map($self->altCE(0,@$_), _derivCE($u));
370 ## int = index(string, substring)
375 my $lev = $self->{level};
376 my $comb = $self->{combining};
377 my $str = $self->splitCE(shift);
378 my $sub = $self->splitCE(shift);
380 return wantarray ? (0,0) : 0 if ! @$sub;
381 return wantarray ? () : -1 if ! @$str;
383 my @subWt = grep _ignorableAtLevel($_,$lev),
384 map $self->getWt($_), @$sub;
388 for (my $i = 0; $i < @$str; $i++) {
391 my @tmp = grep _ignorableAtLevel($_,$lev), $self->getWt($str->[$i]);
392 $go_ahead += length $str->[$i];
394 # /*XXX*/ still broken.
395 # index("e\x{300}", "e") should be 'no match' at level 2 or higher
396 # as "e\x{300}" is a *single* grapheme cluster and not equal to "e".
398 # go ahead as far as we find a combining character;
399 while ($i + 1 < @$str &&
400 (! defined $str->[$i+1] || $comb->{ $str->[$i+1] }) ) {
402 $go_ahead += length $str->[$i];
403 next if ! defined $str->[$i];
405 grep _ignorableAtLevel($_,$lev), $self->getWt($str->[$i]);
409 push @strPt, ($count) x @tmp;
412 while (@strWt >= @subWt) {
413 if (_eqArray(\@strWt, \@subWt, $lev)) {
415 return wantarray ? ($pos, $count-$pos) : $pos;
421 return wantarray ? () : -1;
425 ## bool _eqArray(arrayref, arrayref, level)
429 my $a = shift; # length $a >= length $b;
432 for my $v (0..$lev-1) {
433 for my $c (0..@$b-1){
434 return if $a->[$c][$v] != $b->[$c][$v];
442 ## bool _ignorableAtLevel(CE, level)
444 sub _ignorableAtLevel($$)
447 return unless defined $ce;
449 return ! grep { ! $ce->[$_] } 0..$lv-1;
454 ## string sortkey = getSortKey(string arg)
459 my $lev = $self->{level};
460 my $rCE = $self->splitCE(shift); # get an arrayref
463 my @buf = grep defined(), map $self->getWt($_), @$rCE;
466 my @ret = ([],[],[],[]);
467 foreach my $v (0..$lev-1) {
468 foreach my $b (@buf) {
469 push @{ $ret[$v] }, $b->[$v] if $b->[$v];
472 foreach (@{ $self->{backwards} }) {
474 @{ $ret[$v] } = reverse @{ $ret[$v] };
477 # modification of tertiary weights
478 if ($self->{upper_before_lower}) {
479 foreach (@{ $ret[2] }) {
480 if (0x8 <= $_ && $_ <= 0xC) { $_ -= 6 } # lower
481 elsif (0x2 <= $_ && $_ <= 0x6) { $_ += 6 } # upper
482 elsif ($_ == 0x1C) { $_ += 1 } # square upper
483 elsif ($_ == 0x1D) { $_ -= 1 } # square lower
486 if ($self->{katakana_before_hiragana}) {
487 foreach (@{ $ret[2] }) {
488 if (0x0F <= $_ && $_ <= 0x13) { $_ -= 2 } # katakana
489 elsif (0x0D <= $_ && $_ <= 0x0E) { $_ += 5 } # hiragana
492 join "\0\0", map pack('n*', @$_), @ret;
497 ## int compare = cmp(string a, string b)
499 sub cmp { $_[0]->getSortKey($_[1]) cmp $_[0]->getSortKey($_[2]) }
500 sub eq { $_[0]->getSortKey($_[1]) eq $_[0]->getSortKey($_[2]) }
501 sub ne { $_[0]->getSortKey($_[1]) ne $_[0]->getSortKey($_[2]) }
502 sub lt { $_[0]->getSortKey($_[1]) lt $_[0]->getSortKey($_[2]) }
503 sub le { $_[0]->getSortKey($_[1]) le $_[0]->getSortKey($_[2]) }
504 sub gt { $_[0]->getSortKey($_[1]) gt $_[0]->getSortKey($_[2]) }
505 sub ge { $_[0]->getSortKey($_[1]) ge $_[0]->getSortKey($_[2]) }
508 ## list[strings] sorted = sort(list[strings] arg)
514 sort{ $a->[0] cmp $b->[0] }
515 map [ $obj->getSortKey($_), $_ ], @_;
519 ## list[arrayrefs] CE = _derivCE(int codepoint)
523 my $a = UNDEFINED + ($code >> 15); # ok
524 my $b = ($code & 0x7FFF) | 0x8000; # ok
525 # my $a = 0xFFC2 + ($code >> 15); # ng
526 # my $b = $code & 0x7FFF | 0x1000; # ng
527 $b ? ([$a,2,1,$code],[$b,0,0,$code]) : [$a,2,1,$code];
531 ## "hhhh hhhh hhhh" to (dddd, dddd, dddd)
533 sub _getHexArray { map hex, $_[0] =~ /([0-9a-fA-F]+)/g }
536 # $code must be in Hangul syllable.
537 # Check it before you enter here.
541 my $SIndex = $code - 0xAC00;
542 my $LIndex = int( $SIndex / 588);
543 my $VIndex = int(($SIndex % 588) / 28);
544 my $TIndex = $SIndex % 28;
548 $TIndex ? (0x11A7 + $TIndex) : (),
557 Unicode::Collate - Unicode Collation Algorithm
561 use Unicode::Collate;
564 $Collator = Unicode::Collate->new(%tailoring);
567 @sorted = $Collator->sort(@not_sorted);
570 $result = $Collator->cmp($a, $b); # returns 1, 0, or -1.
574 =head2 Constructor and Tailoring
576 The C<new> method returns a collator object.
578 $Collator = Unicode::Collate->new(
579 alternate => $alternate,
580 backwards => $levelNumber, # or \@levelNumbers
582 normalization => $normalization_form,
583 ignoreName => qr/$ignoreName/,
584 ignoreChar => qr/$ignoreChar/,
585 katakana_before_hiragana => $bool,
586 level => $collationLevel,
587 overrideCJK => \&overrideCJK,
588 overrideHangul => \&overrideHangul,
589 preprocess => \&preprocess,
590 rearrange => \@charList,
592 undefName => qr/$undefName/,
593 undefChar => qr/$undefChar/,
594 upper_before_lower => $bool,
596 # if %tailoring is false (i.e. empty),
597 # $Collator should do the default collation.
603 -- see 3.2.2 Alternate Weighting, UTR #10.
605 This key allows to alternate weighting for variable collation elements,
606 which are marked with an ASTERISK in the table
607 (NOTE: Many punction marks and symbols are variable in F<allkeys.txt>).
609 alternate => 'blanked', 'non-ignorable', 'shifted', or 'shift-trimmed'.
611 These names are case-insensitive.
612 By default (if specification is omitted), 'shifted' is adopted.
614 'Blanked' Variable elements are ignorable at levels 1 through 3;
615 considered at the 4th level.
617 'Non-ignorable' Variable elements are not reset to ignorable.
619 'Shifted' Variable elements are ignorable at levels 1 through 3
620 their level 4 weight is replaced by the old level 1 weight.
621 Level 4 weight for Non-Variable elements is 0xFFFF.
623 'Shift-Trimmed' Same as 'shifted', but all FFFF's at the 4th level
628 -- see 3.1.2 French Accents, UTR #10.
630 backwards => $levelNumber or \@levelNumbers
632 Weights in reverse order; ex. level 2 (diacritic ordering) in French.
633 If omitted, forwards at all the levels.
637 -- see 3.1 Linguistic Features; 3.2.1 File Format, UTR #10.
639 Overrides a default order or defines additional collation elements
641 entry => <<'ENTRIES', # use the UCA file format
642 00E6 ; [.0861.0020.0002.00E6] [.08B1.0020.0002.00E6] # ligature <ae> as <a><e>
643 0063 0068 ; [.0893.0020.0002.0063] # "ch" in traditional Spanish
644 0043 0068 ; [.0893.0020.0008.0043] # "Ch" in traditional Spanish
651 -- see Completely Ignorable, 3.2.2 Alternate Weighting, UTR #10.
653 Makes the entry in the table ignorable.
654 If a collation element is ignorable,
655 it is ignored as if the element had been deleted from there.
657 E.g. when 'a' and 'e' are ignorable,
658 'element' is equal to 'lament' (or 'lmnt').
662 -- see 4.3 Form a sort key for each string, UTR #10.
664 Set the maximum level.
665 Any higher levels than the specified one are ignored.
667 Level 1: alphabetic ordering
668 Level 2: diacritic ordering
669 Level 3: case ordering
670 Level 4: tie-breaking (e.g. in the case when alternate is 'shifted')
674 If omitted, the maximum is the 4th.
678 -- see 4.1 Normalize each input string, UTR #10.
680 If specified, strings are normalized before preparation of sort keys
681 (the normalization is executed after preprocess).
683 As a form name, one of the following names must be used.
685 'C' or 'NFC' for Normalization Form C
686 'D' or 'NFD' for Normalization Form D
687 'KC' or 'NFKC' for Normalization Form KC
688 'KD' or 'NFKD' for Normalization Form KD
690 If omitted, the string is put into Normalization Form D.
692 If C<undef> is passed explicitly as the value for this key,
693 any normalization is not carried out (this may make tailoring easier
694 if any normalization is not desired).
700 -- see 7.1 Derived Collation Elements, UTR #10.
702 By default, mapping of CJK Unified Ideographs
703 uses the Unicode codepoint order.
704 But the mapping of CJK Unified Ideographs may be overrided.
706 ex. CJK Unified Ideographs in the JIS code point order.
709 my $u = shift; # get a Unicode codepoint
710 my $b = pack('n', $u); # to UTF-16BE
711 my $s = your_unicode_to_sjis_converter($b); # convert
712 my $n = unpack('n', $s); # convert sjis to short
713 [ $n, 0x20, 0x2, $u ]; # return the collation element
716 ex. ignores all CJK Unified Ideographs.
718 overrideCJK => sub {()}, # CODEREF returning empty list
720 # where ->eq("Pe\x{4E00}rl", "Perl") is true
721 # as U+4E00 is a CJK Unified Ideograph and to be ignorable.
723 If C<undef> is passed explicitly as the value for this key,
724 weights for CJK Unified Ideographs are treated as undefined.
725 But assignment of weight for CJK Unified Ideographs
726 in table or L<entry> is still valid.
730 -- see 7.1 Derived Collation Elements, UTR #10.
732 By default, Hangul Syllables are decomposed into Hangul Jamo.
733 But the mapping of Hangul Syllables may be overrided.
735 This tag works like L<overrideCJK>, so see there for examples.
737 If you want to override the mapping of Hangul Syllables,
738 the Normalization Forms D and KD are not appropriate
739 (they will be decomposed before overriding).
741 If C<undef> is passed explicitly as the value for this key,
742 weight for Hangul Syllables is treated as undefined
743 without decomposition into Hangul Jamo.
744 But definition of weight for Hangul Syllables
745 in table or L<entry> is still valid.
749 -- see 5.1 Preprocessing, UTR #10.
751 If specified, the coderef is used to preprocess
752 before the formation of sort keys.
754 ex. dropping English articles, such as "a" or "the".
755 Then, "the pen" is before "a pencil".
759 $str =~ s/\b(?:an?|the)\s+//gi;
765 -- see 3.1.3 Rearrangement, UTR #10.
767 Characters that are not coded in logical order and to be rearranged.
770 rearrange => [ 0x0E40..0x0E44, 0x0EC0..0x0EC4 ],
772 If you want to disallow any rearrangement,
773 pass C<undef> or C<[]> (a reference to an empty list)
774 as the value for this key.
778 -- see 3.2 Default Unicode Collation Element Table, UTR #10.
780 You can use another element table if desired.
781 The table file must be in your C<lib/Unicode/Collate> directory.
783 By default, the file C<lib/Unicode/Collate/allkeys.txt> is used.
785 If C<undef> is passed explicitly as the value for this key,
786 no file is read (but you can define collation elements via L<entry>).
788 A typical way to define a collation element table
789 without any file of table:
791 $onlyABC = Unicode::Collate->new(
793 entry => << 'ENTRIES',
794 0061 ; [.0101.0020.0002.0061] # LATIN SMALL LETTER A
795 0041 ; [.0101.0020.0008.0041] # LATIN CAPITAL LETTER A
796 0062 ; [.0102.0020.0002.0062] # LATIN SMALL LETTER B
797 0042 ; [.0102.0020.0008.0042] # LATIN CAPITAL LETTER B
798 0063 ; [.0103.0020.0002.0063] # LATIN SMALL LETTER C
799 0043 ; [.0103.0020.0008.0043] # LATIN CAPITAL LETTER C
807 -- see 6.3.4 Reducing the Repertoire, UTR #10.
809 Undefines the collation element as if it were unassigned in the table.
810 This reduces the size of the table.
811 If an unassigned character appears in the string to be collated,
812 the sort key is made from its codepoint
813 as a single-character collation element,
814 as it is greater than any other assigned collation elements
815 (in the codepoint order among the unassigned characters).
816 But, it'd be better to ignore characters
817 unfamiliar to you and maybe never used.
819 =item katakana_before_hiragana
821 =item upper_before_lower
823 -- see 6.6 Case Comparisons; 7.3.1 Tertiary Weight Table, UTR #10.
825 By default, lowercase is before uppercase
826 and hiragana is before katakana.
828 If the tag is made true, this is reversed.
830 B<NOTE>: These tags simplemindedly assume
831 any lowercase/uppercase or hiragana/katakana distinctions
832 should occur in level 3, and their weights at level 3
833 should be same as those mentioned in 7.3.1, UTR #10.
834 If you define your collation elements which violates this,
835 these tags doesn't work validly.
839 =head2 Methods for Collation
843 =item C<@sorted = $Collator-E<gt>sort(@not_sorted)>
845 Sorts a list of strings.
847 =item C<$result = $Collator-E<gt>cmp($a, $b)>
849 Returns 1 (when C<$a> is greater than C<$b>)
850 or 0 (when C<$a> is equal to C<$b>)
851 or -1 (when C<$a> is lesser than C<$b>).
853 =item C<$result = $Collator-E<gt>eq($a, $b)>
855 =item C<$result = $Collator-E<gt>ne($a, $b)>
857 =item C<$result = $Collator-E<gt>lt($a, $b)>
859 =item C<$result = $Collator-E<gt>le($a, $b)>
861 =item C<$result = $Collator-E<gt>gt($a, $b)>
863 =item C<$result = $Collator-E<gt>ge($a, $b)>
865 They works like the same name operators as theirs.
867 eq : whether $a is equal to $b.
868 ne : whether $a is not equal to $b.
869 lt : whether $a is lesser than $b.
870 le : whether $a is lesser than $b or equal to $b.
871 gt : whether $a is greater than $b.
872 ge : whether $a is greater than $b or equal to $b.
874 =item C<$sortKey = $Collator-E<gt>getSortKey($string)>
876 -- see 4.3 Form a sort key for each string, UTR #10.
880 You compare the sort keys using a binary comparison
881 and get the result of the comparison of the strings using UCA.
883 $Collator->getSortKey($a) cmp $Collator->getSortKey($b)
887 $Collator->cmp($a, $b)
889 =item C<$sortKeyForm = $Collator-E<gt>viewSortKey($string)>
891 Returns a string formalized to display a sort key.
892 Weights are enclosed with C<'['> and C<']'>
893 and level boundaries are denoted by C<'|'>.
895 use Unicode::Collate;
896 my $c = Unicode::Collate->new();
897 print $c->viewSortKey("Perl"),"\n";
900 # [09B3 08B1 09CB 094F|0020 0020 0020 0020|0008 0002 0002 0002|FFFF FFFF FFFF FFFF]
901 # Level 1 Level 2 Level 3 Level 4
903 =item C<$position = $Collator-E<gt>index($string, $substring)>
905 =item C<($position, $length) = $Collator-E<gt>index($string, $substring)>
907 -- see 6.8 Searching, UTR #10.
909 If C<$substring> matches a part of C<$string>, returns
910 the position of the first occurrence of the matching part in scalar context;
911 in list context, returns a two-element list of
912 the position and the length of the matching part.
914 B<Notice> that the length of the matching part may differ from
915 the length of C<$substring>.
917 B<Note> that the position and the length are counted on the string
918 after the process of preprocess, normalization, and rearrangement.
919 Therefore, in case the specified string is not binary equal to
920 the preprocessed/normalized/rearranged string, the position and the length
921 may differ form those on the specified string. But it is guaranteed
922 that, if matched, it returns a non-negative value as C<$position>.
924 If C<$substring> does not match any part of C<$string>,
925 returns C<-1> in scalar context and
926 an empty list in list context.
930 my $Collator = Unicode::Collate->new( normalization => undef, level => 1 );
931 my $str = "Ich mu\x{00DF} studieren.";
932 my $sub = "m\x{00FC}ss";
934 if (my($pos,$len) = $Collator->index($str, $sub)) {
935 $match = substr($str, $pos, $len);
938 and get C<"mu\x{00DF}"> in C<$match> since C<"mu>E<223>C<">
939 is primary equal to C<"m>E<252>C<ss">.
949 Returns the version number of Unicode Technical Standard 10
950 this module consults.
952 =item Base_Unicode_Version
954 Returns the version number of the Unicode Standard
955 this module is based on.
965 Unicode::Collate has not been ported to EBCDIC. The code mostly would
966 work just fine but a decision needs to be made: how the module should
967 work in EBCDIC? Should the low 256 characters be understood as
968 Unicode or as EBCDIC code points? Should one be chosen or should
969 there be a way to do either? Or should such translation be left
970 outside the module for the user to do, for example by using
972 (or utf8::unicode_to_native()/utf8::native_to_unicode()?)
976 Use of the C<normalization> parameter requires
977 the B<Unicode::Normalize> module.
979 If you need not it (say, in the case when you need not
980 handle any combining characters),
981 assign C<normalization =E<gt> undef> explicitly.
983 -- see 6.5 Avoiding Normalization, UTR #10.
987 C<index()> is an experimental method and
988 its return value may be unreliable.
989 The correct implementation for C<index()> must be based
990 on Locale-Sensitive Support: Level 3 in UTR #18,
991 F<Unicode Regular Expression Guidelines>.
993 See also 4.2 Locale-Dependent Graphemes in UTR #18.
997 SADAHIRO Tomoyuki, E<lt>SADAHIRO@cpan.orgE<gt>
999 http://homepage1.nifty.com/nomenclator/perl/
1001 Copyright(C) 2001-2002, SADAHIRO Tomoyuki. Japan. All rights reserved.
1003 This library is free software; you can redistribute it
1004 and/or modify it under the same terms as Perl itself.
1010 =item http://www.unicode.org/unicode/reports/tr10/
1012 Unicode Collation Algorithm - UTR #10
1014 =item http://www.unicode.org/unicode/reports/tr10/allkeys.txt
1016 The Default Unicode Collation Element Table
1018 =item http://www.unicode.org/unicode/reports/tr15/
1020 Unicode Normalization Forms - UAX #15
1022 =item http://www.unicode.org/unicode/reports/tr18
1024 Unicode Regular Expression Guidelines - UTR #18
1026 =item L<Unicode::Normalize>