AUTHORS updates.
[p5sagit/p5-mst-13.2.git] / lib / Unicode / Collate.pm
CommitLineData
45394607 1package Unicode::Collate;
2
3use 5.006;
4use strict;
5use warnings;
6use Carp;
5398038e 7
45394607 8require Exporter;
9
905aa9f0 10our $VERSION = '0.10';
45394607 11our $PACKAGE = __PACKAGE__;
12
13our @ISA = qw(Exporter);
14
15our %EXPORT_TAGS = ();
16our @EXPORT_OK = ();
17our @EXPORT = ();
18
19(our $Path = $INC{'Unicode/Collate.pm'}) =~ s/\.pm$//;
20our $KeyFile = "allkeys.txt";
21
5398038e 22our $getCombinClass; # coderef for combining class from Unicode::Normalize
45394607 23
24use constant Min2 => 0x20; # minimum weight at level 2
25use constant Min3 => 0x02; # minimum weight at level 3
26use constant UNDEFINED => 0xFF80; # special value for undefined CE
27
28##
29## constructor
30##
31sub new
32{
33 my $class = shift;
34 my $self = bless { @_ }, $class;
35
36 # alternate
37 $self->{alternate} =
38 ! exists $self->{alternate} ? 'shifted' :
39 ! defined $self->{alternate} ? '' : $self->{alternate};
40
41 # collation level
5398038e 42 $self->{level} ||= ($self->{alternate} =~ /^shift/ ? 4 : 3);
45394607 43
44 # normalization form
45 $self->{normalization} = 'D' if ! exists $self->{normalization};
46
5398038e 47 if(defined $self->{normalization}){
48 eval "use Unicode::Normalize;";
49 croak "you'd install Unicode::Normalize for normalization forms: $@"
50 if $@;
51 $getCombinClass = \&Unicode::Normalize::getCombinClass
52 if ! $getCombinClass;
53 }
45394607 54
5398038e 55 $self->{UNF} =
45394607 56 ! defined $self->{normalization} ? undef :
57 $self->{normalization} =~ /^(?:NF)?C$/ ? \&NFC :
58 $self->{normalization} =~ /^(?:NF)?D$/ ? \&NFD :
59 $self->{normalization} =~ /^(?:NF)?KC$/ ? \&NFKC :
60 $self->{normalization} =~ /^(?:NF)?KD$/ ? \&NFKD :
61 croak "$PACKAGE unknown normalization form name: $self->{normalization}";
62
45394607 63 # backwards
64 $self->{backwards} ||= [];
65 $self->{backwards} = [ $self->{backwards} ] if ! ref $self->{backwards};
66
67 # rearrange
68 $self->{rearrange} ||= []; # maybe not U+0000 (an ASCII)
69 $self->{rearrange} = [ $self->{rearrange} ] if ! ref $self->{rearrange};
70
905aa9f0 71 # open a table file.
72 # if undef is passed explicitly, no file is read.
73 $self->{table} = $KeyFile unless exists $self->{table};
74 $self->read_table if defined $self->{table};
75
76 if($self->{entry}){
77 $self->parseEntry($_) foreach split /\n/, $self->{entry};
78 }
79
80 # keys of $self->{rearrangeHash} are $self->{rearrange}.
81 $self->{rearrangeHash} = {};
82 @{ $self->{rearrangeHash} }{ @{ $self->{rearrange} } } = ();
83
84 return $self;
85}
86
87
88sub read_table {
89 my $self = shift;
90 my $file = $self->{table} ne '' ? $self->{table} : $KeyFile;
45394607 91 open my $fk, "<$Path/$file" or croak "File does not exist at $Path/$file";
92
93 while(<$fk>){
94 next if /^\s*#/;
95 if(/^\s*\@/){
96 if(/^\@version\s*(\S*)/){
97 $self->{version} ||= $1;
98 }
99 elsif(/^\@alternate\s+(.*)/){
100 $self->{alternate} ||= $1;
101 }
102 elsif(/^\@backwards\s+(.*)/){
103 push @{ $self->{backwards} }, $1;
104 }
105 elsif(/^\@rearrange\s+(.*)/){
106 push @{ $self->{rearrange} }, _getHexArray($1);
107 }
108 next;
109 }
110 $self->parseEntry($_);
111 }
112 close $fk;
45394607 113}
114
905aa9f0 115
45394607 116##
117## get $line, parse it, and write an entry in $self
118##
119sub parseEntry
120{
121 my $self = shift;
122 my $line = shift;
123 my($name, $ele, @key);
124
125 return if $line !~ /^\s*[0-9A-Fa-f]/;
126
127 # get name
128 $name = $1 if $line =~ s/#\s*(.*)//;
129 return if defined $self->{undefName} && $name =~ /$self->{undefName}/;
130
131 # get element
132 my($e, $k) = split /;/, $line;
133 my @e = _getHexArray($e);
134 $ele = pack('U*', @e);
135 return if defined $self->{undefChar} && $ele =~ /$self->{undefChar}/;
136
137 # get sort key
138 if(
139 defined $self->{ignoreName} && $name =~ /$self->{ignoreName}/ ||
140 defined $self->{ignoreChar} && $ele =~ /$self->{ignoreChar}/
141 )
142 {
d16e9e3d 143 $self->{entries}{$ele} = $self->{ignored}{$ele} = 1;
45394607 144 }
145 else
146 {
147 foreach my $arr ($k =~ /\[(\S+)\]/g) {
148 my $var = $arr =~ /\*/;
d16e9e3d 149 push @key, $self->altCE( $var, _getHexArray($arr) );
45394607 150 }
151 $self->{entries}{$ele} = \@key;
152 }
153 $self->{maxlength}{ord $ele} = scalar @e if @e > 1;
154}
155
156
157##
d16e9e3d 158## arrayref CE = altCE(bool variable?, list[num] weights)
45394607 159##
d16e9e3d 160sub altCE
45394607 161{
162 my $self = shift;
163 my $var = shift;
164 my @c = @_;
165
166 $self->{alternate} eq 'blanked' ?
d16e9e3d 167 $var ? [0,0,0] : [ @c[0..2] ] :
168 $self->{alternate} eq 'non-ignorable' ?
169 [ @c[0..2] ] :
45394607 170 $self->{alternate} eq 'shifted' ?
171 $var ? [0,0,0,$c[0] ] : [ @c[0..2], $c[0]+$c[1]+$c[2] ? 0xFFFF : 0 ] :
172 $self->{alternate} eq 'shift-trimmed' ?
173 $var ? [0,0,0,$c[0] ] : [ @c[0..2], 0 ] :
174 \@c;
175}
176
177##
d16e9e3d 178## string hex_sortkey = splitCE(string arg)
45394607 179##
180sub viewSortKey
181{
182 my $self = shift;
183 my $key = $self->getSortKey(@_);
184 my $view = join " ", map sprintf("%04X", $_), unpack 'n*', $key;
185 $view =~ s/ ?0000 ?/|/g;
186 "[$view]";
187}
188
d16e9e3d 189
45394607 190##
d16e9e3d 191## list[strings] elements = splitCE(string arg)
45394607 192##
d16e9e3d 193sub splitCE
45394607 194{
195 my $self = shift;
196 my $code = $self->{preprocess};
5398038e 197 my $norm = $self->{UNF};
45394607 198 my $ent = $self->{entries};
45394607 199 my $max = $self->{maxlength};
45394607 200 my $rear = $self->{rearrangeHash};
201
202 my $str = ref $code ? &$code(shift) : shift;
203 $str = &$norm($str) if ref $norm;
204
205 my(@src, @buf);
206 @src = unpack('U*', $str);
207
208 # rearrangement
209 for(my $i = 0; $i < @src; $i++)
210 {
211 ($src[$i], $src[$i+1]) = ($src[$i+1], $src[$i])
212 if $rear->{ $src[$i] };
213 $i++;
214 }
215
216 for(my $i = 0; $i < @src; $i++)
217 {
218 my $ch;
219 my $u = $src[$i];
220
221 # non-characters
905aa9f0 222 next unless defined $u;
45394607 223 next if $u < 0 || 0x10FFFF < $u # out of range
224 || 0xD800 < $u && $u < 0xDFFF; # unpaired surrogates
225 my $four = $u & 0xFFFF;
226 next if $four == 0xFFFE || $four == 0xFFFF;
227
228 if($max->{$u}) # contract
229 {
230 for(my $j = $max->{$u}; $j >= 1; $j--)
231 {
232 next unless $i+$j-1 < @src;
233 $ch = pack 'U*', @src[$i .. $i+$j-1];
234 $i += $j-1, last if $ent->{$ch};
235 }
236 }
237 else { $ch = pack('U', $u) }
238
5398038e 239 # with Combining Char (UTS#10, 4.2.1), here requires Unicode::Normalize.
240 if($getCombinClass && defined $ch)
45394607 241 {
905aa9f0 242 for(my $j = $i+1; $j < @src; $j++)
45394607 243 {
905aa9f0 244 next unless defined $src[$j];
245 last unless $getCombinClass->( $src[$j] );
45394607 246 my $comb = pack 'U', $src[$j];
247 next if ! $ent->{ $ch.$comb };
248 $ch .= $comb;
905aa9f0 249 $src[$j] = undef;
45394607 250 }
251 }
d16e9e3d 252 push @buf, $ch;
253 }
254 wantarray ? @buf : \@buf;
255}
45394607 256
d16e9e3d 257
258##
259## list[arrayrefs] weight = getWt(string element)
260##
261sub getWt
262{
263 my $self = shift;
264 my $ch = shift;
265 my $ent = $self->{entries};
266 my $ign = $self->{ignored};
267 my $cjk = $self->{overrideCJK};
268 my $hang = $self->{overrideHangul};
269 return if !defined $ch || $ign->{$ch}; # ignored
270 return @{ $ent->{$ch} } if $ent->{$ch};
271 my $u = unpack('U', $ch);
272 return
273 _isHangul($u)
274 ? $hang
275 ? &$hang($u)
5398038e 276 : map(@{ $ent->{pack('U', $_)} }, _decompHangul($u))
d16e9e3d 277 : _isCJK($u)
278 ? $cjk ? &$cjk($u) : map($self->altCE(0,@$_), _CJK($u))
279 : map($self->altCE(0,@$_), _derivCE($u));
280}
281
282##
283## int = index(string, substring)
284##
285sub index
286{
287 my $self = shift;
288 my $lev = $self->{level};
289 my $str = $self->splitCE(shift);
290 my $sub = $self->splitCE(shift);
291
292 return wantarray ? (0,0) : 0 if ! @$sub;
293 return wantarray ? () : -1 if ! @$str;
294
295 my @subWt = grep _ignorableAtLevel($_,$lev),
296 map $self->getWt($_), @$sub;
297
298 my(@strWt,@strPt);
299 my $count = 0;
300 for my $e (@$str){
301 my @tmp = grep _ignorableAtLevel($_,$lev), $self->getWt($e);
302 push @strWt, @tmp;
303 push @strPt, ($count) x @tmp;
304 $count += length $e;
305 while(@strWt >= @subWt){
306 if(_eqArray(\@strWt, \@subWt, $lev)){
307 my $pos = $strPt[0];
308 return wantarray ? ($pos, $count-$pos) : $pos;
309 }
310 shift @strWt;
311 shift @strPt;
312 }
313 }
314 return wantarray ? () : -1;
315}
316
317##
318## bool _eqArray(arrayref, arrayref, level)
319##
320sub _eqArray($$$)
321{
322 my $a = shift; # length $a >= length $b;
323 my $b = shift;
324 my $lev = shift;
325 for my $v (0..$lev-1){
326 for my $c (0..@$b-1){
327 return if $a->[$c][$v] != $b->[$c][$v];
328 }
45394607 329 }
d16e9e3d 330 return 1;
331}
332
333
334##
335## bool _ignorableAtLevel(CE, level)
336##
337sub _ignorableAtLevel($$)
338{
339 my $ce = shift;
340 return if ! defined $ce;
341 my $lv = shift;
342 ! grep { ! $ce->[$_] } 0..$lv-1;
343}
344
345
346##
347## string sortkey = getSortKey(string arg)
348##
349sub getSortKey
350{
351 my $self = shift;
352 my $lev = $self->{level};
353 my $rCE = $self->splitCE(shift); # get an arrayref
354
355 # weight arrays
356 my @buf = grep defined(), map $self->getWt($_), @$rCE;
45394607 357
358 # make sort key
359 my @ret = ([],[],[],[]);
360 foreach my $v (0..$lev-1){
361 foreach my $b (@buf){
362 push @{ $ret[$v] }, $b->[$v] if $b->[$v];
363 }
364 }
365 foreach (@{ $self->{backwards} }){
366 my $v = $_ - 1;
367 @{ $ret[$v] } = reverse @{ $ret[$v] };
368 }
369
370 # modification of tertiary weights
371 if($self->{upper_before_lower}){
372 foreach (@{ $ret[2] }){
373 if (0x8 <= $_ && $_ <= 0xC){ $_ -= 6 } # lower
374 elsif(0x2 <= $_ && $_ <= 0x6){ $_ += 6 } # upper
375 elsif($_ == 0x1C) { $_ += 1 } # square upper
376 elsif($_ == 0x1D) { $_ -= 1 } # square lower
377 }
378 }
379 if($self->{katakana_before_hiragana}){
380 foreach (@{ $ret[2] }){
381 if (0x0F <= $_ && $_ <= 0x13){ $_ -= 2 } # katakana
382 elsif(0x0D <= $_ && $_ <= 0x0E){ $_ += 5 } # hiragana
383 }
384 }
385 join "\0\0", map pack('n*', @$_), @ret;
386}
387
388
389##
d16e9e3d 390## int compare = cmp(string a, string b)
45394607 391##
5398038e 392sub cmp { $_[0]->getSortKey($_[1]) cmp $_[0]->getSortKey($_[2]) }
393sub eq { $_[0]->getSortKey($_[1]) eq $_[0]->getSortKey($_[2]) }
394sub ne { $_[0]->getSortKey($_[1]) ne $_[0]->getSortKey($_[2]) }
395sub lt { $_[0]->getSortKey($_[1]) lt $_[0]->getSortKey($_[2]) }
396sub le { $_[0]->getSortKey($_[1]) le $_[0]->getSortKey($_[2]) }
397sub gt { $_[0]->getSortKey($_[1]) gt $_[0]->getSortKey($_[2]) }
398sub ge { $_[0]->getSortKey($_[1]) ge $_[0]->getSortKey($_[2]) }
45394607 399
400##
d16e9e3d 401## list[strings] sorted = sort(list[strings] arg)
45394607 402##
403sub sort
404{
405 my $obj = shift;
406
407 map { $_->[1] }
408 sort{ $a->[0] cmp $b->[0] }
409 map [ $obj->getSortKey($_), $_ ], @_;
410}
411
412##
d16e9e3d 413## list[arrayrefs] CE = _derivCE(int codepoint)
45394607 414##
415sub _derivCE
416{
417 my $code = shift;
418 my $a = UNDEFINED + ($code >> 15); # ok
419 my $b = ($code & 0x7FFF) | 0x8000; # ok
420# my $a = 0xFFC2 + ($code >> 15); # ng
421# my $b = $code & 0x7FFF | 0x1000; # ng
422 $b ? ([$a,2,1,$code],[$b,0,0,$code]) : [$a,2,1,$code];
423}
424
425##
426## "hhhh hhhh hhhh" to (dddd, dddd, dddd)
427##
5398038e 428sub _getHexArray { map hex(), $_[0] =~ /([0-9a-fA-F]+)/g }
45394607 429
430##
d16e9e3d 431## bool is_a_CJK_Unified_Ideograph = _isCJK(int codepoint)
45394607 432##
433sub _isCJK
434{
435 my $u = shift;
436 return 0x3400 <= $u && $u <= 0x4DB5
437 || 0x4E00 <= $u && $u <= 0x9FA5
438# || 0x20000 <= $u && $u <= 0x2A6D6;
439}
440
441##
d16e9e3d 442## list[arrayref] CE = _CJK(int codepoint_of_CJK)
45394607 443##
5398038e 444sub _CJK { $_[0] > 0xFFFF ? _derivCE($_[0]) : [$_[0], 0x20, 0x02, $_[0]] }
45394607 445
446##
d16e9e3d 447## bool is_a_Hangul_Syllable = _isHangul(int codepoint)
45394607 448##
5398038e 449sub _isHangul { return 0xAC00 <= $_[0] && $_[0] <= 0xD7A3 }
450
451sub _decompHangul {
452 my $code = shift;
453 # $code must be in Hangul syllable. check it before you enter here.
454 my $SIndex = $code - 0xAC00;
455 my $LIndex = int( $SIndex / 588);
456 my $VIndex = int(($SIndex % 588) / 28);
457 my $TIndex = $SIndex % 28;
458 return (
459 0x1100 + $LIndex,
460 0x1161 + $VIndex,
461 $TIndex ? (0x11A7 + $TIndex) : (),
462 );
45394607 463}
464
4651;
466__END__
467
468=head1 NAME
469
470Unicode::Collate - use UCA (Unicode Collation Algorithm)
471
472=head1 SYNOPSIS
473
474 use Unicode::Collate;
475
476 #construct
5398038e 477 $Collator = Unicode::Collate->new(%tailoring);
45394607 478
479 #sort
5398038e 480 @sorted = $Collator->sort(@not_sorted);
45394607 481
482 #compare
5398038e 483 $result = $Collator->cmp($a, $b); # returns 1, 0, or -1.
45394607 484
485=head1 DESCRIPTION
486
487=head2 Constructor and Tailoring
488
d16e9e3d 489The C<new> method returns a collator object.
490
5398038e 491 $Collator = Unicode::Collate->new(
45394607 492 alternate => $alternate,
493 backwards => $levelNumber, # or \@levelNumbers
494 entry => $element,
495 normalization => $normalization_form,
496 ignoreName => qr/$ignoreName/,
497 ignoreChar => qr/$ignoreChar/,
498 katakana_before_hiragana => $bool,
499 level => $collationLevel,
500 overrideCJK => \&overrideCJK,
501 overrideHangul => \&overrideHangul,
502 preprocess => \&preprocess,
503 rearrange => \@charList,
504 table => $filename,
505 undefName => qr/$undefName/,
506 undefChar => qr/$undefChar/,
507 upper_before_lower => $bool,
508 );
509 # if %tailoring is false (empty),
5398038e 510 # $Collator should do the default collation.
45394607 511
512=over 4
513
514=item alternate
515
516-- see 3.2.2 Alternate Weighting, UTR #10.
517
518 alternate => 'shifted', 'blanked', 'non-ignorable', or 'shift-trimmed'.
519
520By default (if specification is omitted), 'shifted' is adopted.
521
522=item backwards
523
524-- see 3.1.2 French Accents, UTR #10.
525
526 backwards => $levelNumber or \@levelNumbers
527
528Weights in reverse order; ex. level 2 (diacritic ordering) in French.
529If omitted, forwards at all the levels.
530
531=item entry
532
533-- see 3.1 Linguistic Features; 3.2.1 File Format, UTR #10.
534
905aa9f0 535Overrides a default order or adds a new collation element
45394607 536
537 entry => <<'ENTRIES', # use the UCA file format
53800E6 ; [.0861.0020.0002.00E6] [.08B1.0020.0002.00E6] # ligature <ae> as <a e>
5390063 0068 ; [.0893.0020.0002.0063] # "ch" in traditional Spanish
5400043 0068 ; [.0893.0020.0008.0043] # "Ch" in traditional Spanish
541ENTRIES
542
543=item ignoreName
544
545=item ignoreChar
546
547-- see Completely Ignorable, 3.2.2 Alternate Weighting, UTR #10.
548
549Ignores the entry in the table.
550If an ignored collation element appears in the string to be collated,
551it is ignored as if the element had been deleted from there.
552
553E.g. when 'a' and 'e' are ignored,
554'element' is equal to 'lament' (or 'lmnt').
555
556=item level
557
558-- see 4.3 Form a sort key for each string, UTR #10.
559
560Set the maximum level.
561Any higher levels than the specified one are ignored.
562
563 Level 1: alphabetic ordering
564 Level 2: diacritic ordering
565 Level 3: case ordering
566 Level 4: tie-breaking (e.g. in the case when alternate is 'shifted')
567
568 ex.level => 2,
569
570=item normalization
571
572-- see 4.1 Normalize each input string, UTR #10.
573
905aa9f0 574If specified, strings are normalized before preparation of sort keys
45394607 575(the normalization is executed after preprocess).
576
577As a form name, one of the following names must be used.
578
579 'C' or 'NFC' for Normalization Form C
580 'D' or 'NFD' for Normalization Form D
581 'KC' or 'NFKC' for Normalization Form KC
582 'KD' or 'NFKD' for Normalization Form KD
583
584If omitted, the string is put into Normalization Form D.
585
586If undefined explicitly (as C<normalization =E<gt> undef>),
587any normalization is not carried out (this may make tailoring easier
588if any normalization is not desired).
589
590see B<CAVEAT>.
591
592=item overrideCJK
593
594=item overrideHangul
595
596-- see 7.1 Derived Collation Elements, UTR #10.
597
598By default, mapping of CJK Unified Ideographs
599uses the Unicode codepoint order
600and Hangul Syllables are decomposed into Hangul Jamo.
601
602The mapping of CJK Unified Ideographs
603or Hangul Syllables may be overrided.
604
605ex. CJK Unified Ideographs in the JIS codepoint order.
606
607 overrideCJK => sub {
608 my $u = shift; # get unicode codepoint
609 my $b = pack('n', $u); # to UTF-16BE
610 my $s = your_unicode_to_sjis_converter($b); # convert
611 my $n = unpack('n', $s); # convert sjis to short
612 [ $n, 1, 1 ]; # return collation element
613 },
614
615If you want to override the mapping of Hangul Syllables,
616the Normalization Forms D and KD are not appropriate
617(they will be decomposed before overriding).
618
619=item preprocess
620
621-- see 5.1 Preprocessing, UTR #10.
622
623If specified, the coderef is used to preprocess
624before the formation of sort keys.
625
626ex. dropping English articles, such as "a" or "the".
627Then, "the pen" is before "a pencil".
628
629 preprocess => sub {
630 my $str = shift;
631 $str =~ s/\b(?:an?|the)\s+//g;
632 $str;
633 },
634
635=item rearrange
636
637-- see 3.1.3 Rearrangement, UTR #10.
638
639Characters that are not coded in logical order and to be rearranged.
640By default,
641
642 rearrange => [ 0x0E40..0x0E44, 0x0EC0..0x0EC4 ],
643
644=item table
645
646-- see 3.2 Default Unicode Collation Element Table, UTR #10.
647
648You can use another element table if desired.
649The table file must be in your C<lib/Unicode/Collate> directory.
650
651By default, the file C<lib/Unicode/Collate/allkeys.txt> is used.
652
905aa9f0 653If undefined explicitly (as C<table =E<gt> undef>),
654no file is read (you'd define collation elements using L<entry>).
655
45394607 656=item undefName
657
658=item undefChar
659
660-- see 6.3.4 Reducing the Repertoire, UTR #10.
661
662Undefines the collation element as if it were unassigned in the table.
663This reduces the size of the table.
664If an unassigned character appears in the string to be collated,
665the sort key is made from its codepoint
666as a single-character collation element,
667as it is greater than any other assigned collation elements
668(in the codepoint order among the unassigned characters).
669But, it'd be better to ignore characters
670unfamiliar to you and maybe never used.
671
672=item katakana_before_hiragana
673
674=item upper_before_lower
675
676-- see 6.6 Case Comparisons; 7.3.1 Tertiary Weight Table, UTR #10.
677
678By default, lowercase is before uppercase
679and hiragana is before katakana.
680
681If the parameter is true, this is reversed.
682
683=back
684
685=head2 Other methods
686
687=over 4
688
5398038e 689=item C<@sorted = $Collator-E<gt>sort(@not_sorted)>
45394607 690
691Sorts a list of strings.
692
5398038e 693=item C<$result = $Collator-E<gt>cmp($a, $b)>
45394607 694
695Returns 1 (when C<$a> is greater than C<$b>)
696or 0 (when C<$a> is equal to C<$b>)
697or -1 (when C<$a> is lesser than C<$b>).
698
5398038e 699=item C<$result = $Collator-E<gt>eq($a, $b)>
700
701=item C<$result = $Collator-E<gt>ne($a, $b)>
702
703=item C<$result = $Collator-E<gt>lt($a, $b)>
704
705=item C<$result = $Collator-E<gt>le($a, $b)>
706
707=item C<$result = $Collator-E<gt>gt($a, $b)>
708
709=item C<$result = $Collator-E<gt>ge($a, $b)>
710
711They works like the same name operators as theirs.
712
713 eq : whether $a is equal to $b.
714 ne : whether $a is not equal to $b.
715 lt : whether $a is lesser than $b.
716 le : whether $a is lesser than $b or equal to $b.
717 gt : whether $a is greater than $b.
718 ge : whether $a is greater than $b or equal to $b.
719
720=item C<$sortKey = $Collator-E<gt>getSortKey($string)>
45394607 721
722-- see 4.3 Form a sort key for each string, UTR #10.
723
724Returns a sort key.
725
726You compare the sort keys using a binary comparison
727and get the result of the comparison of the strings using UCA.
728
5398038e 729 $Collator->getSortKey($a) cmp $Collator->getSortKey($b)
45394607 730
731 is equivalent to
732
5398038e 733 $Collator->cmp($a, $b)
45394607 734
5398038e 735=item C<$position = $Collator-E<gt>index($string, $substring)>
d16e9e3d 736
5398038e 737=item C<($position, $length) = $Collator-E<gt>index($string, $substring)>
d16e9e3d 738
739-- see 6.8 Searching, UTR #10.
740
741If C<$substring> matches a part of C<$string>, returns
742the position of the first occurrence of the matching part in scalar context;
743in list context, returns a two-element list of
744the position and the length of the matching part.
745
746B<Notice> that the length of the matching part may differ from
747the length of C<$substring>.
748
749B<Note> that the position and the length are counted on the string
750after the process of preprocess, normalization, and rearrangement.
751Therefore, in case the specified string is not binary equal to
752the preprocessed/normalized/rearranged string, the position and the length
753may differ form those on the specified string. But it is guaranteed
754that, if matched, it returns a non-negative value as C<$position>.
755
756If C<$substring> does not match any part of C<$string>,
757returns C<-1> in scalar context and
758an empty list in list context.
759
760e.g. you say
761
5398038e 762 my $Collator = Unicode::Collate->new( normalization => undef, level => 1 );
d16e9e3d 763 my $str = "Ich mu\x{00DF} studieren.";
764 my $sub = "m\x{00FC}ss";
765 my $match;
5398038e 766 if(my($pos,$len) = $Collator->index($str, $sub)){
767 $match = substr($str, $pos, $len);
d16e9e3d 768 }
769
770and get C<"mu\x{00DF}"> in C<$match> since C<"mu>E<223>C<">
771is primary equal to C<"m>E<252>C<ss">.
772
45394607 773=back
774
775=head2 EXPORT
776
777None by default.
778
779=head2 CAVEAT
780
781Use of the C<normalization> parameter requires
782the B<Unicode::Normalize> module.
783
5398038e 784If you need not it (say, in the case when you need not
45394607 785handle any combining characters),
786assign C<normalization =E<gt> undef> explicitly.
787
5398038e 788-- see 6.5 Avoiding Normalization, UTR #10.
789
45394607 790=head1 AUTHOR
791
792SADAHIRO Tomoyuki, E<lt>SADAHIRO@cpan.orgE<gt>
793
794 http://homepage1.nifty.com/nomenclator/perl/
795
796 Copyright(C) 2001, SADAHIRO Tomoyuki. Japan. All rights reserved.
797
798 This program is free software; you can redistribute it and/or
799 modify it under the same terms as Perl itself.
800
801=head1 SEE ALSO
802
803=over 4
804
5398038e 805=item Unicode Collation Algorithm - Unicode TR #10
45394607 806
5398038e 807http://www.unicode.org/unicode/reports/tr10/
45394607 808
809=item L<Unicode::Normalize>
810
811normalized forms of Unicode text
812
45394607 813=back
814
815=cut