Integrate mainline
[p5sagit/p5-mst-13.2.git] / lib / Unicode / Collate.pm
CommitLineData
45394607 1package Unicode::Collate;
2
4a2e806c 3BEGIN {
4 if (ord("A") == 193) {
5 die "Unicode::Collate not ported to EBCDIC\n";
6 }
7}
8
45394607 9use 5.006;
10use strict;
11use warnings;
12use Carp;
5398038e 13
45394607 14require Exporter;
15
905aa9f0 16our $VERSION = '0.10';
45394607 17our $PACKAGE = __PACKAGE__;
18
19our @ISA = qw(Exporter);
20
21our %EXPORT_TAGS = ();
22our @EXPORT_OK = ();
23our @EXPORT = ();
24
25(our $Path = $INC{'Unicode/Collate.pm'}) =~ s/\.pm$//;
26our $KeyFile = "allkeys.txt";
27
5398038e 28our $getCombinClass; # coderef for combining class from Unicode::Normalize
45394607 29
30use constant Min2 => 0x20; # minimum weight at level 2
31use constant Min3 => 0x02; # minimum weight at level 3
32use constant UNDEFINED => 0xFF80; # special value for undefined CE
33
34##
35## constructor
36##
37sub new
38{
39 my $class = shift;
40 my $self = bless { @_ }, $class;
41
42 # alternate
43 $self->{alternate} =
44 ! exists $self->{alternate} ? 'shifted' :
45 ! defined $self->{alternate} ? '' : $self->{alternate};
46
47 # collation level
5398038e 48 $self->{level} ||= ($self->{alternate} =~ /^shift/ ? 4 : 3);
45394607 49
50 # normalization form
51 $self->{normalization} = 'D' if ! exists $self->{normalization};
52
5398038e 53 if(defined $self->{normalization}){
54 eval "use Unicode::Normalize;";
55 croak "you'd install Unicode::Normalize for normalization forms: $@"
56 if $@;
57 $getCombinClass = \&Unicode::Normalize::getCombinClass
58 if ! $getCombinClass;
59 }
45394607 60
5398038e 61 $self->{UNF} =
45394607 62 ! defined $self->{normalization} ? undef :
63 $self->{normalization} =~ /^(?:NF)?C$/ ? \&NFC :
64 $self->{normalization} =~ /^(?:NF)?D$/ ? \&NFD :
65 $self->{normalization} =~ /^(?:NF)?KC$/ ? \&NFKC :
66 $self->{normalization} =~ /^(?:NF)?KD$/ ? \&NFKD :
67 croak "$PACKAGE unknown normalization form name: $self->{normalization}";
68
45394607 69 # backwards
70 $self->{backwards} ||= [];
71 $self->{backwards} = [ $self->{backwards} ] if ! ref $self->{backwards};
72
73 # rearrange
74 $self->{rearrange} ||= []; # maybe not U+0000 (an ASCII)
75 $self->{rearrange} = [ $self->{rearrange} ] if ! ref $self->{rearrange};
76
905aa9f0 77 # open a table file.
78 # if undef is passed explicitly, no file is read.
79 $self->{table} = $KeyFile unless exists $self->{table};
80 $self->read_table if defined $self->{table};
81
82 if($self->{entry}){
83 $self->parseEntry($_) foreach split /\n/, $self->{entry};
84 }
85
86 # keys of $self->{rearrangeHash} are $self->{rearrange}.
87 $self->{rearrangeHash} = {};
88 @{ $self->{rearrangeHash} }{ @{ $self->{rearrange} } } = ();
89
90 return $self;
91}
92
93
94sub read_table {
95 my $self = shift;
96 my $file = $self->{table} ne '' ? $self->{table} : $KeyFile;
45394607 97 open my $fk, "<$Path/$file" or croak "File does not exist at $Path/$file";
98
99 while(<$fk>){
100 next if /^\s*#/;
101 if(/^\s*\@/){
102 if(/^\@version\s*(\S*)/){
103 $self->{version} ||= $1;
104 }
105 elsif(/^\@alternate\s+(.*)/){
106 $self->{alternate} ||= $1;
107 }
108 elsif(/^\@backwards\s+(.*)/){
109 push @{ $self->{backwards} }, $1;
110 }
111 elsif(/^\@rearrange\s+(.*)/){
112 push @{ $self->{rearrange} }, _getHexArray($1);
113 }
114 next;
115 }
116 $self->parseEntry($_);
117 }
118 close $fk;
45394607 119}
120
905aa9f0 121
45394607 122##
123## get $line, parse it, and write an entry in $self
124##
125sub parseEntry
126{
127 my $self = shift;
128 my $line = shift;
129 my($name, $ele, @key);
130
131 return if $line !~ /^\s*[0-9A-Fa-f]/;
132
133 # get name
134 $name = $1 if $line =~ s/#\s*(.*)//;
135 return if defined $self->{undefName} && $name =~ /$self->{undefName}/;
136
137 # get element
138 my($e, $k) = split /;/, $line;
139 my @e = _getHexArray($e);
80a5d8e7 140 $ele = pack('U*', @e);
45394607 141 return if defined $self->{undefChar} && $ele =~ /$self->{undefChar}/;
142
143 # get sort key
80a5d8e7 144 if(
45394607 145 defined $self->{ignoreName} && $name =~ /$self->{ignoreName}/ ||
146 defined $self->{ignoreChar} && $ele =~ /$self->{ignoreChar}/
80a5d8e7 147 )
148 {
149 $self->{entries}{$ele} = $self->{ignored}{$ele} = 1;
150 }
151 else
152 {
153 foreach my $arr ($k =~ /\[(\S+)\]/g) {
154 my $var = $arr =~ /\*/;
155 push @key, $self->altCE( $var, _getHexArray($arr) );
45394607 156 }
80a5d8e7 157 $self->{entries}{$ele} = \@key;
45394607 158 }
159 $self->{maxlength}{ord $ele} = scalar @e if @e > 1;
160}
161
162
163##
d16e9e3d 164## arrayref CE = altCE(bool variable?, list[num] weights)
45394607 165##
d16e9e3d 166sub altCE
45394607 167{
168 my $self = shift;
169 my $var = shift;
170 my @c = @_;
171
172 $self->{alternate} eq 'blanked' ?
d16e9e3d 173 $var ? [0,0,0] : [ @c[0..2] ] :
174 $self->{alternate} eq 'non-ignorable' ?
175 [ @c[0..2] ] :
45394607 176 $self->{alternate} eq 'shifted' ?
177 $var ? [0,0,0,$c[0] ] : [ @c[0..2], $c[0]+$c[1]+$c[2] ? 0xFFFF : 0 ] :
178 $self->{alternate} eq 'shift-trimmed' ?
179 $var ? [0,0,0,$c[0] ] : [ @c[0..2], 0 ] :
180 \@c;
181}
182
183##
d16e9e3d 184## string hex_sortkey = splitCE(string arg)
45394607 185##
186sub viewSortKey
187{
188 my $self = shift;
189 my $key = $self->getSortKey(@_);
190 my $view = join " ", map sprintf("%04X", $_), unpack 'n*', $key;
191 $view =~ s/ ?0000 ?/|/g;
192 "[$view]";
193}
194
d16e9e3d 195
45394607 196##
d16e9e3d 197## list[strings] elements = splitCE(string arg)
45394607 198##
d16e9e3d 199sub splitCE
45394607 200{
201 my $self = shift;
202 my $code = $self->{preprocess};
5398038e 203 my $norm = $self->{UNF};
45394607 204 my $ent = $self->{entries};
45394607 205 my $max = $self->{maxlength};
45394607 206 my $rear = $self->{rearrangeHash};
207
208 my $str = ref $code ? &$code(shift) : shift;
209 $str = &$norm($str) if ref $norm;
210
211 my(@src, @buf);
212 @src = unpack('U*', $str);
213
214 # rearrangement
215 for(my $i = 0; $i < @src; $i++)
216 {
217 ($src[$i], $src[$i+1]) = ($src[$i+1], $src[$i])
218 if $rear->{ $src[$i] };
219 $i++;
220 }
221
222 for(my $i = 0; $i < @src; $i++)
223 {
224 my $ch;
225 my $u = $src[$i];
226
227 # non-characters
905aa9f0 228 next unless defined $u;
45394607 229 next if $u < 0 || 0x10FFFF < $u # out of range
230 || 0xD800 < $u && $u < 0xDFFF; # unpaired surrogates
231 my $four = $u & 0xFFFF;
232 next if $four == 0xFFFE || $four == 0xFFFF;
233
234 if($max->{$u}) # contract
235 {
236 for(my $j = $max->{$u}; $j >= 1; $j--)
237 {
238 next unless $i+$j-1 < @src;
239 $ch = pack 'U*', @src[$i .. $i+$j-1];
240 $i += $j-1, last if $ent->{$ch};
241 }
242 }
243 else { $ch = pack('U', $u) }
244
5398038e 245 # with Combining Char (UTS#10, 4.2.1), here requires Unicode::Normalize.
246 if($getCombinClass && defined $ch)
45394607 247 {
905aa9f0 248 for(my $j = $i+1; $j < @src; $j++)
45394607 249 {
905aa9f0 250 next unless defined $src[$j];
251 last unless $getCombinClass->( $src[$j] );
45394607 252 my $comb = pack 'U', $src[$j];
253 next if ! $ent->{ $ch.$comb };
254 $ch .= $comb;
905aa9f0 255 $src[$j] = undef;
45394607 256 }
257 }
d16e9e3d 258 push @buf, $ch;
259 }
260 wantarray ? @buf : \@buf;
261}
45394607 262
d16e9e3d 263
264##
265## list[arrayrefs] weight = getWt(string element)
266##
267sub getWt
268{
269 my $self = shift;
270 my $ch = shift;
271 my $ent = $self->{entries};
272 my $ign = $self->{ignored};
273 my $cjk = $self->{overrideCJK};
274 my $hang = $self->{overrideHangul};
275 return if !defined $ch || $ign->{$ch}; # ignored
276 return @{ $ent->{$ch} } if $ent->{$ch};
277 my $u = unpack('U', $ch);
278 return
279 _isHangul($u)
280 ? $hang
281 ? &$hang($u)
5398038e 282 : map(@{ $ent->{pack('U', $_)} }, _decompHangul($u))
d16e9e3d 283 : _isCJK($u)
284 ? $cjk ? &$cjk($u) : map($self->altCE(0,@$_), _CJK($u))
285 : map($self->altCE(0,@$_), _derivCE($u));
286}
287
288##
289## int = index(string, substring)
290##
291sub index
292{
293 my $self = shift;
294 my $lev = $self->{level};
295 my $str = $self->splitCE(shift);
296 my $sub = $self->splitCE(shift);
297
298 return wantarray ? (0,0) : 0 if ! @$sub;
299 return wantarray ? () : -1 if ! @$str;
300
301 my @subWt = grep _ignorableAtLevel($_,$lev),
302 map $self->getWt($_), @$sub;
303
304 my(@strWt,@strPt);
305 my $count = 0;
306 for my $e (@$str){
307 my @tmp = grep _ignorableAtLevel($_,$lev), $self->getWt($e);
308 push @strWt, @tmp;
309 push @strPt, ($count) x @tmp;
310 $count += length $e;
311 while(@strWt >= @subWt){
312 if(_eqArray(\@strWt, \@subWt, $lev)){
313 my $pos = $strPt[0];
314 return wantarray ? ($pos, $count-$pos) : $pos;
315 }
316 shift @strWt;
317 shift @strPt;
318 }
319 }
320 return wantarray ? () : -1;
321}
322
323##
324## bool _eqArray(arrayref, arrayref, level)
325##
326sub _eqArray($$$)
327{
328 my $a = shift; # length $a >= length $b;
329 my $b = shift;
330 my $lev = shift;
331 for my $v (0..$lev-1){
332 for my $c (0..@$b-1){
333 return if $a->[$c][$v] != $b->[$c][$v];
334 }
45394607 335 }
d16e9e3d 336 return 1;
337}
338
339
340##
341## bool _ignorableAtLevel(CE, level)
342##
343sub _ignorableAtLevel($$)
344{
345 my $ce = shift;
346 return if ! defined $ce;
347 my $lv = shift;
348 ! grep { ! $ce->[$_] } 0..$lv-1;
349}
350
351
352##
353## string sortkey = getSortKey(string arg)
354##
355sub getSortKey
356{
357 my $self = shift;
358 my $lev = $self->{level};
359 my $rCE = $self->splitCE(shift); # get an arrayref
360
361 # weight arrays
362 my @buf = grep defined(), map $self->getWt($_), @$rCE;
45394607 363
364 # make sort key
365 my @ret = ([],[],[],[]);
366 foreach my $v (0..$lev-1){
367 foreach my $b (@buf){
368 push @{ $ret[$v] }, $b->[$v] if $b->[$v];
369 }
370 }
371 foreach (@{ $self->{backwards} }){
372 my $v = $_ - 1;
373 @{ $ret[$v] } = reverse @{ $ret[$v] };
374 }
375
376 # modification of tertiary weights
377 if($self->{upper_before_lower}){
378 foreach (@{ $ret[2] }){
379 if (0x8 <= $_ && $_ <= 0xC){ $_ -= 6 } # lower
380 elsif(0x2 <= $_ && $_ <= 0x6){ $_ += 6 } # upper
381 elsif($_ == 0x1C) { $_ += 1 } # square upper
382 elsif($_ == 0x1D) { $_ -= 1 } # square lower
383 }
384 }
385 if($self->{katakana_before_hiragana}){
386 foreach (@{ $ret[2] }){
387 if (0x0F <= $_ && $_ <= 0x13){ $_ -= 2 } # katakana
388 elsif(0x0D <= $_ && $_ <= 0x0E){ $_ += 5 } # hiragana
389 }
390 }
391 join "\0\0", map pack('n*', @$_), @ret;
392}
393
394
395##
d16e9e3d 396## int compare = cmp(string a, string b)
45394607 397##
5398038e 398sub cmp { $_[0]->getSortKey($_[1]) cmp $_[0]->getSortKey($_[2]) }
399sub eq { $_[0]->getSortKey($_[1]) eq $_[0]->getSortKey($_[2]) }
400sub ne { $_[0]->getSortKey($_[1]) ne $_[0]->getSortKey($_[2]) }
401sub lt { $_[0]->getSortKey($_[1]) lt $_[0]->getSortKey($_[2]) }
402sub le { $_[0]->getSortKey($_[1]) le $_[0]->getSortKey($_[2]) }
403sub gt { $_[0]->getSortKey($_[1]) gt $_[0]->getSortKey($_[2]) }
404sub ge { $_[0]->getSortKey($_[1]) ge $_[0]->getSortKey($_[2]) }
45394607 405
406##
d16e9e3d 407## list[strings] sorted = sort(list[strings] arg)
45394607 408##
409sub sort
410{
411 my $obj = shift;
412
413 map { $_->[1] }
414 sort{ $a->[0] cmp $b->[0] }
415 map [ $obj->getSortKey($_), $_ ], @_;
416}
417
418##
d16e9e3d 419## list[arrayrefs] CE = _derivCE(int codepoint)
45394607 420##
421sub _derivCE
422{
423 my $code = shift;
424 my $a = UNDEFINED + ($code >> 15); # ok
425 my $b = ($code & 0x7FFF) | 0x8000; # ok
426# my $a = 0xFFC2 + ($code >> 15); # ng
427# my $b = $code & 0x7FFF | 0x1000; # ng
428 $b ? ([$a,2,1,$code],[$b,0,0,$code]) : [$a,2,1,$code];
429}
430
431##
432## "hhhh hhhh hhhh" to (dddd, dddd, dddd)
433##
5398038e 434sub _getHexArray { map hex(), $_[0] =~ /([0-9a-fA-F]+)/g }
45394607 435
436##
d16e9e3d 437## bool is_a_CJK_Unified_Ideograph = _isCJK(int codepoint)
45394607 438##
439sub _isCJK
440{
441 my $u = shift;
442 return 0x3400 <= $u && $u <= 0x4DB5
443 || 0x4E00 <= $u && $u <= 0x9FA5
444# || 0x20000 <= $u && $u <= 0x2A6D6;
445}
446
447##
d16e9e3d 448## list[arrayref] CE = _CJK(int codepoint_of_CJK)
45394607 449##
5398038e 450sub _CJK { $_[0] > 0xFFFF ? _derivCE($_[0]) : [$_[0], 0x20, 0x02, $_[0]] }
45394607 451
452##
d16e9e3d 453## bool is_a_Hangul_Syllable = _isHangul(int codepoint)
45394607 454##
5398038e 455sub _isHangul { return 0xAC00 <= $_[0] && $_[0] <= 0xD7A3 }
456
457sub _decompHangul {
458 my $code = shift;
459 # $code must be in Hangul syllable. check it before you enter here.
460 my $SIndex = $code - 0xAC00;
461 my $LIndex = int( $SIndex / 588);
462 my $VIndex = int(($SIndex % 588) / 28);
463 my $TIndex = $SIndex % 28;
464 return (
465 0x1100 + $LIndex,
466 0x1161 + $VIndex,
467 $TIndex ? (0x11A7 + $TIndex) : (),
468 );
45394607 469}
470
4711;
472__END__
473
474=head1 NAME
475
476Unicode::Collate - use UCA (Unicode Collation Algorithm)
477
478=head1 SYNOPSIS
479
480 use Unicode::Collate;
481
482 #construct
5398038e 483 $Collator = Unicode::Collate->new(%tailoring);
45394607 484
485 #sort
5398038e 486 @sorted = $Collator->sort(@not_sorted);
45394607 487
488 #compare
5398038e 489 $result = $Collator->cmp($a, $b); # returns 1, 0, or -1.
45394607 490
491=head1 DESCRIPTION
492
493=head2 Constructor and Tailoring
494
d16e9e3d 495The C<new> method returns a collator object.
496
5398038e 497 $Collator = Unicode::Collate->new(
45394607 498 alternate => $alternate,
499 backwards => $levelNumber, # or \@levelNumbers
500 entry => $element,
501 normalization => $normalization_form,
502 ignoreName => qr/$ignoreName/,
503 ignoreChar => qr/$ignoreChar/,
504 katakana_before_hiragana => $bool,
505 level => $collationLevel,
506 overrideCJK => \&overrideCJK,
507 overrideHangul => \&overrideHangul,
508 preprocess => \&preprocess,
509 rearrange => \@charList,
510 table => $filename,
511 undefName => qr/$undefName/,
512 undefChar => qr/$undefChar/,
513 upper_before_lower => $bool,
514 );
515 # if %tailoring is false (empty),
5398038e 516 # $Collator should do the default collation.
45394607 517
518=over 4
519
520=item alternate
521
522-- see 3.2.2 Alternate Weighting, UTR #10.
523
524 alternate => 'shifted', 'blanked', 'non-ignorable', or 'shift-trimmed'.
525
526By default (if specification is omitted), 'shifted' is adopted.
527
528=item backwards
529
530-- see 3.1.2 French Accents, UTR #10.
531
532 backwards => $levelNumber or \@levelNumbers
533
534Weights in reverse order; ex. level 2 (diacritic ordering) in French.
535If omitted, forwards at all the levels.
536
537=item entry
538
539-- see 3.1 Linguistic Features; 3.2.1 File Format, UTR #10.
540
905aa9f0 541Overrides a default order or adds a new collation element
45394607 542
543 entry => <<'ENTRIES', # use the UCA file format
54400E6 ; [.0861.0020.0002.00E6] [.08B1.0020.0002.00E6] # ligature <ae> as <a e>
5450063 0068 ; [.0893.0020.0002.0063] # "ch" in traditional Spanish
5460043 0068 ; [.0893.0020.0008.0043] # "Ch" in traditional Spanish
547ENTRIES
548
549=item ignoreName
550
551=item ignoreChar
552
553-- see Completely Ignorable, 3.2.2 Alternate Weighting, UTR #10.
554
555Ignores the entry in the table.
556If an ignored collation element appears in the string to be collated,
557it is ignored as if the element had been deleted from there.
558
559E.g. when 'a' and 'e' are ignored,
560'element' is equal to 'lament' (or 'lmnt').
561
562=item level
563
564-- see 4.3 Form a sort key for each string, UTR #10.
565
566Set the maximum level.
567Any higher levels than the specified one are ignored.
568
569 Level 1: alphabetic ordering
570 Level 2: diacritic ordering
571 Level 3: case ordering
572 Level 4: tie-breaking (e.g. in the case when alternate is 'shifted')
573
574 ex.level => 2,
575
576=item normalization
577
578-- see 4.1 Normalize each input string, UTR #10.
579
905aa9f0 580If specified, strings are normalized before preparation of sort keys
45394607 581(the normalization is executed after preprocess).
582
583As a form name, one of the following names must be used.
584
585 'C' or 'NFC' for Normalization Form C
586 'D' or 'NFD' for Normalization Form D
587 'KC' or 'NFKC' for Normalization Form KC
588 'KD' or 'NFKD' for Normalization Form KD
589
590If omitted, the string is put into Normalization Form D.
591
592If undefined explicitly (as C<normalization =E<gt> undef>),
593any normalization is not carried out (this may make tailoring easier
594if any normalization is not desired).
595
596see B<CAVEAT>.
597
598=item overrideCJK
599
600=item overrideHangul
601
602-- see 7.1 Derived Collation Elements, UTR #10.
603
604By default, mapping of CJK Unified Ideographs
605uses the Unicode codepoint order
606and Hangul Syllables are decomposed into Hangul Jamo.
607
608The mapping of CJK Unified Ideographs
609or Hangul Syllables may be overrided.
610
611ex. CJK Unified Ideographs in the JIS codepoint order.
612
613 overrideCJK => sub {
614 my $u = shift; # get unicode codepoint
615 my $b = pack('n', $u); # to UTF-16BE
616 my $s = your_unicode_to_sjis_converter($b); # convert
617 my $n = unpack('n', $s); # convert sjis to short
618 [ $n, 1, 1 ]; # return collation element
619 },
620
621If you want to override the mapping of Hangul Syllables,
622the Normalization Forms D and KD are not appropriate
623(they will be decomposed before overriding).
624
625=item preprocess
626
627-- see 5.1 Preprocessing, UTR #10.
628
629If specified, the coderef is used to preprocess
630before the formation of sort keys.
631
632ex. dropping English articles, such as "a" or "the".
633Then, "the pen" is before "a pencil".
634
635 preprocess => sub {
636 my $str = shift;
637 $str =~ s/\b(?:an?|the)\s+//g;
638 $str;
639 },
640
641=item rearrange
642
643-- see 3.1.3 Rearrangement, UTR #10.
644
645Characters that are not coded in logical order and to be rearranged.
646By default,
647
648 rearrange => [ 0x0E40..0x0E44, 0x0EC0..0x0EC4 ],
649
650=item table
651
652-- see 3.2 Default Unicode Collation Element Table, UTR #10.
653
654You can use another element table if desired.
655The table file must be in your C<lib/Unicode/Collate> directory.
656
657By default, the file C<lib/Unicode/Collate/allkeys.txt> is used.
658
905aa9f0 659If undefined explicitly (as C<table =E<gt> undef>),
660no file is read (you'd define collation elements using L<entry>).
661
45394607 662=item undefName
663
664=item undefChar
665
666-- see 6.3.4 Reducing the Repertoire, UTR #10.
667
668Undefines the collation element as if it were unassigned in the table.
669This reduces the size of the table.
670If an unassigned character appears in the string to be collated,
671the sort key is made from its codepoint
672as a single-character collation element,
673as it is greater than any other assigned collation elements
674(in the codepoint order among the unassigned characters).
675But, it'd be better to ignore characters
676unfamiliar to you and maybe never used.
677
678=item katakana_before_hiragana
679
680=item upper_before_lower
681
682-- see 6.6 Case Comparisons; 7.3.1 Tertiary Weight Table, UTR #10.
683
684By default, lowercase is before uppercase
685and hiragana is before katakana.
686
687If the parameter is true, this is reversed.
688
689=back
690
691=head2 Other methods
692
693=over 4
694
5398038e 695=item C<@sorted = $Collator-E<gt>sort(@not_sorted)>
45394607 696
697Sorts a list of strings.
698
5398038e 699=item C<$result = $Collator-E<gt>cmp($a, $b)>
45394607 700
701Returns 1 (when C<$a> is greater than C<$b>)
702or 0 (when C<$a> is equal to C<$b>)
703or -1 (when C<$a> is lesser than C<$b>).
704
5398038e 705=item C<$result = $Collator-E<gt>eq($a, $b)>
706
707=item C<$result = $Collator-E<gt>ne($a, $b)>
708
709=item C<$result = $Collator-E<gt>lt($a, $b)>
710
711=item C<$result = $Collator-E<gt>le($a, $b)>
712
713=item C<$result = $Collator-E<gt>gt($a, $b)>
714
715=item C<$result = $Collator-E<gt>ge($a, $b)>
716
717They works like the same name operators as theirs.
718
719 eq : whether $a is equal to $b.
720 ne : whether $a is not equal to $b.
721 lt : whether $a is lesser than $b.
722 le : whether $a is lesser than $b or equal to $b.
723 gt : whether $a is greater than $b.
724 ge : whether $a is greater than $b or equal to $b.
725
726=item C<$sortKey = $Collator-E<gt>getSortKey($string)>
45394607 727
728-- see 4.3 Form a sort key for each string, UTR #10.
729
730Returns a sort key.
731
732You compare the sort keys using a binary comparison
733and get the result of the comparison of the strings using UCA.
734
5398038e 735 $Collator->getSortKey($a) cmp $Collator->getSortKey($b)
45394607 736
737 is equivalent to
738
5398038e 739 $Collator->cmp($a, $b)
45394607 740
5398038e 741=item C<$position = $Collator-E<gt>index($string, $substring)>
d16e9e3d 742
5398038e 743=item C<($position, $length) = $Collator-E<gt>index($string, $substring)>
d16e9e3d 744
745-- see 6.8 Searching, UTR #10.
746
747If C<$substring> matches a part of C<$string>, returns
748the position of the first occurrence of the matching part in scalar context;
749in list context, returns a two-element list of
750the position and the length of the matching part.
751
752B<Notice> that the length of the matching part may differ from
753the length of C<$substring>.
754
755B<Note> that the position and the length are counted on the string
756after the process of preprocess, normalization, and rearrangement.
757Therefore, in case the specified string is not binary equal to
758the preprocessed/normalized/rearranged string, the position and the length
759may differ form those on the specified string. But it is guaranteed
760that, if matched, it returns a non-negative value as C<$position>.
761
762If C<$substring> does not match any part of C<$string>,
763returns C<-1> in scalar context and
764an empty list in list context.
765
766e.g. you say
767
5398038e 768 my $Collator = Unicode::Collate->new( normalization => undef, level => 1 );
d16e9e3d 769 my $str = "Ich mu\x{00DF} studieren.";
770 my $sub = "m\x{00FC}ss";
771 my $match;
5398038e 772 if(my($pos,$len) = $Collator->index($str, $sub)){
773 $match = substr($str, $pos, $len);
d16e9e3d 774 }
775
776and get C<"mu\x{00DF}"> in C<$match> since C<"mu>E<223>C<">
777is primary equal to C<"m>E<252>C<ss">.
778
45394607 779=back
780
781=head2 EXPORT
782
783None by default.
784
4a2e806c 785=head2 TODO
786
787Unicode::Collate has not been ported to EBCDIC. The code mostly would
788work just fine but a decision needs to be made: how the module should
789work in EBCDIC? Should the low 256 characters be understood as
790Unicode or as EBCDIC code points? Should one be chosen or should
791there be a way to do either? Or should such translation be left
792outside the module for the user to do, for example by using
793Encode::from_to()?
794
45394607 795=head2 CAVEAT
796
797Use of the C<normalization> parameter requires
798the B<Unicode::Normalize> module.
799
5398038e 800If you need not it (say, in the case when you need not
45394607 801handle any combining characters),
802assign C<normalization =E<gt> undef> explicitly.
803
5398038e 804-- see 6.5 Avoiding Normalization, UTR #10.
805
45394607 806=head1 AUTHOR
807
808SADAHIRO Tomoyuki, E<lt>SADAHIRO@cpan.orgE<gt>
809
810 http://homepage1.nifty.com/nomenclator/perl/
811
812 Copyright(C) 2001, SADAHIRO Tomoyuki. Japan. All rights reserved.
813
814 This program is free software; you can redistribute it and/or
815 modify it under the same terms as Perl itself.
816
817=head1 SEE ALSO
818
819=over 4
820
5398038e 821=item Unicode Collation Algorithm - Unicode TR #10
45394607 822
5398038e 823http://www.unicode.org/unicode/reports/tr10/
45394607 824
825=item L<Unicode::Normalize>
826
827normalized forms of Unicode text
828
45394607 829=back
830
831=cut