lib/AnyDBM_File.t incorrectly passes even without any dbm
[p5sagit/p5-mst-13.2.git] / lib / Unicode / Collate.pm
CommitLineData
45394607 1package Unicode::Collate;
2
4a2e806c 3BEGIN {
4 if (ord("A") == 193) {
5 die "Unicode::Collate not ported to EBCDIC\n";
6 }
7}
8
45394607 9use 5.006;
10use strict;
11use warnings;
12use Carp;
5398038e 13
45394607 14require Exporter;
15
905aa9f0 16our $VERSION = '0.10';
45394607 17our $PACKAGE = __PACKAGE__;
18
19our @ISA = qw(Exporter);
20
21our %EXPORT_TAGS = ();
22our @EXPORT_OK = ();
23our @EXPORT = ();
24
25(our $Path = $INC{'Unicode/Collate.pm'}) =~ s/\.pm$//;
26our $KeyFile = "allkeys.txt";
27
5398038e 28our $getCombinClass; # coderef for combining class from Unicode::Normalize
45394607 29
30use constant Min2 => 0x20; # minimum weight at level 2
31use constant Min3 => 0x02; # minimum weight at level 3
32use constant UNDEFINED => 0xFF80; # special value for undefined CE
33
34##
35## constructor
36##
37sub new
38{
39 my $class = shift;
40 my $self = bless { @_ }, $class;
41
42 # alternate
43 $self->{alternate} =
44 ! exists $self->{alternate} ? 'shifted' :
45 ! defined $self->{alternate} ? '' : $self->{alternate};
46
47 # collation level
5398038e 48 $self->{level} ||= ($self->{alternate} =~ /^shift/ ? 4 : 3);
45394607 49
50 # normalization form
51 $self->{normalization} = 'D' if ! exists $self->{normalization};
52
5398038e 53 if(defined $self->{normalization}){
54 eval "use Unicode::Normalize;";
55 croak "you'd install Unicode::Normalize for normalization forms: $@"
56 if $@;
57 $getCombinClass = \&Unicode::Normalize::getCombinClass
58 if ! $getCombinClass;
59 }
45394607 60
5398038e 61 $self->{UNF} =
45394607 62 ! defined $self->{normalization} ? undef :
63 $self->{normalization} =~ /^(?:NF)?C$/ ? \&NFC :
64 $self->{normalization} =~ /^(?:NF)?D$/ ? \&NFD :
65 $self->{normalization} =~ /^(?:NF)?KC$/ ? \&NFKC :
66 $self->{normalization} =~ /^(?:NF)?KD$/ ? \&NFKD :
67 croak "$PACKAGE unknown normalization form name: $self->{normalization}";
68
45394607 69 # backwards
70 $self->{backwards} ||= [];
71 $self->{backwards} = [ $self->{backwards} ] if ! ref $self->{backwards};
72
73 # rearrange
74 $self->{rearrange} ||= []; # maybe not U+0000 (an ASCII)
75 $self->{rearrange} = [ $self->{rearrange} ] if ! ref $self->{rearrange};
76
905aa9f0 77 # open a table file.
78 # if undef is passed explicitly, no file is read.
79 $self->{table} = $KeyFile unless exists $self->{table};
80 $self->read_table if defined $self->{table};
81
82 if($self->{entry}){
83 $self->parseEntry($_) foreach split /\n/, $self->{entry};
84 }
85
86 # keys of $self->{rearrangeHash} are $self->{rearrange}.
87 $self->{rearrangeHash} = {};
88 @{ $self->{rearrangeHash} }{ @{ $self->{rearrange} } } = ();
89
90 return $self;
91}
92
93
94sub read_table {
95 my $self = shift;
96 my $file = $self->{table} ne '' ? $self->{table} : $KeyFile;
45394607 97 open my $fk, "<$Path/$file" or croak "File does not exist at $Path/$file";
98
99 while(<$fk>){
100 next if /^\s*#/;
101 if(/^\s*\@/){
102 if(/^\@version\s*(\S*)/){
103 $self->{version} ||= $1;
104 }
105 elsif(/^\@alternate\s+(.*)/){
106 $self->{alternate} ||= $1;
107 }
108 elsif(/^\@backwards\s+(.*)/){
109 push @{ $self->{backwards} }, $1;
110 }
111 elsif(/^\@rearrange\s+(.*)/){
112 push @{ $self->{rearrange} }, _getHexArray($1);
113 }
114 next;
115 }
116 $self->parseEntry($_);
117 }
118 close $fk;
45394607 119}
120
905aa9f0 121
45394607 122##
123## get $line, parse it, and write an entry in $self
124##
125sub parseEntry
126{
127 my $self = shift;
128 my $line = shift;
129 my($name, $ele, @key);
130
131 return if $line !~ /^\s*[0-9A-Fa-f]/;
132
133 # get name
134 $name = $1 if $line =~ s/#\s*(.*)//;
135 return if defined $self->{undefName} && $name =~ /$self->{undefName}/;
136
137 # get element
138 my($e, $k) = split /;/, $line;
139 my @e = _getHexArray($e);
872c91ae 140 { no warnings 'utf8'; $ele = pack('U*', @e); }
45394607 141 return if defined $self->{undefChar} && $ele =~ /$self->{undefChar}/;
142
143 # get sort key
872c91ae 144 { no warnings 'utf8';
145 if(
45394607 146 defined $self->{ignoreName} && $name =~ /$self->{ignoreName}/ ||
147 defined $self->{ignoreChar} && $ele =~ /$self->{ignoreChar}/
872c91ae 148 )
149 {
150 $self->{entries}{$ele} = $self->{ignored}{$ele} = 1;
151 }
152 else
153 {
154 foreach my $arr ($k =~ /\[(\S+)\]/g) {
155 my $var = $arr =~ /\*/;
156 push @key, $self->altCE( $var, _getHexArray($arr) );
157 }
158 $self->{entries}{$ele} = \@key;
45394607 159 }
45394607 160 }
161 $self->{maxlength}{ord $ele} = scalar @e if @e > 1;
162}
163
164
165##
d16e9e3d 166## arrayref CE = altCE(bool variable?, list[num] weights)
45394607 167##
d16e9e3d 168sub altCE
45394607 169{
170 my $self = shift;
171 my $var = shift;
172 my @c = @_;
173
174 $self->{alternate} eq 'blanked' ?
d16e9e3d 175 $var ? [0,0,0] : [ @c[0..2] ] :
176 $self->{alternate} eq 'non-ignorable' ?
177 [ @c[0..2] ] :
45394607 178 $self->{alternate} eq 'shifted' ?
179 $var ? [0,0,0,$c[0] ] : [ @c[0..2], $c[0]+$c[1]+$c[2] ? 0xFFFF : 0 ] :
180 $self->{alternate} eq 'shift-trimmed' ?
181 $var ? [0,0,0,$c[0] ] : [ @c[0..2], 0 ] :
182 \@c;
183}
184
185##
d16e9e3d 186## string hex_sortkey = splitCE(string arg)
45394607 187##
188sub viewSortKey
189{
190 my $self = shift;
191 my $key = $self->getSortKey(@_);
192 my $view = join " ", map sprintf("%04X", $_), unpack 'n*', $key;
193 $view =~ s/ ?0000 ?/|/g;
194 "[$view]";
195}
196
d16e9e3d 197
45394607 198##
d16e9e3d 199## list[strings] elements = splitCE(string arg)
45394607 200##
d16e9e3d 201sub splitCE
45394607 202{
203 my $self = shift;
204 my $code = $self->{preprocess};
5398038e 205 my $norm = $self->{UNF};
45394607 206 my $ent = $self->{entries};
45394607 207 my $max = $self->{maxlength};
45394607 208 my $rear = $self->{rearrangeHash};
209
210 my $str = ref $code ? &$code(shift) : shift;
211 $str = &$norm($str) if ref $norm;
212
213 my(@src, @buf);
214 @src = unpack('U*', $str);
215
216 # rearrangement
217 for(my $i = 0; $i < @src; $i++)
218 {
219 ($src[$i], $src[$i+1]) = ($src[$i+1], $src[$i])
220 if $rear->{ $src[$i] };
221 $i++;
222 }
223
224 for(my $i = 0; $i < @src; $i++)
225 {
226 my $ch;
227 my $u = $src[$i];
228
229 # non-characters
905aa9f0 230 next unless defined $u;
45394607 231 next if $u < 0 || 0x10FFFF < $u # out of range
232 || 0xD800 < $u && $u < 0xDFFF; # unpaired surrogates
233 my $four = $u & 0xFFFF;
234 next if $four == 0xFFFE || $four == 0xFFFF;
235
236 if($max->{$u}) # contract
237 {
238 for(my $j = $max->{$u}; $j >= 1; $j--)
239 {
240 next unless $i+$j-1 < @src;
241 $ch = pack 'U*', @src[$i .. $i+$j-1];
242 $i += $j-1, last if $ent->{$ch};
243 }
244 }
245 else { $ch = pack('U', $u) }
246
5398038e 247 # with Combining Char (UTS#10, 4.2.1), here requires Unicode::Normalize.
248 if($getCombinClass && defined $ch)
45394607 249 {
905aa9f0 250 for(my $j = $i+1; $j < @src; $j++)
45394607 251 {
905aa9f0 252 next unless defined $src[$j];
253 last unless $getCombinClass->( $src[$j] );
45394607 254 my $comb = pack 'U', $src[$j];
255 next if ! $ent->{ $ch.$comb };
256 $ch .= $comb;
905aa9f0 257 $src[$j] = undef;
45394607 258 }
259 }
d16e9e3d 260 push @buf, $ch;
261 }
262 wantarray ? @buf : \@buf;
263}
45394607 264
d16e9e3d 265
266##
267## list[arrayrefs] weight = getWt(string element)
268##
269sub getWt
270{
271 my $self = shift;
272 my $ch = shift;
273 my $ent = $self->{entries};
274 my $ign = $self->{ignored};
275 my $cjk = $self->{overrideCJK};
276 my $hang = $self->{overrideHangul};
277 return if !defined $ch || $ign->{$ch}; # ignored
278 return @{ $ent->{$ch} } if $ent->{$ch};
279 my $u = unpack('U', $ch);
280 return
281 _isHangul($u)
282 ? $hang
283 ? &$hang($u)
5398038e 284 : map(@{ $ent->{pack('U', $_)} }, _decompHangul($u))
d16e9e3d 285 : _isCJK($u)
286 ? $cjk ? &$cjk($u) : map($self->altCE(0,@$_), _CJK($u))
287 : map($self->altCE(0,@$_), _derivCE($u));
288}
289
290##
291## int = index(string, substring)
292##
293sub index
294{
295 my $self = shift;
296 my $lev = $self->{level};
297 my $str = $self->splitCE(shift);
298 my $sub = $self->splitCE(shift);
299
300 return wantarray ? (0,0) : 0 if ! @$sub;
301 return wantarray ? () : -1 if ! @$str;
302
303 my @subWt = grep _ignorableAtLevel($_,$lev),
304 map $self->getWt($_), @$sub;
305
306 my(@strWt,@strPt);
307 my $count = 0;
308 for my $e (@$str){
309 my @tmp = grep _ignorableAtLevel($_,$lev), $self->getWt($e);
310 push @strWt, @tmp;
311 push @strPt, ($count) x @tmp;
312 $count += length $e;
313 while(@strWt >= @subWt){
314 if(_eqArray(\@strWt, \@subWt, $lev)){
315 my $pos = $strPt[0];
316 return wantarray ? ($pos, $count-$pos) : $pos;
317 }
318 shift @strWt;
319 shift @strPt;
320 }
321 }
322 return wantarray ? () : -1;
323}
324
325##
326## bool _eqArray(arrayref, arrayref, level)
327##
328sub _eqArray($$$)
329{
330 my $a = shift; # length $a >= length $b;
331 my $b = shift;
332 my $lev = shift;
333 for my $v (0..$lev-1){
334 for my $c (0..@$b-1){
335 return if $a->[$c][$v] != $b->[$c][$v];
336 }
45394607 337 }
d16e9e3d 338 return 1;
339}
340
341
342##
343## bool _ignorableAtLevel(CE, level)
344##
345sub _ignorableAtLevel($$)
346{
347 my $ce = shift;
348 return if ! defined $ce;
349 my $lv = shift;
350 ! grep { ! $ce->[$_] } 0..$lv-1;
351}
352
353
354##
355## string sortkey = getSortKey(string arg)
356##
357sub getSortKey
358{
359 my $self = shift;
360 my $lev = $self->{level};
361 my $rCE = $self->splitCE(shift); # get an arrayref
362
363 # weight arrays
364 my @buf = grep defined(), map $self->getWt($_), @$rCE;
45394607 365
366 # make sort key
367 my @ret = ([],[],[],[]);
368 foreach my $v (0..$lev-1){
369 foreach my $b (@buf){
370 push @{ $ret[$v] }, $b->[$v] if $b->[$v];
371 }
372 }
373 foreach (@{ $self->{backwards} }){
374 my $v = $_ - 1;
375 @{ $ret[$v] } = reverse @{ $ret[$v] };
376 }
377
378 # modification of tertiary weights
379 if($self->{upper_before_lower}){
380 foreach (@{ $ret[2] }){
381 if (0x8 <= $_ && $_ <= 0xC){ $_ -= 6 } # lower
382 elsif(0x2 <= $_ && $_ <= 0x6){ $_ += 6 } # upper
383 elsif($_ == 0x1C) { $_ += 1 } # square upper
384 elsif($_ == 0x1D) { $_ -= 1 } # square lower
385 }
386 }
387 if($self->{katakana_before_hiragana}){
388 foreach (@{ $ret[2] }){
389 if (0x0F <= $_ && $_ <= 0x13){ $_ -= 2 } # katakana
390 elsif(0x0D <= $_ && $_ <= 0x0E){ $_ += 5 } # hiragana
391 }
392 }
393 join "\0\0", map pack('n*', @$_), @ret;
394}
395
396
397##
d16e9e3d 398## int compare = cmp(string a, string b)
45394607 399##
5398038e 400sub cmp { $_[0]->getSortKey($_[1]) cmp $_[0]->getSortKey($_[2]) }
401sub eq { $_[0]->getSortKey($_[1]) eq $_[0]->getSortKey($_[2]) }
402sub ne { $_[0]->getSortKey($_[1]) ne $_[0]->getSortKey($_[2]) }
403sub lt { $_[0]->getSortKey($_[1]) lt $_[0]->getSortKey($_[2]) }
404sub le { $_[0]->getSortKey($_[1]) le $_[0]->getSortKey($_[2]) }
405sub gt { $_[0]->getSortKey($_[1]) gt $_[0]->getSortKey($_[2]) }
406sub ge { $_[0]->getSortKey($_[1]) ge $_[0]->getSortKey($_[2]) }
45394607 407
408##
d16e9e3d 409## list[strings] sorted = sort(list[strings] arg)
45394607 410##
411sub sort
412{
413 my $obj = shift;
414
415 map { $_->[1] }
416 sort{ $a->[0] cmp $b->[0] }
417 map [ $obj->getSortKey($_), $_ ], @_;
418}
419
420##
d16e9e3d 421## list[arrayrefs] CE = _derivCE(int codepoint)
45394607 422##
423sub _derivCE
424{
425 my $code = shift;
426 my $a = UNDEFINED + ($code >> 15); # ok
427 my $b = ($code & 0x7FFF) | 0x8000; # ok
428# my $a = 0xFFC2 + ($code >> 15); # ng
429# my $b = $code & 0x7FFF | 0x1000; # ng
430 $b ? ([$a,2,1,$code],[$b,0,0,$code]) : [$a,2,1,$code];
431}
432
433##
434## "hhhh hhhh hhhh" to (dddd, dddd, dddd)
435##
5398038e 436sub _getHexArray { map hex(), $_[0] =~ /([0-9a-fA-F]+)/g }
45394607 437
438##
d16e9e3d 439## bool is_a_CJK_Unified_Ideograph = _isCJK(int codepoint)
45394607 440##
441sub _isCJK
442{
443 my $u = shift;
444 return 0x3400 <= $u && $u <= 0x4DB5
445 || 0x4E00 <= $u && $u <= 0x9FA5
446# || 0x20000 <= $u && $u <= 0x2A6D6;
447}
448
449##
d16e9e3d 450## list[arrayref] CE = _CJK(int codepoint_of_CJK)
45394607 451##
5398038e 452sub _CJK { $_[0] > 0xFFFF ? _derivCE($_[0]) : [$_[0], 0x20, 0x02, $_[0]] }
45394607 453
454##
d16e9e3d 455## bool is_a_Hangul_Syllable = _isHangul(int codepoint)
45394607 456##
5398038e 457sub _isHangul { return 0xAC00 <= $_[0] && $_[0] <= 0xD7A3 }
458
459sub _decompHangul {
460 my $code = shift;
461 # $code must be in Hangul syllable. check it before you enter here.
462 my $SIndex = $code - 0xAC00;
463 my $LIndex = int( $SIndex / 588);
464 my $VIndex = int(($SIndex % 588) / 28);
465 my $TIndex = $SIndex % 28;
466 return (
467 0x1100 + $LIndex,
468 0x1161 + $VIndex,
469 $TIndex ? (0x11A7 + $TIndex) : (),
470 );
45394607 471}
472
4731;
474__END__
475
476=head1 NAME
477
478Unicode::Collate - use UCA (Unicode Collation Algorithm)
479
480=head1 SYNOPSIS
481
482 use Unicode::Collate;
483
484 #construct
5398038e 485 $Collator = Unicode::Collate->new(%tailoring);
45394607 486
487 #sort
5398038e 488 @sorted = $Collator->sort(@not_sorted);
45394607 489
490 #compare
5398038e 491 $result = $Collator->cmp($a, $b); # returns 1, 0, or -1.
45394607 492
493=head1 DESCRIPTION
494
495=head2 Constructor and Tailoring
496
d16e9e3d 497The C<new> method returns a collator object.
498
5398038e 499 $Collator = Unicode::Collate->new(
45394607 500 alternate => $alternate,
501 backwards => $levelNumber, # or \@levelNumbers
502 entry => $element,
503 normalization => $normalization_form,
504 ignoreName => qr/$ignoreName/,
505 ignoreChar => qr/$ignoreChar/,
506 katakana_before_hiragana => $bool,
507 level => $collationLevel,
508 overrideCJK => \&overrideCJK,
509 overrideHangul => \&overrideHangul,
510 preprocess => \&preprocess,
511 rearrange => \@charList,
512 table => $filename,
513 undefName => qr/$undefName/,
514 undefChar => qr/$undefChar/,
515 upper_before_lower => $bool,
516 );
517 # if %tailoring is false (empty),
5398038e 518 # $Collator should do the default collation.
45394607 519
520=over 4
521
522=item alternate
523
524-- see 3.2.2 Alternate Weighting, UTR #10.
525
526 alternate => 'shifted', 'blanked', 'non-ignorable', or 'shift-trimmed'.
527
528By default (if specification is omitted), 'shifted' is adopted.
529
530=item backwards
531
532-- see 3.1.2 French Accents, UTR #10.
533
534 backwards => $levelNumber or \@levelNumbers
535
536Weights in reverse order; ex. level 2 (diacritic ordering) in French.
537If omitted, forwards at all the levels.
538
539=item entry
540
541-- see 3.1 Linguistic Features; 3.2.1 File Format, UTR #10.
542
905aa9f0 543Overrides a default order or adds a new collation element
45394607 544
545 entry => <<'ENTRIES', # use the UCA file format
54600E6 ; [.0861.0020.0002.00E6] [.08B1.0020.0002.00E6] # ligature <ae> as <a e>
5470063 0068 ; [.0893.0020.0002.0063] # "ch" in traditional Spanish
5480043 0068 ; [.0893.0020.0008.0043] # "Ch" in traditional Spanish
549ENTRIES
550
551=item ignoreName
552
553=item ignoreChar
554
555-- see Completely Ignorable, 3.2.2 Alternate Weighting, UTR #10.
556
557Ignores the entry in the table.
558If an ignored collation element appears in the string to be collated,
559it is ignored as if the element had been deleted from there.
560
561E.g. when 'a' and 'e' are ignored,
562'element' is equal to 'lament' (or 'lmnt').
563
564=item level
565
566-- see 4.3 Form a sort key for each string, UTR #10.
567
568Set the maximum level.
569Any higher levels than the specified one are ignored.
570
571 Level 1: alphabetic ordering
572 Level 2: diacritic ordering
573 Level 3: case ordering
574 Level 4: tie-breaking (e.g. in the case when alternate is 'shifted')
575
576 ex.level => 2,
577
578=item normalization
579
580-- see 4.1 Normalize each input string, UTR #10.
581
905aa9f0 582If specified, strings are normalized before preparation of sort keys
45394607 583(the normalization is executed after preprocess).
584
585As a form name, one of the following names must be used.
586
587 'C' or 'NFC' for Normalization Form C
588 'D' or 'NFD' for Normalization Form D
589 'KC' or 'NFKC' for Normalization Form KC
590 'KD' or 'NFKD' for Normalization Form KD
591
592If omitted, the string is put into Normalization Form D.
593
594If undefined explicitly (as C<normalization =E<gt> undef>),
595any normalization is not carried out (this may make tailoring easier
596if any normalization is not desired).
597
598see B<CAVEAT>.
599
600=item overrideCJK
601
602=item overrideHangul
603
604-- see 7.1 Derived Collation Elements, UTR #10.
605
606By default, mapping of CJK Unified Ideographs
607uses the Unicode codepoint order
608and Hangul Syllables are decomposed into Hangul Jamo.
609
610The mapping of CJK Unified Ideographs
611or Hangul Syllables may be overrided.
612
613ex. CJK Unified Ideographs in the JIS codepoint order.
614
615 overrideCJK => sub {
616 my $u = shift; # get unicode codepoint
617 my $b = pack('n', $u); # to UTF-16BE
618 my $s = your_unicode_to_sjis_converter($b); # convert
619 my $n = unpack('n', $s); # convert sjis to short
620 [ $n, 1, 1 ]; # return collation element
621 },
622
623If you want to override the mapping of Hangul Syllables,
624the Normalization Forms D and KD are not appropriate
625(they will be decomposed before overriding).
626
627=item preprocess
628
629-- see 5.1 Preprocessing, UTR #10.
630
631If specified, the coderef is used to preprocess
632before the formation of sort keys.
633
634ex. dropping English articles, such as "a" or "the".
635Then, "the pen" is before "a pencil".
636
637 preprocess => sub {
638 my $str = shift;
639 $str =~ s/\b(?:an?|the)\s+//g;
640 $str;
641 },
642
643=item rearrange
644
645-- see 3.1.3 Rearrangement, UTR #10.
646
647Characters that are not coded in logical order and to be rearranged.
648By default,
649
650 rearrange => [ 0x0E40..0x0E44, 0x0EC0..0x0EC4 ],
651
652=item table
653
654-- see 3.2 Default Unicode Collation Element Table, UTR #10.
655
656You can use another element table if desired.
657The table file must be in your C<lib/Unicode/Collate> directory.
658
659By default, the file C<lib/Unicode/Collate/allkeys.txt> is used.
660
905aa9f0 661If undefined explicitly (as C<table =E<gt> undef>),
662no file is read (you'd define collation elements using L<entry>).
663
45394607 664=item undefName
665
666=item undefChar
667
668-- see 6.3.4 Reducing the Repertoire, UTR #10.
669
670Undefines the collation element as if it were unassigned in the table.
671This reduces the size of the table.
672If an unassigned character appears in the string to be collated,
673the sort key is made from its codepoint
674as a single-character collation element,
675as it is greater than any other assigned collation elements
676(in the codepoint order among the unassigned characters).
677But, it'd be better to ignore characters
678unfamiliar to you and maybe never used.
679
680=item katakana_before_hiragana
681
682=item upper_before_lower
683
684-- see 6.6 Case Comparisons; 7.3.1 Tertiary Weight Table, UTR #10.
685
686By default, lowercase is before uppercase
687and hiragana is before katakana.
688
689If the parameter is true, this is reversed.
690
691=back
692
693=head2 Other methods
694
695=over 4
696
5398038e 697=item C<@sorted = $Collator-E<gt>sort(@not_sorted)>
45394607 698
699Sorts a list of strings.
700
5398038e 701=item C<$result = $Collator-E<gt>cmp($a, $b)>
45394607 702
703Returns 1 (when C<$a> is greater than C<$b>)
704or 0 (when C<$a> is equal to C<$b>)
705or -1 (when C<$a> is lesser than C<$b>).
706
5398038e 707=item C<$result = $Collator-E<gt>eq($a, $b)>
708
709=item C<$result = $Collator-E<gt>ne($a, $b)>
710
711=item C<$result = $Collator-E<gt>lt($a, $b)>
712
713=item C<$result = $Collator-E<gt>le($a, $b)>
714
715=item C<$result = $Collator-E<gt>gt($a, $b)>
716
717=item C<$result = $Collator-E<gt>ge($a, $b)>
718
719They works like the same name operators as theirs.
720
721 eq : whether $a is equal to $b.
722 ne : whether $a is not equal to $b.
723 lt : whether $a is lesser than $b.
724 le : whether $a is lesser than $b or equal to $b.
725 gt : whether $a is greater than $b.
726 ge : whether $a is greater than $b or equal to $b.
727
728=item C<$sortKey = $Collator-E<gt>getSortKey($string)>
45394607 729
730-- see 4.3 Form a sort key for each string, UTR #10.
731
732Returns a sort key.
733
734You compare the sort keys using a binary comparison
735and get the result of the comparison of the strings using UCA.
736
5398038e 737 $Collator->getSortKey($a) cmp $Collator->getSortKey($b)
45394607 738
739 is equivalent to
740
5398038e 741 $Collator->cmp($a, $b)
45394607 742
5398038e 743=item C<$position = $Collator-E<gt>index($string, $substring)>
d16e9e3d 744
5398038e 745=item C<($position, $length) = $Collator-E<gt>index($string, $substring)>
d16e9e3d 746
747-- see 6.8 Searching, UTR #10.
748
749If C<$substring> matches a part of C<$string>, returns
750the position of the first occurrence of the matching part in scalar context;
751in list context, returns a two-element list of
752the position and the length of the matching part.
753
754B<Notice> that the length of the matching part may differ from
755the length of C<$substring>.
756
757B<Note> that the position and the length are counted on the string
758after the process of preprocess, normalization, and rearrangement.
759Therefore, in case the specified string is not binary equal to
760the preprocessed/normalized/rearranged string, the position and the length
761may differ form those on the specified string. But it is guaranteed
762that, if matched, it returns a non-negative value as C<$position>.
763
764If C<$substring> does not match any part of C<$string>,
765returns C<-1> in scalar context and
766an empty list in list context.
767
768e.g. you say
769
5398038e 770 my $Collator = Unicode::Collate->new( normalization => undef, level => 1 );
d16e9e3d 771 my $str = "Ich mu\x{00DF} studieren.";
772 my $sub = "m\x{00FC}ss";
773 my $match;
5398038e 774 if(my($pos,$len) = $Collator->index($str, $sub)){
775 $match = substr($str, $pos, $len);
d16e9e3d 776 }
777
778and get C<"mu\x{00DF}"> in C<$match> since C<"mu>E<223>C<">
779is primary equal to C<"m>E<252>C<ss">.
780
45394607 781=back
782
783=head2 EXPORT
784
785None by default.
786
4a2e806c 787=head2 TODO
788
789Unicode::Collate has not been ported to EBCDIC. The code mostly would
790work just fine but a decision needs to be made: how the module should
791work in EBCDIC? Should the low 256 characters be understood as
792Unicode or as EBCDIC code points? Should one be chosen or should
793there be a way to do either? Or should such translation be left
794outside the module for the user to do, for example by using
795Encode::from_to()?
796
45394607 797=head2 CAVEAT
798
799Use of the C<normalization> parameter requires
800the B<Unicode::Normalize> module.
801
5398038e 802If you need not it (say, in the case when you need not
45394607 803handle any combining characters),
804assign C<normalization =E<gt> undef> explicitly.
805
5398038e 806-- see 6.5 Avoiding Normalization, UTR #10.
807
45394607 808=head1 AUTHOR
809
810SADAHIRO Tomoyuki, E<lt>SADAHIRO@cpan.orgE<gt>
811
812 http://homepage1.nifty.com/nomenclator/perl/
813
814 Copyright(C) 2001, SADAHIRO Tomoyuki. Japan. All rights reserved.
815
816 This program is free software; you can redistribute it and/or
817 modify it under the same terms as Perl itself.
818
819=head1 SEE ALSO
820
821=over 4
822
5398038e 823=item Unicode Collation Algorithm - Unicode TR #10
45394607 824
5398038e 825http://www.unicode.org/unicode/reports/tr10/
45394607 826
827=item L<Unicode::Normalize>
828
829normalized forms of Unicode text
830
45394607 831=back
832
833=cut