Allow several arguments to display().
[p5sagit/p5-mst-13.2.git] / lib / Unicode / Collate.pm
CommitLineData
45394607 1package Unicode::Collate;
2
3use 5.006;
4use strict;
5use warnings;
6use Carp;
5398038e 7
45394607 8require Exporter;
9
5398038e 10our $VERSION = '0.09';
45394607 11our $PACKAGE = __PACKAGE__;
12
13our @ISA = qw(Exporter);
14
15our %EXPORT_TAGS = ();
16our @EXPORT_OK = ();
17our @EXPORT = ();
18
19(our $Path = $INC{'Unicode/Collate.pm'}) =~ s/\.pm$//;
20our $KeyFile = "allkeys.txt";
21
5398038e 22our $getCombinClass; # coderef for combining class from Unicode::Normalize
45394607 23
24use constant Min2 => 0x20; # minimum weight at level 2
25use constant Min3 => 0x02; # minimum weight at level 3
26use constant UNDEFINED => 0xFF80; # special value for undefined CE
27
28##
29## constructor
30##
31sub new
32{
33 my $class = shift;
34 my $self = bless { @_ }, $class;
35
36 # alternate
37 $self->{alternate} =
38 ! exists $self->{alternate} ? 'shifted' :
39 ! defined $self->{alternate} ? '' : $self->{alternate};
40
41 # collation level
5398038e 42 $self->{level} ||= ($self->{alternate} =~ /^shift/ ? 4 : 3);
45394607 43
44 # normalization form
45 $self->{normalization} = 'D' if ! exists $self->{normalization};
46
5398038e 47 if(defined $self->{normalization}){
48 eval "use Unicode::Normalize;";
49 croak "you'd install Unicode::Normalize for normalization forms: $@"
50 if $@;
51 $getCombinClass = \&Unicode::Normalize::getCombinClass
52 if ! $getCombinClass;
53 }
45394607 54
5398038e 55 $self->{UNF} =
45394607 56 ! defined $self->{normalization} ? undef :
57 $self->{normalization} =~ /^(?:NF)?C$/ ? \&NFC :
58 $self->{normalization} =~ /^(?:NF)?D$/ ? \&NFD :
59 $self->{normalization} =~ /^(?:NF)?KC$/ ? \&NFKC :
60 $self->{normalization} =~ /^(?:NF)?KD$/ ? \&NFKD :
61 croak "$PACKAGE unknown normalization form name: $self->{normalization}";
62
45394607 63 # backwards
64 $self->{backwards} ||= [];
65 $self->{backwards} = [ $self->{backwards} ] if ! ref $self->{backwards};
66
67 # rearrange
68 $self->{rearrange} ||= []; # maybe not U+0000 (an ASCII)
69 $self->{rearrange} = [ $self->{rearrange} ] if ! ref $self->{rearrange};
70
71 # open the table file
72 my $file = defined $self->{table} ? $self->{table} : $KeyFile;
73 open my $fk, "<$Path/$file" or croak "File does not exist at $Path/$file";
74
75 while(<$fk>){
76 next if /^\s*#/;
77 if(/^\s*\@/){
78 if(/^\@version\s*(\S*)/){
79 $self->{version} ||= $1;
80 }
81 elsif(/^\@alternate\s+(.*)/){
82 $self->{alternate} ||= $1;
83 }
84 elsif(/^\@backwards\s+(.*)/){
85 push @{ $self->{backwards} }, $1;
86 }
87 elsif(/^\@rearrange\s+(.*)/){
88 push @{ $self->{rearrange} }, _getHexArray($1);
89 }
90 next;
91 }
92 $self->parseEntry($_);
93 }
94 close $fk;
95 if($self->{entry}){
96 $self->parseEntry($_) foreach split /\n/, $self->{entry};
97 }
98
99 # keys of $self->{rearrangeHash} are $self->{rearrange}.
100 $self->{rearrangeHash} = {};
101 @{ $self->{rearrangeHash} }{ @{ $self->{rearrange} } } = ();
102
103 return $self;
104}
105
106##
107## get $line, parse it, and write an entry in $self
108##
109sub parseEntry
110{
111 my $self = shift;
112 my $line = shift;
113 my($name, $ele, @key);
114
115 return if $line !~ /^\s*[0-9A-Fa-f]/;
116
117 # get name
118 $name = $1 if $line =~ s/#\s*(.*)//;
119 return if defined $self->{undefName} && $name =~ /$self->{undefName}/;
120
121 # get element
122 my($e, $k) = split /;/, $line;
123 my @e = _getHexArray($e);
124 $ele = pack('U*', @e);
125 return if defined $self->{undefChar} && $ele =~ /$self->{undefChar}/;
126
127 # get sort key
128 if(
129 defined $self->{ignoreName} && $name =~ /$self->{ignoreName}/ ||
130 defined $self->{ignoreChar} && $ele =~ /$self->{ignoreChar}/
131 )
132 {
d16e9e3d 133 $self->{entries}{$ele} = $self->{ignored}{$ele} = 1;
45394607 134 }
135 else
136 {
137 foreach my $arr ($k =~ /\[(\S+)\]/g) {
138 my $var = $arr =~ /\*/;
d16e9e3d 139 push @key, $self->altCE( $var, _getHexArray($arr) );
45394607 140 }
141 $self->{entries}{$ele} = \@key;
142 }
143 $self->{maxlength}{ord $ele} = scalar @e if @e > 1;
144}
145
146
147##
d16e9e3d 148## arrayref CE = altCE(bool variable?, list[num] weights)
45394607 149##
d16e9e3d 150sub altCE
45394607 151{
152 my $self = shift;
153 my $var = shift;
154 my @c = @_;
155
156 $self->{alternate} eq 'blanked' ?
d16e9e3d 157 $var ? [0,0,0] : [ @c[0..2] ] :
158 $self->{alternate} eq 'non-ignorable' ?
159 [ @c[0..2] ] :
45394607 160 $self->{alternate} eq 'shifted' ?
161 $var ? [0,0,0,$c[0] ] : [ @c[0..2], $c[0]+$c[1]+$c[2] ? 0xFFFF : 0 ] :
162 $self->{alternate} eq 'shift-trimmed' ?
163 $var ? [0,0,0,$c[0] ] : [ @c[0..2], 0 ] :
164 \@c;
165}
166
167##
d16e9e3d 168## string hex_sortkey = splitCE(string arg)
45394607 169##
170sub viewSortKey
171{
172 my $self = shift;
173 my $key = $self->getSortKey(@_);
174 my $view = join " ", map sprintf("%04X", $_), unpack 'n*', $key;
175 $view =~ s/ ?0000 ?/|/g;
176 "[$view]";
177}
178
d16e9e3d 179
45394607 180##
d16e9e3d 181## list[strings] elements = splitCE(string arg)
45394607 182##
d16e9e3d 183sub splitCE
45394607 184{
185 my $self = shift;
186 my $code = $self->{preprocess};
5398038e 187 my $norm = $self->{UNF};
45394607 188 my $ent = $self->{entries};
45394607 189 my $max = $self->{maxlength};
45394607 190 my $rear = $self->{rearrangeHash};
191
192 my $str = ref $code ? &$code(shift) : shift;
193 $str = &$norm($str) if ref $norm;
194
195 my(@src, @buf);
196 @src = unpack('U*', $str);
197
198 # rearrangement
199 for(my $i = 0; $i < @src; $i++)
200 {
201 ($src[$i], $src[$i+1]) = ($src[$i+1], $src[$i])
202 if $rear->{ $src[$i] };
203 $i++;
204 }
205
206 for(my $i = 0; $i < @src; $i++)
207 {
208 my $ch;
209 my $u = $src[$i];
210
211 # non-characters
212 next if $u < 0 || 0x10FFFF < $u # out of range
213 || 0xD800 < $u && $u < 0xDFFF; # unpaired surrogates
214 my $four = $u & 0xFFFF;
215 next if $four == 0xFFFE || $four == 0xFFFF;
216
217 if($max->{$u}) # contract
218 {
219 for(my $j = $max->{$u}; $j >= 1; $j--)
220 {
221 next unless $i+$j-1 < @src;
222 $ch = pack 'U*', @src[$i .. $i+$j-1];
223 $i += $j-1, last if $ent->{$ch};
224 }
225 }
226 else { $ch = pack('U', $u) }
227
5398038e 228 # with Combining Char (UTS#10, 4.2.1), here requires Unicode::Normalize.
229 if($getCombinClass && defined $ch)
45394607 230 {
5398038e 231 for(my $j = $i+1; $j < @src && $getCombinClass->( $src[$j] ); $j++)
45394607 232 {
233 my $comb = pack 'U', $src[$j];
234 next if ! $ent->{ $ch.$comb };
235 $ch .= $comb;
236 splice(@src, $j, 1);
45394607 237 }
238 }
d16e9e3d 239 push @buf, $ch;
240 }
241 wantarray ? @buf : \@buf;
242}
45394607 243
d16e9e3d 244
245##
246## list[arrayrefs] weight = getWt(string element)
247##
248sub getWt
249{
250 my $self = shift;
251 my $ch = shift;
252 my $ent = $self->{entries};
253 my $ign = $self->{ignored};
254 my $cjk = $self->{overrideCJK};
255 my $hang = $self->{overrideHangul};
256 return if !defined $ch || $ign->{$ch}; # ignored
257 return @{ $ent->{$ch} } if $ent->{$ch};
258 my $u = unpack('U', $ch);
259 return
260 _isHangul($u)
261 ? $hang
262 ? &$hang($u)
5398038e 263 : map(@{ $ent->{pack('U', $_)} }, _decompHangul($u))
d16e9e3d 264 : _isCJK($u)
265 ? $cjk ? &$cjk($u) : map($self->altCE(0,@$_), _CJK($u))
266 : map($self->altCE(0,@$_), _derivCE($u));
267}
268
269##
270## int = index(string, substring)
271##
272sub index
273{
274 my $self = shift;
275 my $lev = $self->{level};
276 my $str = $self->splitCE(shift);
277 my $sub = $self->splitCE(shift);
278
279 return wantarray ? (0,0) : 0 if ! @$sub;
280 return wantarray ? () : -1 if ! @$str;
281
282 my @subWt = grep _ignorableAtLevel($_,$lev),
283 map $self->getWt($_), @$sub;
284
285 my(@strWt,@strPt);
286 my $count = 0;
287 for my $e (@$str){
288 my @tmp = grep _ignorableAtLevel($_,$lev), $self->getWt($e);
289 push @strWt, @tmp;
290 push @strPt, ($count) x @tmp;
291 $count += length $e;
292 while(@strWt >= @subWt){
293 if(_eqArray(\@strWt, \@subWt, $lev)){
294 my $pos = $strPt[0];
295 return wantarray ? ($pos, $count-$pos) : $pos;
296 }
297 shift @strWt;
298 shift @strPt;
299 }
300 }
301 return wantarray ? () : -1;
302}
303
304##
305## bool _eqArray(arrayref, arrayref, level)
306##
307sub _eqArray($$$)
308{
309 my $a = shift; # length $a >= length $b;
310 my $b = shift;
311 my $lev = shift;
312 for my $v (0..$lev-1){
313 for my $c (0..@$b-1){
314 return if $a->[$c][$v] != $b->[$c][$v];
315 }
45394607 316 }
d16e9e3d 317 return 1;
318}
319
320
321##
322## bool _ignorableAtLevel(CE, level)
323##
324sub _ignorableAtLevel($$)
325{
326 my $ce = shift;
327 return if ! defined $ce;
328 my $lv = shift;
329 ! grep { ! $ce->[$_] } 0..$lv-1;
330}
331
332
333##
334## string sortkey = getSortKey(string arg)
335##
336sub getSortKey
337{
338 my $self = shift;
339 my $lev = $self->{level};
340 my $rCE = $self->splitCE(shift); # get an arrayref
341
342 # weight arrays
343 my @buf = grep defined(), map $self->getWt($_), @$rCE;
45394607 344
345 # make sort key
346 my @ret = ([],[],[],[]);
347 foreach my $v (0..$lev-1){
348 foreach my $b (@buf){
349 push @{ $ret[$v] }, $b->[$v] if $b->[$v];
350 }
351 }
352 foreach (@{ $self->{backwards} }){
353 my $v = $_ - 1;
354 @{ $ret[$v] } = reverse @{ $ret[$v] };
355 }
356
357 # modification of tertiary weights
358 if($self->{upper_before_lower}){
359 foreach (@{ $ret[2] }){
360 if (0x8 <= $_ && $_ <= 0xC){ $_ -= 6 } # lower
361 elsif(0x2 <= $_ && $_ <= 0x6){ $_ += 6 } # upper
362 elsif($_ == 0x1C) { $_ += 1 } # square upper
363 elsif($_ == 0x1D) { $_ -= 1 } # square lower
364 }
365 }
366 if($self->{katakana_before_hiragana}){
367 foreach (@{ $ret[2] }){
368 if (0x0F <= $_ && $_ <= 0x13){ $_ -= 2 } # katakana
369 elsif(0x0D <= $_ && $_ <= 0x0E){ $_ += 5 } # hiragana
370 }
371 }
372 join "\0\0", map pack('n*', @$_), @ret;
373}
374
375
376##
d16e9e3d 377## int compare = cmp(string a, string b)
45394607 378##
5398038e 379sub cmp { $_[0]->getSortKey($_[1]) cmp $_[0]->getSortKey($_[2]) }
380sub eq { $_[0]->getSortKey($_[1]) eq $_[0]->getSortKey($_[2]) }
381sub ne { $_[0]->getSortKey($_[1]) ne $_[0]->getSortKey($_[2]) }
382sub lt { $_[0]->getSortKey($_[1]) lt $_[0]->getSortKey($_[2]) }
383sub le { $_[0]->getSortKey($_[1]) le $_[0]->getSortKey($_[2]) }
384sub gt { $_[0]->getSortKey($_[1]) gt $_[0]->getSortKey($_[2]) }
385sub ge { $_[0]->getSortKey($_[1]) ge $_[0]->getSortKey($_[2]) }
45394607 386
387##
d16e9e3d 388## list[strings] sorted = sort(list[strings] arg)
45394607 389##
390sub sort
391{
392 my $obj = shift;
393
394 map { $_->[1] }
395 sort{ $a->[0] cmp $b->[0] }
396 map [ $obj->getSortKey($_), $_ ], @_;
397}
398
399##
d16e9e3d 400## list[arrayrefs] CE = _derivCE(int codepoint)
45394607 401##
402sub _derivCE
403{
404 my $code = shift;
405 my $a = UNDEFINED + ($code >> 15); # ok
406 my $b = ($code & 0x7FFF) | 0x8000; # ok
407# my $a = 0xFFC2 + ($code >> 15); # ng
408# my $b = $code & 0x7FFF | 0x1000; # ng
409 $b ? ([$a,2,1,$code],[$b,0,0,$code]) : [$a,2,1,$code];
410}
411
412##
413## "hhhh hhhh hhhh" to (dddd, dddd, dddd)
414##
5398038e 415sub _getHexArray { map hex(), $_[0] =~ /([0-9a-fA-F]+)/g }
45394607 416
417##
d16e9e3d 418## bool is_a_CJK_Unified_Ideograph = _isCJK(int codepoint)
45394607 419##
420sub _isCJK
421{
422 my $u = shift;
423 return 0x3400 <= $u && $u <= 0x4DB5
424 || 0x4E00 <= $u && $u <= 0x9FA5
425# || 0x20000 <= $u && $u <= 0x2A6D6;
426}
427
428##
d16e9e3d 429## list[arrayref] CE = _CJK(int codepoint_of_CJK)
45394607 430##
5398038e 431sub _CJK { $_[0] > 0xFFFF ? _derivCE($_[0]) : [$_[0], 0x20, 0x02, $_[0]] }
45394607 432
433##
d16e9e3d 434## bool is_a_Hangul_Syllable = _isHangul(int codepoint)
45394607 435##
5398038e 436sub _isHangul { return 0xAC00 <= $_[0] && $_[0] <= 0xD7A3 }
437
438sub _decompHangul {
439 my $code = shift;
440 # $code must be in Hangul syllable. check it before you enter here.
441 my $SIndex = $code - 0xAC00;
442 my $LIndex = int( $SIndex / 588);
443 my $VIndex = int(($SIndex % 588) / 28);
444 my $TIndex = $SIndex % 28;
445 return (
446 0x1100 + $LIndex,
447 0x1161 + $VIndex,
448 $TIndex ? (0x11A7 + $TIndex) : (),
449 );
45394607 450}
451
4521;
453__END__
454
455=head1 NAME
456
457Unicode::Collate - use UCA (Unicode Collation Algorithm)
458
459=head1 SYNOPSIS
460
461 use Unicode::Collate;
462
463 #construct
5398038e 464 $Collator = Unicode::Collate->new(%tailoring);
45394607 465
466 #sort
5398038e 467 @sorted = $Collator->sort(@not_sorted);
45394607 468
469 #compare
5398038e 470 $result = $Collator->cmp($a, $b); # returns 1, 0, or -1.
45394607 471
472=head1 DESCRIPTION
473
474=head2 Constructor and Tailoring
475
d16e9e3d 476The C<new> method returns a collator object.
477
5398038e 478 $Collator = Unicode::Collate->new(
45394607 479 alternate => $alternate,
480 backwards => $levelNumber, # or \@levelNumbers
481 entry => $element,
482 normalization => $normalization_form,
483 ignoreName => qr/$ignoreName/,
484 ignoreChar => qr/$ignoreChar/,
485 katakana_before_hiragana => $bool,
486 level => $collationLevel,
487 overrideCJK => \&overrideCJK,
488 overrideHangul => \&overrideHangul,
489 preprocess => \&preprocess,
490 rearrange => \@charList,
491 table => $filename,
492 undefName => qr/$undefName/,
493 undefChar => qr/$undefChar/,
494 upper_before_lower => $bool,
495 );
496 # if %tailoring is false (empty),
5398038e 497 # $Collator should do the default collation.
45394607 498
499=over 4
500
501=item alternate
502
503-- see 3.2.2 Alternate Weighting, UTR #10.
504
505 alternate => 'shifted', 'blanked', 'non-ignorable', or 'shift-trimmed'.
506
507By default (if specification is omitted), 'shifted' is adopted.
508
509=item backwards
510
511-- see 3.1.2 French Accents, UTR #10.
512
513 backwards => $levelNumber or \@levelNumbers
514
515Weights in reverse order; ex. level 2 (diacritic ordering) in French.
516If omitted, forwards at all the levels.
517
518=item entry
519
520-- see 3.1 Linguistic Features; 3.2.1 File Format, UTR #10.
521
522Overrides a default order or adds a new element
523
524 entry => <<'ENTRIES', # use the UCA file format
52500E6 ; [.0861.0020.0002.00E6] [.08B1.0020.0002.00E6] # ligature <ae> as <a e>
5260063 0068 ; [.0893.0020.0002.0063] # "ch" in traditional Spanish
5270043 0068 ; [.0893.0020.0008.0043] # "Ch" in traditional Spanish
528ENTRIES
529
530=item ignoreName
531
532=item ignoreChar
533
534-- see Completely Ignorable, 3.2.2 Alternate Weighting, UTR #10.
535
536Ignores the entry in the table.
537If an ignored collation element appears in the string to be collated,
538it is ignored as if the element had been deleted from there.
539
540E.g. when 'a' and 'e' are ignored,
541'element' is equal to 'lament' (or 'lmnt').
542
543=item level
544
545-- see 4.3 Form a sort key for each string, UTR #10.
546
547Set the maximum level.
548Any higher levels than the specified one are ignored.
549
550 Level 1: alphabetic ordering
551 Level 2: diacritic ordering
552 Level 3: case ordering
553 Level 4: tie-breaking (e.g. in the case when alternate is 'shifted')
554
555 ex.level => 2,
556
557=item normalization
558
559-- see 4.1 Normalize each input string, UTR #10.
560
561If specified, strings are normalized before preparation sort keys
562(the normalization is executed after preprocess).
563
564As a form name, one of the following names must be used.
565
566 'C' or 'NFC' for Normalization Form C
567 'D' or 'NFD' for Normalization Form D
568 'KC' or 'NFKC' for Normalization Form KC
569 'KD' or 'NFKD' for Normalization Form KD
570
571If omitted, the string is put into Normalization Form D.
572
573If undefined explicitly (as C<normalization =E<gt> undef>),
574any normalization is not carried out (this may make tailoring easier
575if any normalization is not desired).
576
577see B<CAVEAT>.
578
579=item overrideCJK
580
581=item overrideHangul
582
583-- see 7.1 Derived Collation Elements, UTR #10.
584
585By default, mapping of CJK Unified Ideographs
586uses the Unicode codepoint order
587and Hangul Syllables are decomposed into Hangul Jamo.
588
589The mapping of CJK Unified Ideographs
590or Hangul Syllables may be overrided.
591
592ex. CJK Unified Ideographs in the JIS codepoint order.
593
594 overrideCJK => sub {
595 my $u = shift; # get unicode codepoint
596 my $b = pack('n', $u); # to UTF-16BE
597 my $s = your_unicode_to_sjis_converter($b); # convert
598 my $n = unpack('n', $s); # convert sjis to short
599 [ $n, 1, 1 ]; # return collation element
600 },
601
602If you want to override the mapping of Hangul Syllables,
603the Normalization Forms D and KD are not appropriate
604(they will be decomposed before overriding).
605
606=item preprocess
607
608-- see 5.1 Preprocessing, UTR #10.
609
610If specified, the coderef is used to preprocess
611before the formation of sort keys.
612
613ex. dropping English articles, such as "a" or "the".
614Then, "the pen" is before "a pencil".
615
616 preprocess => sub {
617 my $str = shift;
618 $str =~ s/\b(?:an?|the)\s+//g;
619 $str;
620 },
621
622=item rearrange
623
624-- see 3.1.3 Rearrangement, UTR #10.
625
626Characters that are not coded in logical order and to be rearranged.
627By default,
628
629 rearrange => [ 0x0E40..0x0E44, 0x0EC0..0x0EC4 ],
630
631=item table
632
633-- see 3.2 Default Unicode Collation Element Table, UTR #10.
634
635You can use another element table if desired.
636The table file must be in your C<lib/Unicode/Collate> directory.
637
638By default, the file C<lib/Unicode/Collate/allkeys.txt> is used.
639
640=item undefName
641
642=item undefChar
643
644-- see 6.3.4 Reducing the Repertoire, UTR #10.
645
646Undefines the collation element as if it were unassigned in the table.
647This reduces the size of the table.
648If an unassigned character appears in the string to be collated,
649the sort key is made from its codepoint
650as a single-character collation element,
651as it is greater than any other assigned collation elements
652(in the codepoint order among the unassigned characters).
653But, it'd be better to ignore characters
654unfamiliar to you and maybe never used.
655
656=item katakana_before_hiragana
657
658=item upper_before_lower
659
660-- see 6.6 Case Comparisons; 7.3.1 Tertiary Weight Table, UTR #10.
661
662By default, lowercase is before uppercase
663and hiragana is before katakana.
664
665If the parameter is true, this is reversed.
666
667=back
668
669=head2 Other methods
670
671=over 4
672
5398038e 673=item C<@sorted = $Collator-E<gt>sort(@not_sorted)>
45394607 674
675Sorts a list of strings.
676
5398038e 677=item C<$result = $Collator-E<gt>cmp($a, $b)>
45394607 678
679Returns 1 (when C<$a> is greater than C<$b>)
680or 0 (when C<$a> is equal to C<$b>)
681or -1 (when C<$a> is lesser than C<$b>).
682
5398038e 683=item C<$result = $Collator-E<gt>eq($a, $b)>
684
685=item C<$result = $Collator-E<gt>ne($a, $b)>
686
687=item C<$result = $Collator-E<gt>lt($a, $b)>
688
689=item C<$result = $Collator-E<gt>le($a, $b)>
690
691=item C<$result = $Collator-E<gt>gt($a, $b)>
692
693=item C<$result = $Collator-E<gt>ge($a, $b)>
694
695They works like the same name operators as theirs.
696
697 eq : whether $a is equal to $b.
698 ne : whether $a is not equal to $b.
699 lt : whether $a is lesser than $b.
700 le : whether $a is lesser than $b or equal to $b.
701 gt : whether $a is greater than $b.
702 ge : whether $a is greater than $b or equal to $b.
703
704=item C<$sortKey = $Collator-E<gt>getSortKey($string)>
45394607 705
706-- see 4.3 Form a sort key for each string, UTR #10.
707
708Returns a sort key.
709
710You compare the sort keys using a binary comparison
711and get the result of the comparison of the strings using UCA.
712
5398038e 713 $Collator->getSortKey($a) cmp $Collator->getSortKey($b)
45394607 714
715 is equivalent to
716
5398038e 717 $Collator->cmp($a, $b)
45394607 718
5398038e 719=item C<$position = $Collator-E<gt>index($string, $substring)>
d16e9e3d 720
5398038e 721=item C<($position, $length) = $Collator-E<gt>index($string, $substring)>
d16e9e3d 722
723-- see 6.8 Searching, UTR #10.
724
725If C<$substring> matches a part of C<$string>, returns
726the position of the first occurrence of the matching part in scalar context;
727in list context, returns a two-element list of
728the position and the length of the matching part.
729
730B<Notice> that the length of the matching part may differ from
731the length of C<$substring>.
732
733B<Note> that the position and the length are counted on the string
734after the process of preprocess, normalization, and rearrangement.
735Therefore, in case the specified string is not binary equal to
736the preprocessed/normalized/rearranged string, the position and the length
737may differ form those on the specified string. But it is guaranteed
738that, if matched, it returns a non-negative value as C<$position>.
739
740If C<$substring> does not match any part of C<$string>,
741returns C<-1> in scalar context and
742an empty list in list context.
743
744e.g. you say
745
5398038e 746 my $Collator = Unicode::Collate->new( normalization => undef, level => 1 );
d16e9e3d 747 my $str = "Ich mu\x{00DF} studieren.";
748 my $sub = "m\x{00FC}ss";
749 my $match;
5398038e 750 if(my($pos,$len) = $Collator->index($str, $sub)){
751 $match = substr($str, $pos, $len);
d16e9e3d 752 }
753
754and get C<"mu\x{00DF}"> in C<$match> since C<"mu>E<223>C<">
755is primary equal to C<"m>E<252>C<ss">.
756
45394607 757=back
758
759=head2 EXPORT
760
761None by default.
762
763=head2 CAVEAT
764
765Use of the C<normalization> parameter requires
766the B<Unicode::Normalize> module.
767
5398038e 768If you need not it (say, in the case when you need not
45394607 769handle any combining characters),
770assign C<normalization =E<gt> undef> explicitly.
771
5398038e 772-- see 6.5 Avoiding Normalization, UTR #10.
773
45394607 774=head1 AUTHOR
775
776SADAHIRO Tomoyuki, E<lt>SADAHIRO@cpan.orgE<gt>
777
778 http://homepage1.nifty.com/nomenclator/perl/
779
780 Copyright(C) 2001, SADAHIRO Tomoyuki. Japan. All rights reserved.
781
782 This program is free software; you can redistribute it and/or
783 modify it under the same terms as Perl itself.
784
785=head1 SEE ALSO
786
787=over 4
788
5398038e 789=item Unicode Collation Algorithm - Unicode TR #10
45394607 790
5398038e 791http://www.unicode.org/unicode/reports/tr10/
45394607 792
793=item L<Unicode::Normalize>
794
795normalized forms of Unicode text
796
45394607 797=back
798
799=cut