lib/Unicode/Collate.pm

   1 package Unicode::Collate;
   2
   3 use 5.006;
   4 use strict;
   5 use warnings;
   6 use Carp;
   7
   8 require Exporter;
   9
  10 our $VERSION = '0.10';
  11 our $PACKAGE = __PACKAGE__;
  12
  13 our @ISA = qw(Exporter);
  14
  15 our %EXPORT_TAGS = ();
  16 our @EXPORT_OK = ();
  17 our @EXPORT = ();
  18
  19 (our $Path = $INC{'Unicode/Collate.pm'}) =~ s/\.pm$//;
  20 our $KeyFile = "allkeys.txt";
  21
  22 our $getCombinClass; # coderef for combining class from Unicode::Normalize
  23
  24 use constant Min2      => 0x20;   # minimum weight at level 2
  25 use constant Min3      => 0x02;   # minimum weight at level 3
  26 use constant UNDEFINED => 0xFF80; # special value for undefined CE
  27
  28 ##
  29 ## constructor
  30 ##
  31 sub new
  32 {
  33   my $class = shift;
  34   my $self = bless { @_ }, $class;
  35
  36   # alternate
  37   $self->{alternate} =
  38      ! exists  $self->{alternate} ? 'shifted' :
  39      ! defined $self->{alternate} ? '' : $self->{alternate};
  40
  41   # collation level
  42   $self->{level} ||= ($self->{alternate} =~ /^shift/ ? 4 : 3);
  43
  44   # normalization form
  45   $self->{normalization} = 'D' if ! exists $self->{normalization};
  46
  47   if(defined $self->{normalization}){
  48     eval "use Unicode::Normalize;";
  49     croak "you'd install Unicode::Normalize for normalization forms: $@"
  50         if $@;
  51     $getCombinClass = \&Unicode::Normalize::getCombinClass
  52         if ! $getCombinClass;
  53   }
  54
  55   $self->{UNF} =
  56     ! defined $self->{normalization}        ? undef :
  57     $self->{normalization} =~ /^(?:NF)?C$/  ? \&NFC :
  58     $self->{normalization} =~ /^(?:NF)?D$/  ? \&NFD :
  59     $self->{normalization} =~ /^(?:NF)?KC$/ ? \&NFKC :
  60     $self->{normalization} =~ /^(?:NF)?KD$/ ? \&NFKD :
  61     croak "$PACKAGE unknown normalization form name: $self->{normalization}";
  62
  63   # backwards
  64   $self->{backwards} ||= [];
  65   $self->{backwards} = [ $self->{backwards} ] if ! ref $self->{backwards};
  66
  67   # rearrange
  68   $self->{rearrange} ||= []; # maybe not U+0000 (an ASCII)
  69   $self->{rearrange} = [ $self->{rearrange} ] if ! ref $self->{rearrange};
  70
  71   # open a table file.
  72   # if undef is passed explicitly, no file is read.
  73   $self->{table} = $KeyFile unless exists $self->{table};
  74   $self->read_table if defined $self->{table};
  75
  76   if($self->{entry}){
  77     $self->parseEntry($_) foreach split /\n/, $self->{entry};
  78   }
  79
  80   # keys of $self->{rearrangeHash} are $self->{rearrange}.
  81   $self->{rearrangeHash} = {};
  82   @{ $self->{rearrangeHash} }{ @{ $self->{rearrange} } } = ();
  83
  84   return $self;
  85 }
  86
  87
  88 sub read_table {
  89   my $self = shift;
  90   my $file = $self->{table} ne '' ? $self->{table} : $KeyFile;
  91   open my $fk, "<$Path/$file" or croak "File does not exist at $Path/$file";
  92
  93   while(<$fk>){
  94     next if /^\s*#/;
  95     if(/^\s*\@/){
  96        if(/^\@version\s*(\S*)/){
  97          $self->{version} ||= $1;
  98        }
  99        elsif(/^\@alternate\s+(.*)/){
 100          $self->{alternate} ||= $1;
 101        }
 102        elsif(/^\@backwards\s+(.*)/){
 103          push @{ $self->{backwards} }, $1;
 104        }
 105        elsif(/^\@rearrange\s+(.*)/){
 106          push @{ $self->{rearrange} }, _getHexArray($1);
 107        }
 108        next;
 109     }
 110     $self->parseEntry($_);
 111   }
 112   close $fk;
 113 }
 114
 115
 116 ##
 117 ## get $line, parse it, and write an entry in $self
 118 ##
 119 sub parseEntry
 120 {
 121   my $self = shift;
 122   my $line = shift;
 123   my($name, $ele, @key);
 124
 125   return if $line !~ /^\s*[0-9A-Fa-f]/;
 126
 127   # get name
 128   $name = $1 if $line =~ s/#\s*(.*)//;
 129   return if defined $self->{undefName} && $name =~ /$self->{undefName}/;
 130
 131   # get element
 132   my($e, $k) = split /;/, $line;
 133   my @e = _getHexArray($e);
 134   $ele = pack('U*', @e);
 135   return if defined $self->{undefChar} && $ele =~ /$self->{undefChar}/;
 136
 137   # get sort key
 138   if(
 139      defined $self->{ignoreName} && $name =~ /$self->{ignoreName}/ ||
 140      defined $self->{ignoreChar} && $ele  =~ /$self->{ignoreChar}/
 141   )
 142   {
 143      $self->{entries}{$ele} = $self->{ignored}{$ele} = 1;
 144   }
 145   else
 146   {
 147     foreach my $arr ($k =~ /\[(\S+)\]/g) {
 148       my $var = $arr =~ /\*/;
 149       push @key, $self->altCE( $var, _getHexArray($arr) );
 150     }
 151     $self->{entries}{$ele} = \@key;
 152   }
 153   $self->{maxlength}{ord $ele} = scalar @e if @e > 1;
 154 }
 155
 156
 157 ##
 158 ## arrayref CE = altCE(bool variable?, list[num] weights)
 159 ##
 160 sub altCE
 161 {
 162   my $self = shift;
 163   my $var  = shift;
 164   my @c    = @_;
 165
 166   $self->{alternate} eq 'blanked' ?
 167     $var ? [0,0,0] : [ @c[0..2] ] :
 168   $self->{alternate} eq 'non-ignorable' ?
 169     [ @c[0..2] ] :
 170   $self->{alternate} eq 'shifted' ?
 171     $var ? [0,0,0,$c[0] ] : [ @c[0..2], $c[0]+$c[1]+$c[2] ? 0xFFFF : 0 ] :
 172   $self->{alternate} eq 'shift-trimmed' ?
 173     $var ? [0,0,0,$c[0] ] : [ @c[0..2], 0 ] :
 174    \@c;
 175 }
 176
 177 ##
 178 ## string hex_sortkey = splitCE(string arg)
 179 ##
 180 sub viewSortKey
 181 {
 182   my $self = shift;
 183   my $key  = $self->getSortKey(@_);
 184   my $view = join " ", map sprintf("%04X", $_), unpack 'n*', $key;
 185   $view =~ s/ ?0000 ?/|/g;
 186   "[$view]";
 187 }
 188
 189
 190 ##
 191 ## list[strings] elements = splitCE(string arg)
 192 ##
 193 sub splitCE
 194 {
 195   my $self = shift;
 196   my $code = $self->{preprocess};
 197   my $norm = $self->{UNF};
 198   my $ent  = $self->{entries};
 199   my $max  = $self->{maxlength};
 200   my $rear = $self->{rearrangeHash};
 201
 202   my $str = ref $code ? &$code(shift) : shift;
 203   $str = &$norm($str) if ref $norm;
 204
 205   my(@src, @buf);
 206   @src = unpack('U*', $str);
 207
 208   # rearrangement
 209   for(my $i = 0; $i < @src; $i++)
 210   {
 211      ($src[$i], $src[$i+1]) = ($src[$i+1], $src[$i])
 212         if $rear->{ $src[$i] };
 213      $i++;
 214   }
 215
 216   for(my $i = 0; $i < @src; $i++)
 217   {
 218     my $ch;
 219     my $u  = $src[$i];
 220
 221   # non-characters
 222     next unless defined $u;
 223     next if $u < 0 || 0x10FFFF < $u     # out of range
 224          || 0xD800 < $u && $u < 0xDFFF; # unpaired surrogates
 225     my $four = $u & 0xFFFF;
 226     next if $four == 0xFFFE || $four == 0xFFFF;
 227
 228     if($max->{$u}) # contract
 229     {
 230       for(my $j = $max->{$u}; $j >= 1; $j--)
 231       {
 232         next unless $i+$j-1 < @src;
 233         $ch = pack 'U*', @src[$i .. $i+$j-1];
 234         $i += $j-1, last if $ent->{$ch};
 235       }
 236     }
 237     else {  $ch = pack('U', $u) }
 238
 239   # with Combining Char (UTS#10, 4.2.1), here requires Unicode::Normalize.
 240     if($getCombinClass && defined $ch)
 241     {
 242       for(my $j = $i+1; $j < @src; $j++)
 243       {
 244         next unless defined $src[$j];
 245         last unless $getCombinClass->( $src[$j] );
 246         my $comb = pack 'U', $src[$j];
 247         next if ! $ent->{ $ch.$comb };
 248         $ch .= $comb;
 249         $src[$j] = undef;
 250       }
 251     }
 252     push @buf, $ch;
 253   }
 254   wantarray ? @buf : \@buf;
 255 }
 256
 257
 258 ##
 259 ## list[arrayrefs] weight = getWt(string element)
 260 ##
 261 sub getWt
 262 {
 263   my $self = shift;
 264   my $ch   = shift;
 265   my $ent  = $self->{entries};
 266   my $ign  = $self->{ignored};
 267   my $cjk  = $self->{overrideCJK};
 268   my $hang = $self->{overrideHangul};
 269   return if !defined $ch || $ign->{$ch};   # ignored
 270   return @{ $ent->{$ch} } if $ent->{$ch};
 271   my $u = unpack('U', $ch);
 272   return
 273     _isHangul($u)
 274       ? $hang
 275         ? &$hang($u)
 276         : map(@{ $ent->{pack('U', $_)} }, _decompHangul($u))
 277       : _isCJK($u)
 278         ? $cjk ? &$cjk($u) : map($self->altCE(0,@$_), _CJK($u))
 279         : map($self->altCE(0,@$_), _derivCE($u));
 280 }
 281
 282 ##
 283 ## int = index(string, substring)
 284 ##
 285 sub index
 286 {
 287   my $self = shift;
 288   my $lev  = $self->{level};
 289   my $str  = $self->splitCE(shift);
 290   my $sub  = $self->splitCE(shift);
 291
 292   return wantarray ? (0,0) : 0 if ! @$sub;
 293   return wantarray ?  ()  : -1 if ! @$str;
 294
 295   my @subWt = grep _ignorableAtLevel($_,$lev),
 296               map $self->getWt($_), @$sub;
 297
 298   my(@strWt,@strPt);
 299   my $count = 0;
 300   for my $e (@$str){
 301     my @tmp = grep _ignorableAtLevel($_,$lev), $self->getWt($e);
 302     push @strWt, @tmp;
 303     push @strPt, ($count) x @tmp;
 304     $count += length $e;
 305     while(@strWt >= @subWt){
 306       if(_eqArray(\@strWt, \@subWt, $lev)){
 307         my $pos = $strPt[0];
 308         return wantarray ? ($pos, $count-$pos) : $pos;
 309       }
 310       shift @strWt;
 311       shift @strPt;
 312     }
 313   }
 314   return wantarray ? () : -1;
 315 }
 316
 317 ##
 318 ## bool _eqArray(arrayref, arrayref, level)
 319 ##
 320 sub _eqArray($$$)
 321 {
 322   my $a   = shift; # length $a >= length $b;
 323   my $b   = shift;
 324   my $lev = shift;
 325   for my $v (0..$lev-1){
 326     for my $c (0..@$b-1){
 327       return if $a->[$c][$v] != $b->[$c][$v];
 328     }
 329   }
 330   return 1;
 331 }
 332
 333
 334 ##
 335 ## bool _ignorableAtLevel(CE, level)
 336 ##
 337 sub _ignorableAtLevel($$)
 338 {
 339   my $ce = shift;
 340   return if ! defined $ce;
 341   my $lv = shift;
 342   ! grep { ! $ce->[$_] } 0..$lv-1;
 343 }
 344
 345
 346 ##
 347 ## string sortkey = getSortKey(string arg)
 348 ##
 349 sub getSortKey
 350 {
 351   my $self = shift;
 352   my $lev  = $self->{level};
 353   my $rCE  = $self->splitCE(shift); # get an arrayref
 354
 355   # weight arrays
 356   my @buf = grep defined(), map $self->getWt($_), @$rCE;
 357
 358   # make sort key
 359   my @ret = ([],[],[],[]);
 360   foreach my $v (0..$lev-1){
 361     foreach my $b (@buf){
 362       push @{ $ret[$v] }, $b->[$v] if $b->[$v];
 363     }
 364   }
 365   foreach (@{ $self->{backwards} }){
 366     my $v = $_ - 1;
 367     @{ $ret[$v] } = reverse @{ $ret[$v] };
 368   }
 369
 370   # modification of tertiary weights
 371   if($self->{upper_before_lower}){
 372     foreach (@{ $ret[2] }){
 373       if   (0x8 <= $_ && $_ <= 0xC){ $_ -= 6 } # lower
 374       elsif(0x2 <= $_ && $_ <= 0x6){ $_ += 6 } # upper
 375       elsif($_ == 0x1C)            { $_ += 1 } # square upper
 376       elsif($_ == 0x1D)            { $_ -= 1 } # square lower
 377     }
 378   }
 379   if($self->{katakana_before_hiragana}){
 380     foreach (@{ $ret[2] }){
 381       if   (0x0F <= $_ && $_ <= 0x13){ $_ -= 2 } # katakana
 382       elsif(0x0D <= $_ && $_ <= 0x0E){ $_ += 5 } # hiragana
 383     }
 384   }
 385   join "\0\0", map pack('n*', @$_), @ret;
 386 }
 387
 388
 389 ##
 390 ## int compare = cmp(string a, string b)
 391 ##
 392 sub cmp { $_[0]->getSortKey($_[1]) cmp $_[0]->getSortKey($_[2]) }
 393 sub eq  { $_[0]->getSortKey($_[1]) eq  $_[0]->getSortKey($_[2]) }
 394 sub ne  { $_[0]->getSortKey($_[1]) ne  $_[0]->getSortKey($_[2]) }
 395 sub lt  { $_[0]->getSortKey($_[1]) lt  $_[0]->getSortKey($_[2]) }
 396 sub le  { $_[0]->getSortKey($_[1]) le  $_[0]->getSortKey($_[2]) }
 397 sub gt  { $_[0]->getSortKey($_[1]) gt  $_[0]->getSortKey($_[2]) }
 398 sub ge  { $_[0]->getSortKey($_[1]) ge  $_[0]->getSortKey($_[2]) }
 399
 400 ##
 401 ## list[strings] sorted = sort(list[strings] arg)
 402 ##
 403 sub sort
 404 {
 405   my $obj = shift;
 406
 407   map { $_->[1] }
 408   sort{ $a->[0] cmp $b->[0] }
 409   map [ $obj->getSortKey($_), $_ ], @_;
 410 }
 411
 412 ##
 413 ## list[arrayrefs] CE = _derivCE(int codepoint)
 414 ##
 415 sub _derivCE
 416 {
 417   my $code = shift;
 418   my $a = UNDEFINED + ($code >> 15); # ok
 419   my $b = ($code & 0x7FFF) | 0x8000; # ok
 420 # my $a = 0xFFC2 + ($code >> 15);    # ng
 421 # my $b = $code & 0x7FFF | 0x1000;   # ng
 422   $b ? ([$a,2,1,$code],[$b,0,0,$code]) : [$a,2,1,$code];
 423 }
 424
 425 ##
 426 ## "hhhh hhhh hhhh" to (dddd, dddd, dddd)
 427 ##
 428 sub _getHexArray { map hex(), $_[0] =~ /([0-9a-fA-F]+)/g }
 429
 430 ##
 431 ## bool is_a_CJK_Unified_Ideograph = _isCJK(int codepoint)
 432 ##
 433 sub _isCJK
 434 {
 435   my $u = shift;
 436   return 0x3400 <= $u && $u <= 0x4DB5
 437       || 0x4E00 <= $u && $u <= 0x9FA5
 438 #      || 0x20000 <= $u && $u <= 0x2A6D6;
 439 }
 440
 441 ##
 442 ## list[arrayref] CE = _CJK(int codepoint_of_CJK)
 443 ##
 444 sub _CJK { $_[0] > 0xFFFF ? _derivCE($_[0]) : [$_[0], 0x20, 0x02, $_[0]] }
 445
 446 ##
 447 ## bool is_a_Hangul_Syllable = _isHangul(int codepoint)
 448 ##
 449 sub _isHangul { return 0xAC00 <= $_[0] && $_[0] <= 0xD7A3 }
 450
 451 sub _decompHangul {
 452     my $code = shift;
 453   # $code must be in Hangul syllable. check it before you enter here.
 454     my $SIndex = $code - 0xAC00;
 455     my $LIndex = int( $SIndex / 588);
 456     my $VIndex = int(($SIndex % 588) / 28);
 457     my $TIndex =      $SIndex % 28;
 458     return (
 459        0x1100 + $LIndex,
 460        0x1161 + $VIndex,
 461       $TIndex ? (0x11A7 + $TIndex) : (),
 462     );
 463 }
 464
 465 1;
 466 __END__
 467
 468 =head1 NAME
 469
 470 Unicode::Collate - use UCA (Unicode Collation Algorithm)
 471
 472 =head1 SYNOPSIS
 473
 474   use Unicode::Collate;
 475
 476   #construct
 477   $Collator = Unicode::Collate->new(%tailoring);
 478
 479   #sort
 480   @sorted = $Collator->sort(@not_sorted);
 481
 482   #compare
 483   $result = $Collator->cmp($a, $b); # returns 1, 0, or -1.
 484
 485 =head1 DESCRIPTION
 486
 487 =head2 Constructor and Tailoring
 488
 489 The C<new> method returns a collator object.
 490
 491    $Collator = Unicode::Collate->new(
 492       alternate => $alternate,
 493       backwards => $levelNumber, # or \@levelNumbers
 494       entry => $element,
 495       normalization  => $normalization_form,
 496       ignoreName => qr/$ignoreName/,
 497       ignoreChar => qr/$ignoreChar/,
 498       katakana_before_hiragana => $bool,
 499       level => $collationLevel,
 500       overrideCJK => \&overrideCJK,
 501       overrideHangul => \&overrideHangul,
 502       preprocess => \&preprocess,
 503       rearrange => \@charList,
 504       table => $filename,
 505       undefName => qr/$undefName/,
 506       undefChar => qr/$undefChar/,
 507       upper_before_lower => $bool,
 508    );
 509    # if %tailoring is false (empty),
 510    # $Collator should do the default collation.
 511
 512 =over 4
 513
 514 =item alternate
 515
 516 -- see 3.2.2 Alternate Weighting, UTR #10.
 517
 518    alternate => 'shifted', 'blanked', 'non-ignorable', or 'shift-trimmed'.
 519
 520 By default (if specification is omitted), 'shifted' is adopted.
 521
 522 =item backwards
 523
 524 -- see 3.1.2 French Accents, UTR #10.
 525
 526      backwards => $levelNumber or \@levelNumbers
 527
 528 Weights in reverse order; ex. level 2 (diacritic ordering) in French.
 529 If omitted, forwards at all the levels.
 530
 531 =item entry
 532
 533 -- see 3.1 Linguistic Features; 3.2.1 File Format, UTR #10.
 534
 535 Overrides a default order or adds a new collation element
 536
 537   entry => <<'ENTRIES', # use the UCA file format
 538 00E6 ; [.0861.0020.0002.00E6] [.08B1.0020.0002.00E6] # ligature <ae> as <a e>
 539 0063 0068 ; [.0893.0020.0002.0063]      # "ch" in traditional Spanish
 540 0043 0068 ; [.0893.0020.0008.0043]      # "Ch" in traditional Spanish
 541 ENTRIES
 542
 543 =item ignoreName
 544
 545 =item ignoreChar
 546
 547 -- see Completely Ignorable, 3.2.2 Alternate Weighting, UTR #10.
 548
 549 Ignores the entry in the table.
 550 If an ignored collation element appears in the string to be collated,
 551 it is ignored as if the element had been deleted from there.
 552
 553 E.g. when 'a' and 'e' are ignored,
 554 'element' is equal to 'lament' (or 'lmnt').
 555
 556 =item level
 557
 558 -- see 4.3 Form a sort key for each string, UTR #10.
 559
 560 Set the maximum level.
 561 Any higher levels than the specified one are ignored.
 562
 563   Level 1: alphabetic ordering
 564   Level 2: diacritic ordering
 565   Level 3: case ordering
 566   Level 4: tie-breaking (e.g. in the case when alternate is 'shifted')
 567
 568   ex.level => 2,
 569
 570 =item normalization
 571
 572 -- see 4.1 Normalize each input string, UTR #10.
 573
 574 If specified, strings are normalized before preparation of sort keys
 575 (the normalization is executed after preprocess).
 576
 577 As a form name, one of the following names must be used.
 578
 579   'C'  or 'NFC'  for Normalization Form C
 580   'D'  or 'NFD'  for Normalization Form D
 581   'KC' or 'NFKC' for Normalization Form KC
 582   'KD' or 'NFKD' for Normalization Form KD
 583
 584 If omitted, the string is put into Normalization Form D.
 585
 586 If undefined explicitly (as C<normalization =E<gt> undef>),
 587 any normalization is not carried out (this may make tailoring easier
 588 if any normalization is not desired).
 589
 590 see B<CAVEAT>.
 591
 592 =item overrideCJK
 593
 594 =item overrideHangul
 595
 596 -- see 7.1 Derived Collation Elements, UTR #10.
 597
 598 By default, mapping of CJK Unified Ideographs
 599 uses the Unicode codepoint order
 600 and Hangul Syllables are decomposed into Hangul Jamo.
 601
 602 The mapping of CJK Unified Ideographs
 603 or Hangul Syllables may be overrided.
 604
 605 ex. CJK Unified Ideographs in the JIS codepoint order.
 606
 607   overrideCJK => sub {
 608     my $u = shift;               # get unicode codepoint
 609     my $b = pack('n', $u);       # to UTF-16BE
 610     my $s = your_unicode_to_sjis_converter($b); # convert
 611     my $n = unpack('n', $s);     # convert sjis to short
 612     [ $n, 1, 1 ];                # return collation element
 613   },
 614
 615 If you want to override the mapping of Hangul Syllables,
 616 the Normalization Forms D and KD are not appropriate
 617 (they will be decomposed before overriding).
 618
 619 =item preprocess
 620
 621 -- see 5.1 Preprocessing, UTR #10.
 622
 623 If specified, the coderef is used to preprocess
 624 before the formation of sort keys.
 625
 626 ex. dropping English articles, such as "a" or "the".
 627 Then, "the pen" is before "a pencil".
 628
 629      preprocess => sub {
 630            my $str = shift;
 631            $str =~ s/\b(?:an?|the)\s+//g;
 632            $str;
 633         },
 634
 635 =item rearrange
 636
 637 -- see 3.1.3 Rearrangement, UTR #10.
 638
 639 Characters that are not coded in logical order and to be rearranged.
 640 By default,
 641
 642     rearrange => [ 0x0E40..0x0E44, 0x0EC0..0x0EC4 ],
 643
 644 =item table
 645
 646 -- see 3.2 Default Unicode Collation Element Table, UTR #10.
 647
 648 You can use another element table if desired.
 649 The table file must be in your C<lib/Unicode/Collate> directory.
 650
 651 By default, the file C<lib/Unicode/Collate/allkeys.txt> is used.
 652
 653 If undefined explicitly (as C<table =E<gt> undef>),
 654 no file is read (you'd define collation elements using L<entry>).
 655
 656 =item undefName
 657
 658 =item undefChar
 659
 660 -- see 6.3.4 Reducing the Repertoire, UTR #10.
 661
 662 Undefines the collation element as if it were unassigned in the table.
 663 This reduces the size of the table.
 664 If an unassigned character appears in the string to be collated,
 665 the sort key is made from its codepoint
 666 as a single-character collation element,
 667 as it is greater than any other assigned collation elements
 668 (in the codepoint order among the unassigned characters).
 669 But, it'd be better to ignore characters
 670 unfamiliar to you and maybe never used.
 671
 672 =item katakana_before_hiragana
 673
 674 =item upper_before_lower
 675
 676 -- see 6.6 Case Comparisons; 7.3.1 Tertiary Weight Table, UTR #10.
 677
 678 By default, lowercase is before uppercase
 679 and hiragana is before katakana.
 680
 681 If the parameter is true, this is reversed.
 682
 683 =back
 684
 685 =head2 Other methods
 686
 687 =over 4
 688
 689 =item C<@sorted = $Collator-E<gt>sort(@not_sorted)>
 690
 691 Sorts a list of strings.
 692
 693 =item C<$result = $Collator-E<gt>cmp($a, $b)>
 694
 695 Returns 1 (when C<$a> is greater than C<$b>)
 696 or 0 (when C<$a> is equal to C<$b>)
 697 or -1 (when C<$a> is lesser than C<$b>).
 698
 699 =item C<$result = $Collator-E<gt>eq($a, $b)>
 700
 701 =item C<$result = $Collator-E<gt>ne($a, $b)>
 702
 703 =item C<$result = $Collator-E<gt>lt($a, $b)>
 704
 705 =item C<$result = $Collator-E<gt>le($a, $b)>
 706
 707 =item C<$result = $Collator-E<gt>gt($a, $b)>
 708
 709 =item C<$result = $Collator-E<gt>ge($a, $b)>
 710
 711 They works like the same name operators as theirs.
 712
 713    eq : whether $a is equal to $b.
 714    ne : whether $a is not equal to $b.
 715    lt : whether $a is lesser than $b.
 716    le : whether $a is lesser than $b or equal to $b.
 717    gt : whether $a is greater than $b.
 718    ge : whether $a is greater than $b or equal to $b.
 719
 720 =item C<$sortKey = $Collator-E<gt>getSortKey($string)>
 721
 722 -- see 4.3 Form a sort key for each string, UTR #10.
 723
 724 Returns a sort key.
 725
 726 You compare the sort keys using a binary comparison
 727 and get the result of the comparison of the strings using UCA.
 728
 729    $Collator->getSortKey($a) cmp $Collator->getSortKey($b)
 730
 731       is equivalent to
 732
 733    $Collator->cmp($a, $b)
 734
 735 =item C<$position = $Collator-E<gt>index($string, $substring)>
 736
 737 =item C<($position, $length) = $Collator-E<gt>index($string, $substring)>
 738
 739 -- see 6.8 Searching, UTR #10.
 740
 741 If C<$substring> matches a part of C<$string>, returns
 742 the position of the first occurrence of the matching part in scalar context;
 743 in list context, returns a two-element list of
 744 the position and the length of the matching part.
 745
 746 B<Notice> that the length of the matching part may differ from
 747 the length of C<$substring>.
 748
 749 B<Note> that the position and the length are counted on the string
 750 after the process of preprocess, normalization, and rearrangement.
 751 Therefore, in case the specified string is not binary equal to
 752 the preprocessed/normalized/rearranged string, the position and the length
 753 may differ form those on the specified string. But it is guaranteed
 754 that, if matched, it returns a non-negative value as C<$position>.
 755
 756 If C<$substring> does not match any part of C<$string>,
 757 returns C<-1> in scalar context and
 758 an empty list in list context.
 759
 760 e.g. you say
 761
 762   my $Collator = Unicode::Collate->new( normalization => undef, level => 1 );
 763   my $str = "Ich mu\x{00DF} studieren.";
 764   my $sub = "m\x{00FC}ss";
 765   my $match;
 766   if(my($pos,$len) = $Collator->index($str, $sub)){
 767       $match = substr($str, $pos, $len);
 768   }
 769
 770 and get C<"mu\x{00DF}"> in C<$match> since C<"mu>E<223>C<">
 771 is primary equal to C<"m>E<252>C<ss">.
 772
 773 =back
 774
 775 =head2 EXPORT
 776
 777 None by default.
 778
 779 =head2 CAVEAT
 780
 781 Use of the C<normalization> parameter requires
 782 the B<Unicode::Normalize> module.
 783
 784 If you need not it (say, in the case when you need not
 785 handle any combining characters),
 786 assign C<normalization =E<gt> undef> explicitly.
 787
 788 -- see 6.5 Avoiding Normalization, UTR #10.
 789
 790 =head1 AUTHOR
 791
 792 SADAHIRO Tomoyuki, E<lt>SADAHIRO@cpan.orgE<gt>
 793
 794   http://homepage1.nifty.com/nomenclator/perl/
 795
 796   Copyright(C) 2001, SADAHIRO Tomoyuki. Japan. All rights reserved.
 797
 798   This program is free software; you can redistribute it and/or
 799   modify it under the same terms as Perl itself.
 800
 801 =head1 SEE ALSO
 802
 803 =over 4
 804
 805 =item Unicode Collation Algorithm - Unicode TR #10
 806
 807 http://www.unicode.org/unicode/reports/tr10/
 808
 809 =item L<Unicode::Normalize>
 810
 811 normalized forms of Unicode text
 812
 813 =back
 814
 815 =cut