lib/Unicode/Collate.pm

   1 package Unicode::Collate;
   2
   3 use 5.006;
   4 use strict;
   5 use warnings;
   6 use Carp;
   7
   8 require Exporter;
   9
  10 our $VERSION = '0.09';
  11 our $PACKAGE = __PACKAGE__;
  12
  13 our @ISA = qw(Exporter);
  14
  15 our %EXPORT_TAGS = ();
  16 our @EXPORT_OK = ();
  17 our @EXPORT = ();
  18
  19 (our $Path = $INC{'Unicode/Collate.pm'}) =~ s/\.pm$//;
  20 our $KeyFile = "allkeys.txt";
  21
  22 our $getCombinClass; # coderef for combining class from Unicode::Normalize
  23
  24 use constant Min2      => 0x20;   # minimum weight at level 2
  25 use constant Min3      => 0x02;   # minimum weight at level 3
  26 use constant UNDEFINED => 0xFF80; # special value for undefined CE
  27
  28 ##
  29 ## constructor
  30 ##
  31 sub new
  32 {
  33   my $class = shift;
  34   my $self = bless { @_ }, $class;
  35
  36   # alternate
  37   $self->{alternate} =
  38      ! exists  $self->{alternate} ? 'shifted' :
  39      ! defined $self->{alternate} ? '' : $self->{alternate};
  40
  41   # collation level
  42   $self->{level} ||= ($self->{alternate} =~ /^shift/ ? 4 : 3);
  43
  44   # normalization form
  45   $self->{normalization} = 'D' if ! exists $self->{normalization};
  46
  47   if(defined $self->{normalization}){
  48     eval "use Unicode::Normalize;";
  49     croak "you'd install Unicode::Normalize for normalization forms: $@"
  50         if $@;
  51     $getCombinClass = \&Unicode::Normalize::getCombinClass
  52         if ! $getCombinClass;
  53   }
  54
  55   $self->{UNF} =
  56     ! defined $self->{normalization}        ? undef :
  57     $self->{normalization} =~ /^(?:NF)?C$/  ? \&NFC :
  58     $self->{normalization} =~ /^(?:NF)?D$/  ? \&NFD :
  59     $self->{normalization} =~ /^(?:NF)?KC$/ ? \&NFKC :
  60     $self->{normalization} =~ /^(?:NF)?KD$/ ? \&NFKD :
  61     croak "$PACKAGE unknown normalization form name: $self->{normalization}";
  62
  63   # backwards
  64   $self->{backwards} ||= [];
  65   $self->{backwards} = [ $self->{backwards} ] if ! ref $self->{backwards};
  66
  67   # rearrange
  68   $self->{rearrange} ||= []; # maybe not U+0000 (an ASCII)
  69   $self->{rearrange} = [ $self->{rearrange} ] if ! ref $self->{rearrange};
  70
  71   # open the table file
  72   my $file = defined $self->{table} ? $self->{table} : $KeyFile;
  73   open my $fk, "<$Path/$file" or croak "File does not exist at $Path/$file";
  74
  75   while(<$fk>){
  76     next if /^\s*#/;
  77     if(/^\s*\@/){
  78        if(/^\@version\s*(\S*)/){
  79          $self->{version} ||= $1;
  80        }
  81        elsif(/^\@alternate\s+(.*)/){
  82          $self->{alternate} ||= $1;
  83        }
  84        elsif(/^\@backwards\s+(.*)/){
  85          push @{ $self->{backwards} }, $1;
  86        }
  87        elsif(/^\@rearrange\s+(.*)/){
  88          push @{ $self->{rearrange} }, _getHexArray($1);
  89        }
  90        next;
  91     }
  92     $self->parseEntry($_);
  93   }
  94   close $fk;
  95   if($self->{entry}){
  96     $self->parseEntry($_) foreach split /\n/, $self->{entry};
  97   }
  98
  99   # keys of $self->{rearrangeHash} are $self->{rearrange}.
 100   $self->{rearrangeHash} = {};
 101   @{ $self->{rearrangeHash} }{ @{ $self->{rearrange} } } = ();
 102
 103   return $self;
 104 }
 105
 106 ##
 107 ## get $line, parse it, and write an entry in $self
 108 ##
 109 sub parseEntry
 110 {
 111   my $self = shift;
 112   my $line = shift;
 113   my($name, $ele, @key);
 114
 115   return if $line !~ /^\s*[0-9A-Fa-f]/;
 116
 117   # get name
 118   $name = $1 if $line =~ s/#\s*(.*)//;
 119   return if defined $self->{undefName} && $name =~ /$self->{undefName}/;
 120
 121   # get element
 122   my($e, $k) = split /;/, $line;
 123   my @e = _getHexArray($e);
 124   $ele = pack('U*', @e);
 125   return if defined $self->{undefChar} && $ele =~ /$self->{undefChar}/;
 126
 127   # get sort key
 128   if(
 129      defined $self->{ignoreName} && $name =~ /$self->{ignoreName}/ ||
 130      defined $self->{ignoreChar} && $ele  =~ /$self->{ignoreChar}/
 131   )
 132   {
 133      $self->{entries}{$ele} = $self->{ignored}{$ele} = 1;
 134   }
 135   else
 136   {
 137     foreach my $arr ($k =~ /\[(\S+)\]/g) {
 138       my $var = $arr =~ /\*/;
 139       push @key, $self->altCE( $var, _getHexArray($arr) );
 140     }
 141     $self->{entries}{$ele} = \@key;
 142   }
 143   $self->{maxlength}{ord $ele} = scalar @e if @e > 1;
 144 }
 145
 146
 147 ##
 148 ## arrayref CE = altCE(bool variable?, list[num] weights)
 149 ##
 150 sub altCE
 151 {
 152   my $self = shift;
 153   my $var  = shift;
 154   my @c    = @_;
 155
 156   $self->{alternate} eq 'blanked' ?
 157     $var ? [0,0,0] : [ @c[0..2] ] :
 158   $self->{alternate} eq 'non-ignorable' ?
 159     [ @c[0..2] ] :
 160   $self->{alternate} eq 'shifted' ?
 161     $var ? [0,0,0,$c[0] ] : [ @c[0..2], $c[0]+$c[1]+$c[2] ? 0xFFFF : 0 ] :
 162   $self->{alternate} eq 'shift-trimmed' ?
 163     $var ? [0,0,0,$c[0] ] : [ @c[0..2], 0 ] :
 164    \@c;
 165 }
 166
 167 ##
 168 ## string hex_sortkey = splitCE(string arg)
 169 ##
 170 sub viewSortKey
 171 {
 172   my $self = shift;
 173   my $key  = $self->getSortKey(@_);
 174   my $view = join " ", map sprintf("%04X", $_), unpack 'n*', $key;
 175   $view =~ s/ ?0000 ?/|/g;
 176   "[$view]";
 177 }
 178
 179
 180 ##
 181 ## list[strings] elements = splitCE(string arg)
 182 ##
 183 sub splitCE
 184 {
 185   my $self = shift;
 186   my $code = $self->{preprocess};
 187   my $norm = $self->{UNF};
 188   my $ent  = $self->{entries};
 189   my $max  = $self->{maxlength};
 190   my $rear = $self->{rearrangeHash};
 191
 192   my $str = ref $code ? &$code(shift) : shift;
 193   $str = &$norm($str) if ref $norm;
 194
 195   my(@src, @buf);
 196   @src = unpack('U*', $str);
 197
 198   # rearrangement
 199   for(my $i = 0; $i < @src; $i++)
 200   {
 201      ($src[$i], $src[$i+1]) = ($src[$i+1], $src[$i])
 202         if $rear->{ $src[$i] };
 203      $i++;
 204   }
 205
 206   for(my $i = 0; $i < @src; $i++)
 207   {
 208     my $ch;
 209     my $u  = $src[$i];
 210
 211   # non-characters
 212     next if $u < 0 || 0x10FFFF < $u     # out of range
 213          || 0xD800 < $u && $u < 0xDFFF; # unpaired surrogates
 214     my $four = $u & 0xFFFF;
 215     next if $four == 0xFFFE || $four == 0xFFFF;
 216
 217     if($max->{$u}) # contract
 218     {
 219       for(my $j = $max->{$u}; $j >= 1; $j--)
 220       {
 221         next unless $i+$j-1 < @src;
 222         $ch = pack 'U*', @src[$i .. $i+$j-1];
 223         $i += $j-1, last if $ent->{$ch};
 224       }
 225     }
 226     else {  $ch = pack('U', $u) }
 227
 228   # with Combining Char (UTS#10, 4.2.1), here requires Unicode::Normalize.
 229     if($getCombinClass && defined $ch)
 230     {
 231       for(my $j = $i+1; $j < @src && $getCombinClass->( $src[$j] ); $j++)
 232       {
 233         my $comb = pack 'U', $src[$j];
 234         next if ! $ent->{ $ch.$comb };
 235         $ch .= $comb;
 236         splice(@src, $j, 1);
 237       }
 238     }
 239     push @buf, $ch;
 240   }
 241   wantarray ? @buf : \@buf;
 242 }
 243
 244
 245 ##
 246 ## list[arrayrefs] weight = getWt(string element)
 247 ##
 248 sub getWt
 249 {
 250   my $self = shift;
 251   my $ch   = shift;
 252   my $ent  = $self->{entries};
 253   my $ign  = $self->{ignored};
 254   my $cjk  = $self->{overrideCJK};
 255   my $hang = $self->{overrideHangul};
 256   return if !defined $ch || $ign->{$ch};   # ignored
 257   return @{ $ent->{$ch} } if $ent->{$ch};
 258   my $u = unpack('U', $ch);
 259   return
 260     _isHangul($u)
 261       ? $hang
 262         ? &$hang($u)
 263         : map(@{ $ent->{pack('U', $_)} }, _decompHangul($u))
 264       : _isCJK($u)
 265         ? $cjk ? &$cjk($u) : map($self->altCE(0,@$_), _CJK($u))
 266         : map($self->altCE(0,@$_), _derivCE($u));
 267 }
 268
 269 ##
 270 ## int = index(string, substring)
 271 ##
 272 sub index
 273 {
 274   my $self = shift;
 275   my $lev  = $self->{level};
 276   my $str  = $self->splitCE(shift);
 277   my $sub  = $self->splitCE(shift);
 278
 279   return wantarray ? (0,0) : 0 if ! @$sub;
 280   return wantarray ?  ()  : -1 if ! @$str;
 281
 282   my @subWt = grep _ignorableAtLevel($_,$lev),
 283               map $self->getWt($_), @$sub;
 284
 285   my(@strWt,@strPt);
 286   my $count = 0;
 287   for my $e (@$str){
 288     my @tmp = grep _ignorableAtLevel($_,$lev), $self->getWt($e);
 289     push @strWt, @tmp;
 290     push @strPt, ($count) x @tmp;
 291     $count += length $e;
 292     while(@strWt >= @subWt){
 293       if(_eqArray(\@strWt, \@subWt, $lev)){
 294         my $pos = $strPt[0];
 295         return wantarray ? ($pos, $count-$pos) : $pos;
 296       }
 297       shift @strWt;
 298       shift @strPt;
 299     }
 300   }
 301   return wantarray ? () : -1;
 302 }
 303
 304 ##
 305 ## bool _eqArray(arrayref, arrayref, level)
 306 ##
 307 sub _eqArray($$$)
 308 {
 309   my $a   = shift; # length $a >= length $b;
 310   my $b   = shift;
 311   my $lev = shift;
 312   for my $v (0..$lev-1){
 313     for my $c (0..@$b-1){
 314       return if $a->[$c][$v] != $b->[$c][$v];
 315     }
 316   }
 317   return 1;
 318 }
 319
 320
 321 ##
 322 ## bool _ignorableAtLevel(CE, level)
 323 ##
 324 sub _ignorableAtLevel($$)
 325 {
 326   my $ce = shift;
 327   return if ! defined $ce;
 328   my $lv = shift;
 329   ! grep { ! $ce->[$_] } 0..$lv-1;
 330 }
 331
 332
 333 ##
 334 ## string sortkey = getSortKey(string arg)
 335 ##
 336 sub getSortKey
 337 {
 338   my $self = shift;
 339   my $lev  = $self->{level};
 340   my $rCE  = $self->splitCE(shift); # get an arrayref
 341
 342   # weight arrays
 343   my @buf = grep defined(), map $self->getWt($_), @$rCE;
 344
 345   # make sort key
 346   my @ret = ([],[],[],[]);
 347   foreach my $v (0..$lev-1){
 348     foreach my $b (@buf){
 349       push @{ $ret[$v] }, $b->[$v] if $b->[$v];
 350     }
 351   }
 352   foreach (@{ $self->{backwards} }){
 353     my $v = $_ - 1;
 354     @{ $ret[$v] } = reverse @{ $ret[$v] };
 355   }
 356
 357   # modification of tertiary weights
 358   if($self->{upper_before_lower}){
 359     foreach (@{ $ret[2] }){
 360       if   (0x8 <= $_ && $_ <= 0xC){ $_ -= 6 } # lower
 361       elsif(0x2 <= $_ && $_ <= 0x6){ $_ += 6 } # upper
 362       elsif($_ == 0x1C)            { $_ += 1 } # square upper
 363       elsif($_ == 0x1D)            { $_ -= 1 } # square lower
 364     }
 365   }
 366   if($self->{katakana_before_hiragana}){
 367     foreach (@{ $ret[2] }){
 368       if   (0x0F <= $_ && $_ <= 0x13){ $_ -= 2 } # katakana
 369       elsif(0x0D <= $_ && $_ <= 0x0E){ $_ += 5 } # hiragana
 370     }
 371   }
 372   join "\0\0", map pack('n*', @$_), @ret;
 373 }
 374
 375
 376 ##
 377 ## int compare = cmp(string a, string b)
 378 ##
 379 sub cmp { $_[0]->getSortKey($_[1]) cmp $_[0]->getSortKey($_[2]) }
 380 sub eq  { $_[0]->getSortKey($_[1]) eq  $_[0]->getSortKey($_[2]) }
 381 sub ne  { $_[0]->getSortKey($_[1]) ne  $_[0]->getSortKey($_[2]) }
 382 sub lt  { $_[0]->getSortKey($_[1]) lt  $_[0]->getSortKey($_[2]) }
 383 sub le  { $_[0]->getSortKey($_[1]) le  $_[0]->getSortKey($_[2]) }
 384 sub gt  { $_[0]->getSortKey($_[1]) gt  $_[0]->getSortKey($_[2]) }
 385 sub ge  { $_[0]->getSortKey($_[1]) ge  $_[0]->getSortKey($_[2]) }
 386
 387 ##
 388 ## list[strings] sorted = sort(list[strings] arg)
 389 ##
 390 sub sort
 391 {
 392   my $obj = shift;
 393
 394   map { $_->[1] }
 395   sort{ $a->[0] cmp $b->[0] }
 396   map [ $obj->getSortKey($_), $_ ], @_;
 397 }
 398
 399 ##
 400 ## list[arrayrefs] CE = _derivCE(int codepoint)
 401 ##
 402 sub _derivCE
 403 {
 404   my $code = shift;
 405   my $a = UNDEFINED + ($code >> 15); # ok
 406   my $b = ($code & 0x7FFF) | 0x8000; # ok
 407 # my $a = 0xFFC2 + ($code >> 15);    # ng
 408 # my $b = $code & 0x7FFF | 0x1000;   # ng
 409   $b ? ([$a,2,1,$code],[$b,0,0,$code]) : [$a,2,1,$code];
 410 }
 411
 412 ##
 413 ## "hhhh hhhh hhhh" to (dddd, dddd, dddd)
 414 ##
 415 sub _getHexArray { map hex(), $_[0] =~ /([0-9a-fA-F]+)/g }
 416
 417 ##
 418 ## bool is_a_CJK_Unified_Ideograph = _isCJK(int codepoint)
 419 ##
 420 sub _isCJK
 421 {
 422   my $u = shift;
 423   return 0x3400 <= $u && $u <= 0x4DB5
 424       || 0x4E00 <= $u && $u <= 0x9FA5
 425 #      || 0x20000 <= $u && $u <= 0x2A6D6;
 426 }
 427
 428 ##
 429 ## list[arrayref] CE = _CJK(int codepoint_of_CJK)
 430 ##
 431 sub _CJK { $_[0] > 0xFFFF ? _derivCE($_[0]) : [$_[0], 0x20, 0x02, $_[0]] }
 432
 433 ##
 434 ## bool is_a_Hangul_Syllable = _isHangul(int codepoint)
 435 ##
 436 sub _isHangul { return 0xAC00 <= $_[0] && $_[0] <= 0xD7A3 }
 437
 438 sub _decompHangul {
 439     my $code = shift;
 440   # $code must be in Hangul syllable. check it before you enter here.
 441     my $SIndex = $code - 0xAC00;
 442     my $LIndex = int( $SIndex / 588);
 443     my $VIndex = int(($SIndex % 588) / 28);
 444     my $TIndex =      $SIndex % 28;
 445     return (
 446        0x1100 + $LIndex,
 447        0x1161 + $VIndex,
 448       $TIndex ? (0x11A7 + $TIndex) : (),
 449     );
 450 }
 451
 452 1;
 453 __END__
 454
 455 =head1 NAME
 456
 457 Unicode::Collate - use UCA (Unicode Collation Algorithm)
 458
 459 =head1 SYNOPSIS
 460
 461   use Unicode::Collate;
 462
 463   #construct
 464   $Collator = Unicode::Collate->new(%tailoring);
 465
 466   #sort
 467   @sorted = $Collator->sort(@not_sorted);
 468
 469   #compare
 470   $result = $Collator->cmp($a, $b); # returns 1, 0, or -1.
 471
 472 =head1 DESCRIPTION
 473
 474 =head2 Constructor and Tailoring
 475
 476 The C<new> method returns a collator object.
 477
 478    $Collator = Unicode::Collate->new(
 479       alternate => $alternate,
 480       backwards => $levelNumber, # or \@levelNumbers
 481       entry => $element,
 482       normalization  => $normalization_form,
 483       ignoreName => qr/$ignoreName/,
 484       ignoreChar => qr/$ignoreChar/,
 485       katakana_before_hiragana => $bool,
 486       level => $collationLevel,
 487       overrideCJK => \&overrideCJK,
 488       overrideHangul => \&overrideHangul,
 489       preprocess => \&preprocess,
 490       rearrange => \@charList,
 491       table => $filename,
 492       undefName => qr/$undefName/,
 493       undefChar => qr/$undefChar/,
 494       upper_before_lower => $bool,
 495    );
 496    # if %tailoring is false (empty),
 497    # $Collator should do the default collation.
 498
 499 =over 4
 500
 501 =item alternate
 502
 503 -- see 3.2.2 Alternate Weighting, UTR #10.
 504
 505    alternate => 'shifted', 'blanked', 'non-ignorable', or 'shift-trimmed'.
 506
 507 By default (if specification is omitted), 'shifted' is adopted.
 508
 509 =item backwards
 510
 511 -- see 3.1.2 French Accents, UTR #10.
 512
 513      backwards => $levelNumber or \@levelNumbers
 514
 515 Weights in reverse order; ex. level 2 (diacritic ordering) in French.
 516 If omitted, forwards at all the levels.
 517
 518 =item entry
 519
 520 -- see 3.1 Linguistic Features; 3.2.1 File Format, UTR #10.
 521
 522 Overrides a default order or adds a new element
 523
 524   entry => <<'ENTRIES', # use the UCA file format
 525 00E6 ; [.0861.0020.0002.00E6] [.08B1.0020.0002.00E6] # ligature <ae> as <a e>
 526 0063 0068 ; [.0893.0020.0002.0063]      # "ch" in traditional Spanish
 527 0043 0068 ; [.0893.0020.0008.0043]      # "Ch" in traditional Spanish
 528 ENTRIES
 529
 530 =item ignoreName
 531
 532 =item ignoreChar
 533
 534 -- see Completely Ignorable, 3.2.2 Alternate Weighting, UTR #10.
 535
 536 Ignores the entry in the table.
 537 If an ignored collation element appears in the string to be collated,
 538 it is ignored as if the element had been deleted from there.
 539
 540 E.g. when 'a' and 'e' are ignored,
 541 'element' is equal to 'lament' (or 'lmnt').
 542
 543 =item level
 544
 545 -- see 4.3 Form a sort key for each string, UTR #10.
 546
 547 Set the maximum level.
 548 Any higher levels than the specified one are ignored.
 549
 550   Level 1: alphabetic ordering
 551   Level 2: diacritic ordering
 552   Level 3: case ordering
 553   Level 4: tie-breaking (e.g. in the case when alternate is 'shifted')
 554
 555   ex.level => 2,
 556
 557 =item normalization
 558
 559 -- see 4.1 Normalize each input string, UTR #10.
 560
 561 If specified, strings are normalized before preparation sort keys
 562 (the normalization is executed after preprocess).
 563
 564 As a form name, one of the following names must be used.
 565
 566   'C'  or 'NFC'  for Normalization Form C
 567   'D'  or 'NFD'  for Normalization Form D
 568   'KC' or 'NFKC' for Normalization Form KC
 569   'KD' or 'NFKD' for Normalization Form KD
 570
 571 If omitted, the string is put into Normalization Form D.
 572
 573 If undefined explicitly (as C<normalization =E<gt> undef>),
 574 any normalization is not carried out (this may make tailoring easier
 575 if any normalization is not desired).
 576
 577 see B<CAVEAT>.
 578
 579 =item overrideCJK
 580
 581 =item overrideHangul
 582
 583 -- see 7.1 Derived Collation Elements, UTR #10.
 584
 585 By default, mapping of CJK Unified Ideographs
 586 uses the Unicode codepoint order
 587 and Hangul Syllables are decomposed into Hangul Jamo.
 588
 589 The mapping of CJK Unified Ideographs
 590 or Hangul Syllables may be overrided.
 591
 592 ex. CJK Unified Ideographs in the JIS codepoint order.
 593
 594   overrideCJK => sub {
 595     my $u = shift;               # get unicode codepoint
 596     my $b = pack('n', $u);       # to UTF-16BE
 597     my $s = your_unicode_to_sjis_converter($b); # convert
 598     my $n = unpack('n', $s);     # convert sjis to short
 599     [ $n, 1, 1 ];                # return collation element
 600   },
 601
 602 If you want to override the mapping of Hangul Syllables,
 603 the Normalization Forms D and KD are not appropriate
 604 (they will be decomposed before overriding).
 605
 606 =item preprocess
 607
 608 -- see 5.1 Preprocessing, UTR #10.
 609
 610 If specified, the coderef is used to preprocess
 611 before the formation of sort keys.
 612
 613 ex. dropping English articles, such as "a" or "the".
 614 Then, "the pen" is before "a pencil".
 615
 616      preprocess => sub {
 617            my $str = shift;
 618            $str =~ s/\b(?:an?|the)\s+//g;
 619            $str;
 620         },
 621
 622 =item rearrange
 623
 624 -- see 3.1.3 Rearrangement, UTR #10.
 625
 626 Characters that are not coded in logical order and to be rearranged.
 627 By default,
 628
 629     rearrange => [ 0x0E40..0x0E44, 0x0EC0..0x0EC4 ],
 630
 631 =item table
 632
 633 -- see 3.2 Default Unicode Collation Element Table, UTR #10.
 634
 635 You can use another element table if desired.
 636 The table file must be in your C<lib/Unicode/Collate> directory.
 637
 638 By default, the file C<lib/Unicode/Collate/allkeys.txt> is used.
 639
 640 =item undefName
 641
 642 =item undefChar
 643
 644 -- see 6.3.4 Reducing the Repertoire, UTR #10.
 645
 646 Undefines the collation element as if it were unassigned in the table.
 647 This reduces the size of the table.
 648 If an unassigned character appears in the string to be collated,
 649 the sort key is made from its codepoint
 650 as a single-character collation element,
 651 as it is greater than any other assigned collation elements
 652 (in the codepoint order among the unassigned characters).
 653 But, it'd be better to ignore characters
 654 unfamiliar to you and maybe never used.
 655
 656 =item katakana_before_hiragana
 657
 658 =item upper_before_lower
 659
 660 -- see 6.6 Case Comparisons; 7.3.1 Tertiary Weight Table, UTR #10.
 661
 662 By default, lowercase is before uppercase
 663 and hiragana is before katakana.
 664
 665 If the parameter is true, this is reversed.
 666
 667 =back
 668
 669 =head2 Other methods
 670
 671 =over 4
 672
 673 =item C<@sorted = $Collator-E<gt>sort(@not_sorted)>
 674
 675 Sorts a list of strings.
 676
 677 =item C<$result = $Collator-E<gt>cmp($a, $b)>
 678
 679 Returns 1 (when C<$a> is greater than C<$b>)
 680 or 0 (when C<$a> is equal to C<$b>)
 681 or -1 (when C<$a> is lesser than C<$b>).
 682
 683 =item C<$result = $Collator-E<gt>eq($a, $b)>
 684
 685 =item C<$result = $Collator-E<gt>ne($a, $b)>
 686
 687 =item C<$result = $Collator-E<gt>lt($a, $b)>
 688
 689 =item C<$result = $Collator-E<gt>le($a, $b)>
 690
 691 =item C<$result = $Collator-E<gt>gt($a, $b)>
 692
 693 =item C<$result = $Collator-E<gt>ge($a, $b)>
 694
 695 They works like the same name operators as theirs.
 696
 697    eq : whether $a is equal to $b.
 698    ne : whether $a is not equal to $b.
 699    lt : whether $a is lesser than $b.
 700    le : whether $a is lesser than $b or equal to $b.
 701    gt : whether $a is greater than $b.
 702    ge : whether $a is greater than $b or equal to $b.
 703
 704 =item C<$sortKey = $Collator-E<gt>getSortKey($string)>
 705
 706 -- see 4.3 Form a sort key for each string, UTR #10.
 707
 708 Returns a sort key.
 709
 710 You compare the sort keys using a binary comparison
 711 and get the result of the comparison of the strings using UCA.
 712
 713    $Collator->getSortKey($a) cmp $Collator->getSortKey($b)
 714
 715       is equivalent to
 716
 717    $Collator->cmp($a, $b)
 718
 719 =item C<$position = $Collator-E<gt>index($string, $substring)>
 720
 721 =item C<($position, $length) = $Collator-E<gt>index($string, $substring)>
 722
 723 -- see 6.8 Searching, UTR #10.
 724
 725 If C<$substring> matches a part of C<$string>, returns
 726 the position of the first occurrence of the matching part in scalar context;
 727 in list context, returns a two-element list of
 728 the position and the length of the matching part.
 729
 730 B<Notice> that the length of the matching part may differ from
 731 the length of C<$substring>.
 732
 733 B<Note> that the position and the length are counted on the string
 734 after the process of preprocess, normalization, and rearrangement.
 735 Therefore, in case the specified string is not binary equal to
 736 the preprocessed/normalized/rearranged string, the position and the length
 737 may differ form those on the specified string. But it is guaranteed
 738 that, if matched, it returns a non-negative value as C<$position>.
 739
 740 If C<$substring> does not match any part of C<$string>,
 741 returns C<-1> in scalar context and
 742 an empty list in list context.
 743
 744 e.g. you say
 745
 746   my $Collator = Unicode::Collate->new( normalization => undef, level => 1 );
 747   my $str = "Ich mu\x{00DF} studieren.";
 748   my $sub = "m\x{00FC}ss";
 749   my $match;
 750   if(my($pos,$len) = $Collator->index($str, $sub)){
 751       $match = substr($str, $pos, $len);
 752   }
 753
 754 and get C<"mu\x{00DF}"> in C<$match> since C<"mu>E<223>C<">
 755 is primary equal to C<"m>E<252>C<ss">.
 756
 757 =back
 758
 759 =head2 EXPORT
 760
 761 None by default.
 762
 763 =head2 CAVEAT
 764
 765 Use of the C<normalization> parameter requires
 766 the B<Unicode::Normalize> module.
 767
 768 If you need not it (say, in the case when you need not
 769 handle any combining characters),
 770 assign C<normalization =E<gt> undef> explicitly.
 771
 772 -- see 6.5 Avoiding Normalization, UTR #10.
 773
 774 =head1 AUTHOR
 775
 776 SADAHIRO Tomoyuki, E<lt>SADAHIRO@cpan.orgE<gt>
 777
 778   http://homepage1.nifty.com/nomenclator/perl/
 779
 780   Copyright(C) 2001, SADAHIRO Tomoyuki. Japan. All rights reserved.
 781
 782   This program is free software; you can redistribute it and/or
 783   modify it under the same terms as Perl itself.
 784
 785 =head1 SEE ALSO
 786
 787 =over 4
 788
 789 =item Unicode Collation Algorithm - Unicode TR #10
 790
 791 http://www.unicode.org/unicode/reports/tr10/
 792
 793 =item L<Unicode::Normalize>
 794
 795 normalized forms of Unicode text
 796
 797 =back
 798
 799 =cut