lib/Unicode/Collate.pm

   1 package Unicode::Collate;
   2
   3 BEGIN {
   4     if (ord("A") == 193) {
   5         die "Unicode::Collate not ported to EBCDIC\n";
   6     }
   7 }
   8
   9 use 5.006;
  10 use strict;
  11 use warnings;
  12 use Carp;
  13
  14 require Exporter;
  15
  16 our $VERSION = '0.10';
  17 our $PACKAGE = __PACKAGE__;
  18
  19 our @ISA = qw(Exporter);
  20
  21 our %EXPORT_TAGS = ();
  22 our @EXPORT_OK = ();
  23 our @EXPORT = ();
  24
  25 (our $Path = $INC{'Unicode/Collate.pm'}) =~ s/\.pm$//;
  26 our $KeyFile = "allkeys.txt";
  27
  28 our $getCombinClass; # coderef for combining class from Unicode::Normalize
  29
  30 use constant Min2      => 0x20;   # minimum weight at level 2
  31 use constant Min3      => 0x02;   # minimum weight at level 3
  32 use constant UNDEFINED => 0xFF80; # special value for undefined CE
  33
  34 ##
  35 ## constructor
  36 ##
  37 sub new
  38 {
  39   my $class = shift;
  40   my $self = bless { @_ }, $class;
  41
  42   # alternate
  43   $self->{alternate} =
  44      ! exists  $self->{alternate} ? 'shifted' :
  45      ! defined $self->{alternate} ? '' : $self->{alternate};
  46
  47   # collation level
  48   $self->{level} ||= ($self->{alternate} =~ /^shift/ ? 4 : 3);
  49
  50   # normalization form
  51   $self->{normalization} = 'D' if ! exists $self->{normalization};
  52
  53   if(defined $self->{normalization}){
  54     eval "use Unicode::Normalize;";
  55     croak "you'd install Unicode::Normalize for normalization forms: $@"
  56         if $@;
  57     $getCombinClass = \&Unicode::Normalize::getCombinClass
  58         if ! $getCombinClass;
  59   }
  60
  61   $self->{UNF} =
  62     ! defined $self->{normalization}        ? undef :
  63     $self->{normalization} =~ /^(?:NF)?C$/  ? \&NFC :
  64     $self->{normalization} =~ /^(?:NF)?D$/  ? \&NFD :
  65     $self->{normalization} =~ /^(?:NF)?KC$/ ? \&NFKC :
  66     $self->{normalization} =~ /^(?:NF)?KD$/ ? \&NFKD :
  67     croak "$PACKAGE unknown normalization form name: $self->{normalization}";
  68
  69   # backwards
  70   $self->{backwards} ||= [];
  71   $self->{backwards} = [ $self->{backwards} ] if ! ref $self->{backwards};
  72
  73   # rearrange
  74   $self->{rearrange} ||= []; # maybe not U+0000 (an ASCII)
  75   $self->{rearrange} = [ $self->{rearrange} ] if ! ref $self->{rearrange};
  76
  77   # open a table file.
  78   # if undef is passed explicitly, no file is read.
  79   $self->{table} = $KeyFile unless exists $self->{table};
  80   $self->read_table if defined $self->{table};
  81
  82   if($self->{entry}){
  83     $self->parseEntry($_) foreach split /\n/, $self->{entry};
  84   }
  85
  86   # keys of $self->{rearrangeHash} are $self->{rearrange}.
  87   $self->{rearrangeHash} = {};
  88   @{ $self->{rearrangeHash} }{ @{ $self->{rearrange} } } = ();
  89
  90   return $self;
  91 }
  92
  93
  94 sub read_table {
  95   my $self = shift;
  96   my $file = $self->{table} ne '' ? $self->{table} : $KeyFile;
  97   open my $fk, "<$Path/$file" or croak "File does not exist at $Path/$file";
  98
  99   while(<$fk>){
 100     next if /^\s*#/;
 101     if(/^\s*\@/){
 102        if(/^\@version\s*(\S*)/){
 103          $self->{version} ||= $1;
 104        }
 105        elsif(/^\@alternate\s+(.*)/){
 106          $self->{alternate} ||= $1;
 107        }
 108        elsif(/^\@backwards\s+(.*)/){
 109          push @{ $self->{backwards} }, $1;
 110        }
 111        elsif(/^\@rearrange\s+(.*)/){
 112          push @{ $self->{rearrange} }, _getHexArray($1);
 113        }
 114        next;
 115     }
 116     $self->parseEntry($_);
 117   }
 118   close $fk;
 119 }
 120
 121
 122 ##
 123 ## get $line, parse it, and write an entry in $self
 124 ##
 125 sub parseEntry
 126 {
 127   my $self = shift;
 128   my $line = shift;
 129   my($name, $ele, @key);
 130
 131   return if $line !~ /^\s*[0-9A-Fa-f]/;
 132
 133   # get name
 134   $name = $1 if $line =~ s/#\s*(.*)//;
 135   return if defined $self->{undefName} && $name =~ /$self->{undefName}/;
 136
 137   # get element
 138   my($e, $k) = split /;/, $line;
 139   my @e = _getHexArray($e);
 140   $ele = pack('U*', @e);
 141   return if defined $self->{undefChar} && $ele =~ /$self->{undefChar}/;
 142
 143   # get sort key
 144   if(
 145      defined $self->{ignoreName} && $name =~ /$self->{ignoreName}/ ||
 146      defined $self->{ignoreChar} && $ele  =~ /$self->{ignoreChar}/
 147   )
 148   {
 149      $self->{entries}{$ele} = $self->{ignored}{$ele} = 1;
 150   }
 151   else
 152   {
 153     foreach my $arr ($k =~ /\[(\S+)\]/g) {
 154       my $var = $arr =~ /\*/;
 155       push @key, $self->altCE( $var, _getHexArray($arr) );
 156     }
 157     $self->{entries}{$ele} = \@key;
 158   }
 159   $self->{maxlength}{ord $ele} = scalar @e if @e > 1;
 160 }
 161
 162
 163 ##
 164 ## arrayref CE = altCE(bool variable?, list[num] weights)
 165 ##
 166 sub altCE
 167 {
 168   my $self = shift;
 169   my $var  = shift;
 170   my @c    = @_;
 171
 172   $self->{alternate} eq 'blanked' ?
 173     $var ? [0,0,0] : [ @c[0..2] ] :
 174   $self->{alternate} eq 'non-ignorable' ?
 175     [ @c[0..2] ] :
 176   $self->{alternate} eq 'shifted' ?
 177     $var ? [0,0,0,$c[0] ] : [ @c[0..2], $c[0]+$c[1]+$c[2] ? 0xFFFF : 0 ] :
 178   $self->{alternate} eq 'shift-trimmed' ?
 179     $var ? [0,0,0,$c[0] ] : [ @c[0..2], 0 ] :
 180    \@c;
 181 }
 182
 183 ##
 184 ## string hex_sortkey = splitCE(string arg)
 185 ##
 186 sub viewSortKey
 187 {
 188   my $self = shift;
 189   my $key  = $self->getSortKey(@_);
 190   my $view = join " ", map sprintf("%04X", $_), unpack 'n*', $key;
 191   $view =~ s/ ?0000 ?/|/g;
 192   "[$view]";
 193 }
 194
 195
 196 ##
 197 ## list[strings] elements = splitCE(string arg)
 198 ##
 199 sub splitCE
 200 {
 201   my $self = shift;
 202   my $code = $self->{preprocess};
 203   my $norm = $self->{UNF};
 204   my $ent  = $self->{entries};
 205   my $max  = $self->{maxlength};
 206   my $rear = $self->{rearrangeHash};
 207
 208   my $str = ref $code ? &$code(shift) : shift;
 209   $str = &$norm($str) if ref $norm;
 210
 211   my(@src, @buf);
 212   @src = unpack('U*', $str);
 213
 214   # rearrangement
 215   for(my $i = 0; $i < @src; $i++)
 216   {
 217      ($src[$i], $src[$i+1]) = ($src[$i+1], $src[$i])
 218         if $rear->{ $src[$i] };
 219      $i++;
 220   }
 221
 222   for(my $i = 0; $i < @src; $i++)
 223   {
 224     my $ch;
 225     my $u  = $src[$i];
 226
 227   # non-characters
 228     next unless defined $u;
 229     next if $u < 0 || 0x10FFFF < $u     # out of range
 230          || 0xD800 < $u && $u < 0xDFFF; # unpaired surrogates
 231     my $four = $u & 0xFFFF;
 232     next if $four == 0xFFFE || $four == 0xFFFF;
 233
 234     if($max->{$u}) # contract
 235     {
 236       for(my $j = $max->{$u}; $j >= 1; $j--)
 237       {
 238         next unless $i+$j-1 < @src;
 239         $ch = pack 'U*', @src[$i .. $i+$j-1];
 240         $i += $j-1, last if $ent->{$ch};
 241       }
 242     }
 243     else {  $ch = pack('U', $u) }
 244
 245   # with Combining Char (UTS#10, 4.2.1), here requires Unicode::Normalize.
 246     if($getCombinClass && defined $ch)
 247     {
 248       for(my $j = $i+1; $j < @src; $j++)
 249       {
 250         next unless defined $src[$j];
 251         last unless $getCombinClass->( $src[$j] );
 252         my $comb = pack 'U', $src[$j];
 253         next if ! $ent->{ $ch.$comb };
 254         $ch .= $comb;
 255         $src[$j] = undef;
 256       }
 257     }
 258     push @buf, $ch;
 259   }
 260   wantarray ? @buf : \@buf;
 261 }
 262
 263
 264 ##
 265 ## list[arrayrefs] weight = getWt(string element)
 266 ##
 267 sub getWt
 268 {
 269   my $self = shift;
 270   my $ch   = shift;
 271   my $ent  = $self->{entries};
 272   my $ign  = $self->{ignored};
 273   my $cjk  = $self->{overrideCJK};
 274   my $hang = $self->{overrideHangul};
 275   return if !defined $ch || $ign->{$ch};   # ignored
 276   return @{ $ent->{$ch} } if $ent->{$ch};
 277   my $u = unpack('U', $ch);
 278   return
 279     _isHangul($u)
 280       ? $hang
 281         ? &$hang($u)
 282         : map(@{ $ent->{pack('U', $_)} }, _decompHangul($u))
 283       : _isCJK($u)
 284         ? $cjk ? &$cjk($u) : map($self->altCE(0,@$_), _CJK($u))
 285         : map($self->altCE(0,@$_), _derivCE($u));
 286 }
 287
 288 ##
 289 ## int = index(string, substring)
 290 ##
 291 sub index
 292 {
 293   my $self = shift;
 294   my $lev  = $self->{level};
 295   my $str  = $self->splitCE(shift);
 296   my $sub  = $self->splitCE(shift);
 297
 298   return wantarray ? (0,0) : 0 if ! @$sub;
 299   return wantarray ?  ()  : -1 if ! @$str;
 300
 301   my @subWt = grep _ignorableAtLevel($_,$lev),
 302               map $self->getWt($_), @$sub;
 303
 304   my(@strWt,@strPt);
 305   my $count = 0;
 306   for my $e (@$str){
 307     my @tmp = grep _ignorableAtLevel($_,$lev), $self->getWt($e);
 308     push @strWt, @tmp;
 309     push @strPt, ($count) x @tmp;
 310     $count += length $e;
 311     while(@strWt >= @subWt){
 312       if(_eqArray(\@strWt, \@subWt, $lev)){
 313         my $pos = $strPt[0];
 314         return wantarray ? ($pos, $count-$pos) : $pos;
 315       }
 316       shift @strWt;
 317       shift @strPt;
 318     }
 319   }
 320   return wantarray ? () : -1;
 321 }
 322
 323 ##
 324 ## bool _eqArray(arrayref, arrayref, level)
 325 ##
 326 sub _eqArray($$$)
 327 {
 328   my $a   = shift; # length $a >= length $b;
 329   my $b   = shift;
 330   my $lev = shift;
 331   for my $v (0..$lev-1){
 332     for my $c (0..@$b-1){
 333       return if $a->[$c][$v] != $b->[$c][$v];
 334     }
 335   }
 336   return 1;
 337 }
 338
 339
 340 ##
 341 ## bool _ignorableAtLevel(CE, level)
 342 ##
 343 sub _ignorableAtLevel($$)
 344 {
 345   my $ce = shift;
 346   return if ! defined $ce;
 347   my $lv = shift;
 348   ! grep { ! $ce->[$_] } 0..$lv-1;
 349 }
 350
 351
 352 ##
 353 ## string sortkey = getSortKey(string arg)
 354 ##
 355 sub getSortKey
 356 {
 357   my $self = shift;
 358   my $lev  = $self->{level};
 359   my $rCE  = $self->splitCE(shift); # get an arrayref
 360
 361   # weight arrays
 362   my @buf = grep defined(), map $self->getWt($_), @$rCE;
 363
 364   # make sort key
 365   my @ret = ([],[],[],[]);
 366   foreach my $v (0..$lev-1){
 367     foreach my $b (@buf){
 368       push @{ $ret[$v] }, $b->[$v] if $b->[$v];
 369     }
 370   }
 371   foreach (@{ $self->{backwards} }){
 372     my $v = $_ - 1;
 373     @{ $ret[$v] } = reverse @{ $ret[$v] };
 374   }
 375
 376   # modification of tertiary weights
 377   if($self->{upper_before_lower}){
 378     foreach (@{ $ret[2] }){
 379       if   (0x8 <= $_ && $_ <= 0xC){ $_ -= 6 } # lower
 380       elsif(0x2 <= $_ && $_ <= 0x6){ $_ += 6 } # upper
 381       elsif($_ == 0x1C)            { $_ += 1 } # square upper
 382       elsif($_ == 0x1D)            { $_ -= 1 } # square lower
 383     }
 384   }
 385   if($self->{katakana_before_hiragana}){
 386     foreach (@{ $ret[2] }){
 387       if   (0x0F <= $_ && $_ <= 0x13){ $_ -= 2 } # katakana
 388       elsif(0x0D <= $_ && $_ <= 0x0E){ $_ += 5 } # hiragana
 389     }
 390   }
 391   join "\0\0", map pack('n*', @$_), @ret;
 392 }
 393
 394
 395 ##
 396 ## int compare = cmp(string a, string b)
 397 ##
 398 sub cmp { $_[0]->getSortKey($_[1]) cmp $_[0]->getSortKey($_[2]) }
 399 sub eq  { $_[0]->getSortKey($_[1]) eq  $_[0]->getSortKey($_[2]) }
 400 sub ne  { $_[0]->getSortKey($_[1]) ne  $_[0]->getSortKey($_[2]) }
 401 sub lt  { $_[0]->getSortKey($_[1]) lt  $_[0]->getSortKey($_[2]) }
 402 sub le  { $_[0]->getSortKey($_[1]) le  $_[0]->getSortKey($_[2]) }
 403 sub gt  { $_[0]->getSortKey($_[1]) gt  $_[0]->getSortKey($_[2]) }
 404 sub ge  { $_[0]->getSortKey($_[1]) ge  $_[0]->getSortKey($_[2]) }
 405
 406 ##
 407 ## list[strings] sorted = sort(list[strings] arg)
 408 ##
 409 sub sort
 410 {
 411   my $obj = shift;
 412
 413   map { $_->[1] }
 414   sort{ $a->[0] cmp $b->[0] }
 415   map [ $obj->getSortKey($_), $_ ], @_;
 416 }
 417
 418 ##
 419 ## list[arrayrefs] CE = _derivCE(int codepoint)
 420 ##
 421 sub _derivCE
 422 {
 423   my $code = shift;
 424   my $a = UNDEFINED + ($code >> 15); # ok
 425   my $b = ($code & 0x7FFF) | 0x8000; # ok
 426 # my $a = 0xFFC2 + ($code >> 15);    # ng
 427 # my $b = $code & 0x7FFF | 0x1000;   # ng
 428   $b ? ([$a,2,1,$code],[$b,0,0,$code]) : [$a,2,1,$code];
 429 }
 430
 431 ##
 432 ## "hhhh hhhh hhhh" to (dddd, dddd, dddd)
 433 ##
 434 sub _getHexArray { map hex(), $_[0] =~ /([0-9a-fA-F]+)/g }
 435
 436 ##
 437 ## bool is_a_CJK_Unified_Ideograph = _isCJK(int codepoint)
 438 ##
 439 sub _isCJK
 440 {
 441   my $u = shift;
 442   return 0x3400 <= $u && $u <= 0x4DB5
 443       || 0x4E00 <= $u && $u <= 0x9FA5
 444 #      || 0x20000 <= $u && $u <= 0x2A6D6;
 445 }
 446
 447 ##
 448 ## list[arrayref] CE = _CJK(int codepoint_of_CJK)
 449 ##
 450 sub _CJK { $_[0] > 0xFFFF ? _derivCE($_[0]) : [$_[0], 0x20, 0x02, $_[0]] }
 451
 452 ##
 453 ## bool is_a_Hangul_Syllable = _isHangul(int codepoint)
 454 ##
 455 sub _isHangul { return 0xAC00 <= $_[0] && $_[0] <= 0xD7A3 }
 456
 457 sub _decompHangul {
 458     my $code = shift;
 459   # $code must be in Hangul syllable. check it before you enter here.
 460     my $SIndex = $code - 0xAC00;
 461     my $LIndex = int( $SIndex / 588);
 462     my $VIndex = int(($SIndex % 588) / 28);
 463     my $TIndex =      $SIndex % 28;
 464     return (
 465        0x1100 + $LIndex,
 466        0x1161 + $VIndex,
 467       $TIndex ? (0x11A7 + $TIndex) : (),
 468     );
 469 }
 470
 471 1;
 472 __END__
 473
 474 =head1 NAME
 475
 476 Unicode::Collate - use UCA (Unicode Collation Algorithm)
 477
 478 =head1 SYNOPSIS
 479
 480   use Unicode::Collate;
 481
 482   #construct
 483   $Collator = Unicode::Collate->new(%tailoring);
 484
 485   #sort
 486   @sorted = $Collator->sort(@not_sorted);
 487
 488   #compare
 489   $result = $Collator->cmp($a, $b); # returns 1, 0, or -1.
 490
 491 =head1 DESCRIPTION
 492
 493 =head2 Constructor and Tailoring
 494
 495 The C<new> method returns a collator object.
 496
 497    $Collator = Unicode::Collate->new(
 498       alternate => $alternate,
 499       backwards => $levelNumber, # or \@levelNumbers
 500       entry => $element,
 501       normalization  => $normalization_form,
 502       ignoreName => qr/$ignoreName/,
 503       ignoreChar => qr/$ignoreChar/,
 504       katakana_before_hiragana => $bool,
 505       level => $collationLevel,
 506       overrideCJK => \&overrideCJK,
 507       overrideHangul => \&overrideHangul,
 508       preprocess => \&preprocess,
 509       rearrange => \@charList,
 510       table => $filename,
 511       undefName => qr/$undefName/,
 512       undefChar => qr/$undefChar/,
 513       upper_before_lower => $bool,
 514    );
 515    # if %tailoring is false (empty),
 516    # $Collator should do the default collation.
 517
 518 =over 4
 519
 520 =item alternate
 521
 522 -- see 3.2.2 Alternate Weighting, UTR #10.
 523
 524    alternate => 'shifted', 'blanked', 'non-ignorable', or 'shift-trimmed'.
 525
 526 By default (if specification is omitted), 'shifted' is adopted.
 527
 528 =item backwards
 529
 530 -- see 3.1.2 French Accents, UTR #10.
 531
 532      backwards => $levelNumber or \@levelNumbers
 533
 534 Weights in reverse order; ex. level 2 (diacritic ordering) in French.
 535 If omitted, forwards at all the levels.
 536
 537 =item entry
 538
 539 -- see 3.1 Linguistic Features; 3.2.1 File Format, UTR #10.
 540
 541 Overrides a default order or adds a new collation element
 542
 543   entry => <<'ENTRIES', # use the UCA file format
 544 00E6 ; [.0861.0020.0002.00E6] [.08B1.0020.0002.00E6] # ligature <ae> as <a e>
 545 0063 0068 ; [.0893.0020.0002.0063]      # "ch" in traditional Spanish
 546 0043 0068 ; [.0893.0020.0008.0043]      # "Ch" in traditional Spanish
 547 ENTRIES
 548
 549 =item ignoreName
 550
 551 =item ignoreChar
 552
 553 -- see Completely Ignorable, 3.2.2 Alternate Weighting, UTR #10.
 554
 555 Ignores the entry in the table.
 556 If an ignored collation element appears in the string to be collated,
 557 it is ignored as if the element had been deleted from there.
 558
 559 E.g. when 'a' and 'e' are ignored,
 560 'element' is equal to 'lament' (or 'lmnt').
 561
 562 =item level
 563
 564 -- see 4.3 Form a sort key for each string, UTR #10.
 565
 566 Set the maximum level.
 567 Any higher levels than the specified one are ignored.
 568
 569   Level 1: alphabetic ordering
 570   Level 2: diacritic ordering
 571   Level 3: case ordering
 572   Level 4: tie-breaking (e.g. in the case when alternate is 'shifted')
 573
 574   ex.level => 2,
 575
 576 =item normalization
 577
 578 -- see 4.1 Normalize each input string, UTR #10.
 579
 580 If specified, strings are normalized before preparation of sort keys
 581 (the normalization is executed after preprocess).
 582
 583 As a form name, one of the following names must be used.
 584
 585   'C'  or 'NFC'  for Normalization Form C
 586   'D'  or 'NFD'  for Normalization Form D
 587   'KC' or 'NFKC' for Normalization Form KC
 588   'KD' or 'NFKD' for Normalization Form KD
 589
 590 If omitted, the string is put into Normalization Form D.
 591
 592 If undefined explicitly (as C<normalization =E<gt> undef>),
 593 any normalization is not carried out (this may make tailoring easier
 594 if any normalization is not desired).
 595
 596 see B<CAVEAT>.
 597
 598 =item overrideCJK
 599
 600 =item overrideHangul
 601
 602 -- see 7.1 Derived Collation Elements, UTR #10.
 603
 604 By default, mapping of CJK Unified Ideographs
 605 uses the Unicode codepoint order
 606 and Hangul Syllables are decomposed into Hangul Jamo.
 607
 608 The mapping of CJK Unified Ideographs
 609 or Hangul Syllables may be overrided.
 610
 611 ex. CJK Unified Ideographs in the JIS codepoint order.
 612
 613   overrideCJK => sub {
 614     my $u = shift;               # get unicode codepoint
 615     my $b = pack('n', $u);       # to UTF-16BE
 616     my $s = your_unicode_to_sjis_converter($b); # convert
 617     my $n = unpack('n', $s);     # convert sjis to short
 618     [ $n, 1, 1 ];                # return collation element
 619   },
 620
 621 If you want to override the mapping of Hangul Syllables,
 622 the Normalization Forms D and KD are not appropriate
 623 (they will be decomposed before overriding).
 624
 625 =item preprocess
 626
 627 -- see 5.1 Preprocessing, UTR #10.
 628
 629 If specified, the coderef is used to preprocess
 630 before the formation of sort keys.
 631
 632 ex. dropping English articles, such as "a" or "the".
 633 Then, "the pen" is before "a pencil".
 634
 635      preprocess => sub {
 636            my $str = shift;
 637            $str =~ s/\b(?:an?|the)\s+//g;
 638            $str;
 639         },
 640
 641 =item rearrange
 642
 643 -- see 3.1.3 Rearrangement, UTR #10.
 644
 645 Characters that are not coded in logical order and to be rearranged.
 646 By default,
 647
 648     rearrange => [ 0x0E40..0x0E44, 0x0EC0..0x0EC4 ],
 649
 650 =item table
 651
 652 -- see 3.2 Default Unicode Collation Element Table, UTR #10.
 653
 654 You can use another element table if desired.
 655 The table file must be in your C<lib/Unicode/Collate> directory.
 656
 657 By default, the file C<lib/Unicode/Collate/allkeys.txt> is used.
 658
 659 If undefined explicitly (as C<table =E<gt> undef>),
 660 no file is read (you'd define collation elements using L<entry>).
 661
 662 =item undefName
 663
 664 =item undefChar
 665
 666 -- see 6.3.4 Reducing the Repertoire, UTR #10.
 667
 668 Undefines the collation element as if it were unassigned in the table.
 669 This reduces the size of the table.
 670 If an unassigned character appears in the string to be collated,
 671 the sort key is made from its codepoint
 672 as a single-character collation element,
 673 as it is greater than any other assigned collation elements
 674 (in the codepoint order among the unassigned characters).
 675 But, it'd be better to ignore characters
 676 unfamiliar to you and maybe never used.
 677
 678 =item katakana_before_hiragana
 679
 680 =item upper_before_lower
 681
 682 -- see 6.6 Case Comparisons; 7.3.1 Tertiary Weight Table, UTR #10.
 683
 684 By default, lowercase is before uppercase
 685 and hiragana is before katakana.
 686
 687 If the parameter is true, this is reversed.
 688
 689 =back
 690
 691 =head2 Other methods
 692
 693 =over 4
 694
 695 =item C<@sorted = $Collator-E<gt>sort(@not_sorted)>
 696
 697 Sorts a list of strings.
 698
 699 =item C<$result = $Collator-E<gt>cmp($a, $b)>
 700
 701 Returns 1 (when C<$a> is greater than C<$b>)
 702 or 0 (when C<$a> is equal to C<$b>)
 703 or -1 (when C<$a> is lesser than C<$b>).
 704
 705 =item C<$result = $Collator-E<gt>eq($a, $b)>
 706
 707 =item C<$result = $Collator-E<gt>ne($a, $b)>
 708
 709 =item C<$result = $Collator-E<gt>lt($a, $b)>
 710
 711 =item C<$result = $Collator-E<gt>le($a, $b)>
 712
 713 =item C<$result = $Collator-E<gt>gt($a, $b)>
 714
 715 =item C<$result = $Collator-E<gt>ge($a, $b)>
 716
 717 They works like the same name operators as theirs.
 718
 719    eq : whether $a is equal to $b.
 720    ne : whether $a is not equal to $b.
 721    lt : whether $a is lesser than $b.
 722    le : whether $a is lesser than $b or equal to $b.
 723    gt : whether $a is greater than $b.
 724    ge : whether $a is greater than $b or equal to $b.
 725
 726 =item C<$sortKey = $Collator-E<gt>getSortKey($string)>
 727
 728 -- see 4.3 Form a sort key for each string, UTR #10.
 729
 730 Returns a sort key.
 731
 732 You compare the sort keys using a binary comparison
 733 and get the result of the comparison of the strings using UCA.
 734
 735    $Collator->getSortKey($a) cmp $Collator->getSortKey($b)
 736
 737       is equivalent to
 738
 739    $Collator->cmp($a, $b)
 740
 741 =item C<$position = $Collator-E<gt>index($string, $substring)>
 742
 743 =item C<($position, $length) = $Collator-E<gt>index($string, $substring)>
 744
 745 -- see 6.8 Searching, UTR #10.
 746
 747 If C<$substring> matches a part of C<$string>, returns
 748 the position of the first occurrence of the matching part in scalar context;
 749 in list context, returns a two-element list of
 750 the position and the length of the matching part.
 751
 752 B<Notice> that the length of the matching part may differ from
 753 the length of C<$substring>.
 754
 755 B<Note> that the position and the length are counted on the string
 756 after the process of preprocess, normalization, and rearrangement.
 757 Therefore, in case the specified string is not binary equal to
 758 the preprocessed/normalized/rearranged string, the position and the length
 759 may differ form those on the specified string. But it is guaranteed
 760 that, if matched, it returns a non-negative value as C<$position>.
 761
 762 If C<$substring> does not match any part of C<$string>,
 763 returns C<-1> in scalar context and
 764 an empty list in list context.
 765
 766 e.g. you say
 767
 768   my $Collator = Unicode::Collate->new( normalization => undef, level => 1 );
 769   my $str = "Ich mu\x{00DF} studieren.";
 770   my $sub = "m\x{00FC}ss";
 771   my $match;
 772   if(my($pos,$len) = $Collator->index($str, $sub)){
 773       $match = substr($str, $pos, $len);
 774   }
 775
 776 and get C<"mu\x{00DF}"> in C<$match> since C<"mu>E<223>C<">
 777 is primary equal to C<"m>E<252>C<ss">.
 778
 779 =back
 780
 781 =head2 EXPORT
 782
 783 None by default.
 784
 785 =head2 TODO
 786
 787 Unicode::Collate has not been ported to EBCDIC.  The code mostly would
 788 work just fine but a decision needs to be made: how the module should
 789 work in EBCDIC?  Should the low 256 characters be understood as
 790 Unicode or as EBCDIC code points?  Should one be chosen or should
 791 there be a way to do either?  Or should such translation be left
 792 outside the module for the user to do, for example by using
 793 Encode::from_to()?
 794
 795 =head2 CAVEAT
 796
 797 Use of the C<normalization> parameter requires
 798 the B<Unicode::Normalize> module.
 799
 800 If you need not it (say, in the case when you need not
 801 handle any combining characters),
 802 assign C<normalization =E<gt> undef> explicitly.
 803
 804 -- see 6.5 Avoiding Normalization, UTR #10.
 805
 806 =head1 AUTHOR
 807
 808 SADAHIRO Tomoyuki, E<lt>SADAHIRO@cpan.orgE<gt>
 809
 810   http://homepage1.nifty.com/nomenclator/perl/
 811
 812   Copyright(C) 2001, SADAHIRO Tomoyuki. Japan. All rights reserved.
 813
 814   This program is free software; you can redistribute it and/or
 815   modify it under the same terms as Perl itself.
 816
 817 =head1 SEE ALSO
 818
 819 =over 4
 820
 821 =item Unicode Collation Algorithm - Unicode TR #10
 822
 823 http://www.unicode.org/unicode/reports/tr10/
 824
 825 =item L<Unicode::Normalize>
 826
 827 normalized forms of Unicode text
 828
 829 =back
 830
 831 =cut