lib/Unicode/Collate.pm

   1 package Unicode::Collate;
   2
   3 BEGIN {
   4     if (ord("A") == 193) {
   5         die "Unicode::Collate not ported to EBCDIC\n";
   6     }
   7 }
   8
   9 use 5.006;
  10 use strict;
  11 use warnings;
  12 use Carp;
  13
  14 require Exporter;
  15
  16 our $VERSION = '0.10';
  17 our $PACKAGE = __PACKAGE__;
  18
  19 our @ISA = qw(Exporter);
  20
  21 our %EXPORT_TAGS = ();
  22 our @EXPORT_OK = ();
  23 our @EXPORT = ();
  24
  25 (our $Path = $INC{'Unicode/Collate.pm'}) =~ s/\.pm$//;
  26 our $KeyFile = "allkeys.txt";
  27
  28 our $getCombinClass; # coderef for combining class from Unicode::Normalize
  29
  30 use constant Min2      => 0x20;   # minimum weight at level 2
  31 use constant Min3      => 0x02;   # minimum weight at level 3
  32 use constant UNDEFINED => 0xFF80; # special value for undefined CE
  33
  34 ##
  35 ## constructor
  36 ##
  37 sub new
  38 {
  39   my $class = shift;
  40   my $self = bless { @_ }, $class;
  41
  42   # alternate
  43   $self->{alternate} =
  44      ! exists  $self->{alternate} ? 'shifted' :
  45      ! defined $self->{alternate} ? '' : $self->{alternate};
  46
  47   # collation level
  48   $self->{level} ||= ($self->{alternate} =~ /^shift/ ? 4 : 3);
  49
  50   # normalization form
  51   $self->{normalization} = 'D' if ! exists $self->{normalization};
  52
  53   if(defined $self->{normalization}){
  54     eval "use Unicode::Normalize;";
  55     croak "you'd install Unicode::Normalize for normalization forms: $@"
  56         if $@;
  57     $getCombinClass = \&Unicode::Normalize::getCombinClass
  58         if ! $getCombinClass;
  59   }
  60
  61   $self->{UNF} =
  62     ! defined $self->{normalization}        ? undef :
  63     $self->{normalization} =~ /^(?:NF)?C$/  ? \&NFC :
  64     $self->{normalization} =~ /^(?:NF)?D$/  ? \&NFD :
  65     $self->{normalization} =~ /^(?:NF)?KC$/ ? \&NFKC :
  66     $self->{normalization} =~ /^(?:NF)?KD$/ ? \&NFKD :
  67     croak "$PACKAGE unknown normalization form name: $self->{normalization}";
  68
  69   # backwards
  70   $self->{backwards} ||= [];
  71   $self->{backwards} = [ $self->{backwards} ] if ! ref $self->{backwards};
  72
  73   # rearrange
  74   $self->{rearrange} ||= []; # maybe not U+0000 (an ASCII)
  75   $self->{rearrange} = [ $self->{rearrange} ] if ! ref $self->{rearrange};
  76
  77   # open a table file.
  78   # if undef is passed explicitly, no file is read.
  79   $self->{table} = $KeyFile unless exists $self->{table};
  80   $self->read_table if defined $self->{table};
  81
  82   if($self->{entry}){
  83     $self->parseEntry($_) foreach split /\n/, $self->{entry};
  84   }
  85
  86   # keys of $self->{rearrangeHash} are $self->{rearrange}.
  87   $self->{rearrangeHash} = {};
  88   @{ $self->{rearrangeHash} }{ @{ $self->{rearrange} } } = ();
  89
  90   return $self;
  91 }
  92
  93
  94 sub read_table {
  95   my $self = shift;
  96   my $file = $self->{table} ne '' ? $self->{table} : $KeyFile;
  97   open my $fk, "<$Path/$file" or croak "File does not exist at $Path/$file";
  98
  99   while(<$fk>){
 100     next if /^\s*#/;
 101     if(/^\s*\@/){
 102        if(/^\@version\s*(\S*)/){
 103          $self->{version} ||= $1;
 104        }
 105        elsif(/^\@alternate\s+(.*)/){
 106          $self->{alternate} ||= $1;
 107        }
 108        elsif(/^\@backwards\s+(.*)/){
 109          push @{ $self->{backwards} }, $1;
 110        }
 111        elsif(/^\@rearrange\s+(.*)/){
 112          push @{ $self->{rearrange} }, _getHexArray($1);
 113        }
 114        next;
 115     }
 116     $self->parseEntry($_);
 117   }
 118   close $fk;
 119 }
 120
 121
 122 ##
 123 ## get $line, parse it, and write an entry in $self
 124 ##
 125 sub parseEntry
 126 {
 127   my $self = shift;
 128   my $line = shift;
 129   my($name, $ele, @key);
 130
 131   return if $line !~ /^\s*[0-9A-Fa-f]/;
 132
 133   # get name
 134   $name = $1 if $line =~ s/#\s*(.*)//;
 135   return if defined $self->{undefName} && $name =~ /$self->{undefName}/;
 136
 137   # get element
 138   my($e, $k) = split /;/, $line;
 139   my @e = _getHexArray($e);
 140   { no warnings 'utf8'; $ele = pack('U*', @e); }
 141   return if defined $self->{undefChar} && $ele =~ /$self->{undefChar}/;
 142
 143   # get sort key
 144   { no warnings 'utf8';
 145     if(
 146      defined $self->{ignoreName} && $name =~ /$self->{ignoreName}/ ||
 147      defined $self->{ignoreChar} && $ele  =~ /$self->{ignoreChar}/
 148        )
 149     {
 150         $self->{entries}{$ele} = $self->{ignored}{$ele} = 1;
 151     }
 152     else
 153     {
 154         foreach my $arr ($k =~ /\[(\S+)\]/g) {
 155             my $var = $arr =~ /\*/;
 156             push @key, $self->altCE( $var, _getHexArray($arr) );
 157         }
 158         $self->{entries}{$ele} = \@key;
 159     }
 160   }
 161   $self->{maxlength}{ord $ele} = scalar @e if @e > 1;
 162 }
 163
 164
 165 ##
 166 ## arrayref CE = altCE(bool variable?, list[num] weights)
 167 ##
 168 sub altCE
 169 {
 170   my $self = shift;
 171   my $var  = shift;
 172   my @c    = @_;
 173
 174   $self->{alternate} eq 'blanked' ?
 175     $var ? [0,0,0] : [ @c[0..2] ] :
 176   $self->{alternate} eq 'non-ignorable' ?
 177     [ @c[0..2] ] :
 178   $self->{alternate} eq 'shifted' ?
 179     $var ? [0,0,0,$c[0] ] : [ @c[0..2], $c[0]+$c[1]+$c[2] ? 0xFFFF : 0 ] :
 180   $self->{alternate} eq 'shift-trimmed' ?
 181     $var ? [0,0,0,$c[0] ] : [ @c[0..2], 0 ] :
 182    \@c;
 183 }
 184
 185 ##
 186 ## string hex_sortkey = splitCE(string arg)
 187 ##
 188 sub viewSortKey
 189 {
 190   my $self = shift;
 191   my $key  = $self->getSortKey(@_);
 192   my $view = join " ", map sprintf("%04X", $_), unpack 'n*', $key;
 193   $view =~ s/ ?0000 ?/|/g;
 194   "[$view]";
 195 }
 196
 197
 198 ##
 199 ## list[strings] elements = splitCE(string arg)
 200 ##
 201 sub splitCE
 202 {
 203   my $self = shift;
 204   my $code = $self->{preprocess};
 205   my $norm = $self->{UNF};
 206   my $ent  = $self->{entries};
 207   my $max  = $self->{maxlength};
 208   my $rear = $self->{rearrangeHash};
 209
 210   my $str = ref $code ? &$code(shift) : shift;
 211   $str = &$norm($str) if ref $norm;
 212
 213   my(@src, @buf);
 214   @src = unpack('U*', $str);
 215
 216   # rearrangement
 217   for(my $i = 0; $i < @src; $i++)
 218   {
 219      ($src[$i], $src[$i+1]) = ($src[$i+1], $src[$i])
 220         if $rear->{ $src[$i] };
 221      $i++;
 222   }
 223
 224   for(my $i = 0; $i < @src; $i++)
 225   {
 226     my $ch;
 227     my $u  = $src[$i];
 228
 229   # non-characters
 230     next unless defined $u;
 231     next if $u < 0 || 0x10FFFF < $u     # out of range
 232          || 0xD800 < $u && $u < 0xDFFF; # unpaired surrogates
 233     my $four = $u & 0xFFFF;
 234     next if $four == 0xFFFE || $four == 0xFFFF;
 235
 236     if($max->{$u}) # contract
 237     {
 238       for(my $j = $max->{$u}; $j >= 1; $j--)
 239       {
 240         next unless $i+$j-1 < @src;
 241         $ch = pack 'U*', @src[$i .. $i+$j-1];
 242         $i += $j-1, last if $ent->{$ch};
 243       }
 244     }
 245     else {  $ch = pack('U', $u) }
 246
 247   # with Combining Char (UTS#10, 4.2.1), here requires Unicode::Normalize.
 248     if($getCombinClass && defined $ch)
 249     {
 250       for(my $j = $i+1; $j < @src; $j++)
 251       {
 252         next unless defined $src[$j];
 253         last unless $getCombinClass->( $src[$j] );
 254         my $comb = pack 'U', $src[$j];
 255         next if ! $ent->{ $ch.$comb };
 256         $ch .= $comb;
 257         $src[$j] = undef;
 258       }
 259     }
 260     push @buf, $ch;
 261   }
 262   wantarray ? @buf : \@buf;
 263 }
 264
 265
 266 ##
 267 ## list[arrayrefs] weight = getWt(string element)
 268 ##
 269 sub getWt
 270 {
 271   my $self = shift;
 272   my $ch   = shift;
 273   my $ent  = $self->{entries};
 274   my $ign  = $self->{ignored};
 275   my $cjk  = $self->{overrideCJK};
 276   my $hang = $self->{overrideHangul};
 277   return if !defined $ch || $ign->{$ch};   # ignored
 278   return @{ $ent->{$ch} } if $ent->{$ch};
 279   my $u = unpack('U', $ch);
 280   return
 281     _isHangul($u)
 282       ? $hang
 283         ? &$hang($u)
 284         : map(@{ $ent->{pack('U', $_)} }, _decompHangul($u))
 285       : _isCJK($u)
 286         ? $cjk ? &$cjk($u) : map($self->altCE(0,@$_), _CJK($u))
 287         : map($self->altCE(0,@$_), _derivCE($u));
 288 }
 289
 290 ##
 291 ## int = index(string, substring)
 292 ##
 293 sub index
 294 {
 295   my $self = shift;
 296   my $lev  = $self->{level};
 297   my $str  = $self->splitCE(shift);
 298   my $sub  = $self->splitCE(shift);
 299
 300   return wantarray ? (0,0) : 0 if ! @$sub;
 301   return wantarray ?  ()  : -1 if ! @$str;
 302
 303   my @subWt = grep _ignorableAtLevel($_,$lev),
 304               map $self->getWt($_), @$sub;
 305
 306   my(@strWt,@strPt);
 307   my $count = 0;
 308   for my $e (@$str){
 309     my @tmp = grep _ignorableAtLevel($_,$lev), $self->getWt($e);
 310     push @strWt, @tmp;
 311     push @strPt, ($count) x @tmp;
 312     $count += length $e;
 313     while(@strWt >= @subWt){
 314       if(_eqArray(\@strWt, \@subWt, $lev)){
 315         my $pos = $strPt[0];
 316         return wantarray ? ($pos, $count-$pos) : $pos;
 317       }
 318       shift @strWt;
 319       shift @strPt;
 320     }
 321   }
 322   return wantarray ? () : -1;
 323 }
 324
 325 ##
 326 ## bool _eqArray(arrayref, arrayref, level)
 327 ##
 328 sub _eqArray($$$)
 329 {
 330   my $a   = shift; # length $a >= length $b;
 331   my $b   = shift;
 332   my $lev = shift;
 333   for my $v (0..$lev-1){
 334     for my $c (0..@$b-1){
 335       return if $a->[$c][$v] != $b->[$c][$v];
 336     }
 337   }
 338   return 1;
 339 }
 340
 341
 342 ##
 343 ## bool _ignorableAtLevel(CE, level)
 344 ##
 345 sub _ignorableAtLevel($$)
 346 {
 347   my $ce = shift;
 348   return if ! defined $ce;
 349   my $lv = shift;
 350   ! grep { ! $ce->[$_] } 0..$lv-1;
 351 }
 352
 353
 354 ##
 355 ## string sortkey = getSortKey(string arg)
 356 ##
 357 sub getSortKey
 358 {
 359   my $self = shift;
 360   my $lev  = $self->{level};
 361   my $rCE  = $self->splitCE(shift); # get an arrayref
 362
 363   # weight arrays
 364   my @buf = grep defined(), map $self->getWt($_), @$rCE;
 365
 366   # make sort key
 367   my @ret = ([],[],[],[]);
 368   foreach my $v (0..$lev-1){
 369     foreach my $b (@buf){
 370       push @{ $ret[$v] }, $b->[$v] if $b->[$v];
 371     }
 372   }
 373   foreach (@{ $self->{backwards} }){
 374     my $v = $_ - 1;
 375     @{ $ret[$v] } = reverse @{ $ret[$v] };
 376   }
 377
 378   # modification of tertiary weights
 379   if($self->{upper_before_lower}){
 380     foreach (@{ $ret[2] }){
 381       if   (0x8 <= $_ && $_ <= 0xC){ $_ -= 6 } # lower
 382       elsif(0x2 <= $_ && $_ <= 0x6){ $_ += 6 } # upper
 383       elsif($_ == 0x1C)            { $_ += 1 } # square upper
 384       elsif($_ == 0x1D)            { $_ -= 1 } # square lower
 385     }
 386   }
 387   if($self->{katakana_before_hiragana}){
 388     foreach (@{ $ret[2] }){
 389       if   (0x0F <= $_ && $_ <= 0x13){ $_ -= 2 } # katakana
 390       elsif(0x0D <= $_ && $_ <= 0x0E){ $_ += 5 } # hiragana
 391     }
 392   }
 393   join "\0\0", map pack('n*', @$_), @ret;
 394 }
 395
 396
 397 ##
 398 ## int compare = cmp(string a, string b)
 399 ##
 400 sub cmp { $_[0]->getSortKey($_[1]) cmp $_[0]->getSortKey($_[2]) }
 401 sub eq  { $_[0]->getSortKey($_[1]) eq  $_[0]->getSortKey($_[2]) }
 402 sub ne  { $_[0]->getSortKey($_[1]) ne  $_[0]->getSortKey($_[2]) }
 403 sub lt  { $_[0]->getSortKey($_[1]) lt  $_[0]->getSortKey($_[2]) }
 404 sub le  { $_[0]->getSortKey($_[1]) le  $_[0]->getSortKey($_[2]) }
 405 sub gt  { $_[0]->getSortKey($_[1]) gt  $_[0]->getSortKey($_[2]) }
 406 sub ge  { $_[0]->getSortKey($_[1]) ge  $_[0]->getSortKey($_[2]) }
 407
 408 ##
 409 ## list[strings] sorted = sort(list[strings] arg)
 410 ##
 411 sub sort
 412 {
 413   my $obj = shift;
 414
 415   map { $_->[1] }
 416   sort{ $a->[0] cmp $b->[0] }
 417   map [ $obj->getSortKey($_), $_ ], @_;
 418 }
 419
 420 ##
 421 ## list[arrayrefs] CE = _derivCE(int codepoint)
 422 ##
 423 sub _derivCE
 424 {
 425   my $code = shift;
 426   my $a = UNDEFINED + ($code >> 15); # ok
 427   my $b = ($code & 0x7FFF) | 0x8000; # ok
 428 # my $a = 0xFFC2 + ($code >> 15);    # ng
 429 # my $b = $code & 0x7FFF | 0x1000;   # ng
 430   $b ? ([$a,2,1,$code],[$b,0,0,$code]) : [$a,2,1,$code];
 431 }
 432
 433 ##
 434 ## "hhhh hhhh hhhh" to (dddd, dddd, dddd)
 435 ##
 436 sub _getHexArray { map hex(), $_[0] =~ /([0-9a-fA-F]+)/g }
 437
 438 ##
 439 ## bool is_a_CJK_Unified_Ideograph = _isCJK(int codepoint)
 440 ##
 441 sub _isCJK
 442 {
 443   my $u = shift;
 444   return 0x3400 <= $u && $u <= 0x4DB5
 445       || 0x4E00 <= $u && $u <= 0x9FA5
 446 #      || 0x20000 <= $u && $u <= 0x2A6D6;
 447 }
 448
 449 ##
 450 ## list[arrayref] CE = _CJK(int codepoint_of_CJK)
 451 ##
 452 sub _CJK { $_[0] > 0xFFFF ? _derivCE($_[0]) : [$_[0], 0x20, 0x02, $_[0]] }
 453
 454 ##
 455 ## bool is_a_Hangul_Syllable = _isHangul(int codepoint)
 456 ##
 457 sub _isHangul { return 0xAC00 <= $_[0] && $_[0] <= 0xD7A3 }
 458
 459 sub _decompHangul {
 460     my $code = shift;
 461   # $code must be in Hangul syllable. check it before you enter here.
 462     my $SIndex = $code - 0xAC00;
 463     my $LIndex = int( $SIndex / 588);
 464     my $VIndex = int(($SIndex % 588) / 28);
 465     my $TIndex =      $SIndex % 28;
 466     return (
 467        0x1100 + $LIndex,
 468        0x1161 + $VIndex,
 469       $TIndex ? (0x11A7 + $TIndex) : (),
 470     );
 471 }
 472
 473 1;
 474 __END__
 475
 476 =head1 NAME
 477
 478 Unicode::Collate - use UCA (Unicode Collation Algorithm)
 479
 480 =head1 SYNOPSIS
 481
 482   use Unicode::Collate;
 483
 484   #construct
 485   $Collator = Unicode::Collate->new(%tailoring);
 486
 487   #sort
 488   @sorted = $Collator->sort(@not_sorted);
 489
 490   #compare
 491   $result = $Collator->cmp($a, $b); # returns 1, 0, or -1.
 492
 493 =head1 DESCRIPTION
 494
 495 =head2 Constructor and Tailoring
 496
 497 The C<new> method returns a collator object.
 498
 499    $Collator = Unicode::Collate->new(
 500       alternate => $alternate,
 501       backwards => $levelNumber, # or \@levelNumbers
 502       entry => $element,
 503       normalization  => $normalization_form,
 504       ignoreName => qr/$ignoreName/,
 505       ignoreChar => qr/$ignoreChar/,
 506       katakana_before_hiragana => $bool,
 507       level => $collationLevel,
 508       overrideCJK => \&overrideCJK,
 509       overrideHangul => \&overrideHangul,
 510       preprocess => \&preprocess,
 511       rearrange => \@charList,
 512       table => $filename,
 513       undefName => qr/$undefName/,
 514       undefChar => qr/$undefChar/,
 515       upper_before_lower => $bool,
 516    );
 517    # if %tailoring is false (empty),
 518    # $Collator should do the default collation.
 519
 520 =over 4
 521
 522 =item alternate
 523
 524 -- see 3.2.2 Alternate Weighting, UTR #10.
 525
 526    alternate => 'shifted', 'blanked', 'non-ignorable', or 'shift-trimmed'.
 527
 528 By default (if specification is omitted), 'shifted' is adopted.
 529
 530 =item backwards
 531
 532 -- see 3.1.2 French Accents, UTR #10.
 533
 534      backwards => $levelNumber or \@levelNumbers
 535
 536 Weights in reverse order; ex. level 2 (diacritic ordering) in French.
 537 If omitted, forwards at all the levels.
 538
 539 =item entry
 540
 541 -- see 3.1 Linguistic Features; 3.2.1 File Format, UTR #10.
 542
 543 Overrides a default order or adds a new collation element
 544
 545   entry => <<'ENTRIES', # use the UCA file format
 546 00E6 ; [.0861.0020.0002.00E6] [.08B1.0020.0002.00E6] # ligature <ae> as <a e>
 547 0063 0068 ; [.0893.0020.0002.0063]      # "ch" in traditional Spanish
 548 0043 0068 ; [.0893.0020.0008.0043]      # "Ch" in traditional Spanish
 549 ENTRIES
 550
 551 =item ignoreName
 552
 553 =item ignoreChar
 554
 555 -- see Completely Ignorable, 3.2.2 Alternate Weighting, UTR #10.
 556
 557 Ignores the entry in the table.
 558 If an ignored collation element appears in the string to be collated,
 559 it is ignored as if the element had been deleted from there.
 560
 561 E.g. when 'a' and 'e' are ignored,
 562 'element' is equal to 'lament' (or 'lmnt').
 563
 564 =item level
 565
 566 -- see 4.3 Form a sort key for each string, UTR #10.
 567
 568 Set the maximum level.
 569 Any higher levels than the specified one are ignored.
 570
 571   Level 1: alphabetic ordering
 572   Level 2: diacritic ordering
 573   Level 3: case ordering
 574   Level 4: tie-breaking (e.g. in the case when alternate is 'shifted')
 575
 576   ex.level => 2,
 577
 578 =item normalization
 579
 580 -- see 4.1 Normalize each input string, UTR #10.
 581
 582 If specified, strings are normalized before preparation of sort keys
 583 (the normalization is executed after preprocess).
 584
 585 As a form name, one of the following names must be used.
 586
 587   'C'  or 'NFC'  for Normalization Form C
 588   'D'  or 'NFD'  for Normalization Form D
 589   'KC' or 'NFKC' for Normalization Form KC
 590   'KD' or 'NFKD' for Normalization Form KD
 591
 592 If omitted, the string is put into Normalization Form D.
 593
 594 If undefined explicitly (as C<normalization =E<gt> undef>),
 595 any normalization is not carried out (this may make tailoring easier
 596 if any normalization is not desired).
 597
 598 see B<CAVEAT>.
 599
 600 =item overrideCJK
 601
 602 =item overrideHangul
 603
 604 -- see 7.1 Derived Collation Elements, UTR #10.
 605
 606 By default, mapping of CJK Unified Ideographs
 607 uses the Unicode codepoint order
 608 and Hangul Syllables are decomposed into Hangul Jamo.
 609
 610 The mapping of CJK Unified Ideographs
 611 or Hangul Syllables may be overrided.
 612
 613 ex. CJK Unified Ideographs in the JIS codepoint order.
 614
 615   overrideCJK => sub {
 616     my $u = shift;               # get unicode codepoint
 617     my $b = pack('n', $u);       # to UTF-16BE
 618     my $s = your_unicode_to_sjis_converter($b); # convert
 619     my $n = unpack('n', $s);     # convert sjis to short
 620     [ $n, 1, 1 ];                # return collation element
 621   },
 622
 623 If you want to override the mapping of Hangul Syllables,
 624 the Normalization Forms D and KD are not appropriate
 625 (they will be decomposed before overriding).
 626
 627 =item preprocess
 628
 629 -- see 5.1 Preprocessing, UTR #10.
 630
 631 If specified, the coderef is used to preprocess
 632 before the formation of sort keys.
 633
 634 ex. dropping English articles, such as "a" or "the".
 635 Then, "the pen" is before "a pencil".
 636
 637      preprocess => sub {
 638            my $str = shift;
 639            $str =~ s/\b(?:an?|the)\s+//g;
 640            $str;
 641         },
 642
 643 =item rearrange
 644
 645 -- see 3.1.3 Rearrangement, UTR #10.
 646
 647 Characters that are not coded in logical order and to be rearranged.
 648 By default,
 649
 650     rearrange => [ 0x0E40..0x0E44, 0x0EC0..0x0EC4 ],
 651
 652 =item table
 653
 654 -- see 3.2 Default Unicode Collation Element Table, UTR #10.
 655
 656 You can use another element table if desired.
 657 The table file must be in your C<lib/Unicode/Collate> directory.
 658
 659 By default, the file C<lib/Unicode/Collate/allkeys.txt> is used.
 660
 661 If undefined explicitly (as C<table =E<gt> undef>),
 662 no file is read (you'd define collation elements using L<entry>).
 663
 664 =item undefName
 665
 666 =item undefChar
 667
 668 -- see 6.3.4 Reducing the Repertoire, UTR #10.
 669
 670 Undefines the collation element as if it were unassigned in the table.
 671 This reduces the size of the table.
 672 If an unassigned character appears in the string to be collated,
 673 the sort key is made from its codepoint
 674 as a single-character collation element,
 675 as it is greater than any other assigned collation elements
 676 (in the codepoint order among the unassigned characters).
 677 But, it'd be better to ignore characters
 678 unfamiliar to you and maybe never used.
 679
 680 =item katakana_before_hiragana
 681
 682 =item upper_before_lower
 683
 684 -- see 6.6 Case Comparisons; 7.3.1 Tertiary Weight Table, UTR #10.
 685
 686 By default, lowercase is before uppercase
 687 and hiragana is before katakana.
 688
 689 If the parameter is true, this is reversed.
 690
 691 =back
 692
 693 =head2 Other methods
 694
 695 =over 4
 696
 697 =item C<@sorted = $Collator-E<gt>sort(@not_sorted)>
 698
 699 Sorts a list of strings.
 700
 701 =item C<$result = $Collator-E<gt>cmp($a, $b)>
 702
 703 Returns 1 (when C<$a> is greater than C<$b>)
 704 or 0 (when C<$a> is equal to C<$b>)
 705 or -1 (when C<$a> is lesser than C<$b>).
 706
 707 =item C<$result = $Collator-E<gt>eq($a, $b)>
 708
 709 =item C<$result = $Collator-E<gt>ne($a, $b)>
 710
 711 =item C<$result = $Collator-E<gt>lt($a, $b)>
 712
 713 =item C<$result = $Collator-E<gt>le($a, $b)>
 714
 715 =item C<$result = $Collator-E<gt>gt($a, $b)>
 716
 717 =item C<$result = $Collator-E<gt>ge($a, $b)>
 718
 719 They works like the same name operators as theirs.
 720
 721    eq : whether $a is equal to $b.
 722    ne : whether $a is not equal to $b.
 723    lt : whether $a is lesser than $b.
 724    le : whether $a is lesser than $b or equal to $b.
 725    gt : whether $a is greater than $b.
 726    ge : whether $a is greater than $b or equal to $b.
 727
 728 =item C<$sortKey = $Collator-E<gt>getSortKey($string)>
 729
 730 -- see 4.3 Form a sort key for each string, UTR #10.
 731
 732 Returns a sort key.
 733
 734 You compare the sort keys using a binary comparison
 735 and get the result of the comparison of the strings using UCA.
 736
 737    $Collator->getSortKey($a) cmp $Collator->getSortKey($b)
 738
 739       is equivalent to
 740
 741    $Collator->cmp($a, $b)
 742
 743 =item C<$position = $Collator-E<gt>index($string, $substring)>
 744
 745 =item C<($position, $length) = $Collator-E<gt>index($string, $substring)>
 746
 747 -- see 6.8 Searching, UTR #10.
 748
 749 If C<$substring> matches a part of C<$string>, returns
 750 the position of the first occurrence of the matching part in scalar context;
 751 in list context, returns a two-element list of
 752 the position and the length of the matching part.
 753
 754 B<Notice> that the length of the matching part may differ from
 755 the length of C<$substring>.
 756
 757 B<Note> that the position and the length are counted on the string
 758 after the process of preprocess, normalization, and rearrangement.
 759 Therefore, in case the specified string is not binary equal to
 760 the preprocessed/normalized/rearranged string, the position and the length
 761 may differ form those on the specified string. But it is guaranteed
 762 that, if matched, it returns a non-negative value as C<$position>.
 763
 764 If C<$substring> does not match any part of C<$string>,
 765 returns C<-1> in scalar context and
 766 an empty list in list context.
 767
 768 e.g. you say
 769
 770   my $Collator = Unicode::Collate->new( normalization => undef, level => 1 );
 771   my $str = "Ich mu\x{00DF} studieren.";
 772   my $sub = "m\x{00FC}ss";
 773   my $match;
 774   if(my($pos,$len) = $Collator->index($str, $sub)){
 775       $match = substr($str, $pos, $len);
 776   }
 777
 778 and get C<"mu\x{00DF}"> in C<$match> since C<"mu>E<223>C<">
 779 is primary equal to C<"m>E<252>C<ss">.
 780
 781 =back
 782
 783 =head2 EXPORT
 784
 785 None by default.
 786
 787 =head2 TODO
 788
 789 Unicode::Collate has not been ported to EBCDIC.  The code mostly would
 790 work just fine but a decision needs to be made: how the module should
 791 work in EBCDIC?  Should the low 256 characters be understood as
 792 Unicode or as EBCDIC code points?  Should one be chosen or should
 793 there be a way to do either?  Or should such translation be left
 794 outside the module for the user to do, for example by using
 795 Encode::from_to()?
 796
 797 =head2 CAVEAT
 798
 799 Use of the C<normalization> parameter requires
 800 the B<Unicode::Normalize> module.
 801
 802 If you need not it (say, in the case when you need not
 803 handle any combining characters),
 804 assign C<normalization =E<gt> undef> explicitly.
 805
 806 -- see 6.5 Avoiding Normalization, UTR #10.
 807
 808 =head1 AUTHOR
 809
 810 SADAHIRO Tomoyuki, E<lt>SADAHIRO@cpan.orgE<gt>
 811
 812   http://homepage1.nifty.com/nomenclator/perl/
 813
 814   Copyright(C) 2001, SADAHIRO Tomoyuki. Japan. All rights reserved.
 815
 816   This program is free software; you can redistribute it and/or
 817   modify it under the same terms as Perl itself.
 818
 819 =head1 SEE ALSO
 820
 821 =over 4
 822
 823 =item Unicode Collation Algorithm - Unicode TR #10
 824
 825 http://www.unicode.org/unicode/reports/tr10/
 826
 827 =item L<Unicode::Normalize>
 828
 829 normalized forms of Unicode text
 830
 831 =back
 832
 833 =cut