lib/Unicode/Collate.pm

   1 package Unicode::Collate;
   2
   3 BEGIN {
   4     if (ord("A") == 193) {
   5         die "Unicode::Collate not ported to EBCDIC\n";
   6     }
   7 }
   8
   9 use 5.006;
  10 use strict;
  11 use warnings;
  12 use Carp;
  13
  14 require Exporter;
  15
  16 our $VERSION = '0.12';
  17 our $PACKAGE = __PACKAGE__;
  18
  19 our @ISA = qw(Exporter);
  20
  21 our %EXPORT_TAGS = ();
  22 our @EXPORT_OK = ();
  23 our @EXPORT = ();
  24
  25 (our $Path = $INC{'Unicode/Collate.pm'}) =~ s/\.pm$//;
  26 our $KeyFile = "allkeys.txt";
  27
  28 our $UNICODE_VERSION;
  29
  30 eval { require Unicode::UCD };
  31
  32 unless ($@) {
  33     $UNICODE_VERSION = Unicode::UCD::UnicodeVersion();
  34 }
  35 else { # XXX, Perl 5.6.1
  36     my($f, $fh);
  37     foreach my $d (@INC) {
  38         use File::Spec;
  39         $f = File::Spec->catfile($d, "unicode", "Unicode.301");
  40         if (open($fh, $f)) {
  41             $UNICODE_VERSION = '3.0.1';
  42             close $fh;
  43             last;
  44         }
  45     }
  46 }
  47
  48 our $getCombinClass; # coderef for combining class from Unicode::Normalize
  49
  50 use constant Min2      => 0x20;   # minimum weight at level 2
  51 use constant Min3      => 0x02;   # minimum weight at level 3
  52 use constant UNDEFINED => 0xFF80; # special value for undefined CE's
  53
  54 our $DefaultRearrange = [ 0x0E40..0x0E44, 0x0EC0..0x0EC4 ];
  55
  56 sub UCA_Version { "8.0" }
  57
  58 sub Base_Unicode_Version { $UNICODE_VERSION || 'unknown' }
  59
  60 ##
  61 ## constructor
  62 ##
  63 sub new
  64 {
  65     my $class = shift;
  66     my $self = bless { @_ }, $class;
  67
  68     # alternate lowercased
  69     $self->{alternate} =
  70         ! exists $self->{alternate} ? 'shifted' : lc($self->{alternate});
  71
  72     croak "$PACKAGE unknown alternate tag name: $self->{alternate}"
  73         unless $self->{alternate} eq 'blanked'
  74             || $self->{alternate} eq 'non-ignorable'
  75             || $self->{alternate} eq 'shifted'
  76             || $self->{alternate} eq 'shift-trimmed';
  77
  78     # collation level
  79     $self->{level} ||= 4;
  80
  81     croak "Illegal level lower than 1 (passed $self->{level})."
  82         if $self->{level} < 1;
  83     croak "A level higher than 4 (passed $self->{level}) is not supported."
  84         if 4 < $self->{level};
  85
  86     # overrideHangul and -CJK
  87     # If true: CODEREF used; '': default; undef: derived elements
  88     $self->{overrideHangul} = ''
  89         if ! exists $self->{overrideHangul};
  90     $self->{overrideCJK} = ''
  91         if ! exists $self->{overrideCJK};
  92
  93     # normalization form
  94     $self->{normalization} = 'D'
  95         if ! exists $self->{normalization};
  96     $self->{UNF} = undef;
  97
  98     if (defined $self->{normalization}) {
  99         eval { require Unicode::Normalize };
 100         croak "Unicode/Normalize.pm is required to normalize strings: $@"
 101             if $@;
 102
 103         Unicode::Normalize->import();
 104         $getCombinClass = \&Unicode::Normalize::getCombinClass
 105             if ! $getCombinClass;
 106
 107         $self->{UNF} =
 108             $self->{normalization} =~ /^(?:NF)?C$/  ? \&NFC :
 109             $self->{normalization} =~ /^(?:NF)?D$/  ? \&NFD :
 110             $self->{normalization} =~ /^(?:NF)?KC$/ ? \&NFKC :
 111             $self->{normalization} =~ /^(?:NF)?KD$/ ? \&NFKD :
 112           croak "$PACKAGE unknown normalization form name: "
 113                 . $self->{normalization};
 114     }
 115
 116     # Open a table file.
 117     # If undef is passed explicitly, no file is read.
 118     $self->{table} = $KeyFile
 119         if ! exists $self->{table};
 120     $self->read_table
 121         if defined $self->{table};
 122
 123     if ($self->{entry}) {
 124         $self->parseEntry($_) foreach split /\n/, $self->{entry};
 125     }
 126
 127     # backwards
 128     $self->{backwards} ||= [ ];
 129     $self->{backwards} = [ $self->{backwards} ]
 130         if ! ref $self->{backwards};
 131
 132     # rearrange
 133     $self->{rearrange} = $DefaultRearrange
 134         if ! exists $self->{rearrange};
 135     $self->{rearrange} = []
 136         if ! defined $self->{rearrange};
 137     croak "$PACKAGE: A list for rearrangement must be store in an ARRAYREF"
 138         if ! ref $self->{rearrange};
 139
 140     # keys of $self->{rearrangeHash} are $self->{rearrange}.
 141     $self->{rearrangeHash} = undef;
 142
 143     if (@{ $self->{rearrange} }) {
 144         @{ $self->{rearrangeHash} }{ @{ $self->{rearrange} } } = ();
 145     }
 146
 147     return $self;
 148 }
 149
 150 sub read_table {
 151     my $self = shift;
 152     my $file = $self->{table} ne '' ? $self->{table} : $KeyFile;
 153
 154     open my $fk, "<$Path/$file"
 155         or croak "File does not exist at $Path/$file";
 156
 157     while (<$fk>) {
 158         next if /^\s*#/;
 159         if (/^\s*\@/) {
 160             if (/^\@version\s*(\S*)/) {
 161                 $self->{version} ||= $1;
 162             }
 163             elsif (/^\@alternate\s+(.*)/) {
 164                 $self->{alternate} ||= $1;
 165             }
 166             elsif (/^\@backwards\s+(.*)/) {
 167                 push @{ $self->{backwards} }, $1;
 168             }
 169             elsif (/^\@rearrange\s+(.*)/) {
 170                 push @{ $self->{rearrange} }, _getHexArray($1);
 171             }
 172             next;
 173         }
 174         $self->parseEntry($_);
 175     }
 176     close $fk;
 177 }
 178
 179
 180 ##
 181 ## get $line, parse it, and write an entry in $self
 182 ##
 183 sub parseEntry
 184 {
 185     my $self = shift;
 186     my $line = shift;
 187     my($name, $ele, @key);
 188
 189     return if $line !~ /^\s*[0-9A-Fa-f]/;
 190
 191     # removes comment and gets name
 192     $name = $1
 193         if $line =~ s/[#%]\s*(.*)//;
 194     return if defined $self->{undefName} && $name =~ /$self->{undefName}/;
 195
 196     # gets element
 197     my($e, $k) = split /;/, $line;
 198     croak "Wrong Entry: <charList> must be separated by ';' from <collElement>"
 199         if ! $k;
 200
 201     my @e = _getHexArray($e);
 202     $ele = pack('U*', @e);
 203     return if defined $self->{undefChar} && $ele =~ /$self->{undefChar}/;
 204
 205     # get sort key
 206     if (defined $self->{ignoreName} && $name =~ /$self->{ignoreName}/ ||
 207         defined $self->{ignoreChar} && $ele  =~ /$self->{ignoreChar}/)
 208     {
 209         $self->{entries}{$ele} = $self->{ignored}{$ele} = 1;
 210     }
 211     else {
 212         my $combining = 1; # primary = 0, secondary != 0;
 213
 214         foreach my $arr ($k =~ /\[([^\[\]]+)\]/g) { # SPACEs allowed
 215             my $var = $arr =~ /\*/; # exactly /^\*/ but be lenient.
 216             push @key, $self->altCE($var, _getHexArray($arr));
 217             $combining = 0 unless $key[-1][0] == 0 && $key[-1][1] != 0;
 218         }
 219         $self->{entries}{$ele} = \@key;
 220         $self->{combining}{$ele} = 1 if $combining;
 221     }
 222     $self->{maxlength}{ord $ele} = scalar @e if @e > 1;
 223 }
 224
 225
 226 ##
 227 ## arrayref CE = altCE(bool variable?, list[num] weights)
 228 ##
 229 sub altCE
 230 {
 231     my $self = shift;
 232     my $var  = shift;
 233     my @c    = @_;
 234
 235     $self->{alternate} eq 'blanked' ?
 236         $var ? [0,0,0,$c[3]] : \@c :
 237     $self->{alternate} eq 'non-ignorable' ?
 238         \@c :
 239     $self->{alternate} eq 'shifted' ?
 240         $var ? [0,0,0,$c[0] ] : [ @c[0..2], $c[0]+$c[1]+$c[2] ? 0xFFFF : 0 ] :
 241     $self->{alternate} eq 'shift-trimmed' ?
 242         $var ? [0,0,0,$c[0] ] : [ @c[0..2], 0 ] :
 243         croak "$PACKAGE unknown alternate name: $self->{alternate}";
 244 }
 245
 246 ##
 247 ## string hex_sortkey = splitCE(string arg)
 248 ##
 249 sub viewSortKey
 250 {
 251     my $self = shift;
 252     my $key  = $self->getSortKey(@_);
 253     my $view = join " ", map sprintf("%04X", $_), unpack 'n*', $key;
 254     $view =~ s/ ?0000 ?/|/g;
 255     return "[$view]";
 256 }
 257
 258
 259 ##
 260 ## list[strings] elements = splitCE(string arg)
 261 ##
 262 sub splitCE
 263 {
 264     my $self = shift;
 265     my $code = $self->{preprocess};
 266     my $norm = $self->{UNF};
 267     my $ent  = $self->{entries};
 268     my $max  = $self->{maxlength};
 269     my $reH  = $self->{rearrangeHash};
 270
 271     my $str = ref $code ? &$code(shift) : shift;
 272     $str = &$norm($str) if ref $norm;
 273
 274     my @src = unpack('U*', $str);
 275     my @buf;
 276
 277     # rearrangement
 278     if ($reH) {
 279         for (my $i = 0; $i < @src; $i++) {
 280             if (exists $reH->{ $src[$i] } && $i + 1 < @src) {
 281                 ($src[$i], $src[$i+1]) = ($src[$i+1], $src[$i]);
 282                 $i++;
 283             }
 284         }
 285     }
 286
 287     for (my $i = 0; $i < @src; $i++) {
 288         my $ch;
 289         my $u = $src[$i];
 290
 291         # non-characters
 292         next unless defined $u;
 293         next if $u < 0 || 0x10FFFF < $u    # out of range
 294             || (0xD800 <= $u && $u <= 0xDFFF); # unpaired surrogates
 295         my $four = $u & 0xFFFF;
 296         next if $four == 0xFFFE || $four == 0xFFFF;
 297
 298         if ($max->{$u}) { # contract
 299             for (my $j = $max->{$u}; $j >= 1; $j--) {
 300                 next unless $i+$j-1 < @src;
 301                 $ch = pack 'U*', @src[$i .. $i+$j-1];
 302                 $i += $j-1, last if $ent->{$ch};
 303             }
 304         } else {
 305             $ch = pack('U', $u);
 306         }
 307
 308         # with Combining Char (UTS#10, 4.2.1), here requires Unicode::Normalize.
 309         if ($getCombinClass && defined $ch) {
 310             for (my $j = $i+1; $j < @src; $j++) {
 311                 next unless defined $src[$j];
 312                 last unless $getCombinClass->( $src[$j] );
 313                 my $comb = pack 'U', $src[$j];
 314                 next if ! $ent->{ $ch.$comb };
 315                 $ch .= $comb;
 316                 $src[$j] = undef;
 317             }
 318         }
 319         push @buf, $ch;
 320     }
 321     wantarray ? @buf : \@buf;
 322 }
 323
 324
 325 ##
 326 ## list[arrayrefs] weight = getWt(string element)
 327 ##
 328 sub getWt
 329 {
 330     my $self = shift;
 331     my $ch   = shift;
 332     my $ent  = $self->{entries};
 333     my $ign  = $self->{ignored};
 334     my $cjk  = $self->{overrideCJK};
 335     my $hang = $self->{overrideHangul};
 336
 337     return if !defined $ch || $ign->{$ch}; # ignored
 338     return @{ $ent->{$ch} } if $ent->{$ch};
 339     my $u = unpack('U', $ch);
 340
 341     if (0xAC00 <= $u && $u <= 0xD7A3) { # is_Hangul
 342         return $hang
 343             ? &$hang($u)
 344             : defined $hang
 345                 ? map({
 346                         my $v = $_;
 347                         my $ar = $ent->{pack('U', $v)};
 348                         $ar ? @$ar : map($self->altCE(0,@$_), _derivCE($v));
 349                     } _decompHangul($u))
 350                 : map($self->altCE(0,@$_), _derivCE($u));
 351     }
 352     elsif (0x3400 <= $u && $u <= 0x4DB5 ||
 353            0x4E00 <= $u && $u <= 0x9FA5 ||
 354            0x20000 <= $u && $u <= 0x2A6D6) { # is_CJK
 355         return $cjk
 356             ? &$cjk($u)
 357             : defined $cjk && $u <= 0xFFFF
 358                 ? $self->altCE(0, ($u, 0x20, 0x02, $u))
 359                 : map($self->altCE(0,@$_), _derivCE($u));
 360     }
 361     else {
 362         return map($self->altCE(0,@$_), _derivCE($u));
 363     }
 364 }
 365
 366 ##
 367 ## int = index(string, substring)
 368 ##
 369 sub index
 370 {
 371     my $self = shift;
 372     my $lev  = $self->{level};
 373     my $comb = $self->{combining};
 374     my $str  = $self->splitCE(shift);
 375     my $sub  = $self->splitCE(shift);
 376
 377     return wantarray ? (0,0) : 0 if ! @$sub;
 378     return wantarray ?  ()  : -1 if ! @$str;
 379
 380     my @subWt = grep _ignorableAtLevel($_,$lev),
 381                 map $self->getWt($_), @$sub;
 382
 383     my(@strWt,@strPt);
 384     my $count = 0;
 385     for (my $i = 0; $i < @$str; $i++) {
 386         my $go_ahead = 0;
 387
 388         my @tmp = grep _ignorableAtLevel($_,$lev), $self->getWt($str->[$i]);
 389         $go_ahead += length $str->[$i];
 390
 391         # /*XXX*/ still broken.
 392         # index("e\x{300}", "e") should be 'no match' at level 2 or higher
 393         # as "e\x{300}" is a *single* grapheme cluster and not equal to "e".
 394
 395         # go ahead as far as we find a combining character;
 396         while ($i + 1 < @$str &&
 397               (! defined $str->[$i+1] || $comb->{ $str->[$i+1] }) ) {
 398             $i++;
 399             $go_ahead += length $str->[$i];
 400             next if ! defined $str->[$i];
 401             push @tmp,
 402                 grep _ignorableAtLevel($_,$lev), $self->getWt($str->[$i]);
 403         }
 404
 405         push @strWt, @tmp;
 406         push @strPt, ($count) x @tmp;
 407         $count += $go_ahead;
 408
 409         while (@strWt >= @subWt) {
 410             if (_eqArray(\@strWt, \@subWt, $lev)) {
 411                 my $pos = $strPt[0];
 412                 return wantarray ? ($pos, $count-$pos) : $pos;
 413             }
 414             shift @strWt;
 415             shift @strPt;
 416         }
 417     }
 418     return wantarray ? () : -1;
 419 }
 420
 421 ##
 422 ## bool _eqArray(arrayref, arrayref, level)
 423 ##
 424 sub _eqArray($$$)
 425 {
 426     my $a   = shift; # length $a >= length $b;
 427     my $b   = shift;
 428     my $lev = shift;
 429     for my $v (0..$lev-1) {
 430         for my $c (0..@$b-1){
 431             return if $a->[$c][$v] != $b->[$c][$v];
 432         }
 433     }
 434     return 1;
 435 }
 436
 437
 438 ##
 439 ## bool _ignorableAtLevel(CE, level)
 440 ##
 441 sub _ignorableAtLevel($$)
 442 {
 443     my $ce = shift;
 444     return unless defined $ce;
 445     my $lv = shift;
 446     return ! grep { ! $ce->[$_] } 0..$lv-1;
 447 }
 448
 449
 450 ##
 451 ## string sortkey = getSortKey(string arg)
 452 ##
 453 sub getSortKey
 454 {
 455     my $self = shift;
 456     my $lev  = $self->{level};
 457     my $rCE  = $self->splitCE(shift); # get an arrayref
 458
 459     # weight arrays
 460     my @buf = grep defined(), map $self->getWt($_), @$rCE;
 461
 462     # make sort key
 463     my @ret = ([],[],[],[]);
 464     foreach my $v (0..$lev-1) {
 465         foreach my $b (@buf) {
 466             push @{ $ret[$v] }, $b->[$v] if $b->[$v];
 467         }
 468     }
 469     foreach (@{ $self->{backwards} }) {
 470         my $v = $_ - 1;
 471         @{ $ret[$v] } = reverse @{ $ret[$v] };
 472     }
 473
 474     # modification of tertiary weights
 475     if ($self->{upper_before_lower}) {
 476         foreach (@{ $ret[2] }) {
 477             if    (0x8 <= $_ && $_ <= 0xC) { $_ -= 6 } # lower
 478             elsif (0x2 <= $_ && $_ <= 0x6) { $_ += 6 } # upper
 479             elsif ($_ == 0x1C)             { $_ += 1 } # square upper
 480             elsif ($_ == 0x1D)             { $_ -= 1 } # square lower
 481         }
 482     }
 483     if ($self->{katakana_before_hiragana}) {
 484         foreach (@{ $ret[2] }) {
 485             if    (0x0F <= $_ && $_ <= 0x13) { $_ -= 2 } # katakana
 486             elsif (0x0D <= $_ && $_ <= 0x0E) { $_ += 5 } # hiragana
 487         }
 488     }
 489     join "\0\0", map pack('n*', @$_), @ret;
 490 }
 491
 492
 493 ##
 494 ## int compare = cmp(string a, string b)
 495 ##
 496 sub cmp { $_[0]->getSortKey($_[1]) cmp $_[0]->getSortKey($_[2]) }
 497 sub eq  { $_[0]->getSortKey($_[1]) eq  $_[0]->getSortKey($_[2]) }
 498 sub ne  { $_[0]->getSortKey($_[1]) ne  $_[0]->getSortKey($_[2]) }
 499 sub lt  { $_[0]->getSortKey($_[1]) lt  $_[0]->getSortKey($_[2]) }
 500 sub le  { $_[0]->getSortKey($_[1]) le  $_[0]->getSortKey($_[2]) }
 501 sub gt  { $_[0]->getSortKey($_[1]) gt  $_[0]->getSortKey($_[2]) }
 502 sub ge  { $_[0]->getSortKey($_[1]) ge  $_[0]->getSortKey($_[2]) }
 503
 504 ##
 505 ## list[strings] sorted = sort(list[strings] arg)
 506 ##
 507 sub sort {
 508     my $obj = shift;
 509     return
 510         map { $_->[1] }
 511             sort{ $a->[0] cmp $b->[0] }
 512                 map [ $obj->getSortKey($_), $_ ], @_;
 513 }
 514
 515 ##
 516 ## list[arrayrefs] CE = _derivCE(int codepoint)
 517 ##
 518 sub _derivCE {
 519     my $code = shift;
 520     my $a = UNDEFINED + ($code >> 15); # ok
 521     my $b = ($code & 0x7FFF) | 0x8000; # ok
 522 #   my $a = 0xFFC2 + ($code >> 15);    # ng
 523 #   my $b = $code & 0x7FFF | 0x1000;   # ng
 524     $b ? ([$a,2,1,$code],[$b,0,0,$code]) : [$a,2,1,$code];
 525 }
 526
 527 ##
 528 ## "hhhh hhhh hhhh" to (dddd, dddd, dddd)
 529 ##
 530 sub _getHexArray { map hex, $_[0] =~ /([0-9a-fA-F]+)/g }
 531
 532 #
 533 # $code must be in Hangul syllable.
 534 # Check it before you enter here.
 535 #
 536 sub _decompHangul {
 537     my $code = shift;
 538     my $SIndex = $code - 0xAC00;
 539     my $LIndex = int( $SIndex / 588);
 540     my $VIndex = int(($SIndex % 588) / 28);
 541     my $TIndex =      $SIndex % 28;
 542     return (
 543         0x1100 + $LIndex,
 544         0x1161 + $VIndex,
 545         $TIndex ? (0x11A7 + $TIndex) : (),
 546     );
 547 }
 548
 549 1;
 550 __END__
 551
 552 =head1 NAME
 553
 554 Unicode::Collate - Unicode Collation Algorithm
 555
 556 =head1 SYNOPSIS
 557
 558   use Unicode::Collate;
 559
 560   #construct
 561   $Collator = Unicode::Collate->new(%tailoring);
 562
 563   #sort
 564   @sorted = $Collator->sort(@not_sorted);
 565
 566   #compare
 567   $result = $Collator->cmp($a, $b); # returns 1, 0, or -1.
 568
 569 =head1 DESCRIPTION
 570
 571 =head2 Constructor and Tailoring
 572
 573 The C<new> method returns a collator object.
 574
 575    $Collator = Unicode::Collate->new(
 576       alternate => $alternate,
 577       backwards => $levelNumber, # or \@levelNumbers
 578       entry => $element,
 579       normalization  => $normalization_form,
 580       ignoreName => qr/$ignoreName/,
 581       ignoreChar => qr/$ignoreChar/,
 582       katakana_before_hiragana => $bool,
 583       level => $collationLevel,
 584       overrideCJK => \&overrideCJK,
 585       overrideHangul => \&overrideHangul,
 586       preprocess => \&preprocess,
 587       rearrange => \@charList,
 588       table => $filename,
 589       undefName => qr/$undefName/,
 590       undefChar => qr/$undefChar/,
 591       upper_before_lower => $bool,
 592    );
 593    # if %tailoring is false (i.e. empty),
 594    # $Collator should do the default collation.
 595
 596 =over 4
 597
 598 =item alternate
 599
 600 -- see 3.2.2 Alternate Weighting, UTR #10.
 601
 602 This key allows to alternate weighting for variable collation elements,
 603 which are marked with an ASTERISK in the table
 604 (NOTE: Many punction marks and symbols are variable in F<allkeys.txt>).
 605
 606    alternate => 'blanked', 'non-ignorable', 'shifted', or 'shift-trimmed'.
 607
 608 These names are case-insensitive.
 609 By default (if specification is omitted), 'shifted' is adopted.
 610
 611    'Blanked'        Variable elements are ignorable at levels 1 through 3;
 612                     considered at the 4th level.
 613
 614    'Non-ignorable'  Variable elements are not reset to ignorable.
 615
 616    'Shifted'        Variable elements are ignorable at levels 1 through 3
 617                     their level 4 weight is replaced by the old level 1 weight.
 618                     Level 4 weight for Non-Variable elements is 0xFFFF.
 619
 620    'Shift-Trimmed'  Same as 'shifted', but all FFFF's at the 4th level
 621                     are trimmed.
 622
 623 =item backwards
 624
 625 -- see 3.1.2 French Accents, UTR #10.
 626
 627      backwards => $levelNumber or \@levelNumbers
 628
 629 Weights in reverse order; ex. level 2 (diacritic ordering) in French.
 630 If omitted, forwards at all the levels.
 631
 632 =item entry
 633
 634 -- see 3.1 Linguistic Features; 3.2.1 File Format, UTR #10.
 635
 636 Overrides a default order or defines additional collation elements
 637
 638   entry => <<'ENTRIES', # use the UCA file format
 639 00E6 ; [.0861.0020.0002.00E6] [.08B1.0020.0002.00E6] # ligature <ae> as <a><e>
 640 0063 0068 ; [.0893.0020.0002.0063]      # "ch" in traditional Spanish
 641 0043 0068 ; [.0893.0020.0008.0043]      # "Ch" in traditional Spanish
 642 ENTRIES
 643
 644 =item ignoreName
 645
 646 =item ignoreChar
 647
 648 -- see Completely Ignorable, 3.2.2 Alternate Weighting, UTR #10.
 649
 650 Makes the entry in the table ignorable.
 651 If a collation element is ignorable,
 652 it is ignored as if the element had been deleted from there.
 653
 654 E.g. when 'a' and 'e' are ignorable,
 655 'element' is equal to 'lament' (or 'lmnt').
 656
 657 =item level
 658
 659 -- see 4.3 Form a sort key for each string, UTR #10.
 660
 661 Set the maximum level.
 662 Any higher levels than the specified one are ignored.
 663
 664   Level 1: alphabetic ordering
 665   Level 2: diacritic ordering
 666   Level 3: case ordering
 667   Level 4: tie-breaking (e.g. in the case when alternate is 'shifted')
 668
 669   ex.level => 2,
 670
 671 If omitted, the maximum is the 4th.
 672
 673 =item normalization
 674
 675 -- see 4.1 Normalize each input string, UTR #10.
 676
 677 If specified, strings are normalized before preparation of sort keys
 678 (the normalization is executed after preprocess).
 679
 680 As a form name, one of the following names must be used.
 681
 682   'C'  or 'NFC'  for Normalization Form C
 683   'D'  or 'NFD'  for Normalization Form D
 684   'KC' or 'NFKC' for Normalization Form KC
 685   'KD' or 'NFKD' for Normalization Form KD
 686
 687 If omitted, the string is put into Normalization Form D.
 688
 689 If C<undef> is passed explicitly as the value for this key,
 690 any normalization is not carried out (this may make tailoring easier
 691 if any normalization is not desired).
 692
 693 see B<CAVEAT>.
 694
 695 =item overrideCJK
 696
 697 -- see 7.1 Derived Collation Elements, UTR #10.
 698
 699 By default, mapping of CJK Unified Ideographs
 700 uses the Unicode codepoint order.
 701 But the mapping of CJK Unified Ideographs may be overrided.
 702
 703 ex. CJK Unified Ideographs in the JIS code point order.
 704
 705   overrideCJK => sub {
 706       my $u = shift;             # get a Unicode codepoint
 707       my $b = pack('n', $u);     # to UTF-16BE
 708       my $s = your_unicode_to_sjis_converter($b); # convert
 709       my $n = unpack('n', $s);   # convert sjis to short
 710       [ $n, 0x20, 0x2, $u ];     # return the collation element
 711   },
 712
 713 ex. ignores all CJK Unified Ideographs.
 714
 715   overrideCJK => sub {()}, # CODEREF returning empty list
 716
 717    # where ->eq("Pe\x{4E00}rl", "Perl") is true
 718    # as U+4E00 is a CJK Unified Ideograph and to be ignorable.
 719
 720 If C<undef> is passed explicitly as the value for this key,
 721 weights for CJK Unified Ideographs are treated as undefined.
 722 But assignment of weight for CJK Unified Ideographs
 723 in table or L<entry> is still valid.
 724
 725 =item overrideHangul
 726
 727 -- see 7.1 Derived Collation Elements, UTR #10.
 728
 729 By default, Hangul Syllables are decomposed into Hangul Jamo.
 730 But the mapping of Hangul Syllables may be overrided.
 731
 732 This tag works like L<overrideCJK>, so see there for examples.
 733
 734 If you want to override the mapping of Hangul Syllables,
 735 the Normalization Forms D and KD are not appropriate
 736 (they will be decomposed before overriding).
 737
 738 If C<undef> is passed explicitly as the value for this key,
 739 weight for Hangul Syllables is treated as undefined
 740 without decomposition into Hangul Jamo.
 741 But definition of weight for Hangul Syllables
 742 in table or L<entry> is still valid.
 743
 744 =item preprocess
 745
 746 -- see 5.1 Preprocessing, UTR #10.
 747
 748 If specified, the coderef is used to preprocess
 749 before the formation of sort keys.
 750
 751 ex. dropping English articles, such as "a" or "the".
 752 Then, "the pen" is before "a pencil".
 753
 754      preprocess => sub {
 755            my $str = shift;
 756            $str =~ s/\b(?:an?|the)\s+//gi;
 757            $str;
 758         },
 759
 760 =item rearrange
 761
 762 -- see 3.1.3 Rearrangement, UTR #10.
 763
 764 Characters that are not coded in logical order and to be rearranged.
 765 By default,
 766
 767     rearrange => [ 0x0E40..0x0E44, 0x0EC0..0x0EC4 ],
 768
 769 If you want to disallow any rearrangement,
 770 pass C<undef> or C<[]> (a reference to an empty list)
 771 as the value for this key.
 772
 773 =item table
 774
 775 -- see 3.2 Default Unicode Collation Element Table, UTR #10.
 776
 777 You can use another element table if desired.
 778 The table file must be in your C<lib/Unicode/Collate> directory.
 779
 780 By default, the file C<lib/Unicode/Collate/allkeys.txt> is used.
 781
 782 If C<undef> is passed explicitly as the value for this key,
 783 no file is read (but you can define collation elements via L<entry>).
 784
 785 A typical way to define a collation element table
 786 without any file of table:
 787
 788    $onlyABC = Unicode::Collate->new(
 789        table => undef,
 790        entry => << 'ENTRIES',
 791 0061 ; [.0101.0020.0002.0061] # LATIN SMALL LETTER A
 792 0041 ; [.0101.0020.0008.0041] # LATIN CAPITAL LETTER A
 793 0062 ; [.0102.0020.0002.0062] # LATIN SMALL LETTER B
 794 0042 ; [.0102.0020.0008.0042] # LATIN CAPITAL LETTER B
 795 0063 ; [.0103.0020.0002.0063] # LATIN SMALL LETTER C
 796 0043 ; [.0103.0020.0008.0043] # LATIN CAPITAL LETTER C
 797 ENTRIES
 798     );
 799
 800 =item undefName
 801
 802 =item undefChar
 803
 804 -- see 6.3.4 Reducing the Repertoire, UTR #10.
 805
 806 Undefines the collation element as if it were unassigned in the table.
 807 This reduces the size of the table.
 808 If an unassigned character appears in the string to be collated,
 809 the sort key is made from its codepoint
 810 as a single-character collation element,
 811 as it is greater than any other assigned collation elements
 812 (in the codepoint order among the unassigned characters).
 813 But, it'd be better to ignore characters
 814 unfamiliar to you and maybe never used.
 815
 816 =item katakana_before_hiragana
 817
 818 =item upper_before_lower
 819
 820 -- see 6.6 Case Comparisons; 7.3.1 Tertiary Weight Table, UTR #10.
 821
 822 By default, lowercase is before uppercase
 823 and hiragana is before katakana.
 824
 825 If the tag is made true, this is reversed.
 826
 827 B<NOTE>: These tags simplemindedly assume
 828 any lowercase/uppercase or hiragana/katakana distinctions
 829 should occur in level 3, and their weights at level 3
 830 should be same as those mentioned in 7.3.1, UTR #10.
 831 If you define your collation elements which violates this,
 832 these tags doesn't work validly.
 833
 834 =back
 835
 836 =head2 Methods for Collation
 837
 838 =over 4
 839
 840 =item C<@sorted = $Collator-E<gt>sort(@not_sorted)>
 841
 842 Sorts a list of strings.
 843
 844 =item C<$result = $Collator-E<gt>cmp($a, $b)>
 845
 846 Returns 1 (when C<$a> is greater than C<$b>)
 847 or 0 (when C<$a> is equal to C<$b>)
 848 or -1 (when C<$a> is lesser than C<$b>).
 849
 850 =item C<$result = $Collator-E<gt>eq($a, $b)>
 851
 852 =item C<$result = $Collator-E<gt>ne($a, $b)>
 853
 854 =item C<$result = $Collator-E<gt>lt($a, $b)>
 855
 856 =item C<$result = $Collator-E<gt>le($a, $b)>
 857
 858 =item C<$result = $Collator-E<gt>gt($a, $b)>
 859
 860 =item C<$result = $Collator-E<gt>ge($a, $b)>
 861
 862 They works like the same name operators as theirs.
 863
 864    eq : whether $a is equal to $b.
 865    ne : whether $a is not equal to $b.
 866    lt : whether $a is lesser than $b.
 867    le : whether $a is lesser than $b or equal to $b.
 868    gt : whether $a is greater than $b.
 869    ge : whether $a is greater than $b or equal to $b.
 870
 871 =item C<$sortKey = $Collator-E<gt>getSortKey($string)>
 872
 873 -- see 4.3 Form a sort key for each string, UTR #10.
 874
 875 Returns a sort key.
 876
 877 You compare the sort keys using a binary comparison
 878 and get the result of the comparison of the strings using UCA.
 879
 880    $Collator->getSortKey($a) cmp $Collator->getSortKey($b)
 881
 882       is equivalent to
 883
 884    $Collator->cmp($a, $b)
 885
 886 =item C<$sortKeyForm = $Collator-E<gt>viewSortKey($string)>
 887
 888 Returns a string formalized to display a sort key.
 889 Weights are enclosed with C<'['> and C<']'>
 890 and level boundaries are denoted by C<'|'>.
 891
 892    use Unicode::Collate;
 893    my $c = Unicode::Collate->new();
 894    print $c->viewSortKey("Perl"),"\n";
 895
 896     # output:
 897     # [09B3 08B1 09CB 094F|0020 0020 0020 0020|0008 0002 0002 0002|FFFF FFFF FFFF FFFF]
 898     #  Level 1             Level 2             Level 3             Level 4
 899
 900 =item C<$position = $Collator-E<gt>index($string, $substring)>
 901
 902 =item C<($position, $length) = $Collator-E<gt>index($string, $substring)>
 903
 904 -- see 6.8 Searching, UTR #10.
 905
 906 If C<$substring> matches a part of C<$string>, returns
 907 the position of the first occurrence of the matching part in scalar context;
 908 in list context, returns a two-element list of
 909 the position and the length of the matching part.
 910
 911 B<Notice> that the length of the matching part may differ from
 912 the length of C<$substring>.
 913
 914 B<Note> that the position and the length are counted on the string
 915 after the process of preprocess, normalization, and rearrangement.
 916 Therefore, in case the specified string is not binary equal to
 917 the preprocessed/normalized/rearranged string, the position and the length
 918 may differ form those on the specified string. But it is guaranteed
 919 that, if matched, it returns a non-negative value as C<$position>.
 920
 921 If C<$substring> does not match any part of C<$string>,
 922 returns C<-1> in scalar context and
 923 an empty list in list context.
 924
 925 e.g. you say
 926
 927   my $Collator = Unicode::Collate->new( normalization => undef, level => 1 );
 928   my $str = "Ich mu\x{00DF} studieren.";
 929   my $sub = "m\x{00FC}ss";
 930   my $match;
 931   if (my($pos,$len) = $Collator->index($str, $sub)) {
 932       $match = substr($str, $pos, $len);
 933   }
 934
 935 and get C<"mu\x{00DF}"> in C<$match> since C<"mu>E<223>C<">
 936 is primary equal to C<"m>E<252>C<ss">.
 937
 938 =back
 939
 940 =head2 Other Methods
 941
 942 =over 4
 943
 944 =item UCA_Version
 945
 946 Returns the version number of Unicode Technical Standard 10
 947 this module consults.
 948
 949 =item Base_Unicode_Version
 950
 951 Returns the version number of the Unicode Standard
 952 this module is based on.
 953
 954 =back
 955
 956 =head2 EXPORT
 957
 958 None by default.
 959
 960 =head2 TODO
 961
 962 Unicode::Collate has not been ported to EBCDIC.  The code mostly would
 963 work just fine but a decision needs to be made: how the module should
 964 work in EBCDIC?  Should the low 256 characters be understood as
 965 Unicode or as EBCDIC code points?  Should one be chosen or should
 966 there be a way to do either?  Or should such translation be left
 967 outside the module for the user to do, for example by using
 968 Encode::from_to()?
 969 (or utf8::unicode_to_native()/utf8::native_to_unicode()?)
 970
 971 =head2 CAVEAT
 972
 973 Use of the C<normalization> parameter requires
 974 the B<Unicode::Normalize> module.
 975
 976 If you need not it (say, in the case when you need not
 977 handle any combining characters),
 978 assign C<normalization =E<gt> undef> explicitly.
 979
 980 -- see 6.5 Avoiding Normalization, UTR #10.
 981
 982 =head2 BUGS
 983
 984 C<index()> is an experimental method and
 985 its return value may be unreliable.
 986 The correct implementation for C<index()> must be based
 987 on Locale-Sensitive Support: Level 3 in UTR #18,
 988 F<Unicode Regular Expression Guidelines>.
 989
 990 See also 4.2 Locale-Dependent Graphemes in UTR #18.
 991
 992 =head1 AUTHOR
 993
 994 SADAHIRO Tomoyuki, E<lt>SADAHIRO@cpan.orgE<gt>
 995
 996   http://homepage1.nifty.com/nomenclator/perl/
 997
 998   Copyright(C) 2001-2002, SADAHIRO Tomoyuki. Japan. All rights reserved.
 999
1000   This library is free software; you can redistribute it
1001   and/or modify it under the same terms as Perl itself.
1002
1003 =head1 SEE ALSO
1004
1005 =over 4
1006
1007 =item http://www.unicode.org/unicode/reports/tr10/
1008
1009 Unicode Collation Algorithm - UTR #10
1010
1011 =item http://www.unicode.org/unicode/reports/tr10/allkeys.txt
1012
1013 The Default Unicode Collation Element Table
1014
1015 =item http://www.unicode.org/unicode/reports/tr15/
1016
1017 Unicode Normalization Forms - UAX #15
1018
1019 =item http://www.unicode.org/unicode/reports/tr18
1020
1021 Unicode Regular Expression Guidelines - UTR #18
1022
1023 =item L<Unicode::Normalize>
1024
1025 =back
1026
1027 =cut