lib/Unicode/Collate.pm

   1 package Unicode::Collate;
   2
   3 BEGIN {
   4     if (ord("A") == 193) {
   5         die "Unicode::Collate not ported to EBCDIC\n";
   6     }
   7 }
   8
   9 use 5.006;
  10 use strict;
  11 use warnings;
  12 use Carp;
  13 use File::Spec;
  14
  15 require Exporter;
  16
  17 our $VERSION = '0.12';
  18 our $PACKAGE = __PACKAGE__;
  19
  20 our @ISA = qw(Exporter);
  21
  22 our %EXPORT_TAGS = ();
  23 our @EXPORT_OK = ();
  24 our @EXPORT = ();
  25
  26 (our $Path = $INC{'Unicode/Collate.pm'}) =~ s/\.pm$//;
  27 our $KeyFile = "allkeys.txt";
  28
  29 our $UNICODE_VERSION;
  30
  31 eval { require Unicode::UCD };
  32
  33 unless ($@) {
  34     $UNICODE_VERSION = Unicode::UCD::UnicodeVersion();
  35 }
  36 else { # XXX, Perl 5.6.1
  37     my($f, $fh);
  38     foreach my $d (@INC) {
  39         use File::Spec;
  40         $f = File::Spec->catfile($d, "unicode", "Unicode.301");
  41         if (open($fh, $f)) {
  42             $UNICODE_VERSION = '3.0.1';
  43             close $fh;
  44             last;
  45         }
  46     }
  47 }
  48
  49 our $getCombinClass; # coderef for combining class from Unicode::Normalize
  50
  51 use constant Min2      => 0x20;   # minimum weight at level 2
  52 use constant Min3      => 0x02;   # minimum weight at level 3
  53 use constant UNDEFINED => 0xFF80; # special value for undefined CE's
  54
  55 our $DefaultRearrange = [ 0x0E40..0x0E44, 0x0EC0..0x0EC4 ];
  56
  57 sub UCA_Version { "8.0" }
  58
  59 sub Base_Unicode_Version { $UNICODE_VERSION || 'unknown' }
  60
  61 ##
  62 ## constructor
  63 ##
  64 sub new
  65 {
  66     my $class = shift;
  67     my $self = bless { @_ }, $class;
  68
  69     # alternate lowercased
  70     $self->{alternate} =
  71         ! exists $self->{alternate} ? 'shifted' : lc($self->{alternate});
  72
  73     croak "$PACKAGE unknown alternate tag name: $self->{alternate}"
  74         unless $self->{alternate} eq 'blanked'
  75             || $self->{alternate} eq 'non-ignorable'
  76             || $self->{alternate} eq 'shifted'
  77             || $self->{alternate} eq 'shift-trimmed';
  78
  79     # collation level
  80     $self->{level} ||= 4;
  81
  82     croak "Illegal level lower than 1 (passed $self->{level})."
  83         if $self->{level} < 1;
  84     croak "A level higher than 4 (passed $self->{level}) is not supported."
  85         if 4 < $self->{level};
  86
  87     # overrideHangul and -CJK
  88     # If true: CODEREF used; '': default; undef: derived elements
  89     $self->{overrideHangul} = ''
  90         if ! exists $self->{overrideHangul};
  91     $self->{overrideCJK} = ''
  92         if ! exists $self->{overrideCJK};
  93
  94     # normalization form
  95     $self->{normalization} = 'D'
  96         if ! exists $self->{normalization};
  97     $self->{UNF} = undef;
  98
  99     if (defined $self->{normalization}) {
 100         eval { require Unicode::Normalize };
 101         croak "Unicode/Normalize.pm is required to normalize strings: $@"
 102             if $@;
 103
 104         Unicode::Normalize->import();
 105         $getCombinClass = \&Unicode::Normalize::getCombinClass
 106             if ! $getCombinClass;
 107
 108         $self->{UNF} =
 109             $self->{normalization} =~ /^(?:NF)?C$/  ? \&NFC :
 110             $self->{normalization} =~ /^(?:NF)?D$/  ? \&NFD :
 111             $self->{normalization} =~ /^(?:NF)?KC$/ ? \&NFKC :
 112             $self->{normalization} =~ /^(?:NF)?KD$/ ? \&NFKD :
 113           croak "$PACKAGE unknown normalization form name: "
 114                 . $self->{normalization};
 115     }
 116
 117     # Open a table file.
 118     # If undef is passed explicitly, no file is read.
 119     $self->{table} = $KeyFile
 120         if ! exists $self->{table};
 121     $self->read_table
 122         if defined $self->{table};
 123
 124     if ($self->{entry}) {
 125         $self->parseEntry($_) foreach split /\n/, $self->{entry};
 126     }
 127
 128     # backwards
 129     $self->{backwards} ||= [ ];
 130     $self->{backwards} = [ $self->{backwards} ]
 131         if ! ref $self->{backwards};
 132
 133     # rearrange
 134     $self->{rearrange} = $DefaultRearrange
 135         if ! exists $self->{rearrange};
 136     $self->{rearrange} = []
 137         if ! defined $self->{rearrange};
 138     croak "$PACKAGE: A list for rearrangement must be store in an ARRAYREF"
 139         if ! ref $self->{rearrange};
 140
 141     # keys of $self->{rearrangeHash} are $self->{rearrange}.
 142     $self->{rearrangeHash} = undef;
 143
 144     if (@{ $self->{rearrange} }) {
 145         @{ $self->{rearrangeHash} }{ @{ $self->{rearrange} } } = ();
 146     }
 147
 148     return $self;
 149 }
 150
 151 sub read_table {
 152     my $self = shift;
 153     my $file = $self->{table} ne '' ? $self->{table} : $KeyFile;
 154
 155     my $filepath = File::Spec->catfile($Path, $file);
 156     open my $fk, "<$filepath"
 157         or croak "File does not exist at $filepath";
 158
 159     while (<$fk>) {
 160         next if /^\s*#/;
 161         if (/^\s*\@/) {
 162             if (/^\@version\s*(\S*)/) {
 163                 $self->{version} ||= $1;
 164             }
 165             elsif (/^\@alternate\s+(.*)/) {
 166                 $self->{alternate} ||= $1;
 167             }
 168             elsif (/^\@backwards\s+(.*)/) {
 169                 push @{ $self->{backwards} }, $1;
 170             }
 171             elsif (/^\@rearrange\s+(.*)/) {
 172                 push @{ $self->{rearrange} }, _getHexArray($1);
 173             }
 174             next;
 175         }
 176         $self->parseEntry($_);
 177     }
 178     close $fk;
 179 }
 180
 181
 182 ##
 183 ## get $line, parse it, and write an entry in $self
 184 ##
 185 sub parseEntry
 186 {
 187     my $self = shift;
 188     my $line = shift;
 189     my($name, $ele, @key);
 190
 191     return if $line !~ /^\s*[0-9A-Fa-f]/;
 192
 193     # removes comment and gets name
 194     $name = $1
 195         if $line =~ s/[#%]\s*(.*)//;
 196     return if defined $self->{undefName} && $name =~ /$self->{undefName}/;
 197
 198     # gets element
 199     my($e, $k) = split /;/, $line;
 200     croak "Wrong Entry: <charList> must be separated by ';' from <collElement>"
 201         if ! $k;
 202
 203     my @e = _getHexArray($e);
 204     $ele = pack('U*', @e);
 205     return if defined $self->{undefChar} && $ele =~ /$self->{undefChar}/;
 206
 207     # get sort key
 208     if (defined $self->{ignoreName} && $name =~ /$self->{ignoreName}/ ||
 209         defined $self->{ignoreChar} && $ele  =~ /$self->{ignoreChar}/)
 210     {
 211         $self->{entries}{$ele} = $self->{ignored}{$ele} = 1;
 212     }
 213     else {
 214         my $combining = 1; # primary = 0, secondary != 0;
 215
 216         foreach my $arr ($k =~ /\[([^\[\]]+)\]/g) { # SPACEs allowed
 217             my $var = $arr =~ /\*/; # exactly /^\*/ but be lenient.
 218             push @key, $self->altCE($var, _getHexArray($arr));
 219             $combining = 0 unless $key[-1][0] == 0 && $key[-1][1] != 0;
 220         }
 221         $self->{entries}{$ele} = \@key;
 222         $self->{combining}{$ele} = 1 if $combining;
 223     }
 224     $self->{maxlength}{ord $ele} = scalar @e if @e > 1;
 225 }
 226
 227
 228 ##
 229 ## arrayref CE = altCE(bool variable?, list[num] weights)
 230 ##
 231 sub altCE
 232 {
 233     my $self = shift;
 234     my $var  = shift;
 235     my @c    = @_;
 236
 237     $self->{alternate} eq 'blanked' ?
 238         $var ? [0,0,0,$c[3]] : \@c :
 239     $self->{alternate} eq 'non-ignorable' ?
 240         \@c :
 241     $self->{alternate} eq 'shifted' ?
 242         $var ? [0,0,0,$c[0] ] : [ @c[0..2], $c[0]+$c[1]+$c[2] ? 0xFFFF : 0 ] :
 243     $self->{alternate} eq 'shift-trimmed' ?
 244         $var ? [0,0,0,$c[0] ] : [ @c[0..2], 0 ] :
 245         croak "$PACKAGE unknown alternate name: $self->{alternate}";
 246 }
 247
 248 ##
 249 ## string hex_sortkey = splitCE(string arg)
 250 ##
 251 sub viewSortKey
 252 {
 253     my $self = shift;
 254     my $key  = $self->getSortKey(@_);
 255     my $view = join " ", map sprintf("%04X", $_), unpack 'n*', $key;
 256     $view =~ s/ ?0000 ?/|/g;
 257     return "[$view]";
 258 }
 259
 260
 261 ##
 262 ## list[strings] elements = splitCE(string arg)
 263 ##
 264 sub splitCE
 265 {
 266     my $self = shift;
 267     my $code = $self->{preprocess};
 268     my $norm = $self->{UNF};
 269     my $ent  = $self->{entries};
 270     my $max  = $self->{maxlength};
 271     my $reH  = $self->{rearrangeHash};
 272
 273     my $str = ref $code ? &$code(shift) : shift;
 274     $str = &$norm($str) if ref $norm;
 275
 276     my @src = unpack('U*', $str);
 277     my @buf;
 278
 279     # rearrangement
 280     if ($reH) {
 281         for (my $i = 0; $i < @src; $i++) {
 282             if (exists $reH->{ $src[$i] } && $i + 1 < @src) {
 283                 ($src[$i], $src[$i+1]) = ($src[$i+1], $src[$i]);
 284                 $i++;
 285             }
 286         }
 287     }
 288
 289     for (my $i = 0; $i < @src; $i++) {
 290         my $ch;
 291         my $u = $src[$i];
 292
 293         # non-characters
 294         next unless defined $u;
 295         next if $u < 0 || 0x10FFFF < $u    # out of range
 296             || (0xD800 <= $u && $u <= 0xDFFF); # unpaired surrogates
 297         my $four = $u & 0xFFFF;
 298         next if $four == 0xFFFE || $four == 0xFFFF;
 299
 300         if ($max->{$u}) { # contract
 301             for (my $j = $max->{$u}; $j >= 1; $j--) {
 302                 next unless $i+$j-1 < @src;
 303                 $ch = pack 'U*', @src[$i .. $i+$j-1];
 304                 $i += $j-1, last if $ent->{$ch};
 305             }
 306         } else {
 307             $ch = pack('U', $u);
 308         }
 309
 310         # with Combining Char (UTS#10, 4.2.1), here requires Unicode::Normalize.
 311         if ($getCombinClass && defined $ch) {
 312             for (my $j = $i+1; $j < @src; $j++) {
 313                 next unless defined $src[$j];
 314                 last unless $getCombinClass->( $src[$j] );
 315                 my $comb = pack 'U', $src[$j];
 316                 next if ! $ent->{ $ch.$comb };
 317                 $ch .= $comb;
 318                 $src[$j] = undef;
 319             }
 320         }
 321         push @buf, $ch;
 322     }
 323     wantarray ? @buf : \@buf;
 324 }
 325
 326
 327 ##
 328 ## list[arrayrefs] weight = getWt(string element)
 329 ##
 330 sub getWt
 331 {
 332     my $self = shift;
 333     my $ch   = shift;
 334     my $ent  = $self->{entries};
 335     my $ign  = $self->{ignored};
 336     my $cjk  = $self->{overrideCJK};
 337     my $hang = $self->{overrideHangul};
 338
 339     return if !defined $ch || $ign->{$ch}; # ignored
 340     return @{ $ent->{$ch} } if $ent->{$ch};
 341     my $u = unpack('U', $ch);
 342
 343     if (0xAC00 <= $u && $u <= 0xD7A3) { # is_Hangul
 344         return $hang
 345             ? &$hang($u)
 346             : defined $hang
 347                 ? map({
 348                         my $v = $_;
 349                         my $ar = $ent->{pack('U', $v)};
 350                         $ar ? @$ar : map($self->altCE(0,@$_), _derivCE($v));
 351                     } _decompHangul($u))
 352                 : map($self->altCE(0,@$_), _derivCE($u));
 353     }
 354     elsif (0x3400 <= $u && $u <= 0x4DB5 ||
 355            0x4E00 <= $u && $u <= 0x9FA5 ||
 356            0x20000 <= $u && $u <= 0x2A6D6) { # is_CJK
 357         return $cjk
 358             ? &$cjk($u)
 359             : defined $cjk && $u <= 0xFFFF
 360                 ? $self->altCE(0, ($u, 0x20, 0x02, $u))
 361                 : map($self->altCE(0,@$_), _derivCE($u));
 362     }
 363     else {
 364         return map($self->altCE(0,@$_), _derivCE($u));
 365     }
 366 }
 367
 368 ##
 369 ## int = index(string, substring)
 370 ##
 371 sub index
 372 {
 373     my $self = shift;
 374     my $lev  = $self->{level};
 375     my $comb = $self->{combining};
 376     my $str  = $self->splitCE(shift);
 377     my $sub  = $self->splitCE(shift);
 378
 379     return wantarray ? (0,0) : 0 if ! @$sub;
 380     return wantarray ?  ()  : -1 if ! @$str;
 381
 382     my @subWt = grep _ignorableAtLevel($_,$lev),
 383                 map $self->getWt($_), @$sub;
 384
 385     my(@strWt,@strPt);
 386     my $count = 0;
 387     for (my $i = 0; $i < @$str; $i++) {
 388         my $go_ahead = 0;
 389
 390         my @tmp = grep _ignorableAtLevel($_,$lev), $self->getWt($str->[$i]);
 391         $go_ahead += length $str->[$i];
 392
 393         # /*XXX*/ still broken.
 394         # index("e\x{300}", "e") should be 'no match' at level 2 or higher
 395         # as "e\x{300}" is a *single* grapheme cluster and not equal to "e".
 396
 397         # go ahead as far as we find a combining character;
 398         while ($i + 1 < @$str &&
 399               (! defined $str->[$i+1] || $comb->{ $str->[$i+1] }) ) {
 400             $i++;
 401             $go_ahead += length $str->[$i];
 402             next if ! defined $str->[$i];
 403             push @tmp,
 404                 grep _ignorableAtLevel($_,$lev), $self->getWt($str->[$i]);
 405         }
 406
 407         push @strWt, @tmp;
 408         push @strPt, ($count) x @tmp;
 409         $count += $go_ahead;
 410
 411         while (@strWt >= @subWt) {
 412             if (_eqArray(\@strWt, \@subWt, $lev)) {
 413                 my $pos = $strPt[0];
 414                 return wantarray ? ($pos, $count-$pos) : $pos;
 415             }
 416             shift @strWt;
 417             shift @strPt;
 418         }
 419     }
 420     return wantarray ? () : -1;
 421 }
 422
 423 ##
 424 ## bool _eqArray(arrayref, arrayref, level)
 425 ##
 426 sub _eqArray($$$)
 427 {
 428     my $a   = shift; # length $a >= length $b;
 429     my $b   = shift;
 430     my $lev = shift;
 431     for my $v (0..$lev-1) {
 432         for my $c (0..@$b-1){
 433             return if $a->[$c][$v] != $b->[$c][$v];
 434         }
 435     }
 436     return 1;
 437 }
 438
 439
 440 ##
 441 ## bool _ignorableAtLevel(CE, level)
 442 ##
 443 sub _ignorableAtLevel($$)
 444 {
 445     my $ce = shift;
 446     return unless defined $ce;
 447     my $lv = shift;
 448     return ! grep { ! $ce->[$_] } 0..$lv-1;
 449 }
 450
 451
 452 ##
 453 ## string sortkey = getSortKey(string arg)
 454 ##
 455 sub getSortKey
 456 {
 457     my $self = shift;
 458     my $lev  = $self->{level};
 459     my $rCE  = $self->splitCE(shift); # get an arrayref
 460
 461     # weight arrays
 462     my @buf = grep defined(), map $self->getWt($_), @$rCE;
 463
 464     # make sort key
 465     my @ret = ([],[],[],[]);
 466     foreach my $v (0..$lev-1) {
 467         foreach my $b (@buf) {
 468             push @{ $ret[$v] }, $b->[$v] if $b->[$v];
 469         }
 470     }
 471     foreach (@{ $self->{backwards} }) {
 472         my $v = $_ - 1;
 473         @{ $ret[$v] } = reverse @{ $ret[$v] };
 474     }
 475
 476     # modification of tertiary weights
 477     if ($self->{upper_before_lower}) {
 478         foreach (@{ $ret[2] }) {
 479             if    (0x8 <= $_ && $_ <= 0xC) { $_ -= 6 } # lower
 480             elsif (0x2 <= $_ && $_ <= 0x6) { $_ += 6 } # upper
 481             elsif ($_ == 0x1C)             { $_ += 1 } # square upper
 482             elsif ($_ == 0x1D)             { $_ -= 1 } # square lower
 483         }
 484     }
 485     if ($self->{katakana_before_hiragana}) {
 486         foreach (@{ $ret[2] }) {
 487             if    (0x0F <= $_ && $_ <= 0x13) { $_ -= 2 } # katakana
 488             elsif (0x0D <= $_ && $_ <= 0x0E) { $_ += 5 } # hiragana
 489         }
 490     }
 491     join "\0\0", map pack('n*', @$_), @ret;
 492 }
 493
 494
 495 ##
 496 ## int compare = cmp(string a, string b)
 497 ##
 498 sub cmp { $_[0]->getSortKey($_[1]) cmp $_[0]->getSortKey($_[2]) }
 499 sub eq  { $_[0]->getSortKey($_[1]) eq  $_[0]->getSortKey($_[2]) }
 500 sub ne  { $_[0]->getSortKey($_[1]) ne  $_[0]->getSortKey($_[2]) }
 501 sub lt  { $_[0]->getSortKey($_[1]) lt  $_[0]->getSortKey($_[2]) }
 502 sub le  { $_[0]->getSortKey($_[1]) le  $_[0]->getSortKey($_[2]) }
 503 sub gt  { $_[0]->getSortKey($_[1]) gt  $_[0]->getSortKey($_[2]) }
 504 sub ge  { $_[0]->getSortKey($_[1]) ge  $_[0]->getSortKey($_[2]) }
 505
 506 ##
 507 ## list[strings] sorted = sort(list[strings] arg)
 508 ##
 509 sub sort {
 510     my $obj = shift;
 511     return
 512         map { $_->[1] }
 513             sort{ $a->[0] cmp $b->[0] }
 514                 map [ $obj->getSortKey($_), $_ ], @_;
 515 }
 516
 517 ##
 518 ## list[arrayrefs] CE = _derivCE(int codepoint)
 519 ##
 520 sub _derivCE {
 521     my $code = shift;
 522     my $a = UNDEFINED + ($code >> 15); # ok
 523     my $b = ($code & 0x7FFF) | 0x8000; # ok
 524 #   my $a = 0xFFC2 + ($code >> 15);    # ng
 525 #   my $b = $code & 0x7FFF | 0x1000;   # ng
 526     $b ? ([$a,2,1,$code],[$b,0,0,$code]) : [$a,2,1,$code];
 527 }
 528
 529 ##
 530 ## "hhhh hhhh hhhh" to (dddd, dddd, dddd)
 531 ##
 532 sub _getHexArray { map hex, $_[0] =~ /([0-9a-fA-F]+)/g }
 533
 534 #
 535 # $code must be in Hangul syllable.
 536 # Check it before you enter here.
 537 #
 538 sub _decompHangul {
 539     my $code = shift;
 540     my $SIndex = $code - 0xAC00;
 541     my $LIndex = int( $SIndex / 588);
 542     my $VIndex = int(($SIndex % 588) / 28);
 543     my $TIndex =      $SIndex % 28;
 544     return (
 545         0x1100 + $LIndex,
 546         0x1161 + $VIndex,
 547         $TIndex ? (0x11A7 + $TIndex) : (),
 548     );
 549 }
 550
 551 1;
 552 __END__
 553
 554 =head1 NAME
 555
 556 Unicode::Collate - Unicode Collation Algorithm
 557
 558 =head1 SYNOPSIS
 559
 560   use Unicode::Collate;
 561
 562   #construct
 563   $Collator = Unicode::Collate->new(%tailoring);
 564
 565   #sort
 566   @sorted = $Collator->sort(@not_sorted);
 567
 568   #compare
 569   $result = $Collator->cmp($a, $b); # returns 1, 0, or -1.
 570
 571 =head1 DESCRIPTION
 572
 573 =head2 Constructor and Tailoring
 574
 575 The C<new> method returns a collator object.
 576
 577    $Collator = Unicode::Collate->new(
 578       alternate => $alternate,
 579       backwards => $levelNumber, # or \@levelNumbers
 580       entry => $element,
 581       normalization  => $normalization_form,
 582       ignoreName => qr/$ignoreName/,
 583       ignoreChar => qr/$ignoreChar/,
 584       katakana_before_hiragana => $bool,
 585       level => $collationLevel,
 586       overrideCJK => \&overrideCJK,
 587       overrideHangul => \&overrideHangul,
 588       preprocess => \&preprocess,
 589       rearrange => \@charList,
 590       table => $filename,
 591       undefName => qr/$undefName/,
 592       undefChar => qr/$undefChar/,
 593       upper_before_lower => $bool,
 594    );
 595    # if %tailoring is false (i.e. empty),
 596    # $Collator should do the default collation.
 597
 598 =over 4
 599
 600 =item alternate
 601
 602 -- see 3.2.2 Alternate Weighting, UTR #10.
 603
 604 This key allows to alternate weighting for variable collation elements,
 605 which are marked with an ASTERISK in the table
 606 (NOTE: Many punction marks and symbols are variable in F<allkeys.txt>).
 607
 608    alternate => 'blanked', 'non-ignorable', 'shifted', or 'shift-trimmed'.
 609
 610 These names are case-insensitive.
 611 By default (if specification is omitted), 'shifted' is adopted.
 612
 613    'Blanked'        Variable elements are ignorable at levels 1 through 3;
 614                     considered at the 4th level.
 615
 616    'Non-ignorable'  Variable elements are not reset to ignorable.
 617
 618    'Shifted'        Variable elements are ignorable at levels 1 through 3
 619                     their level 4 weight is replaced by the old level 1 weight.
 620                     Level 4 weight for Non-Variable elements is 0xFFFF.
 621
 622    'Shift-Trimmed'  Same as 'shifted', but all FFFF's at the 4th level
 623                     are trimmed.
 624
 625 =item backwards
 626
 627 -- see 3.1.2 French Accents, UTR #10.
 628
 629      backwards => $levelNumber or \@levelNumbers
 630
 631 Weights in reverse order; ex. level 2 (diacritic ordering) in French.
 632 If omitted, forwards at all the levels.
 633
 634 =item entry
 635
 636 -- see 3.1 Linguistic Features; 3.2.1 File Format, UTR #10.
 637
 638 Overrides a default order or defines additional collation elements
 639
 640   entry => <<'ENTRIES', # use the UCA file format
 641 00E6 ; [.0861.0020.0002.00E6] [.08B1.0020.0002.00E6] # ligature <ae> as <a><e>
 642 0063 0068 ; [.0893.0020.0002.0063]      # "ch" in traditional Spanish
 643 0043 0068 ; [.0893.0020.0008.0043]      # "Ch" in traditional Spanish
 644 ENTRIES
 645
 646 =item ignoreName
 647
 648 =item ignoreChar
 649
 650 -- see Completely Ignorable, 3.2.2 Alternate Weighting, UTR #10.
 651
 652 Makes the entry in the table ignorable.
 653 If a collation element is ignorable,
 654 it is ignored as if the element had been deleted from there.
 655
 656 E.g. when 'a' and 'e' are ignorable,
 657 'element' is equal to 'lament' (or 'lmnt').
 658
 659 =item level
 660
 661 -- see 4.3 Form a sort key for each string, UTR #10.
 662
 663 Set the maximum level.
 664 Any higher levels than the specified one are ignored.
 665
 666   Level 1: alphabetic ordering
 667   Level 2: diacritic ordering
 668   Level 3: case ordering
 669   Level 4: tie-breaking (e.g. in the case when alternate is 'shifted')
 670
 671   ex.level => 2,
 672
 673 If omitted, the maximum is the 4th.
 674
 675 =item normalization
 676
 677 -- see 4.1 Normalize each input string, UTR #10.
 678
 679 If specified, strings are normalized before preparation of sort keys
 680 (the normalization is executed after preprocess).
 681
 682 As a form name, one of the following names must be used.
 683
 684   'C'  or 'NFC'  for Normalization Form C
 685   'D'  or 'NFD'  for Normalization Form D
 686   'KC' or 'NFKC' for Normalization Form KC
 687   'KD' or 'NFKD' for Normalization Form KD
 688
 689 If omitted, the string is put into Normalization Form D.
 690
 691 If C<undef> is passed explicitly as the value for this key,
 692 any normalization is not carried out (this may make tailoring easier
 693 if any normalization is not desired).
 694
 695 see B<CAVEAT>.
 696
 697 =item overrideCJK
 698
 699 -- see 7.1 Derived Collation Elements, UTR #10.
 700
 701 By default, mapping of CJK Unified Ideographs
 702 uses the Unicode codepoint order.
 703 But the mapping of CJK Unified Ideographs may be overrided.
 704
 705 ex. CJK Unified Ideographs in the JIS code point order.
 706
 707   overrideCJK => sub {
 708       my $u = shift;             # get a Unicode codepoint
 709       my $b = pack('n', $u);     # to UTF-16BE
 710       my $s = your_unicode_to_sjis_converter($b); # convert
 711       my $n = unpack('n', $s);   # convert sjis to short
 712       [ $n, 0x20, 0x2, $u ];     # return the collation element
 713   },
 714
 715 ex. ignores all CJK Unified Ideographs.
 716
 717   overrideCJK => sub {()}, # CODEREF returning empty list
 718
 719    # where ->eq("Pe\x{4E00}rl", "Perl") is true
 720    # as U+4E00 is a CJK Unified Ideograph and to be ignorable.
 721
 722 If C<undef> is passed explicitly as the value for this key,
 723 weights for CJK Unified Ideographs are treated as undefined.
 724 But assignment of weight for CJK Unified Ideographs
 725 in table or L<entry> is still valid.
 726
 727 =item overrideHangul
 728
 729 -- see 7.1 Derived Collation Elements, UTR #10.
 730
 731 By default, Hangul Syllables are decomposed into Hangul Jamo.
 732 But the mapping of Hangul Syllables may be overrided.
 733
 734 This tag works like L<overrideCJK>, so see there for examples.
 735
 736 If you want to override the mapping of Hangul Syllables,
 737 the Normalization Forms D and KD are not appropriate
 738 (they will be decomposed before overriding).
 739
 740 If C<undef> is passed explicitly as the value for this key,
 741 weight for Hangul Syllables is treated as undefined
 742 without decomposition into Hangul Jamo.
 743 But definition of weight for Hangul Syllables
 744 in table or L<entry> is still valid.
 745
 746 =item preprocess
 747
 748 -- see 5.1 Preprocessing, UTR #10.
 749
 750 If specified, the coderef is used to preprocess
 751 before the formation of sort keys.
 752
 753 ex. dropping English articles, such as "a" or "the".
 754 Then, "the pen" is before "a pencil".
 755
 756      preprocess => sub {
 757            my $str = shift;
 758            $str =~ s/\b(?:an?|the)\s+//gi;
 759            $str;
 760         },
 761
 762 =item rearrange
 763
 764 -- see 3.1.3 Rearrangement, UTR #10.
 765
 766 Characters that are not coded in logical order and to be rearranged.
 767 By default,
 768
 769     rearrange => [ 0x0E40..0x0E44, 0x0EC0..0x0EC4 ],
 770
 771 If you want to disallow any rearrangement,
 772 pass C<undef> or C<[]> (a reference to an empty list)
 773 as the value for this key.
 774
 775 =item table
 776
 777 -- see 3.2 Default Unicode Collation Element Table, UTR #10.
 778
 779 You can use another element table if desired.
 780 The table file must be in your C<lib/Unicode/Collate> directory.
 781
 782 By default, the file C<lib/Unicode/Collate/allkeys.txt> is used.
 783
 784 If C<undef> is passed explicitly as the value for this key,
 785 no file is read (but you can define collation elements via L<entry>).
 786
 787 A typical way to define a collation element table
 788 without any file of table:
 789
 790    $onlyABC = Unicode::Collate->new(
 791        table => undef,
 792        entry => << 'ENTRIES',
 793 0061 ; [.0101.0020.0002.0061] # LATIN SMALL LETTER A
 794 0041 ; [.0101.0020.0008.0041] # LATIN CAPITAL LETTER A
 795 0062 ; [.0102.0020.0002.0062] # LATIN SMALL LETTER B
 796 0042 ; [.0102.0020.0008.0042] # LATIN CAPITAL LETTER B
 797 0063 ; [.0103.0020.0002.0063] # LATIN SMALL LETTER C
 798 0043 ; [.0103.0020.0008.0043] # LATIN CAPITAL LETTER C
 799 ENTRIES
 800     );
 801
 802 =item undefName
 803
 804 =item undefChar
 805
 806 -- see 6.3.4 Reducing the Repertoire, UTR #10.
 807
 808 Undefines the collation element as if it were unassigned in the table.
 809 This reduces the size of the table.
 810 If an unassigned character appears in the string to be collated,
 811 the sort key is made from its codepoint
 812 as a single-character collation element,
 813 as it is greater than any other assigned collation elements
 814 (in the codepoint order among the unassigned characters).
 815 But, it'd be better to ignore characters
 816 unfamiliar to you and maybe never used.
 817
 818 =item katakana_before_hiragana
 819
 820 =item upper_before_lower
 821
 822 -- see 6.6 Case Comparisons; 7.3.1 Tertiary Weight Table, UTR #10.
 823
 824 By default, lowercase is before uppercase
 825 and hiragana is before katakana.
 826
 827 If the tag is made true, this is reversed.
 828
 829 B<NOTE>: These tags simplemindedly assume
 830 any lowercase/uppercase or hiragana/katakana distinctions
 831 should occur in level 3, and their weights at level 3
 832 should be same as those mentioned in 7.3.1, UTR #10.
 833 If you define your collation elements which violates this,
 834 these tags doesn't work validly.
 835
 836 =back
 837
 838 =head2 Methods for Collation
 839
 840 =over 4
 841
 842 =item C<@sorted = $Collator-E<gt>sort(@not_sorted)>
 843
 844 Sorts a list of strings.
 845
 846 =item C<$result = $Collator-E<gt>cmp($a, $b)>
 847
 848 Returns 1 (when C<$a> is greater than C<$b>)
 849 or 0 (when C<$a> is equal to C<$b>)
 850 or -1 (when C<$a> is lesser than C<$b>).
 851
 852 =item C<$result = $Collator-E<gt>eq($a, $b)>
 853
 854 =item C<$result = $Collator-E<gt>ne($a, $b)>
 855
 856 =item C<$result = $Collator-E<gt>lt($a, $b)>
 857
 858 =item C<$result = $Collator-E<gt>le($a, $b)>
 859
 860 =item C<$result = $Collator-E<gt>gt($a, $b)>
 861
 862 =item C<$result = $Collator-E<gt>ge($a, $b)>
 863
 864 They works like the same name operators as theirs.
 865
 866    eq : whether $a is equal to $b.
 867    ne : whether $a is not equal to $b.
 868    lt : whether $a is lesser than $b.
 869    le : whether $a is lesser than $b or equal to $b.
 870    gt : whether $a is greater than $b.
 871    ge : whether $a is greater than $b or equal to $b.
 872
 873 =item C<$sortKey = $Collator-E<gt>getSortKey($string)>
 874
 875 -- see 4.3 Form a sort key for each string, UTR #10.
 876
 877 Returns a sort key.
 878
 879 You compare the sort keys using a binary comparison
 880 and get the result of the comparison of the strings using UCA.
 881
 882    $Collator->getSortKey($a) cmp $Collator->getSortKey($b)
 883
 884       is equivalent to
 885
 886    $Collator->cmp($a, $b)
 887
 888 =item C<$sortKeyForm = $Collator-E<gt>viewSortKey($string)>
 889
 890 Returns a string formalized to display a sort key.
 891 Weights are enclosed with C<'['> and C<']'>
 892 and level boundaries are denoted by C<'|'>.
 893
 894    use Unicode::Collate;
 895    my $c = Unicode::Collate->new();
 896    print $c->viewSortKey("Perl"),"\n";
 897
 898     # output:
 899     # [09B3 08B1 09CB 094F|0020 0020 0020 0020|0008 0002 0002 0002|FFFF FFFF FFFF FFFF]
 900     #  Level 1             Level 2             Level 3             Level 4
 901
 902 =item C<$position = $Collator-E<gt>index($string, $substring)>
 903
 904 =item C<($position, $length) = $Collator-E<gt>index($string, $substring)>
 905
 906 -- see 6.8 Searching, UTR #10.
 907
 908 If C<$substring> matches a part of C<$string>, returns
 909 the position of the first occurrence of the matching part in scalar context;
 910 in list context, returns a two-element list of
 911 the position and the length of the matching part.
 912
 913 B<Notice> that the length of the matching part may differ from
 914 the length of C<$substring>.
 915
 916 B<Note> that the position and the length are counted on the string
 917 after the process of preprocess, normalization, and rearrangement.
 918 Therefore, in case the specified string is not binary equal to
 919 the preprocessed/normalized/rearranged string, the position and the length
 920 may differ form those on the specified string. But it is guaranteed
 921 that, if matched, it returns a non-negative value as C<$position>.
 922
 923 If C<$substring> does not match any part of C<$string>,
 924 returns C<-1> in scalar context and
 925 an empty list in list context.
 926
 927 e.g. you say
 928
 929   my $Collator = Unicode::Collate->new( normalization => undef, level => 1 );
 930   my $str = "Ich mu\x{00DF} studieren.";
 931   my $sub = "m\x{00FC}ss";
 932   my $match;
 933   if (my($pos,$len) = $Collator->index($str, $sub)) {
 934       $match = substr($str, $pos, $len);
 935   }
 936
 937 and get C<"mu\x{00DF}"> in C<$match> since C<"mu>E<223>C<">
 938 is primary equal to C<"m>E<252>C<ss">.
 939
 940 =back
 941
 942 =head2 Other Methods
 943
 944 =over 4
 945
 946 =item UCA_Version
 947
 948 Returns the version number of Unicode Technical Standard 10
 949 this module consults.
 950
 951 =item Base_Unicode_Version
 952
 953 Returns the version number of the Unicode Standard
 954 this module is based on.
 955
 956 =back
 957
 958 =head2 EXPORT
 959
 960 None by default.
 961
 962 =head2 TODO
 963
 964 Unicode::Collate has not been ported to EBCDIC.  The code mostly would
 965 work just fine but a decision needs to be made: how the module should
 966 work in EBCDIC?  Should the low 256 characters be understood as
 967 Unicode or as EBCDIC code points?  Should one be chosen or should
 968 there be a way to do either?  Or should such translation be left
 969 outside the module for the user to do, for example by using
 970 Encode::from_to()?
 971 (or utf8::unicode_to_native()/utf8::native_to_unicode()?)
 972
 973 =head2 CAVEAT
 974
 975 Use of the C<normalization> parameter requires
 976 the B<Unicode::Normalize> module.
 977
 978 If you need not it (say, in the case when you need not
 979 handle any combining characters),
 980 assign C<normalization =E<gt> undef> explicitly.
 981
 982 -- see 6.5 Avoiding Normalization, UTR #10.
 983
 984 =head2 BUGS
 985
 986 C<index()> is an experimental method and
 987 its return value may be unreliable.
 988 The correct implementation for C<index()> must be based
 989 on Locale-Sensitive Support: Level 3 in UTR #18,
 990 F<Unicode Regular Expression Guidelines>.
 991
 992 See also 4.2 Locale-Dependent Graphemes in UTR #18.
 993
 994 =head1 AUTHOR
 995
 996 SADAHIRO Tomoyuki, E<lt>SADAHIRO@cpan.orgE<gt>
 997
 998   http://homepage1.nifty.com/nomenclator/perl/
 999
1000   Copyright(C) 2001-2002, SADAHIRO Tomoyuki. Japan. All rights reserved.
1001
1002   This library is free software; you can redistribute it
1003   and/or modify it under the same terms as Perl itself.
1004
1005 =head1 SEE ALSO
1006
1007 =over 4
1008
1009 =item http://www.unicode.org/unicode/reports/tr10/
1010
1011 Unicode Collation Algorithm - UTR #10
1012
1013 =item http://www.unicode.org/unicode/reports/tr10/allkeys.txt
1014
1015 The Default Unicode Collation Element Table
1016
1017 =item http://www.unicode.org/unicode/reports/tr15/
1018
1019 Unicode Normalization Forms - UAX #15
1020
1021 =item http://www.unicode.org/unicode/reports/tr18
1022
1023 Unicode Regular Expression Guidelines - UTR #18
1024
1025 =item L<Unicode::Normalize>
1026
1027 =back
1028
1029 =cut