lib/Unicode/Collate.pm

   1 package Unicode::Collate;
   2
   3 BEGIN {
   4     if (ord("A") == 193) {
   5         die "Unicode::Collate not ported to EBCDIC\n";
   6     }
   7 }
   8
   9 use 5.006;
  10 use strict;
  11 use warnings;
  12 use Carp;
  13
  14 require Exporter;
  15
  16 our $VERSION = '0.11';
  17 our $PACKAGE = __PACKAGE__;
  18
  19 our @ISA = qw(Exporter);
  20
  21 our %EXPORT_TAGS = ();
  22 our @EXPORT_OK = ();
  23 our @EXPORT = ();
  24
  25 (our $Path = $INC{'Unicode/Collate.pm'}) =~ s/\.pm$//;
  26 our $KeyFile = "allkeys.txt";
  27
  28 our $UNICODE_VERSION;
  29
  30 {
  31     my($f, $fh);
  32     foreach my $d (@INC) {
  33         use File::Spec;
  34         $f = File::Spec->catfile($d, "unicore", "version");
  35         if (open($fh, $f)) {
  36             chomp($UNICODE_VERSION = <$fh>);
  37             close $fh;
  38             last;
  39         }
  40
  41         # XXX, Perl 5.6.1
  42         $f = File::Spec->catfile($d, "unicode", "Unicode.301");
  43         if (open($fh, $f)) {
  44             $UNICODE_VERSION = "3.0.1";
  45             close $fh;
  46             last;
  47         }
  48     }
  49 }
  50
  51 our $getCombinClass; # coderef for combining class from Unicode::Normalize
  52
  53 use constant Min2      => 0x20;   # minimum weight at level 2
  54 use constant Min3      => 0x02;   # minimum weight at level 3
  55 use constant UNDEFINED => 0xFF80; # special value for undefined CE's
  56
  57 our $DefaultRearrange = [ 0x0E40..0x0E44, 0x0EC0..0x0EC4 ];
  58
  59 sub UCA_Version { "8.0" }
  60
  61 sub Base_Unicode_Version { $UNICODE_VERSION }
  62
  63 ##
  64 ## constructor
  65 ##
  66 sub new
  67 {
  68     my $class = shift;
  69     my $self = bless { @_ }, $class;
  70
  71     # alternate lowercased
  72     $self->{alternate} =
  73         ! exists $self->{alternate} ? 'shifted' : lc($self->{alternate});
  74
  75     croak "$PACKAGE unknown alternate tag name: $self->{alternate}"
  76         unless $self->{alternate} eq 'blanked'
  77             || $self->{alternate} eq 'non-ignorable'
  78             || $self->{alternate} eq 'shifted'
  79             || $self->{alternate} eq 'shift-trimmed';
  80
  81     # collation level
  82     $self->{level} ||= 4;
  83
  84     croak "Illegal level lower than 1 (passed $self->{level})."
  85         if $self->{level} < 1;
  86     croak "A level higher than 4 (passed $self->{level}) is not supported."
  87         if 4 < $self->{level};
  88
  89     # overrideHangul and -CJK
  90     # If true: CODEREF used; '': default; undef: derived elements
  91     $self->{overrideHangul} = ''
  92         if ! exists $self->{overrideHangul};
  93     $self->{overrideCJK} = ''
  94         if ! exists $self->{overrideCJK};
  95
  96     # normalization form
  97     $self->{normalization} = 'D'
  98         if ! exists $self->{normalization};
  99     $self->{UNF} = undef;
 100
 101     if (defined $self->{normalization}) {
 102         eval { require Unicode::Normalize };
 103         croak "Unicode/Normalize.pm is required to normalize strings: $@"
 104             if $@;
 105
 106         Unicode::Normalize->import();
 107         $getCombinClass = \&Unicode::Normalize::getCombinClass
 108             if ! $getCombinClass;
 109
 110         $self->{UNF} =
 111             $self->{normalization} =~ /^(?:NF)?C$/  ? \&NFC :
 112             $self->{normalization} =~ /^(?:NF)?D$/  ? \&NFD :
 113             $self->{normalization} =~ /^(?:NF)?KC$/ ? \&NFKC :
 114             $self->{normalization} =~ /^(?:NF)?KD$/ ? \&NFKD :
 115           croak "$PACKAGE unknown normalization form name: "
 116                 . $self->{normalization};
 117     }
 118
 119     # Open a table file.
 120     # If undef is passed explicitly, no file is read.
 121     $self->{table} = $KeyFile
 122         if ! exists $self->{table};
 123     $self->read_table
 124         if defined $self->{table};
 125
 126     if ($self->{entry}) {
 127         $self->parseEntry($_) foreach split /\n/, $self->{entry};
 128     }
 129
 130     # backwards
 131     $self->{backwards} ||= [ ];
 132     $self->{backwards} = [ $self->{backwards} ]
 133         if ! ref $self->{backwards};
 134
 135     # rearrange
 136     $self->{rearrange} = $DefaultRearrange
 137         if ! exists $self->{rearrange};
 138     $self->{rearrange} = []
 139         if ! defined $self->{rearrange};
 140     croak "$PACKAGE: A list for rearrangement must be store in an ARRAYREF"
 141         if ! ref $self->{rearrange};
 142
 143     # keys of $self->{rearrangeHash} are $self->{rearrange}.
 144     $self->{rearrangeHash} = undef;
 145
 146     if (@{ $self->{rearrange} }) {
 147         @{ $self->{rearrangeHash} }{ @{ $self->{rearrange} } } = ();
 148     }
 149
 150     return $self;
 151 }
 152
 153 sub read_table {
 154     my $self = shift;
 155     my $file = $self->{table} ne '' ? $self->{table} : $KeyFile;
 156
 157     open my $fk, "<$Path/$file"
 158         or croak "File does not exist at $Path/$file";
 159
 160     while (<$fk>) {
 161         next if /^\s*#/;
 162         if (/^\s*\@/) {
 163             if (/^\@version\s*(\S*)/) {
 164                 $self->{version} ||= $1;
 165             }
 166             elsif (/^\@alternate\s+(.*)/) {
 167                 $self->{alternate} ||= $1;
 168             }
 169             elsif (/^\@backwards\s+(.*)/) {
 170                 push @{ $self->{backwards} }, $1;
 171             }
 172             elsif (/^\@rearrange\s+(.*)/) {
 173                 push @{ $self->{rearrange} }, _getHexArray($1);
 174             }
 175             next;
 176         }
 177         $self->parseEntry($_);
 178     }
 179     close $fk;
 180 }
 181
 182
 183 ##
 184 ## get $line, parse it, and write an entry in $self
 185 ##
 186 sub parseEntry
 187 {
 188     my $self = shift;
 189     my $line = shift;
 190     my($name, $ele, @key);
 191
 192     return if $line !~ /^\s*[0-9A-Fa-f]/;
 193
 194     # removes comment and gets name
 195     $name = $1
 196         if $line =~ s/[#%]\s*(.*)//;
 197     return if defined $self->{undefName} && $name =~ /$self->{undefName}/;
 198
 199     # gets element
 200     my($e, $k) = split /;/, $line;
 201     croak "Wrong Entry: <charList> must be separated by ';' from <collElement>"
 202         if ! $k;
 203
 204     my @e = _getHexArray($e);
 205     $ele = pack('U*', @e);
 206     return if defined $self->{undefChar} && $ele =~ /$self->{undefChar}/;
 207
 208     # get sort key
 209     if (defined $self->{ignoreName} && $name =~ /$self->{ignoreName}/ ||
 210         defined $self->{ignoreChar} && $ele  =~ /$self->{ignoreChar}/)
 211     {
 212         $self->{entries}{$ele} = $self->{ignored}{$ele} = 1;
 213     }
 214     else {
 215         my $combining = 1; # primary = 0, secondary != 0;
 216
 217         foreach my $arr ($k =~ /\[([^\[\]]+)\]/g) { # SPACEs allowed
 218             my $var = $arr =~ /\*/; # exactly /^\*/ but be lenient.
 219             push @key, $self->altCE($var, _getHexArray($arr));
 220             $combining = 0 unless $key[-1][0] == 0 && $key[-1][1] != 0;
 221         }
 222         $self->{entries}{$ele} = \@key;
 223         $self->{combining}{$ele} = 1 if $combining;
 224     }
 225     $self->{maxlength}{ord $ele} = scalar @e if @e > 1;
 226 }
 227
 228
 229 ##
 230 ## arrayref CE = altCE(bool variable?, list[num] weights)
 231 ##
 232 sub altCE
 233 {
 234     my $self = shift;
 235     my $var  = shift;
 236     my @c    = @_;
 237
 238     $self->{alternate} eq 'blanked' ?
 239         $var ? [0,0,0,$c[3]] : \@c :
 240     $self->{alternate} eq 'non-ignorable' ?
 241         \@c :
 242     $self->{alternate} eq 'shifted' ?
 243         $var ? [0,0,0,$c[0] ] : [ @c[0..2], $c[0]+$c[1]+$c[2] ? 0xFFFF : 0 ] :
 244     $self->{alternate} eq 'shift-trimmed' ?
 245         $var ? [0,0,0,$c[0] ] : [ @c[0..2], 0 ] :
 246         croak "$PACKAGE unknown alternate name: $self->{alternate}";
 247 }
 248
 249 ##
 250 ## string hex_sortkey = splitCE(string arg)
 251 ##
 252 sub viewSortKey
 253 {
 254     my $self = shift;
 255     my $key  = $self->getSortKey(@_);
 256     my $view = join " ", map sprintf("%04X", $_), unpack 'n*', $key;
 257     $view =~ s/ ?0000 ?/|/g;
 258     return "[$view]";
 259 }
 260
 261
 262 ##
 263 ## list[strings] elements = splitCE(string arg)
 264 ##
 265 sub splitCE
 266 {
 267     my $self = shift;
 268     my $code = $self->{preprocess};
 269     my $norm = $self->{UNF};
 270     my $ent  = $self->{entries};
 271     my $max  = $self->{maxlength};
 272     my $reH  = $self->{rearrangeHash};
 273
 274     my $str = ref $code ? &$code(shift) : shift;
 275     $str = &$norm($str) if ref $norm;
 276
 277     my @src = unpack('U*', $str);
 278     my @buf;
 279
 280     # rearrangement
 281     if ($reH) {
 282         for (my $i = 0; $i < @src; $i++) {
 283             if (exists $reH->{ $src[$i] } && $i + 1 < @src) {
 284                 ($src[$i], $src[$i+1]) = ($src[$i+1], $src[$i]);
 285                 $i++;
 286             }
 287         }
 288     }
 289
 290     for (my $i = 0; $i < @src; $i++) {
 291         my $ch;
 292         my $u = $src[$i];
 293
 294         # non-characters
 295         next unless defined $u;
 296         next if $u < 0 || 0x10FFFF < $u    # out of range
 297             || (0xD800 <= $u && $u <= 0xDFFF); # unpaired surrogates
 298         my $four = $u & 0xFFFF;
 299         next if $four == 0xFFFE || $four == 0xFFFF;
 300
 301         if ($max->{$u}) { # contract
 302             for (my $j = $max->{$u}; $j >= 1; $j--) {
 303                 next unless $i+$j-1 < @src;
 304                 $ch = pack 'U*', @src[$i .. $i+$j-1];
 305                 $i += $j-1, last if $ent->{$ch};
 306             }
 307         } else {
 308             $ch = pack('U', $u);
 309         }
 310
 311         # with Combining Char (UTS#10, 4.2.1), here requires Unicode::Normalize.
 312         if ($getCombinClass && defined $ch) {
 313             for (my $j = $i+1; $j < @src; $j++) {
 314                 next unless defined $src[$j];
 315                 last unless $getCombinClass->( $src[$j] );
 316                 my $comb = pack 'U', $src[$j];
 317                 next if ! $ent->{ $ch.$comb };
 318                 $ch .= $comb;
 319                 $src[$j] = undef;
 320             }
 321         }
 322         push @buf, $ch;
 323     }
 324     wantarray ? @buf : \@buf;
 325 }
 326
 327
 328 ##
 329 ## list[arrayrefs] weight = getWt(string element)
 330 ##
 331 sub getWt
 332 {
 333     my $self = shift;
 334     my $ch   = shift;
 335     my $ent  = $self->{entries};
 336     my $ign  = $self->{ignored};
 337     my $cjk  = $self->{overrideCJK};
 338     my $hang = $self->{overrideHangul};
 339
 340     return if !defined $ch || $ign->{$ch}; # ignored
 341     return @{ $ent->{$ch} } if $ent->{$ch};
 342     my $u = unpack('U', $ch);
 343
 344     if (0xAC00 <= $u && $u <= 0xD7A3) { # is_Hangul
 345         return $hang
 346             ? &$hang($u)
 347             : defined $hang
 348                 ? map({
 349                         my $v = $_;
 350                         my $ar = $ent->{pack('U', $v)};
 351                         $ar ? @$ar : map($self->altCE(0,@$_), _derivCE($v));
 352                     } _decompHangul($u))
 353                 : map($self->altCE(0,@$_), _derivCE($u));
 354     }
 355     elsif (0x3400 <= $u && $u <= 0x4DB5 ||
 356            0x4E00 <= $u && $u <= 0x9FA5 ||
 357            0x20000 <= $u && $u <= 0x2A6D6) { # is_CJK
 358         return $cjk
 359             ? &$cjk($u)
 360             : defined $cjk && $u <= 0xFFFF
 361                 ? $self->altCE(0, ($u, 0x20, 0x02, $u))
 362                 : map($self->altCE(0,@$_), _derivCE($u));
 363     }
 364     else {
 365         return map($self->altCE(0,@$_), _derivCE($u));
 366     }
 367 }
 368
 369 ##
 370 ## int = index(string, substring)
 371 ##
 372 sub index
 373 {
 374     my $self = shift;
 375     my $lev  = $self->{level};
 376     my $comb = $self->{combining};
 377     my $str  = $self->splitCE(shift);
 378     my $sub  = $self->splitCE(shift);
 379
 380     return wantarray ? (0,0) : 0 if ! @$sub;
 381     return wantarray ?  ()  : -1 if ! @$str;
 382
 383     my @subWt = grep _ignorableAtLevel($_,$lev),
 384                 map $self->getWt($_), @$sub;
 385
 386     my(@strWt,@strPt);
 387     my $count = 0;
 388     for (my $i = 0; $i < @$str; $i++) {
 389         my $go_ahead = 0;
 390
 391         my @tmp = grep _ignorableAtLevel($_,$lev), $self->getWt($str->[$i]);
 392         $go_ahead += length $str->[$i];
 393
 394         # /*XXX*/ still broken.
 395         # index("e\x{300}", "e") should be 'no match' at level 2 or higher
 396         # as "e\x{300}" is a *single* grapheme cluster and not equal to "e".
 397
 398         # go ahead as far as we find a combining character;
 399         while ($i + 1 < @$str &&
 400               (! defined $str->[$i+1] || $comb->{ $str->[$i+1] }) ) {
 401             $i++;
 402             $go_ahead += length $str->[$i];
 403             next if ! defined $str->[$i];
 404             push @tmp,
 405                 grep _ignorableAtLevel($_,$lev), $self->getWt($str->[$i]);
 406         }
 407
 408         push @strWt, @tmp;
 409         push @strPt, ($count) x @tmp;
 410         $count += $go_ahead;
 411
 412         while (@strWt >= @subWt) {
 413             if (_eqArray(\@strWt, \@subWt, $lev)) {
 414                 my $pos = $strPt[0];
 415                 return wantarray ? ($pos, $count-$pos) : $pos;
 416             }
 417             shift @strWt;
 418             shift @strPt;
 419         }
 420     }
 421     return wantarray ? () : -1;
 422 }
 423
 424 ##
 425 ## bool _eqArray(arrayref, arrayref, level)
 426 ##
 427 sub _eqArray($$$)
 428 {
 429     my $a   = shift; # length $a >= length $b;
 430     my $b   = shift;
 431     my $lev = shift;
 432     for my $v (0..$lev-1) {
 433         for my $c (0..@$b-1){
 434             return if $a->[$c][$v] != $b->[$c][$v];
 435         }
 436     }
 437     return 1;
 438 }
 439
 440
 441 ##
 442 ## bool _ignorableAtLevel(CE, level)
 443 ##
 444 sub _ignorableAtLevel($$)
 445 {
 446     my $ce = shift;
 447     return unless defined $ce;
 448     my $lv = shift;
 449     return ! grep { ! $ce->[$_] } 0..$lv-1;
 450 }
 451
 452
 453 ##
 454 ## string sortkey = getSortKey(string arg)
 455 ##
 456 sub getSortKey
 457 {
 458     my $self = shift;
 459     my $lev  = $self->{level};
 460     my $rCE  = $self->splitCE(shift); # get an arrayref
 461
 462     # weight arrays
 463     my @buf = grep defined(), map $self->getWt($_), @$rCE;
 464
 465     # make sort key
 466     my @ret = ([],[],[],[]);
 467     foreach my $v (0..$lev-1) {
 468         foreach my $b (@buf) {
 469             push @{ $ret[$v] }, $b->[$v] if $b->[$v];
 470         }
 471     }
 472     foreach (@{ $self->{backwards} }) {
 473         my $v = $_ - 1;
 474         @{ $ret[$v] } = reverse @{ $ret[$v] };
 475     }
 476
 477     # modification of tertiary weights
 478     if ($self->{upper_before_lower}) {
 479         foreach (@{ $ret[2] }) {
 480             if    (0x8 <= $_ && $_ <= 0xC) { $_ -= 6 } # lower
 481             elsif (0x2 <= $_ && $_ <= 0x6) { $_ += 6 } # upper
 482             elsif ($_ == 0x1C)             { $_ += 1 } # square upper
 483             elsif ($_ == 0x1D)             { $_ -= 1 } # square lower
 484         }
 485     }
 486     if ($self->{katakana_before_hiragana}) {
 487         foreach (@{ $ret[2] }) {
 488             if    (0x0F <= $_ && $_ <= 0x13) { $_ -= 2 } # katakana
 489             elsif (0x0D <= $_ && $_ <= 0x0E) { $_ += 5 } # hiragana
 490         }
 491     }
 492     join "\0\0", map pack('n*', @$_), @ret;
 493 }
 494
 495
 496 ##
 497 ## int compare = cmp(string a, string b)
 498 ##
 499 sub cmp { $_[0]->getSortKey($_[1]) cmp $_[0]->getSortKey($_[2]) }
 500 sub eq  { $_[0]->getSortKey($_[1]) eq  $_[0]->getSortKey($_[2]) }
 501 sub ne  { $_[0]->getSortKey($_[1]) ne  $_[0]->getSortKey($_[2]) }
 502 sub lt  { $_[0]->getSortKey($_[1]) lt  $_[0]->getSortKey($_[2]) }
 503 sub le  { $_[0]->getSortKey($_[1]) le  $_[0]->getSortKey($_[2]) }
 504 sub gt  { $_[0]->getSortKey($_[1]) gt  $_[0]->getSortKey($_[2]) }
 505 sub ge  { $_[0]->getSortKey($_[1]) ge  $_[0]->getSortKey($_[2]) }
 506
 507 ##
 508 ## list[strings] sorted = sort(list[strings] arg)
 509 ##
 510 sub sort {
 511     my $obj = shift;
 512     return
 513         map { $_->[1] }
 514             sort{ $a->[0] cmp $b->[0] }
 515                 map [ $obj->getSortKey($_), $_ ], @_;
 516 }
 517
 518 ##
 519 ## list[arrayrefs] CE = _derivCE(int codepoint)
 520 ##
 521 sub _derivCE {
 522     my $code = shift;
 523     my $a = UNDEFINED + ($code >> 15); # ok
 524     my $b = ($code & 0x7FFF) | 0x8000; # ok
 525 #   my $a = 0xFFC2 + ($code >> 15);    # ng
 526 #   my $b = $code & 0x7FFF | 0x1000;   # ng
 527     $b ? ([$a,2,1,$code],[$b,0,0,$code]) : [$a,2,1,$code];
 528 }
 529
 530 ##
 531 ## "hhhh hhhh hhhh" to (dddd, dddd, dddd)
 532 ##
 533 sub _getHexArray { map hex, $_[0] =~ /([0-9a-fA-F]+)/g }
 534
 535 #
 536 # $code must be in Hangul syllable.
 537 # Check it before you enter here.
 538 #
 539 sub _decompHangul {
 540     my $code = shift;
 541     my $SIndex = $code - 0xAC00;
 542     my $LIndex = int( $SIndex / 588);
 543     my $VIndex = int(($SIndex % 588) / 28);
 544     my $TIndex =      $SIndex % 28;
 545     return (
 546         0x1100 + $LIndex,
 547         0x1161 + $VIndex,
 548         $TIndex ? (0x11A7 + $TIndex) : (),
 549     );
 550 }
 551
 552 1;
 553 __END__
 554
 555 =head1 NAME
 556
 557 Unicode::Collate - Unicode Collation Algorithm
 558
 559 =head1 SYNOPSIS
 560
 561   use Unicode::Collate;
 562
 563   #construct
 564   $Collator = Unicode::Collate->new(%tailoring);
 565
 566   #sort
 567   @sorted = $Collator->sort(@not_sorted);
 568
 569   #compare
 570   $result = $Collator->cmp($a, $b); # returns 1, 0, or -1.
 571
 572 =head1 DESCRIPTION
 573
 574 =head2 Constructor and Tailoring
 575
 576 The C<new> method returns a collator object.
 577
 578    $Collator = Unicode::Collate->new(
 579       alternate => $alternate,
 580       backwards => $levelNumber, # or \@levelNumbers
 581       entry => $element,
 582       normalization  => $normalization_form,
 583       ignoreName => qr/$ignoreName/,
 584       ignoreChar => qr/$ignoreChar/,
 585       katakana_before_hiragana => $bool,
 586       level => $collationLevel,
 587       overrideCJK => \&overrideCJK,
 588       overrideHangul => \&overrideHangul,
 589       preprocess => \&preprocess,
 590       rearrange => \@charList,
 591       table => $filename,
 592       undefName => qr/$undefName/,
 593       undefChar => qr/$undefChar/,
 594       upper_before_lower => $bool,
 595    );
 596    # if %tailoring is false (i.e. empty),
 597    # $Collator should do the default collation.
 598
 599 =over 4
 600
 601 =item alternate
 602
 603 -- see 3.2.2 Alternate Weighting, UTR #10.
 604
 605 This key allows to alternate weighting for variable collation elements,
 606 which are marked with an ASTERISK in the table
 607 (NOTE: Many punction marks and symbols are variable in F<allkeys.txt>).
 608
 609    alternate => 'blanked', 'non-ignorable', 'shifted', or 'shift-trimmed'.
 610
 611 These names are case-insensitive.
 612 By default (if specification is omitted), 'shifted' is adopted.
 613
 614    'Blanked'        Variable elements are ignorable at levels 1 through 3;
 615                     considered at the 4th level.
 616
 617    'Non-ignorable'  Variable elements are not reset to ignorable.
 618
 619    'Shifted'        Variable elements are ignorable at levels 1 through 3
 620                     their level 4 weight is replaced by the old level 1 weight.
 621                     Level 4 weight for Non-Variable elements is 0xFFFF.
 622
 623    'Shift-Trimmed'  Same as 'shifted', but all FFFF's at the 4th level
 624                     are trimmed.
 625
 626 =item backwards
 627
 628 -- see 3.1.2 French Accents, UTR #10.
 629
 630      backwards => $levelNumber or \@levelNumbers
 631
 632 Weights in reverse order; ex. level 2 (diacritic ordering) in French.
 633 If omitted, forwards at all the levels.
 634
 635 =item entry
 636
 637 -- see 3.1 Linguistic Features; 3.2.1 File Format, UTR #10.
 638
 639 Overrides a default order or defines additional collation elements
 640
 641   entry => <<'ENTRIES', # use the UCA file format
 642 00E6 ; [.0861.0020.0002.00E6] [.08B1.0020.0002.00E6] # ligature <ae> as <a><e>
 643 0063 0068 ; [.0893.0020.0002.0063]      # "ch" in traditional Spanish
 644 0043 0068 ; [.0893.0020.0008.0043]      # "Ch" in traditional Spanish
 645 ENTRIES
 646
 647 =item ignoreName
 648
 649 =item ignoreChar
 650
 651 -- see Completely Ignorable, 3.2.2 Alternate Weighting, UTR #10.
 652
 653 Makes the entry in the table ignorable.
 654 If a collation element is ignorable,
 655 it is ignored as if the element had been deleted from there.
 656
 657 E.g. when 'a' and 'e' are ignorable,
 658 'element' is equal to 'lament' (or 'lmnt').
 659
 660 =item level
 661
 662 -- see 4.3 Form a sort key for each string, UTR #10.
 663
 664 Set the maximum level.
 665 Any higher levels than the specified one are ignored.
 666
 667   Level 1: alphabetic ordering
 668   Level 2: diacritic ordering
 669   Level 3: case ordering
 670   Level 4: tie-breaking (e.g. in the case when alternate is 'shifted')
 671
 672   ex.level => 2,
 673
 674 If omitted, the maximum is the 4th.
 675
 676 =item normalization
 677
 678 -- see 4.1 Normalize each input string, UTR #10.
 679
 680 If specified, strings are normalized before preparation of sort keys
 681 (the normalization is executed after preprocess).
 682
 683 As a form name, one of the following names must be used.
 684
 685   'C'  or 'NFC'  for Normalization Form C
 686   'D'  or 'NFD'  for Normalization Form D
 687   'KC' or 'NFKC' for Normalization Form KC
 688   'KD' or 'NFKD' for Normalization Form KD
 689
 690 If omitted, the string is put into Normalization Form D.
 691
 692 If C<undef> is passed explicitly as the value for this key,
 693 any normalization is not carried out (this may make tailoring easier
 694 if any normalization is not desired).
 695
 696 see B<CAVEAT>.
 697
 698 =item overrideCJK
 699
 700 -- see 7.1 Derived Collation Elements, UTR #10.
 701
 702 By default, mapping of CJK Unified Ideographs
 703 uses the Unicode codepoint order.
 704 But the mapping of CJK Unified Ideographs may be overrided.
 705
 706 ex. CJK Unified Ideographs in the JIS code point order.
 707
 708   overrideCJK => sub {
 709       my $u = shift;             # get a Unicode codepoint
 710       my $b = pack('n', $u);     # to UTF-16BE
 711       my $s = your_unicode_to_sjis_converter($b); # convert
 712       my $n = unpack('n', $s);   # convert sjis to short
 713       [ $n, 0x20, 0x2, $u ];     # return the collation element
 714   },
 715
 716 ex. ignores all CJK Unified Ideographs.
 717
 718   overrideCJK => sub {()}, # CODEREF returning empty list
 719
 720    # where ->eq("Pe\x{4E00}rl", "Perl") is true
 721    # as U+4E00 is a CJK Unified Ideograph and to be ignorable.
 722
 723 If C<undef> is passed explicitly as the value for this key,
 724 weights for CJK Unified Ideographs are treated as undefined.
 725 But assignment of weight for CJK Unified Ideographs
 726 in table or L<entry> is still valid.
 727
 728 =item overrideHangul
 729
 730 -- see 7.1 Derived Collation Elements, UTR #10.
 731
 732 By default, Hangul Syllables are decomposed into Hangul Jamo.
 733 But the mapping of Hangul Syllables may be overrided.
 734
 735 This tag works like L<overrideCJK>, so see there for examples.
 736
 737 If you want to override the mapping of Hangul Syllables,
 738 the Normalization Forms D and KD are not appropriate
 739 (they will be decomposed before overriding).
 740
 741 If C<undef> is passed explicitly as the value for this key,
 742 weight for Hangul Syllables is treated as undefined
 743 without decomposition into Hangul Jamo.
 744 But definition of weight for Hangul Syllables
 745 in table or L<entry> is still valid.
 746
 747 =item preprocess
 748
 749 -- see 5.1 Preprocessing, UTR #10.
 750
 751 If specified, the coderef is used to preprocess
 752 before the formation of sort keys.
 753
 754 ex. dropping English articles, such as "a" or "the".
 755 Then, "the pen" is before "a pencil".
 756
 757      preprocess => sub {
 758            my $str = shift;
 759            $str =~ s/\b(?:an?|the)\s+//gi;
 760            $str;
 761         },
 762
 763 =item rearrange
 764
 765 -- see 3.1.3 Rearrangement, UTR #10.
 766
 767 Characters that are not coded in logical order and to be rearranged.
 768 By default,
 769
 770     rearrange => [ 0x0E40..0x0E44, 0x0EC0..0x0EC4 ],
 771
 772 If you want to disallow any rearrangement,
 773 pass C<undef> or C<[]> (a reference to an empty list)
 774 as the value for this key.
 775
 776 =item table
 777
 778 -- see 3.2 Default Unicode Collation Element Table, UTR #10.
 779
 780 You can use another element table if desired.
 781 The table file must be in your C<lib/Unicode/Collate> directory.
 782
 783 By default, the file C<lib/Unicode/Collate/allkeys.txt> is used.
 784
 785 If C<undef> is passed explicitly as the value for this key,
 786 no file is read (but you can define collation elements via L<entry>).
 787
 788 A typical way to define a collation element table
 789 without any file of table:
 790
 791    $onlyABC = Unicode::Collate->new(
 792        table => undef,
 793        entry => << 'ENTRIES',
 794 0061 ; [.0101.0020.0002.0061] # LATIN SMALL LETTER A
 795 0041 ; [.0101.0020.0008.0041] # LATIN CAPITAL LETTER A
 796 0062 ; [.0102.0020.0002.0062] # LATIN SMALL LETTER B
 797 0042 ; [.0102.0020.0008.0042] # LATIN CAPITAL LETTER B
 798 0063 ; [.0103.0020.0002.0063] # LATIN SMALL LETTER C
 799 0043 ; [.0103.0020.0008.0043] # LATIN CAPITAL LETTER C
 800 ENTRIES
 801     );
 802
 803 =item undefName
 804
 805 =item undefChar
 806
 807 -- see 6.3.4 Reducing the Repertoire, UTR #10.
 808
 809 Undefines the collation element as if it were unassigned in the table.
 810 This reduces the size of the table.
 811 If an unassigned character appears in the string to be collated,
 812 the sort key is made from its codepoint
 813 as a single-character collation element,
 814 as it is greater than any other assigned collation elements
 815 (in the codepoint order among the unassigned characters).
 816 But, it'd be better to ignore characters
 817 unfamiliar to you and maybe never used.
 818
 819 =item katakana_before_hiragana
 820
 821 =item upper_before_lower
 822
 823 -- see 6.6 Case Comparisons; 7.3.1 Tertiary Weight Table, UTR #10.
 824
 825 By default, lowercase is before uppercase
 826 and hiragana is before katakana.
 827
 828 If the tag is made true, this is reversed.
 829
 830 B<NOTE>: These tags simplemindedly assume
 831 any lowercase/uppercase or hiragana/katakana distinctions
 832 should occur in level 3, and their weights at level 3
 833 should be same as those mentioned in 7.3.1, UTR #10.
 834 If you define your collation elements which violates this,
 835 these tags doesn't work validly.
 836
 837 =back
 838
 839 =head2 Methods for Collation
 840
 841 =over 4
 842
 843 =item C<@sorted = $Collator-E<gt>sort(@not_sorted)>
 844
 845 Sorts a list of strings.
 846
 847 =item C<$result = $Collator-E<gt>cmp($a, $b)>
 848
 849 Returns 1 (when C<$a> is greater than C<$b>)
 850 or 0 (when C<$a> is equal to C<$b>)
 851 or -1 (when C<$a> is lesser than C<$b>).
 852
 853 =item C<$result = $Collator-E<gt>eq($a, $b)>
 854
 855 =item C<$result = $Collator-E<gt>ne($a, $b)>
 856
 857 =item C<$result = $Collator-E<gt>lt($a, $b)>
 858
 859 =item C<$result = $Collator-E<gt>le($a, $b)>
 860
 861 =item C<$result = $Collator-E<gt>gt($a, $b)>
 862
 863 =item C<$result = $Collator-E<gt>ge($a, $b)>
 864
 865 They works like the same name operators as theirs.
 866
 867    eq : whether $a is equal to $b.
 868    ne : whether $a is not equal to $b.
 869    lt : whether $a is lesser than $b.
 870    le : whether $a is lesser than $b or equal to $b.
 871    gt : whether $a is greater than $b.
 872    ge : whether $a is greater than $b or equal to $b.
 873
 874 =item C<$sortKey = $Collator-E<gt>getSortKey($string)>
 875
 876 -- see 4.3 Form a sort key for each string, UTR #10.
 877
 878 Returns a sort key.
 879
 880 You compare the sort keys using a binary comparison
 881 and get the result of the comparison of the strings using UCA.
 882
 883    $Collator->getSortKey($a) cmp $Collator->getSortKey($b)
 884
 885       is equivalent to
 886
 887    $Collator->cmp($a, $b)
 888
 889 =item C<$sortKeyForm = $Collator-E<gt>viewSortKey($string)>
 890
 891 Returns a string formalized to display a sort key.
 892 Weights are enclosed with C<'['> and C<']'>
 893 and level boundaries are denoted by C<'|'>.
 894
 895    use Unicode::Collate;
 896    my $c = Unicode::Collate->new();
 897    print $c->viewSortKey("Perl"),"\n";
 898
 899     # output:
 900     # [09B3 08B1 09CB 094F|0020 0020 0020 0020|0008 0002 0002 0002|FFFF FFFF FFFF FFFF]
 901     #  Level 1             Level 2             Level 3             Level 4
 902
 903 =item C<$position = $Collator-E<gt>index($string, $substring)>
 904
 905 =item C<($position, $length) = $Collator-E<gt>index($string, $substring)>
 906
 907 -- see 6.8 Searching, UTR #10.
 908
 909 If C<$substring> matches a part of C<$string>, returns
 910 the position of the first occurrence of the matching part in scalar context;
 911 in list context, returns a two-element list of
 912 the position and the length of the matching part.
 913
 914 B<Notice> that the length of the matching part may differ from
 915 the length of C<$substring>.
 916
 917 B<Note> that the position and the length are counted on the string
 918 after the process of preprocess, normalization, and rearrangement.
 919 Therefore, in case the specified string is not binary equal to
 920 the preprocessed/normalized/rearranged string, the position and the length
 921 may differ form those on the specified string. But it is guaranteed
 922 that, if matched, it returns a non-negative value as C<$position>.
 923
 924 If C<$substring> does not match any part of C<$string>,
 925 returns C<-1> in scalar context and
 926 an empty list in list context.
 927
 928 e.g. you say
 929
 930   my $Collator = Unicode::Collate->new( normalization => undef, level => 1 );
 931   my $str = "Ich mu\x{00DF} studieren.";
 932   my $sub = "m\x{00FC}ss";
 933   my $match;
 934   if (my($pos,$len) = $Collator->index($str, $sub)) {
 935       $match = substr($str, $pos, $len);
 936   }
 937
 938 and get C<"mu\x{00DF}"> in C<$match> since C<"mu>E<223>C<">
 939 is primary equal to C<"m>E<252>C<ss">.
 940
 941 =back
 942
 943 =head2 Other Methods
 944
 945 =over 4
 946
 947 =item UCA_Version
 948
 949 Returns the version number of Unicode Technical Standard 10
 950 this module consults.
 951
 952 =item Base_Unicode_Version
 953
 954 Returns the version number of the Unicode Standard
 955 this module is based on.
 956
 957 =back
 958
 959 =head2 EXPORT
 960
 961 None by default.
 962
 963 =head2 TODO
 964
 965 Unicode::Collate has not been ported to EBCDIC.  The code mostly would
 966 work just fine but a decision needs to be made: how the module should
 967 work in EBCDIC?  Should the low 256 characters be understood as
 968 Unicode or as EBCDIC code points?  Should one be chosen or should
 969 there be a way to do either?  Or should such translation be left
 970 outside the module for the user to do, for example by using
 971 Encode::from_to()?
 972 (or utf8::unicode_to_native()/utf8::native_to_unicode()?)
 973
 974 =head2 CAVEAT
 975
 976 Use of the C<normalization> parameter requires
 977 the B<Unicode::Normalize> module.
 978
 979 If you need not it (say, in the case when you need not
 980 handle any combining characters),
 981 assign C<normalization =E<gt> undef> explicitly.
 982
 983 -- see 6.5 Avoiding Normalization, UTR #10.
 984
 985 =head2 BUGS
 986
 987 C<index()> is an experimental method and
 988 its return value may be unreliable.
 989 The correct implementation for C<index()> must be based
 990 on Locale-Sensitive Support: Level 3 in UTR #18,
 991 F<Unicode Regular Expression Guidelines>.
 992
 993 See also 4.2 Locale-Dependent Graphemes in UTR #18.
 994
 995 =head1 AUTHOR
 996
 997 SADAHIRO Tomoyuki, E<lt>SADAHIRO@cpan.orgE<gt>
 998
 999   http://homepage1.nifty.com/nomenclator/perl/
1000
1001   Copyright(C) 2001-2002, SADAHIRO Tomoyuki. Japan. All rights reserved.
1002
1003   This library is free software; you can redistribute it
1004   and/or modify it under the same terms as Perl itself.
1005
1006 =head1 SEE ALSO
1007
1008 =over 4
1009
1010 =item http://www.unicode.org/unicode/reports/tr10/
1011
1012 Unicode Collation Algorithm - UTR #10
1013
1014 =item http://www.unicode.org/unicode/reports/tr10/allkeys.txt
1015
1016 The Default Unicode Collation Element Table
1017
1018 =item http://www.unicode.org/unicode/reports/tr15/
1019
1020 Unicode Normalization Forms - UAX #15
1021
1022 =item http://www.unicode.org/unicode/reports/tr18
1023
1024 Unicode Regular Expression Guidelines - UTR #18
1025
1026 =item L<Unicode::Normalize>
1027
1028 =back
1029
1030 =cut