lib/Unicode/Collate.pm

   1 package Unicode::Collate;
   2
   3 BEGIN {
   4     unless ("A" eq pack('U', 0x41)) {
   5         die "Unicode::Collate cannot stringify a Unicode code point\n";
   6     }
   7 }
   8
   9 use 5.006;
  10 use strict;
  11 use warnings;
  12 use Carp;
  13 use File::Spec;
  14
  15 require Exporter;
  16
  17 our $VERSION = '0.28';
  18 our $PACKAGE = __PACKAGE__;
  19
  20 our @ISA = qw(Exporter);
  21
  22 our %EXPORT_TAGS = ();
  23 our @EXPORT_OK = ();
  24 our @EXPORT = ();
  25
  26 (our $Path = $INC{'Unicode/Collate.pm'}) =~ s/\.pm$//;
  27 our $KeyFile = "allkeys.txt";
  28
  29 our $UNICODE_VERSION;
  30
  31 eval { require Unicode::UCD };
  32
  33 unless ($@) {
  34     $UNICODE_VERSION = Unicode::UCD::UnicodeVersion();
  35 }
  36 else { # Perl 5.6.1
  37     my($f, $fh);
  38     foreach my $d (@INC) {
  39         $f = File::Spec->catfile($d, "unicode", "Unicode.301");
  40         if (open($fh, $f)) {
  41             $UNICODE_VERSION = '3.0.1';
  42             close $fh;
  43             last;
  44         }
  45     }
  46 }
  47
  48 # Perl's boolean
  49 use constant TRUE  => 1;
  50 use constant FALSE => "";
  51 use constant NOMATCHPOS => -1;
  52
  53 # A coderef to get combining class imported from Unicode::Normalize
  54 # (i.e. \&Unicode::Normalize::getCombinClass).
  55 # This is also used as a HAS_UNICODE_NORMALIZE flag.
  56 our $CVgetCombinClass;
  57
  58 # Supported Levels
  59 use constant MinLevel => 1;
  60 use constant MaxLevel => 4;
  61
  62 # Minimum weights at level 2 and 3, respectively
  63 use constant Min2Wt => 0x20;
  64 use constant Min3Wt => 0x02;
  65
  66 # Shifted weight at 4th level
  67 use constant Shift4Wt => 0xFFFF;
  68
  69 # Variable weight at 1st level.
  70 # This is a negative value but should be regarded as zero on collation.
  71 # This is for distinction of variable chars from level 3 ignorable chars.
  72 use constant Var1Wt => -1;
  73
  74
  75 # A boolean for Variable and 16-bit weights at 4 levels of Collation Element
  76 # PROBLEM: The Default Unicode Collation Element Table
  77 # has weights over 0xFFFF at the 4th level.
  78 # The tie-breaking in the variable weights
  79 # other than "shift" (as well as "shift-trimmed") is unreliable.
  80 use constant VCE_TEMPLATE => 'Cn4';
  81
  82 # A sort key: 16-bit weights
  83 # See also the PROBLEM on VCE_TEMPLATE above.
  84 use constant KEY_TEMPLATE => 'n*';
  85
  86 # Level separator in a sort key:
  87 # i.e. pack(KEY_TEMPLATE, 0)
  88 use constant LEVEL_SEP => "\0\0";
  89
  90 # As Unicode code point separator for hash keys.
  91 # A joined code point string (denoted by JCPS below)
  92 # like "65;768" is used for internal processing
  93 # instead of Perl's Unicode string like "\x41\x{300}",
  94 # as the native code point is different from the Unicode code point
  95 # on EBCDIC platform.
  96 # This character must not be included in any stringified
  97 # representation of an integer.
  98 use constant CODE_SEP => ';';
  99
 100 # boolean values of variable weights
 101 use constant NON_VAR => 0; # Non-Variable character
 102 use constant VAR     => 1; # Variable character
 103
 104 # Logical_Order_Exception in PropList.txt
 105 # TODO: synchronization with change of PropList.txt.
 106 our $DefaultRearrange = [ 0x0E40..0x0E44, 0x0EC0..0x0EC4 ];
 107
 108 sub UCA_Version { "9" }
 109
 110 sub Base_Unicode_Version { $UNICODE_VERSION || 'unknown' }
 111
 112 ######
 113
 114 sub pack_U {
 115     return pack('U*', @_);
 116 }
 117
 118 sub unpack_U {
 119     return unpack('U*', pack('U*').shift);
 120 }
 121
 122 ######
 123
 124 my (%AlternateOK);
 125 @AlternateOK{ qw/
 126     blanked  non-ignorable  shifted  shift-trimmed
 127   / } = ();
 128
 129 our @ChangeOK = qw/
 130     alternate backwards level normalization rearrange
 131     katakana_before_hiragana upper_before_lower
 132     overrideHangul overrideCJK preprocess UCA_Version
 133   /;
 134
 135 our @ChangeNG = qw/
 136     entry entries table maxlength
 137     ignoreChar ignoreName undefChar undefName
 138     versionTable alternateTable backwardsTable forwardsTable rearrangeTable
 139     derivCode normCode rearrangeHash L3_ignorable
 140     backwardsFlag
 141   /;
 142 # The hash key 'ignored' is deleted at v 0.21.
 143 # The hash key 'isShift' is deleted at v 0.23.
 144 # The hash key 'combining' is deleted at v 0.24.
 145
 146 my (%ChangeOK, %ChangeNG);
 147 @ChangeOK{ @ChangeOK } = ();
 148 @ChangeNG{ @ChangeNG } = ();
 149
 150 sub change {
 151     my $self = shift;
 152     my %hash = @_;
 153     my %old;
 154     foreach my $k (keys %hash) {
 155         if (exists $ChangeOK{$k}) {
 156             $old{$k} = $self->{$k};
 157             $self->{$k} = $hash{$k};
 158         }
 159         elsif (exists $ChangeNG{$k}) {
 160             croak "change of $k via change() is not allowed!";
 161         }
 162         # else => ignored
 163     }
 164     $self->checkCollator;
 165     return wantarray ? %old : $self;
 166 }
 167
 168 sub _checkLevel {
 169     my $level = shift;
 170     my $key   = shift;
 171     croak sprintf "Illegal level %d (in \$self->{%s}) lower than %d.",
 172         $level, $key, MinLevel if MinLevel > $level;
 173     croak sprintf "Unsupported level %d (in \$self->{%s}) higher than %d ",
 174         $level, $key, MaxLevel if MaxLevel < $level;
 175 }
 176
 177 sub checkCollator {
 178     my $self = shift;
 179     _checkLevel($self->{level}, "level");
 180
 181     $self->{derivCode} =
 182         $self->{UCA_Version} ==  8 ? \&_derivCE_8 :
 183         $self->{UCA_Version} ==  9 ? \&_derivCE_9 :
 184       croak "Illegal UCA version (passed $self->{UCA_Version}).";
 185
 186     $self->{alternate} = lc($self->{alternate});
 187     croak "$PACKAGE unknown alternate tag name: $self->{alternate}"
 188         unless exists $AlternateOK{ $self->{alternate} };
 189
 190     if (! defined $self->{backwards}) {
 191         $self->{backwardsFlag} = 0;
 192     }
 193     elsif (! ref $self->{backwards}) {
 194         _checkLevel($self->{backwards}, "backwards");
 195         $self->{backwardsFlag} = 1 << $self->{backwards};
 196     }
 197     else {
 198         my %level;
 199         $self->{backwardsFlag} = 0;
 200         for my $b (@{ $self->{backwards} }) {
 201             _checkLevel($b, "backwards");
 202             $level{$b} = 1;
 203         }
 204         for my $v (sort keys %level) {
 205             $self->{backwardsFlag} += 1 << $v;
 206         }
 207     }
 208
 209     $self->{rearrange} = []
 210         if ! defined $self->{rearrange};
 211     croak "$PACKAGE: A list for rearrangement must be store in an ARRAYREF"
 212         if ! ref $self->{rearrange};
 213
 214     # keys of $self->{rearrangeHash} are $self->{rearrange}.
 215     $self->{rearrangeHash} = undef;
 216
 217     if (@{ $self->{rearrange} }) {
 218         @{ $self->{rearrangeHash} }{ @{ $self->{rearrange} } } = ();
 219     }
 220
 221     $self->{normCode} = undef;
 222
 223     if (defined $self->{normalization}) {
 224         eval { require Unicode::Normalize };
 225         croak "Unicode/Normalize.pm is required to normalize strings: $@"
 226             if $@;
 227
 228         $CVgetCombinClass = \&Unicode::Normalize::getCombinClass
 229             if ! $CVgetCombinClass;
 230
 231         if ($self->{normalization} ne 'prenormalized') {
 232             my $norm = $self->{normalization};
 233             $self->{normCode} = sub {
 234                 Unicode::Normalize::normalize($norm, shift);
 235             };
 236             eval { $self->{normCode}->("") }; # try
 237             $@ and croak "$PACKAGE unknown normalization form name: $norm";
 238         }
 239     }
 240     return;
 241 }
 242
 243 sub new
 244 {
 245     my $class = shift;
 246     my $self = bless { @_ }, $class;
 247
 248     # If undef is passed explicitly, no file is read.
 249     $self->{table} = $KeyFile if ! exists $self->{table};
 250     $self->read_table if defined $self->{table};
 251
 252     if ($self->{entry}) {
 253         $self->parseEntry($_) foreach split /\n/, $self->{entry};
 254     }
 255
 256     $self->{level} ||= MaxLevel;
 257     $self->{UCA_Version} ||= UCA_Version();
 258
 259     $self->{overrideHangul} = ''
 260         if ! exists $self->{overrideHangul};
 261     $self->{overrideCJK} = ''
 262         if ! exists $self->{overrideCJK};
 263     $self->{normalization} = 'NFD'
 264         if ! exists $self->{normalization};
 265     $self->{alternate} = $self->{alternateTable} || 'shifted'
 266         if ! exists $self->{alternate};
 267     $self->{rearrange} = $self->{rearrangeTable} || $DefaultRearrange
 268         if ! exists $self->{rearrange};
 269     $self->{backwards} = $self->{backwardsTable}
 270         if ! exists $self->{backwards};
 271
 272     $self->checkCollator;
 273
 274     return $self;
 275 }
 276
 277 sub read_table {
 278     my $self = shift;
 279     my $file = $self->{table} ne '' ? $self->{table} : $KeyFile;
 280
 281     my $filepath = File::Spec->catfile($Path, $file);
 282     open my $fk, "<$filepath"
 283         or croak "File does not exist at $filepath";
 284
 285     while (<$fk>) {
 286         next if /^\s*#/;
 287         if (/^\s*\@/) {
 288             if    (/^\s*\@version\s*(\S*)/) {
 289                 $self->{versionTable} ||= $1;
 290             }
 291             elsif (/^\s*\@alternate\s+(\S*)/) {
 292                 $self->{alternateTable} ||= $1;
 293             }
 294             elsif (/^\s*\@backwards\s+(\S*)/) {
 295                 push @{ $self->{backwardsTable} }, $1;
 296             }
 297             elsif (/^\s*\@forwards\s+(\S*)/) { # parhaps no use
 298                 push @{ $self->{forwardsTable} }, $1;
 299             }
 300             elsif (/^\s*\@rearrange\s+(.*)/) { # (\S*) is NG
 301                 push @{ $self->{rearrangeTable} }, _getHexArray($1);
 302             }
 303             next;
 304         }
 305         $self->parseEntry($_);
 306     }
 307     close $fk;
 308 }
 309
 310
 311 ##
 312 ## get $line, parse it, and write an entry in $self
 313 ##
 314 sub parseEntry
 315 {
 316     my $self = shift;
 317     my $line = shift;
 318     my($name, $entry, @uv, @key);
 319
 320     return if $line !~ /^\s*[0-9A-Fa-f]/;
 321
 322     # removes comment and gets name
 323     $name = $1
 324         if $line =~ s/[#%]\s*(.*)//;
 325     return if defined $self->{undefName} && $name =~ /$self->{undefName}/;
 326
 327     # gets element
 328     my($e, $k) = split /;/, $line;
 329     croak "Wrong Entry: <charList> must be separated by ';' from <collElement>"
 330         if ! $k;
 331
 332     @uv = _getHexArray($e);
 333     return if !@uv;
 334
 335     $entry = join(CODE_SEP, @uv); # in JCPS
 336
 337     if (defined $self->{undefChar} || defined $self->{ignoreChar}) {
 338         my $ele = pack_U(@uv);
 339
 340         # regarded as if it were not entried in the table
 341         return
 342             if defined $self->{undefChar} && $ele =~ /$self->{undefChar}/;
 343
 344         # replaced as completely ignorable
 345         $k = '[.0000.0000.0000.0000]'
 346             if defined $self->{ignoreChar} && $ele =~ /$self->{ignoreChar}/;
 347     }
 348
 349     # replaced as completely ignorable
 350     $k = '[.0000.0000.0000.0000]'
 351         if defined $self->{ignoreName} && $name =~ /$self->{ignoreName}/;
 352
 353     my $is_L3_ignorable = TRUE;
 354
 355     foreach my $arr ($k =~ /\[([^\[\]]+)\]/g) { # SPACEs allowed
 356         my $var = $arr =~ /\*/; # exactly /^\*/ but be lenient.
 357         my @wt = _getHexArray($arr);
 358         push @key, pack(VCE_TEMPLATE, $var, @wt);
 359         $is_L3_ignorable = FALSE
 360             if $wt[0] + $wt[1] + $wt[2] != 0;
 361           # if $arr !~ /[1-9A-Fa-f]/; NG
 362           # Conformance Test shows L3-ignorable is completely ignorable.
 363         # For expansion, an entry $is_L3_ignorable
 364         # if and only if "all" CEs are [.0000.0000.0000].
 365     }
 366
 367     $self->{entries}{$entry} = \@key;
 368
 369     $self->{L3_ignorable}{$uv[0]} = TRUE
 370         if @uv == 1 && $is_L3_ignorable;
 371
 372     # Contraction is to be considered in the range of this maxlength.
 373     $self->{maxlength}{$uv[0]} = scalar @uv
 374         if @uv > 1;
 375 }
 376
 377
 378 ##
 379 ## arrayref[weights] = altCE(VCE)
 380 ##
 381 sub altCE
 382 {
 383     my $self = shift;
 384     my($var, @wt) = unpack(VCE_TEMPLATE, shift);
 385
 386     $self->{alternate} eq 'blanked' ?
 387         $var ? [Var1Wt, 0, 0, $wt[3]] : \@wt :
 388     $self->{alternate} eq 'non-ignorable' ?
 389         \@wt :
 390     $self->{alternate} eq 'shifted' ?
 391         $var ? [Var1Wt, 0, 0, $wt[0] ]
 392              : [ @wt[0..2], $wt[0]+$wt[1]+$wt[2] ? Shift4Wt : 0 ] :
 393     $self->{alternate} eq 'shift-trimmed' ?
 394         $var ? [Var1Wt, 0, 0, $wt[0] ] : [ @wt[0..2], 0 ] :
 395         croak "$PACKAGE unknown alternate name: $self->{alternate}";
 396 }
 397
 398 sub viewSortKey
 399 {
 400     my $self = shift;
 401     $self->visualizeSortKey($self->getSortKey(@_));
 402 }
 403
 404 sub visualizeSortKey
 405 {
 406     my $self = shift;
 407     my $view = join " ", map sprintf("%04X", $_), unpack(KEY_TEMPLATE, shift);
 408
 409     if ($self->{UCA_Version} <= 8) {
 410         $view =~ s/ ?0000 ?/|/g;
 411     } else {
 412         $view =~ s/\b0000\b/|/g;
 413     }
 414     return "[$view]";
 415 }
 416
 417
 418 ##
 419 ## arrayref of JCPS   = splitCE(string to be collated)
 420 ## arrayref of arrayref[JCPS, ini_pos, fin_pos] = splitCE(string, true)
 421 ##
 422 sub splitCE
 423 {
 424     my $self = shift;
 425     my $wLen = $_[1];
 426
 427     my $code = $self->{preprocess};
 428     my $norm = $self->{normCode};
 429     my $ent  = $self->{entries};
 430     my $max  = $self->{maxlength};
 431     my $reH  = $self->{rearrangeHash};
 432     my $ign  = $self->{L3_ignorable};
 433     my $ver9 = $self->{UCA_Version} > 8;
 434
 435     my ($str, @buf);
 436
 437     if ($wLen) {
 438         $code and croak "Preprocess breaks character positions. "
 439                         . "Don't use with index(), match(), etc.";
 440         $norm and croak "Normalization breaks character positions. "
 441                         . "Don't use with index(), match(), etc.";
 442         $str = $_[0];
 443     }
 444     else {
 445         $str = $_[0];
 446         $str = &$code($str) if ref $code;
 447         $str = &$norm($str) if ref $norm;
 448     }
 449
 450     # get array of Unicode code point of string.
 451     my @src = unpack_U($str);
 452
 453     # rearrangement:
 454     # Character positions are not kept if rearranged,
 455     # then neglected if $wLen is true.
 456     if ($reH && ! $wLen) {
 457         for (my $i = 0; $i < @src; $i++) {
 458             if (exists $reH->{ $src[$i] } && $i + 1 < @src) {
 459                 ($src[$i], $src[$i+1]) = ($src[$i+1], $src[$i]);
 460                 $i++;
 461             }
 462         }
 463     }
 464
 465     if ($ver9) {
 466         # To remove a character marked as a completely ignorable.
 467         for (my $i = 0; $i < @src; $i++) {
 468             $src[$i] = undef if $ign->{ $src[$i] };
 469         }
 470     }
 471
 472     for (my $i = 0; $i < @src; $i++) {
 473         next if _isNonCharacter($src[$i]);
 474
 475         my $i_orig = $i;
 476         my $ce = $src[$i];
 477
 478         if ($max->{$ce}) { # contract
 479             my $temp_ce = $ce;
 480             my $ceLen = 1;
 481             my $maxLen = $max->{$ce};
 482
 483             for (my $p = $i + 1; $ceLen < $maxLen && $p < @src; $p++) {
 484                 next if ! defined $src[$p];
 485                 $temp_ce .= CODE_SEP . $src[$p];
 486                 $ceLen++;
 487                 if ($ent->{$temp_ce}) {
 488                     $ce = $temp_ce;
 489                     $i = $p;
 490                 }
 491             }
 492
 493         # not-contiguous contraction with Combining Char (cf. UTS#10, S2.1).
 494         # This process requires Unicode::Normalize.
 495         # If "normalize" is undef, here should be skipped *always*
 496         # (in spite of bool value of $CVgetCombinClass),
 497         # since canonical ordering cannot be expected.
 498         # Blocked combining character should not be contracted.
 499
 500             if ($self->{normalization})
 501             # $self->{normCode} is false in the case of "prenormalized".
 502             {
 503                 my $preCC = 0;
 504                 my $curCC = 0;
 505
 506                 for (my $p = $i + 1; $p < @src; $p++) {
 507                     next if ! defined $src[$p];
 508                     $curCC = $CVgetCombinClass->($src[$p]);
 509                     last unless $curCC;
 510                     my $tail = CODE_SEP . $src[$p];
 511                     if ($preCC != $curCC && $ent->{$ce.$tail}) {
 512                         $ce .= $tail;
 513                         $src[$p] = undef;
 514                     } else {
 515                         $preCC = $curCC;
 516                     }
 517                 }
 518             }
 519         }
 520
 521         if ($wLen) {
 522             for (my $p = $i + 1; $p < @src; $p++) {
 523                 last if defined $src[$p];
 524                 $i = $p;
 525             }
 526         }
 527
 528         push @buf, $wLen ? [$ce, $i_orig, $i + 1] : $ce;
 529     }
 530     return \@buf;
 531 }
 532
 533
 534 ##
 535 ## list of arrayrefs of weights = getWt(JCPS)
 536 ##
 537 sub getWt
 538 {
 539     my $self = shift;
 540     my $ce   = shift;
 541     my $ent  = $self->{entries};
 542     my $der  = $self->{derivCode};
 543
 544     return if !defined $ce;
 545     return map($self->altCE($_), @{ $ent->{$ce} })
 546         if $ent->{$ce};
 547
 548     # CE must not be a contraction, then it's a code point.
 549     my $u = $ce;
 550
 551     if (0xAC00 <= $u && $u <= 0xD7A3) { # is Hangul Syllale
 552         my $hang = $self->{overrideHangul};
 553         my @hangulCE;
 554         if ($hang) {
 555             @hangulCE = map(pack(VCE_TEMPLATE, NON_VAR, @$_), &$hang($u));
 556         }
 557         elsif (!defined $hang) {
 558             @hangulCE = $der->($u);
 559         }
 560         else {
 561             my $max  = $self->{maxlength};
 562             my @decH = _decompHangul($u);
 563
 564             if (@decH == 2) {
 565                 my $contract = join(CODE_SEP, @decH);
 566                 @decH = ($contract) if $ent->{$contract};
 567             } else { # must be <@decH == 3>
 568                 if ($max->{$decH[0]}) {
 569                     my $contract = join(CODE_SEP, @decH);
 570                     if ($ent->{$contract}) {
 571                         @decH = ($contract);
 572                     } else {
 573                         $contract = join(CODE_SEP, @decH[0,1]);
 574                         $ent->{$contract} and @decH = ($contract, $decH[2]);
 575                     }
 576                     # even if V's ignorable, LT contraction is not supported.
 577                     # If such a situatution were required, NFD should be used.
 578                 }
 579                 if (@decH == 3 && $max->{$decH[1]}) {
 580                     my $contract = join(CODE_SEP, @decH[1,2]);
 581                     $ent->{$contract} and @decH = ($decH[0], $contract);
 582                 }
 583             }
 584
 585             @hangulCE = map({
 586                     $ent->{$_} ? @{ $ent->{$_} } : $der->($_);
 587                 } @decH);
 588         }
 589         return map $self->altCE($_), @hangulCE;
 590     }
 591     elsif (0x3400 <= $u && $u <= 0x4DB5 ||
 592            0x4E00 <= $u && $u <= 0x9FA5 ||
 593            0x20000 <= $u && $u <= 0x2A6D6) # CJK Ideograph
 594     {
 595         my $cjk  = $self->{overrideCJK};
 596         return map $self->altCE($_),
 597             $cjk
 598                 ? map(pack(VCE_TEMPLATE, NON_VAR, @$_), &$cjk($u))
 599                 : defined $cjk && $self->{UCA_Version} <= 8 && $u < 0x10000
 600                     ? pack(VCE_TEMPLATE, NON_VAR, $u, Min2Wt, Min3Wt, $u)
 601                     : $der->($u);
 602     }
 603     else {
 604         return map $self->altCE($_), $der->($u);
 605     }
 606 }
 607
 608
 609 ##
 610 ## string sortkey = getSortKey(string arg)
 611 ##
 612 sub getSortKey
 613 {
 614     my $self = shift;
 615     my $lev  = $self->{level};
 616     my $rCE  = $self->splitCE(shift); # get an arrayref of JCPS
 617     my $ver9 = $self->{UCA_Version} > 8;
 618     my $v2i  = $self->{alternate} ne 'non-ignorable';
 619
 620     # weight arrays
 621     my (@buf, $last_is_variable);
 622
 623     foreach my $wt (map $self->getWt($_), @$rCE) {
 624         if ($v2i && $ver9) {
 625             if ($wt->[0] == 0) { # ignorable
 626                 next if $last_is_variable;
 627             } else {
 628                 $last_is_variable = ($wt->[0] == Var1Wt);
 629             }
 630         }
 631         push @buf, $wt;
 632     }
 633
 634     # make sort key
 635     my @ret = ([],[],[],[]);
 636     foreach my $v (0..$lev-1) {
 637         foreach my $b (@buf) {
 638             push @{ $ret[$v] }, $b->[$v]
 639                 if 0 < $b->[$v];
 640         }
 641     }
 642
 643     # modification of tertiary weights
 644     if ($self->{upper_before_lower}) {
 645         foreach (@{ $ret[2] }) {
 646             if    (0x8 <= $_ && $_ <= 0xC) { $_ -= 6 } # lower
 647             elsif (0x2 <= $_ && $_ <= 0x6) { $_ += 6 } # upper
 648             elsif ($_ == 0x1C)             { $_ += 1 } # square upper
 649             elsif ($_ == 0x1D)             { $_ -= 1 } # square lower
 650         }
 651     }
 652     if ($self->{katakana_before_hiragana}) {
 653         foreach (@{ $ret[2] }) {
 654             if    (0x0F <= $_ && $_ <= 0x13) { $_ -= 2 } # katakana
 655             elsif (0x0D <= $_ && $_ <= 0x0E) { $_ += 5 } # hiragana
 656         }
 657     }
 658
 659     if ($self->{backwardsFlag}) {
 660         for (my $v = MinLevel; $v <= MaxLevel; $v++) {
 661             if ($self->{backwardsFlag} & (1 << $v)) {
 662                 @{ $ret[$v-1] } = reverse @{ $ret[$v-1] };
 663             }
 664         }
 665     }
 666
 667     join LEVEL_SEP, map pack(KEY_TEMPLATE, @$_), @ret;
 668 }
 669
 670
 671 ##
 672 ## int compare = cmp(string a, string b)
 673 ##
 674 sub cmp { $_[0]->getSortKey($_[1]) cmp $_[0]->getSortKey($_[2]) }
 675 sub eq  { $_[0]->getSortKey($_[1]) eq  $_[0]->getSortKey($_[2]) }
 676 sub ne  { $_[0]->getSortKey($_[1]) ne  $_[0]->getSortKey($_[2]) }
 677 sub lt  { $_[0]->getSortKey($_[1]) lt  $_[0]->getSortKey($_[2]) }
 678 sub le  { $_[0]->getSortKey($_[1]) le  $_[0]->getSortKey($_[2]) }
 679 sub gt  { $_[0]->getSortKey($_[1]) gt  $_[0]->getSortKey($_[2]) }
 680 sub ge  { $_[0]->getSortKey($_[1]) ge  $_[0]->getSortKey($_[2]) }
 681
 682 ##
 683 ## list[strings] sorted = sort(list[strings] arg)
 684 ##
 685 sub sort {
 686     my $obj = shift;
 687     return
 688         map { $_->[1] }
 689             sort{ $a->[0] cmp $b->[0] }
 690                 map [ $obj->getSortKey($_), $_ ], @_;
 691 }
 692
 693
 694 sub _derivCE_9 {
 695     my $u = shift;
 696     my $base =
 697         (0x4E00 <= $u && $u <= 0x9FA5)
 698             ? 0xFB40 : # CJK
 699         (0x3400 <= $u && $u <= 0x4DB5 || 0x20000 <= $u && $u <= 0x2A6D6)
 700             ? 0xFB80   # CJK ext.
 701             : 0xFBC0;  # others
 702
 703     my $aaaa = $base + ($u >> 15);
 704     my $bbbb = ($u & 0x7FFF) | 0x8000;
 705     return
 706         pack(VCE_TEMPLATE, NON_VAR, $aaaa, Min2Wt, Min3Wt, $u),
 707         pack(VCE_TEMPLATE, NON_VAR, $bbbb,      0,      0, $u);
 708 }
 709
 710 sub _derivCE_8 {
 711     my $code = shift;
 712     my $aaaa =  0xFF80 + ($code >> 15);
 713     my $bbbb = ($code & 0x7FFF) | 0x8000;
 714     return
 715         pack(VCE_TEMPLATE, NON_VAR, $aaaa, 2, 1, $code),
 716         pack(VCE_TEMPLATE, NON_VAR, $bbbb, 0, 0, $code);
 717 }
 718
 719 ##
 720 ## "hhhh hhhh hhhh" to (dddd, dddd, dddd)
 721 ##
 722 sub _getHexArray { map hex, $_[0] =~ /([0-9a-fA-F]+)/g }
 723
 724 #
 725 # $code *must* be in Hangul syllable.
 726 # Check it before you enter here.
 727 #
 728 sub _decompHangul {
 729     my $code = shift;
 730     my $SIndex = $code - 0xAC00;
 731     my $LIndex = int( $SIndex / 588);
 732     my $VIndex = int(($SIndex % 588) / 28);
 733     my $TIndex =      $SIndex % 28;
 734     return (
 735         0x1100 + $LIndex,
 736         0x1161 + $VIndex,
 737         $TIndex ? (0x11A7 + $TIndex) : (),
 738     );
 739 }
 740
 741 sub _isNonCharacter {
 742     my $code = shift;
 743     return ! defined $code                      # removed
 744         || ($code < 0 || 0x10FFFF < $code)      # out of range
 745         || (($code & 0xFFFE) == 0xFFFE)         # ??FFF[EF] (cf. utf8.c)
 746         || (0xD800 <= $code && $code <= 0xDFFF) # unpaired surrogates
 747         || (0xFDD0 <= $code && $code <= 0xFDEF) # other non-characters
 748     ;
 749 }
 750
 751
 752 ##
 753 ## bool _nonIgnorAtLevel(arrayref weights, int level)
 754 ##
 755 sub _nonIgnorAtLevel($$)
 756 {
 757     my $wt = shift;
 758     return if ! defined $wt;
 759     my $lv = shift;
 760     return grep($wt->[$_-1] != 0, MinLevel..$lv) ? TRUE : FALSE;
 761 }
 762
 763 ##
 764 ## bool _eqArray(
 765 ##    arrayref of arrayref[weights] source,
 766 ##    arrayref of arrayref[weights] substr,
 767 ##    int level)
 768 ## * comparison of graphemes vs graphemes.
 769 ##   @$source >= @$substr must be true (check it before call this);
 770 ##
 771 sub _eqArray($$$)
 772 {
 773     my $source = shift;
 774     my $substr = shift;
 775     my $lev = shift;
 776
 777     for my $g (0..@$substr-1){
 778         # Do the $g'th graphemes have the same number of AV weigths?
 779         return if @{ $source->[$g] } != @{ $substr->[$g] };
 780
 781         for my $w (0..@{ $substr->[$g] }-1) {
 782             for my $v (0..$lev-1) {
 783                 return if $source->[$g][$w][$v] != $substr->[$g][$w][$v];
 784             }
 785         }
 786     }
 787     return 1;
 788 }
 789
 790 ##
 791 ## (int position, int length)
 792 ## int position = index(string, substring, position, [undoc'ed grobal])
 793 ##
 794 ## With "grobal" (only for the list context),
 795 ##  returns list of arrayref[position, length].
 796 ##
 797 sub index
 798 {
 799     my $self  = shift;
 800     my $str   = shift;
 801     my $len   = length($str);
 802     my $subCE = $self->splitCE(shift);
 803     my $pos   = @_ ? shift : 0;
 804        $pos   = 0 if $pos < 0;
 805     my $grob  = shift;
 806
 807     my $lev   = $self->{level};
 808     my $ver9  = $self->{UCA_Version} > 8;
 809     my $v2i   = $self->{alternate} ne 'non-ignorable';
 810
 811     if (! @$subCE) {
 812         my $temp = $pos <= 0 ? 0 : $len <= $pos ? $len : $pos;
 813         return $grob
 814             ? map([$_, 0], $temp..$len)
 815             : wantarray ? ($temp,0) : $temp;
 816     }
 817     if ($len < $pos) {
 818         return wantarray ? () : NOMATCHPOS;
 819     }
 820     my $strCE = $self->splitCE($pos ? substr($str, $pos) : $str, TRUE);
 821     if (! @$strCE) {
 822         return wantarray ? () : NOMATCHPOS;
 823     }
 824     my $last_is_variable;
 825     my(@strWt, @iniPos, @finPos, @subWt, @g_ret);
 826
 827     $last_is_variable = FALSE;
 828     for my $wt (map $self->getWt($_), @$subCE) {
 829         my $to_be_pushed = _nonIgnorAtLevel($wt,$lev);
 830
 831         if ($v2i && $ver9) {
 832             if ($wt->[0] == 0) {
 833                 $to_be_pushed = FALSE if $last_is_variable;
 834             } else {
 835                 $last_is_variable = ($wt->[0] == Var1Wt);
 836             }
 837         }
 838
 839         if (@subWt && $wt->[0] == 0) {
 840             push @{ $subWt[-1] }, $wt if $to_be_pushed;
 841         } else {
 842             $wt->[0] = 0 if $wt->[0] == Var1Wt;
 843             push @subWt, [ $wt ];
 844         }
 845     }
 846
 847     my $count = 0;
 848     my $end = @$strCE - 1;
 849
 850     $last_is_variable = FALSE;
 851
 852     for (my $i = 0; $i <= $end; ) { # no $i++
 853         my $found_base = 0;
 854
 855         # fetch a grapheme
 856         while ($i <= $end && $found_base == 0) {
 857             for my $wt ($self->getWt($strCE->[$i][0])) {
 858                 my $to_be_pushed = _nonIgnorAtLevel($wt,$lev);
 859
 860                 if ($v2i && $ver9) {
 861                     if ($wt->[0] == 0) {
 862                         $to_be_pushed = FALSE if $last_is_variable;
 863                     } else {
 864                         $last_is_variable = ($wt->[0] == Var1Wt);
 865                     }
 866                 }
 867
 868                 if (@strWt && $wt->[0] == 0) {
 869                     push @{ $strWt[-1] }, $wt if $to_be_pushed;
 870                     $finPos[-1] = $strCE->[$i][2];
 871                 } elsif ($to_be_pushed) {
 872                     $wt->[0] = 0 if $wt->[0] == Var1Wt;
 873                     push @strWt,  [ $wt ];
 874                     push @iniPos, $found_base ? NOMATCHPOS : $strCE->[$i][1];
 875                     $finPos[-1] = NOMATCHPOS if $found_base;
 876                     push @finPos, $strCE->[$i][2];
 877                     $found_base++;
 878                 }
 879                 # else ===> no-op
 880             }
 881             $i++;
 882         }
 883
 884         # try to match
 885         while ( @strWt > @subWt || (@strWt == @subWt && $i > $end) ) {
 886             if ($iniPos[0] != NOMATCHPOS &&
 887                     $finPos[$#subWt] != NOMATCHPOS &&
 888                         _eqArray(\@strWt, \@subWt, $lev)) {
 889                 my $temp = $iniPos[0] + $pos;
 890
 891                 if ($grob) {
 892                     push @g_ret, [$temp, $finPos[$#subWt] - $iniPos[0]];
 893                     splice @strWt,  0, $#subWt;
 894                     splice @iniPos, 0, $#subWt;
 895                     splice @finPos, 0, $#subWt;
 896                 }
 897                 else {
 898                     return wantarray
 899                         ? ($temp, $finPos[$#subWt] - $iniPos[0])
 900                         :  $temp;
 901                 }
 902             }
 903             shift @strWt;
 904             shift @iniPos;
 905             shift @finPos;
 906         }
 907     }
 908
 909     return $grob
 910         ? @g_ret
 911         : wantarray ? () : NOMATCHPOS;
 912 }
 913
 914 ##
 915 ## scalarref to matching part = match(string, substring)
 916 ##
 917 sub match
 918 {
 919     my $self = shift;
 920     if (my($pos,$len) = $self->index($_[0], $_[1])) {
 921         my $temp = substr($_[0], $pos, $len);
 922         return wantarray ? $temp : \$temp;
 923         # An lvalue ref \substr should be avoided,
 924         # since its value is affected by modification of its referent.
 925     }
 926     else {
 927         return;
 928     }
 929 }
 930
 931 ##
 932 ## arrayref matching parts = gmatch(string, substring)
 933 ##
 934 sub gmatch
 935 {
 936     my $self = shift;
 937     my $str  = shift;
 938     my $sub  = shift;
 939     return map substr($str, $_->[0], $_->[1]),
 940                 $self->index($str, $sub, 0, 'g');
 941 }
 942
 943 ##
 944 ## bool subst'ed = subst(string, substring, replace)
 945 ##
 946 sub subst
 947 {
 948     my $self = shift;
 949     my $code = ref $_[2] eq 'CODE' ? $_[2] : FALSE;
 950
 951     if (my($pos,$len) = $self->index($_[0], $_[1])) {
 952         if ($code) {
 953             my $mat = substr($_[0], $pos, $len);
 954             substr($_[0], $pos, $len, $code->($mat));
 955         } else {
 956             substr($_[0], $pos, $len, $_[2]);
 957         }
 958         return TRUE;
 959     }
 960     else {
 961         return FALSE;
 962     }
 963 }
 964
 965 ##
 966 ## int count = gsubst(string, substring, replace)
 967 ##
 968 sub gsubst
 969 {
 970     my $self = shift;
 971     my $code = ref $_[2] eq 'CODE' ? $_[2] : FALSE;
 972     my $cnt = 0;
 973
 974     # Replacement is carried out from the end, then use reverse.
 975     for my $pos_len (reverse $self->index($_[0], $_[1], 0, 'g')) {
 976         if ($code) {
 977             my $mat = substr($_[0], $pos_len->[0], $pos_len->[1]);
 978             substr($_[0], $pos_len->[0], $pos_len->[1], $code->($mat));
 979         } else {
 980             substr($_[0], $pos_len->[0], $pos_len->[1], $_[2]);
 981         }
 982         $cnt++;
 983     }
 984     return $cnt;
 985 }
 986
 987 1;
 988 __END__
 989
 990 =head1 NAME
 991
 992 Unicode::Collate - Unicode Collation Algorithm
 993
 994 =head1 SYNOPSIS
 995
 996   use Unicode::Collate;
 997
 998   #construct
 999   $Collator = Unicode::Collate->new(%tailoring);
1000
1001   #sort
1002   @sorted = $Collator->sort(@not_sorted);
1003
1004   #compare
1005   $result = $Collator->cmp($a, $b); # returns 1, 0, or -1.
1006
1007 =head1 DESCRIPTION
1008
1009 This module is an implementation
1010 of Unicode Technical Standard #10 (UTS #10)
1011 "Unicode Collation Algorithm."
1012
1013 =head2 Constructor and Tailoring
1014
1015 The C<new> method returns a collator object.
1016
1017    $Collator = Unicode::Collate->new(
1018       UCA_Version => $UCA_Version,
1019       alternate => $alternate,
1020       backwards => $levelNumber, # or \@levelNumbers
1021       entry => $element,
1022       normalization  => $normalization_form,
1023       ignoreName => qr/$ignoreName/,
1024       ignoreChar => qr/$ignoreChar/,
1025       katakana_before_hiragana => $bool,
1026       level => $collationLevel,
1027       overrideCJK => \&overrideCJK,
1028       overrideHangul => \&overrideHangul,
1029       preprocess => \&preprocess,
1030       rearrange => \@charList,
1031       table => $filename,
1032       undefName => qr/$undefName/,
1033       undefChar => qr/$undefChar/,
1034       upper_before_lower => $bool,
1035    );
1036    # if %tailoring is false (i.e. empty),
1037    # $Collator should do the default collation.
1038
1039 =over 4
1040
1041 =item UCA_Version
1042
1043 If the version number of the older UCA is given,
1044 the older behavior of that version is emulated on collating.
1045 If omitted, the return value of C<UCA_Version()> is used.
1046
1047 The supported version: 8 or 9.
1048
1049 B<This parameter may be removed in the future version,
1050 as switching the algorithm would affect the performance.>
1051
1052 =item alternate
1053
1054 -- see 3.2.2 Variable Weighting, UTS #10.
1055
1056 (the title in UCA version 8: Alternate Weighting)
1057
1058 This key allows to alternate weighting for variable collation elements,
1059 which are marked with an ASTERISK in the table
1060 (NOTE: Many punction marks and symbols are variable in F<allkeys.txt>).
1061
1062    alternate => 'blanked', 'non-ignorable', 'shifted', or 'shift-trimmed'.
1063
1064 These names are case-insensitive.
1065 By default (if specification is omitted), 'shifted' is adopted.
1066
1067    'Blanked'        Variable elements are made ignorable at levels 1 through 3;
1068                     considered at the 4th level.
1069
1070    'Non-ignorable'  Variable elements are not reset to ignorable.
1071
1072    'Shifted'        Variable elements are made ignorable at levels 1 through 3
1073                     their level 4 weight is replaced by the old level 1 weight.
1074                     Level 4 weight for Non-Variable elements is 0xFFFF.
1075
1076    'Shift-Trimmed'  Same as 'shifted', but all FFFF's at the 4th level
1077                     are trimmed.
1078
1079 =item backwards
1080
1081 -- see 3.1.2 French Accents, UTS #10.
1082
1083      backwards => $levelNumber or \@levelNumbers
1084
1085 Weights in reverse order; ex. level 2 (diacritic ordering) in French.
1086 If omitted, forwards at all the levels.
1087
1088 =item entry
1089
1090 -- see 3.1 Linguistic Features; 3.2.1 File Format, UTS #10.
1091
1092 Overrides a default order or defines additional collation elements
1093
1094   entry => <<'ENTRIES', # use the UCA file format
1095 00E6 ; [.0861.0020.0002.00E6] [.08B1.0020.0002.00E6] # ligature <ae> as <a><e>
1096 0063 0068 ; [.0893.0020.0002.0063]      # "ch" in traditional Spanish
1097 0043 0068 ; [.0893.0020.0008.0043]      # "Ch" in traditional Spanish
1098 ENTRIES
1099
1100 B<NOTE:> The code point in the UCA file format (before C<';'>)
1101 B<must> be a Unicode code point, but not a native code point.
1102 So C<0063> must always denote C<U+0063>,
1103 but not a character of C<"\x63">.
1104
1105 =item ignoreName
1106
1107 =item ignoreChar
1108
1109 -- see Completely Ignorable, 3.2.2 Variable Weighting, UTS #10.
1110
1111 Makes the entry in the table completely ignorable;
1112 i.e. as if the weights were zero at all level.
1113
1114 E.g. when 'a' and 'e' are ignorable,
1115 'element' is equal to 'lament' (or 'lmnt').
1116
1117 =item level
1118
1119 -- see 4.3 Form a sort key for each string, UTS #10.
1120
1121 Set the maximum level.
1122 Any higher levels than the specified one are ignored.
1123
1124   Level 1: alphabetic ordering
1125   Level 2: diacritic ordering
1126   Level 3: case ordering
1127   Level 4: tie-breaking (e.g. in the case when alternate is 'shifted')
1128
1129   ex.level => 2,
1130
1131 If omitted, the maximum is the 4th.
1132
1133 =item normalization
1134
1135 -- see 4.1 Normalize each input string, UTS #10.
1136
1137 If specified, strings are normalized before preparation of sort keys
1138 (the normalization is executed after preprocess).
1139
1140 A form name C<Unicode::Normalize::normalize()> accepts will be applied
1141 as C<$normalization_form>.
1142 Acceptable names include C<'NFD'>, C<'NFC'>, C<'NFKD'>, and C<'NFKC'>.
1143 See C<Unicode::Normalize::normalize()> for detail.
1144 If omitted, C<'NFD'> is used.
1145
1146 L<normalization> is performed after L<preprocess> (if defined).
1147
1148 Furthermore, special values, C<undef> and C<"prenormalized">, can be used,
1149 though they are not concerned with C<Unicode::Normalize::normalize()>.
1150
1151 If C<undef> (not a string C<"undef">) is passed explicitly
1152 as the value for this key,
1153 any normalization is not carried out (this may make tailoring easier
1154 if any normalization is not desired).
1155 Under C<(normalization =E<gt> undef)>, only contiguous contractions
1156 are resolved; e.g. C<A-cedilla-ring> would be primary equal to C<A>,
1157 even if C<A-ring> (and C<A-ring-cedilla>) is ordered after C<Z>.
1158 In this point,
1159 C<(normalization =E<gt> undef, preprocess =E<gt> sub { NFD(shift) })>
1160 B<is not> equivalent to C<(normalization =E<gt> 'NFD')>.
1161
1162 In the case of C<(normalization =E<gt> "prenormalized")>,
1163 any normalization is not performed, but
1164 non-contiguous contractions with combining characters are performed.
1165 Therefore
1166 C<(normalization =E<gt> 'prenormalized', preprocess =E<gt> sub { NFD(shift) })>
1167 B<is> equivalent to C<(normalization =E<gt> 'NFD')>.
1168 If source strings are finely prenormalized,
1169 C<(normalization =E<gt> 'prenormalized')> may save time for normalization.
1170
1171 Except C<(normalization =E<gt> undef)>,
1172 B<Unicode::Normalize> is required (see also B<CAVEAT>).
1173
1174 =item overrideCJK
1175
1176 -- see 7.1 Derived Collation Elements, UTS #10.
1177
1178 By default, mapping of CJK Unified Ideographs
1179 uses the Unicode codepoint order.
1180 But the mapping of CJK Unified Ideographs may be overrided.
1181
1182 ex. CJK Unified Ideographs in the JIS code point order.
1183
1184   overrideCJK => sub {
1185       my $u = shift;             # get a Unicode codepoint
1186       my $b = pack('n', $u);     # to UTF-16BE
1187       my $s = your_unicode_to_sjis_converter($b); # convert
1188       my $n = unpack('n', $s);   # convert sjis to short
1189       [ $n, 0x20, 0x2, $u ];     # return the collation element
1190   },
1191
1192 ex. ignores all CJK Unified Ideographs.
1193
1194   overrideCJK => sub {()}, # CODEREF returning empty list
1195
1196    # where ->eq("Pe\x{4E00}rl", "Perl") is true
1197    # as U+4E00 is a CJK Unified Ideograph and to be ignorable.
1198
1199 If C<undef> is passed explicitly as the value for this key,
1200 weights for CJK Unified Ideographs are treated as undefined.
1201 But assignment of weight for CJK Unified Ideographs
1202 in table or L<entry> is still valid.
1203
1204 =item overrideHangul
1205
1206 -- see 7.1 Derived Collation Elements, UTS #10.
1207
1208 By default, Hangul Syllables are decomposed into Hangul Jamo.
1209 But the mapping of Hangul Syllables may be overrided.
1210
1211 This tag works like L<overrideCJK>, so see there for examples.
1212
1213 If you want to override the mapping of Hangul Syllables,
1214 the Normalization Forms D and KD are not appropriate
1215 (they will be decomposed before overriding).
1216
1217 If C<undef> is passed explicitly as the value for this key,
1218 weight for Hangul Syllables is treated as undefined
1219 without decomposition into Hangul Jamo.
1220 But definition of weight for Hangul Syllables
1221 in table or L<entry> is still valid.
1222
1223 =item preprocess
1224
1225 -- see 5.1 Preprocessing, UTS #10.
1226
1227 If specified, the coderef is used to preprocess
1228 before the formation of sort keys.
1229
1230 ex. dropping English articles, such as "a" or "the".
1231 Then, "the pen" is before "a pencil".
1232
1233      preprocess => sub {
1234            my $str = shift;
1235            $str =~ s/\b(?:an?|the)\s+//gi;
1236            return $str;
1237         },
1238
1239 L<preprocess> is performed before L<normalization> (if defined).
1240
1241 =item rearrange
1242
1243 -- see 3.1.3 Rearrangement, UTS #10.
1244
1245 Characters that are not coded in logical order and to be rearranged.
1246 By default,
1247
1248     rearrange => [ 0x0E40..0x0E44, 0x0EC0..0x0EC4 ],
1249
1250 If you want to disallow any rearrangement,
1251 pass C<undef> or C<[]> (a reference to an empty list)
1252 as the value for this key.
1253
1254 B<According to the version 9 of UCA, this parameter shall not be used;
1255 but it is not warned at present.>
1256
1257 =item table
1258
1259 -- see 3.2 Default Unicode Collation Element Table, UTS #10.
1260
1261 You can use another element table if desired.
1262 The table file must be put into a directory
1263 where F<Unicode/Collate.pm> is installed.
1264 E.g. in F<perl/lib/Unicode/Collate> directory
1265 when you have F<perl/lib/Unicode/Collate.pm>.
1266
1267 By default, the filename F<"allkeys.txt"> is used.
1268
1269 If C<undef> is passed explicitly as the value for this key,
1270 no file is read (but you can define collation elements via L<entry>).
1271
1272 A typical way to define a collation element table
1273 without any file of table:
1274
1275    $onlyABC = Unicode::Collate->new(
1276        table => undef,
1277        entry => << 'ENTRIES',
1278 0061 ; [.0101.0020.0002.0061] # LATIN SMALL LETTER A
1279 0041 ; [.0101.0020.0008.0041] # LATIN CAPITAL LETTER A
1280 0062 ; [.0102.0020.0002.0062] # LATIN SMALL LETTER B
1281 0042 ; [.0102.0020.0008.0042] # LATIN CAPITAL LETTER B
1282 0063 ; [.0103.0020.0002.0063] # LATIN SMALL LETTER C
1283 0043 ; [.0103.0020.0008.0043] # LATIN CAPITAL LETTER C
1284 ENTRIES
1285     );
1286
1287 =item undefName
1288
1289 =item undefChar
1290
1291 -- see 6.3.4 Reducing the Repertoire, UTS #10.
1292
1293 Undefines the collation element as if it were unassigned in the table.
1294 This reduces the size of the table.
1295 If an unassigned character appears in the string to be collated,
1296 the sort key is made from its codepoint
1297 as a single-character collation element,
1298 as it is greater than any other assigned collation elements
1299 (in the codepoint order among the unassigned characters).
1300 But, it'd be better to ignore characters
1301 unfamiliar to you and maybe never used.
1302
1303 =item katakana_before_hiragana
1304
1305 =item upper_before_lower
1306
1307 -- see 6.6 Case Comparisons; 7.3.1 Tertiary Weight Table, UTS #10.
1308
1309 By default, lowercase is before uppercase
1310 and hiragana is before katakana.
1311
1312 If the tag is made true, this is reversed.
1313
1314 B<NOTE>: These tags simplemindedly assume
1315 any lowercase/uppercase or hiragana/katakana distinctions
1316 must occur in level 3, and their weights at level 3
1317 must be same as those mentioned in 7.3.1, UTS #10.
1318 If you define your collation elements which violate this requirement,
1319 these tags don't work validly.
1320
1321 =back
1322
1323 =head2 Methods for Collation
1324
1325 =over 4
1326
1327 =item C<@sorted = $Collator-E<gt>sort(@not_sorted)>
1328
1329 Sorts a list of strings.
1330
1331 =item C<$result = $Collator-E<gt>cmp($a, $b)>
1332
1333 Returns 1 (when C<$a> is greater than C<$b>)
1334 or 0 (when C<$a> is equal to C<$b>)
1335 or -1 (when C<$a> is lesser than C<$b>).
1336
1337 =item C<$result = $Collator-E<gt>eq($a, $b)>
1338
1339 =item C<$result = $Collator-E<gt>ne($a, $b)>
1340
1341 =item C<$result = $Collator-E<gt>lt($a, $b)>
1342
1343 =item C<$result = $Collator-E<gt>le($a, $b)>
1344
1345 =item C<$result = $Collator-E<gt>gt($a, $b)>
1346
1347 =item C<$result = $Collator-E<gt>ge($a, $b)>
1348
1349 They works like the same name operators as theirs.
1350
1351    eq : whether $a is equal to $b.
1352    ne : whether $a is not equal to $b.
1353    lt : whether $a is lesser than $b.
1354    le : whether $a is lesser than $b or equal to $b.
1355    gt : whether $a is greater than $b.
1356    ge : whether $a is greater than $b or equal to $b.
1357
1358 =item C<$sortKey = $Collator-E<gt>getSortKey($string)>
1359
1360 -- see 4.3 Form a sort key for each string, UTS #10.
1361
1362 Returns a sort key.
1363
1364 You compare the sort keys using a binary comparison
1365 and get the result of the comparison of the strings using UCA.
1366
1367    $Collator->getSortKey($a) cmp $Collator->getSortKey($b)
1368
1369       is equivalent to
1370
1371    $Collator->cmp($a, $b)
1372
1373 =item C<$sortKeyForm = $Collator-E<gt>viewSortKey($string)>
1374
1375    use Unicode::Collate;
1376    my $c = Unicode::Collate->new();
1377    print $c->viewSortKey("Perl"),"\n";
1378
1379    # output:
1380    # [0B67 0A65 0B7F 0B03 | 0020 0020 0020 0020 | 0008 0002 0002 0002 | FFFF FFFF FFFF FFFF]
1381    #  Level 1               Level 2               Level 3               Level 4
1382
1383     (If C<UCA_Version> is 8, the output is slightly different.)
1384
1385 =back
1386
1387 =head2 Methods for Searching
1388
1389 B<DISCLAIMER:> If C<preprocess> or C<normalization> tag is true
1390 for C<$Collator>, calling these methods (C<index>, C<match>, C<gmatch>,
1391 C<subst>, C<gsubst>) is croaked,
1392 as the position and the length might differ
1393 from those on the specified string.
1394 (And the C<rearrange> tag is neglected.)
1395
1396 The C<match>, C<gmatch>, C<subst>, C<gsubst> methods work
1397 like C<m//>, C<m//g>, C<s///>, C<s///g>, respectively,
1398 but they are not aware of any pattern, but only a literal substring.
1399
1400 =over 4
1401
1402 =item C<$position = $Collator-E<gt>index($string, $substring[, $position])>
1403
1404 =item C<($position, $length) = $Collator-E<gt>index($string, $substring[, $position])>
1405
1406 If C<$substring> matches a part of C<$string>, returns
1407 the position of the first occurrence of the matching part in scalar context;
1408 in list context, returns a two-element list of
1409 the position and the length of the matching part.
1410
1411 If C<$substring> does not match any part of C<$string>,
1412 returns C<-1> in scalar context and
1413 an empty list in list context.
1414
1415 e.g. you say
1416
1417   my $Collator = Unicode::Collate->new( normalization => undef, level => 1 );
1418                                      # (normalization => undef) is REQUIRED.
1419   my $str = "Ich muß studieren Perl.";
1420   my $sub = "MÜSS";
1421   my $match;
1422   if (my($pos,$len) = $Collator->index($str, $sub)) {
1423       $match = substr($str, $pos, $len);
1424   }
1425
1426 and get C<"muß"> in C<$match> since C<"muß">
1427 is primary equal to C<"MÜSS">.
1428
1429 =item C<$match_ref = $Collator-E<gt>match($string, $substring)>
1430
1431 =item C<($match)   = $Collator-E<gt>match($string, $substring)>
1432
1433 If C<$substring> matches a part of C<$string>, in scalar context, returns
1434 B<a reference to> the first occurrence of the matching part
1435 (C<$match_ref> is always true if matches,
1436 since every reference is B<true>);
1437 in list context, returns the first occurrence of the matching part.
1438
1439 If C<$substring> does not match any part of C<$string>,
1440 returns C<undef> in scalar context and
1441 an empty list in list context.
1442
1443 e.g.
1444
1445     if ($match_ref = $Collator->match($str, $sub)) { # scalar context
1446         print "matches [$$match_ref].\n";
1447     } else {
1448         print "doesn't match.\n";
1449     }
1450
1451      or
1452
1453     if (($match) = $Collator->match($str, $sub)) { # list context
1454         print "matches [$match].\n";
1455     } else {
1456         print "doesn't match.\n";
1457     }
1458
1459 =item C<@match = $Collator-E<gt>gmatch($string, $substring)>
1460
1461 If C<$substring> matches a part of C<$string>, returns
1462 all the matching parts (or matching count in scalar context).
1463
1464 If C<$substring> does not match any part of C<$string>,
1465 returns an empty list.
1466
1467 =item C<$count = $Collator-E<gt>subst($string, $substring, $replacement)>
1468
1469 If C<$substring> matches a part of C<$string>,
1470 the first occurrence of the matching part is replaced by C<$replacement>
1471 (C<$string> is modified) and return C<$count> (always equals to C<1>).
1472
1473 C<$replacement> can be a C<CODEREF>,
1474 taking the matching part as an argument,
1475 and returning a string to replace the matching part
1476 (a bit similar to C<s/(..)/$coderef-E<gt>($1)/e>).
1477
1478 =item C<$count = $Collator-E<gt>gsubst($string, $substring, $replacement)>
1479
1480 If C<$substring> matches a part of C<$string>,
1481 all the occurrences of the matching part is replaced by C<$replacement>
1482 (C<$string> is modified) and return C<$count>.
1483
1484 C<$replacement> can be a C<CODEREF>,
1485 taking the matching part as an argument,
1486 and returning a string to replace the matching part
1487 (a bit similar to C<s/(..)/$coderef-E<gt>($1)/eg>).
1488
1489 e.g.
1490
1491   my $Collator = Unicode::Collate->new( normalization => undef, level => 1 );
1492                                      # (normalization => undef) is REQUIRED.
1493   my $str = "Camel ass came\x{301}l CAMEL horse cAm\0E\0L...";
1494   $Collator->gsubst($str, "camel", sub { "<b>$_[0]</b>" });
1495
1496   # now $str is "<b>Camel</b> ass <b>came\x{301}l</b> <b>CAMEL</b> horse <b>cAm\0E\0L</b>...";
1497   # i.e., all the camels are made bold-faced.
1498
1499 =back
1500
1501 =head2 Other Methods
1502
1503 =over 4
1504
1505 =item C<%old_tailoring = $Collator-E<gt>change(%new_tailoring)>
1506
1507 Change the value of specified keys and returns the changed part.
1508
1509     $Collator = Unicode::Collate->new(level => 4);
1510
1511     $Collator->eq("perl", "PERL"); # false
1512
1513     %old = $Collator->change(level => 2); # returns (level => 4).
1514
1515     $Collator->eq("perl", "PERL"); # true
1516
1517     $Collator->change(%old); # returns (level => 2).
1518
1519     $Collator->eq("perl", "PERL"); # false
1520
1521 Not all C<(key,value)>s are allowed to be changed.
1522 See also C<@Unicode::Collate::ChangeOK> and C<@Unicode::Collate::ChangeNG>.
1523
1524 In the scalar context, returns the modified collator
1525 (but it is B<not> a clone from the original).
1526
1527     $Collator->change(level => 2)->eq("perl", "PERL"); # true
1528
1529     $Collator->eq("perl", "PERL"); # true; now max level is 2nd.
1530
1531     $Collator->change(level => 4)->eq("perl", "PERL"); # false
1532
1533 =item UCA_Version
1534
1535 Returns the version number of UTS #10 this module consults.
1536
1537 =item Base_Unicode_Version
1538
1539 Returns the version number of the Unicode Standard
1540 this module is based on.
1541
1542 =back
1543
1544 =head2 EXPORT
1545
1546 None by default.
1547
1548 =head2 CAVEAT
1549
1550 Use of the C<normalization> parameter requires
1551 the B<Unicode::Normalize> module.
1552
1553 If you need not it (say, in the case when you need not
1554 handle any combining characters),
1555 assign C<normalization =E<gt> undef> explicitly.
1556
1557 -- see 6.5 Avoiding Normalization, UTS #10.
1558
1559 =head2 Conformance Test
1560
1561 The Conformance Test for the UCA is provided
1562 in L<http://www.unicode.org/reports/tr10/CollationTest.html>
1563 and L<http://www.unicode.org/reports/tr10/CollationTest.zip>
1564
1565 For F<CollationTest_SHIFTED.txt>,
1566 a collator via C<Unicode::Collate-E<gt>new( )> should be used;
1567 for F<CollationTest_NON_IGNORABLE.txt>, a collator via
1568 C<Unicode::Collate-E<gt>new(alternate =E<gt> "non-ignorable", level =E<gt> 3)>.
1569
1570 B<Unicode::Normalize is required to try The Conformance Test.>
1571
1572 =head1 AUTHOR
1573
1574 SADAHIRO Tomoyuki, <SADAHIRO@cpan.org>
1575
1576   http://homepage1.nifty.com/nomenclator/perl/
1577
1578   Copyright(C) 2001-2003, SADAHIRO Tomoyuki. Japan. All rights reserved.
1579
1580   This library is free software; you can redistribute it
1581   and/or modify it under the same terms as Perl itself.
1582
1583 =head1 SEE ALSO
1584
1585 =over 4
1586
1587 =item http://www.unicode.org/reports/tr10/
1588
1589 Unicode Collation Algorithm - UTS #10
1590
1591 =item http://www.unicode.org/reports/tr10/allkeys.txt
1592
1593 The Default Unicode Collation Element Table
1594
1595 =item http://www.unicode.org/reports/tr10/CollationTest.html
1596 http://www.unicode.org/reports/tr10/CollationTest.zip
1597
1598 The latest versions of the conformance test for the UCA
1599
1600 =item http://www.unicode.org/reports/tr15/
1601
1602 Unicode Normalization Forms - UAX #15
1603
1604 =item L<Unicode::Normalize>
1605
1606 =back
1607
1608 =cut