lib/Unicode/Collate.pm

   1 package Unicode::Collate;
   2
   3 BEGIN {
   4     unless ("A" eq pack('U', 0x41) || "A" eq pack('U', ord("A"))) {
   5         die "Unicode::Collate cannot stringify a Unicode code point\n";
   6     }
   7 }
   8
   9 use 5.006;
  10 use strict;
  11 use warnings;
  12 use Carp;
  13 use File::Spec;
  14
  15 require Exporter;
  16
  17 our $VERSION = '0.24';
  18 our $PACKAGE = __PACKAGE__;
  19
  20 our @ISA = qw(Exporter);
  21
  22 our %EXPORT_TAGS = ();
  23 our @EXPORT_OK = ();
  24 our @EXPORT = ();
  25
  26 (our $Path = $INC{'Unicode/Collate.pm'}) =~ s/\.pm$//;
  27 our $KeyFile = "allkeys.txt";
  28
  29 our $UNICODE_VERSION;
  30
  31 eval { require Unicode::UCD };
  32
  33 unless ($@) {
  34     $UNICODE_VERSION = Unicode::UCD::UnicodeVersion();
  35 }
  36 else { # Perl 5.6.1
  37     my($f, $fh);
  38     foreach my $d (@INC) {
  39         $f = File::Spec->catfile($d, "unicode", "Unicode.301");
  40         if (open($fh, $f)) {
  41             $UNICODE_VERSION = '3.0.1';
  42             close $fh;
  43             last;
  44         }
  45     }
  46 }
  47
  48 # Perl's boolean
  49 use constant TRUE  => 1;
  50 use constant FALSE => "";
  51 use constant NOMATCHPOS => -1;
  52
  53 # A coderef to get combining class imported from Unicode::Normalize
  54 # (i.e. \&Unicode::Normalize::getCombinClass).
  55 # This is also used as a HAS_UNICODE_NORMALIZE flag.
  56 our $getCombinClass;
  57
  58 # Supported Levels
  59 use constant MinLevel => 1;
  60 use constant MaxLevel => 4;
  61
  62 # Minimum weights at level 2 and 3, respectively
  63 use constant Min2Wt => 0x20;
  64 use constant Min3Wt => 0x02;
  65
  66 # Shifted weight at 4th level
  67 use constant Shift4Wt => 0xFFFF;
  68
  69 # Variable weight at 1st level.
  70 # This is a negative value but should be regarded as zero on collation.
  71 # This is for distinction of variable chars from level 3 ignorable chars.
  72 use constant Var1Wt => -1;
  73
  74
  75 # A boolean for Variable and 16-bit weights at 4 levels of Collation Element
  76 # PROBLEM: The Default Unicode Collation Element Table
  77 # has weights over 0xFFFF at the 4th level.
  78 # The tie-breaking in the variable weights
  79 # other than "shift" (as well as "shift-trimmed") is unreliable.
  80 use constant VCE_TEMPLATE => 'Cn4';
  81
  82 # A sort key: 16-bit weights
  83 # See also the PROBLEM on VCE_TEMPLATE above.
  84 use constant KEY_TEMPLATE => 'n*';
  85
  86 # Level separator in a sort key:
  87 # i.e. pack(KEY_TEMPLATE, 0)
  88 use constant LEVEL_SEP => "\0\0";
  89
  90 # As Unicode code point separator for hash keys.
  91 # A joined code point string (denoted by JCPS below)
  92 # like "65;768" is used for internal processing
  93 # instead of Perl's Unicode string like "\x41\x{300}",
  94 # as the native code point is different from the Unicode code point
  95 # on EBCDIC platform.
  96 # This character must not be included in any stringified
  97 # representation of an integer.
  98 use constant CODE_SEP => ';';
  99
 100 # boolean values of variable weights
 101 use constant NON_VAR => 0; # Non-Variable character
 102 use constant VAR     => 1; # Variable character
 103
 104 # Logical_Order_Exception in PropList.txt
 105 # TODO: synchronization with change of PropList.txt.
 106 our $DefaultRearrange = [ 0x0E40..0x0E44, 0x0EC0..0x0EC4 ];
 107
 108 sub UCA_Version { "9" }
 109
 110 sub Base_Unicode_Version { $UNICODE_VERSION || 'unknown' }
 111
 112 ######
 113
 114 use constant UNICODE_FOR_PACK => ("A" eq pack('U', 0x41));
 115 use constant NATIVE_FOR_PACK  => ("A" eq pack('U', ord("A")));
 116
 117 use constant UNICODE_FOR_UNPACK => (0x41 == unpack('U', "A"));
 118 use constant NATIVE_FOR_UNPACK  => (ord("A") == unpack('U', "A"));
 119
 120 sub pack_U {
 121     return UNICODE_FOR_PACK
 122         ? pack('U*', @_)
 123         : NATIVE_FOR_PACK
 124             ? pack('U*', map utf8::unicode_to_native($_), @_)
 125             : die "$PACKAGE, a Unicode code point cannot be stringified.\n";
 126 }
 127
 128 sub unpack_U {
 129     return UNICODE_FOR_UNPACK
 130         ? unpack('U*', shift)
 131         : NATIVE_FOR_UNPACK
 132             ? map(utf8::native_to_unicode($_), unpack 'U*', shift)
 133             : die "$PACKAGE, a code point returned from unpack U " .
 134                 "cannot be converted into Unicode.\n";
 135 }
 136
 137 ######
 138
 139 my (%AlternateOK);
 140 @AlternateOK{ qw/
 141     blanked  non-ignorable  shifted  shift-trimmed
 142   / } = ();
 143
 144 our @ChangeOK = qw/
 145     alternate backwards level normalization rearrange
 146     katakana_before_hiragana upper_before_lower
 147     overrideHangul overrideCJK preprocess UCA_Version
 148   /;
 149
 150 our @ChangeNG = qw/
 151     entry entries table maxlength
 152     ignoreChar ignoreName undefChar undefName
 153     versionTable alternateTable backwardsTable forwardsTable rearrangeTable
 154     derivCode normCode rearrangeHash L3_ignorable
 155     backwardsFlag
 156   /;
 157 # The hash key 'ignored' is deleted at v 0.21.
 158 # The hash key 'isShift' is deleted at v 0.23.
 159 # The hash key 'combining' is deleted at v 0.24.
 160
 161 my (%ChangeOK, %ChangeNG);
 162 @ChangeOK{ @ChangeOK } = ();
 163 @ChangeNG{ @ChangeNG } = ();
 164
 165 sub change {
 166     my $self = shift;
 167     my %hash = @_;
 168     my %old;
 169     foreach my $k (keys %hash) {
 170         if (exists $ChangeOK{$k}) {
 171             $old{$k} = $self->{$k};
 172             $self->{$k} = $hash{$k};
 173         }
 174         elsif (exists $ChangeNG{$k}) {
 175             croak "change of $k via change() is not allowed!";
 176         }
 177         # else => ignored
 178     }
 179     $self->checkCollator;
 180     return wantarray ? %old : $self;
 181 }
 182
 183 sub _checkLevel {
 184     my $level = shift;
 185     my $key   = shift;
 186     croak sprintf "Illegal level %d (in \$self->{%s}) lower than %d.",
 187         $level, $key, MinLevel if MinLevel > $level;
 188     croak sprintf "Unsupported level %d (in \$self->{%s}) higher than %d ",
 189         $level, $key, MaxLevel if MaxLevel < $level;
 190 }
 191
 192 sub checkCollator {
 193     my $self = shift;
 194     _checkLevel($self->{level}, "level");
 195
 196     $self->{derivCode} =
 197         $self->{UCA_Version} ==  8 ? \&_derivCE_8 :
 198         $self->{UCA_Version} ==  9 ? \&_derivCE_9 :
 199       croak "Illegal UCA version (passed $self->{UCA_Version}).";
 200
 201     $self->{alternate} = lc($self->{alternate});
 202     croak "$PACKAGE unknown alternate tag name: $self->{alternate}"
 203         unless exists $AlternateOK{ $self->{alternate} };
 204
 205     if (! defined $self->{backwards}) {
 206         $self->{backwardsFlag} = 0;
 207     }
 208     elsif (! ref $self->{backwards}) {
 209         _checkLevel($self->{backwards}, "backwards");
 210         $self->{backwardsFlag} = 1 << $self->{backwards};
 211     }
 212     else {
 213         my %level;
 214         $self->{backwardsFlag} = 0;
 215         for my $b (@{ $self->{backwards} }) {
 216             _checkLevel($b, "backwards");
 217             $level{$b} = 1;
 218         }
 219         for my $v (sort keys %level) {
 220             $self->{backwardsFlag} += 1 << $v;
 221         }
 222     }
 223
 224     $self->{rearrange} = []
 225         if ! defined $self->{rearrange};
 226     croak "$PACKAGE: A list for rearrangement must be store in an ARRAYREF"
 227         if ! ref $self->{rearrange};
 228
 229     # keys of $self->{rearrangeHash} are $self->{rearrange}.
 230     $self->{rearrangeHash} = undef;
 231
 232     if (@{ $self->{rearrange} }) {
 233         @{ $self->{rearrangeHash} }{ @{ $self->{rearrange} } } = ();
 234     }
 235
 236     $self->{normCode} = undef;
 237
 238     if (defined $self->{normalization}) {
 239         eval { require Unicode::Normalize };
 240         croak "Unicode/Normalize.pm is required to normalize strings: $@"
 241             if $@;
 242
 243         Unicode::Normalize->import();
 244         $getCombinClass = \&Unicode::Normalize::getCombinClass
 245             if ! $getCombinClass;
 246
 247         $self->{normCode} =
 248             $self->{normalization} =~ /^(?:NF)?C$/  ? \&NFC :
 249             $self->{normalization} =~ /^(?:NF)?D$/  ? \&NFD :
 250             $self->{normalization} =~ /^(?:NF)?KC$/ ? \&NFKC :
 251             $self->{normalization} =~ /^(?:NF)?KD$/ ? \&NFKD :
 252           croak "$PACKAGE unknown normalization form name: "
 253                 . $self->{normalization};
 254     }
 255     return;
 256 }
 257
 258 sub new
 259 {
 260     my $class = shift;
 261     my $self = bless { @_ }, $class;
 262
 263     # If undef is passed explicitly, no file is read.
 264     $self->{table} = $KeyFile if ! exists $self->{table};
 265     $self->read_table if defined $self->{table};
 266
 267     if ($self->{entry}) {
 268         $self->parseEntry($_) foreach split /\n/, $self->{entry};
 269     }
 270
 271     $self->{level} ||= MaxLevel;
 272     $self->{UCA_Version} ||= UCA_Version();
 273
 274     $self->{overrideHangul} = ''
 275         if ! exists $self->{overrideHangul};
 276     $self->{overrideCJK} = ''
 277         if ! exists $self->{overrideCJK};
 278     $self->{normalization} = 'D'
 279         if ! exists $self->{normalization};
 280     $self->{alternate} = $self->{alternateTable} || 'shifted'
 281         if ! exists $self->{alternate};
 282     $self->{rearrange} = $self->{rearrangeTable} || $DefaultRearrange
 283         if ! exists $self->{rearrange};
 284     $self->{backwards} = $self->{backwardsTable}
 285         if ! exists $self->{backwards};
 286
 287     $self->checkCollator;
 288
 289     return $self;
 290 }
 291
 292 sub read_table {
 293     my $self = shift;
 294     my $file = $self->{table} ne '' ? $self->{table} : $KeyFile;
 295
 296     my $filepath = File::Spec->catfile($Path, $file);
 297     open my $fk, "<$filepath"
 298         or croak "File does not exist at $filepath";
 299
 300     while (<$fk>) {
 301         next if /^\s*#/;
 302         if (/^\s*\@/) {
 303             if    (/^\s*\@version\s*(\S*)/) {
 304                 $self->{versionTable} ||= $1;
 305             }
 306             elsif (/^\s*\@alternate\s+(\S*)/) {
 307                 $self->{alternateTable} ||= $1;
 308             }
 309             elsif (/^\s*\@backwards\s+(\S*)/) {
 310                 push @{ $self->{backwardsTable} }, $1;
 311             }
 312             elsif (/^\s*\@forwards\s+(\S*)/) { # parhaps no use
 313                 push @{ $self->{forwardsTable} }, $1;
 314             }
 315             elsif (/^\s*\@rearrange\s+(.*)/) { # (\S*) is NG
 316                 push @{ $self->{rearrangeTable} }, _getHexArray($1);
 317             }
 318             next;
 319         }
 320         $self->parseEntry($_);
 321     }
 322     close $fk;
 323 }
 324
 325
 326 ##
 327 ## get $line, parse it, and write an entry in $self
 328 ##
 329 sub parseEntry
 330 {
 331     my $self = shift;
 332     my $line = shift;
 333     my($name, $entry, @uv, @key);
 334
 335     return if $line !~ /^\s*[0-9A-Fa-f]/;
 336
 337     # removes comment and gets name
 338     $name = $1
 339         if $line =~ s/[#%]\s*(.*)//;
 340     return if defined $self->{undefName} && $name =~ /$self->{undefName}/;
 341
 342     # gets element
 343     my($e, $k) = split /;/, $line;
 344     croak "Wrong Entry: <charList> must be separated by ';' from <collElement>"
 345         if ! $k;
 346
 347     @uv = _getHexArray($e);
 348     return if !@uv;
 349
 350     $entry = join(CODE_SEP, @uv); # in JCPS
 351
 352     if (defined $self->{undefChar} || defined $self->{ignoreChar}) {
 353         my $ele = pack_U(@uv);
 354
 355         # regarded as if it were not entried in the table
 356         return
 357             if defined $self->{undefChar} && $ele =~ /$self->{undefChar}/;
 358
 359         # replaced as completely ignorable
 360         $k = '[.0000.0000.0000.0000]'
 361             if defined $self->{ignoreChar} && $ele =~ /$self->{ignoreChar}/;
 362     }
 363
 364     # replaced as completely ignorable
 365     $k = '[.0000.0000.0000.0000]'
 366         if defined $self->{ignoreName} && $name =~ /$self->{ignoreName}/;
 367
 368     my $is_L3_ignorable;
 369
 370     foreach my $arr ($k =~ /\[([^\[\]]+)\]/g) { # SPACEs allowed
 371         my $var = $arr =~ /\*/; # exactly /^\*/ but be lenient.
 372         my @wt = _getHexArray($arr);
 373         push @key, pack(VCE_TEMPLATE, $var, @wt);
 374         $is_L3_ignorable = TRUE
 375             if $wt[0] + $wt[1] + $wt[2] == 0;
 376           # if $arr !~ /[1-9A-Fa-f]/; NG
 377           # Conformance Test shows L3-ignorable is completely ignorable.
 378     }
 379
 380     $self->{entries}{$entry} = \@key;
 381
 382     $self->{L3_ignorable}{$uv[0]} = TRUE
 383         if @uv == 1 && $is_L3_ignorable;
 384
 385     # Contraction is to be considered in the range of this maxlength.
 386     $self->{maxlength}{$uv[0]} = scalar @uv
 387         if @uv > 1;
 388 }
 389
 390
 391 ##
 392 ## arrayref[weights] = altCE(VCE)
 393 ##
 394 sub altCE
 395 {
 396     my $self = shift;
 397     my($var, @wt) = unpack(VCE_TEMPLATE, shift);
 398
 399     $self->{alternate} eq 'blanked' ?
 400         $var ? [Var1Wt, 0, 0, $wt[3]] : \@wt :
 401     $self->{alternate} eq 'non-ignorable' ?
 402         \@wt :
 403     $self->{alternate} eq 'shifted' ?
 404         $var ? [Var1Wt, 0, 0, $wt[0] ]
 405              : [ @wt[0..2], $wt[0]+$wt[1]+$wt[2] ? Shift4Wt : 0 ] :
 406     $self->{alternate} eq 'shift-trimmed' ?
 407         $var ? [Var1Wt, 0, 0, $wt[0] ] : [ @wt[0..2], 0 ] :
 408         croak "$PACKAGE unknown alternate name: $self->{alternate}";
 409 }
 410
 411 sub viewSortKey
 412 {
 413     my $self = shift;
 414     $self->visualizeSortKey($self->getSortKey(@_));
 415 }
 416
 417 sub visualizeSortKey
 418 {
 419     my $self = shift;
 420     my $view = join " ", map sprintf("%04X", $_), unpack(KEY_TEMPLATE, shift);
 421
 422     if ($self->{UCA_Version} <= 8) {
 423         $view =~ s/ ?0000 ?/|/g;
 424     } else {
 425         $view =~ s/\b0000\b/|/g;
 426     }
 427     return "[$view]";
 428 }
 429
 430
 431 ##
 432 ## arrayref of JCPS   = splitCE(string to be collated)
 433 ## arrayref of arrayref[JCPS, ini_pos, fin_pos] = splitCE(string, true)
 434 ##
 435 sub splitCE
 436 {
 437     my $self = shift;
 438     my $wLen = $_[1];
 439
 440     my $code = $self->{preprocess};
 441     my $norm = $self->{normCode};
 442     my $ent  = $self->{entries};
 443     my $max  = $self->{maxlength};
 444     my $reH  = $self->{rearrangeHash};
 445     my $ign  = $self->{L3_ignorable};
 446     my $ver9 = $self->{UCA_Version} > 8;
 447
 448     my ($str, @buf);
 449
 450     if ($wLen) {
 451         $code and croak "Preprocess breaks character positions. "
 452                         . "Don't use with index(), match(), etc.";
 453         $norm and croak "Normalization breaks character positions. "
 454                         . "Don't use with index(), match(), etc.";
 455         $str = $_[0];
 456     }
 457     else {
 458         $str = $_[0];
 459         $str = &$code($str) if ref $code;
 460         $str = &$norm($str) if ref $norm;
 461     }
 462
 463     # get array of Unicode code point of string.
 464     my @src = unpack_U($str);
 465
 466     # rearrangement:
 467     # Character positions are not kept if rearranged,
 468     # then neglected if $wLen is true.
 469     if ($reH && ! $wLen) {
 470         for (my $i = 0; $i < @src; $i++) {
 471             if (exists $reH->{ $src[$i] } && $i + 1 < @src) {
 472                 ($src[$i], $src[$i+1]) = ($src[$i+1], $src[$i]);
 473                 $i++;
 474             }
 475         }
 476     }
 477
 478     if ($ver9) {
 479         # To remove a character marked as a completely ignorable.
 480         for (my $i = 0; $i < @src; $i++) {
 481             $src[$i] = undef if $ign->{ $src[$i] };
 482         }
 483     }
 484
 485     for (my $i = 0; $i < @src; $i++) {
 486         next if _isNonCharacter($src[$i]);
 487
 488         my $i_orig = $i;
 489         my $ce = $src[$i];
 490
 491         if ($max->{$ce}) { # contract
 492             my $temp_ce = $ce;
 493
 494             for (my $p = $i + 1; $p < @src; $p++) {
 495                 next if ! defined $src[$p];
 496                 $temp_ce .= CODE_SEP . $src[$p];
 497                 if ($ent->{$temp_ce}) {
 498                     $ce = $temp_ce;
 499                     $i = $p;
 500                 }
 501             }
 502         }
 503
 504         # with Combining Char (UTS#10, 4.2.1).
 505         # requires Unicode::Normalize.
 506         # Not be $wLen, as not croaked due to $norm.
 507         if ($getCombinClass) {
 508             for (my $p = $i + 1; $p < @src; $p++) {
 509                 next if ! defined $src[$p];
 510                 last unless $getCombinClass->($src[$p]);
 511                 my $tail = CODE_SEP . $src[$p];
 512                 if ($ent->{$ce.$tail}) {
 513                     $ce .= $tail;
 514                     $src[$p] = undef;
 515                 }
 516             }
 517         }
 518
 519         if ($wLen) {
 520             for (my $p = $i + 1; $p < @src; $p++) {
 521                 last if defined $src[$p];
 522                 $i = $p;
 523             }
 524         }
 525
 526         push @buf, $wLen ? [$ce, $i_orig, $i + 1] : $ce;
 527     }
 528     return \@buf;
 529 }
 530
 531
 532 ##
 533 ## list of arrayrefs of weights = getWt(JCPS)
 534 ##
 535 sub getWt
 536 {
 537     my $self = shift;
 538     my $ce   = shift;
 539     my $ent  = $self->{entries};
 540     my $cjk  = $self->{overrideCJK};
 541     my $hang = $self->{overrideHangul};
 542     my $der  = $self->{derivCode};
 543
 544     return if !defined $ce;
 545     return map($self->altCE($_), @{ $ent->{$ce} })
 546         if $ent->{$ce};
 547
 548     # CE must not be a contraction, then it's a code point.
 549     my $u = $ce;
 550
 551     if (0xAC00 <= $u && $u <= 0xD7A3) { # is Hangul Syllale
 552         return map $self->altCE($_),
 553             $hang
 554                 ? map(pack(VCE_TEMPLATE, NON_VAR, @$_), &$hang($u))
 555                 : defined $hang
 556                     ? map({
 557                             $ent->{$_} ? @{ $ent->{$_} } : $der->($_);
 558                         } _decompHangul($u))
 559                     : $der->($u);
 560     }
 561     elsif (0x3400 <= $u && $u <= 0x4DB5 ||
 562            0x4E00 <= $u && $u <= 0x9FA5 ||
 563            0x20000 <= $u && $u <= 0x2A6D6) { # CJK Ideograph
 564         return map $self->altCE($_),
 565             $cjk
 566                 ? map(pack(VCE_TEMPLATE, NON_VAR, @$_), &$cjk($u))
 567                 : defined $cjk && $self->{UCA_Version} <= 8 && $u < 0x10000
 568                     ? pack(VCE_TEMPLATE, NON_VAR, $u, Min2Wt, Min3Wt, $u)
 569                     : $der->($u);
 570     }
 571     else {
 572         return map $self->altCE($_), $der->($u);
 573     }
 574 }
 575
 576
 577 ##
 578 ## string sortkey = getSortKey(string arg)
 579 ##
 580 sub getSortKey
 581 {
 582     my $self = shift;
 583     my $lev  = $self->{level};
 584     my $rCE  = $self->splitCE(shift); # get an arrayref of JCPS
 585     my $ver9 = $self->{UCA_Version} > 8;
 586     my $v2i  = $self->{alternate} ne 'non-ignorable';
 587
 588     # weight arrays
 589     my (@buf, $last_is_variable);
 590
 591     foreach my $wt (map $self->getWt($_), @$rCE) {
 592         if ($v2i && $ver9) {
 593             if ($wt->[0] == 0) { # ignorable
 594                 next if $last_is_variable;
 595             } else {
 596                 $last_is_variable = ($wt->[0] == Var1Wt);
 597             }
 598         }
 599         push @buf, $wt;
 600     }
 601
 602     # make sort key
 603     my @ret = ([],[],[],[]);
 604     foreach my $v (0..$lev-1) {
 605         foreach my $b (@buf) {
 606             push @{ $ret[$v] }, $b->[$v]
 607                 if 0 < $b->[$v];
 608         }
 609     }
 610
 611     # modification of tertiary weights
 612     if ($self->{upper_before_lower}) {
 613         foreach (@{ $ret[2] }) {
 614             if    (0x8 <= $_ && $_ <= 0xC) { $_ -= 6 } # lower
 615             elsif (0x2 <= $_ && $_ <= 0x6) { $_ += 6 } # upper
 616             elsif ($_ == 0x1C)             { $_ += 1 } # square upper
 617             elsif ($_ == 0x1D)             { $_ -= 1 } # square lower
 618         }
 619     }
 620     if ($self->{katakana_before_hiragana}) {
 621         foreach (@{ $ret[2] }) {
 622             if    (0x0F <= $_ && $_ <= 0x13) { $_ -= 2 } # katakana
 623             elsif (0x0D <= $_ && $_ <= 0x0E) { $_ += 5 } # hiragana
 624         }
 625     }
 626
 627     if ($self->{backwardsFlag}) {
 628         for (my $v = MinLevel; $v <= MaxLevel; $v++) {
 629             if ($self->{backwardsFlag} & (1 << $v)) {
 630                 @{ $ret[$v-1] } = reverse @{ $ret[$v-1] };
 631             }
 632         }
 633     }
 634
 635     join LEVEL_SEP, map pack(KEY_TEMPLATE, @$_), @ret;
 636 }
 637
 638
 639 ##
 640 ## int compare = cmp(string a, string b)
 641 ##
 642 sub cmp { $_[0]->getSortKey($_[1]) cmp $_[0]->getSortKey($_[2]) }
 643 sub eq  { $_[0]->getSortKey($_[1]) eq  $_[0]->getSortKey($_[2]) }
 644 sub ne  { $_[0]->getSortKey($_[1]) ne  $_[0]->getSortKey($_[2]) }
 645 sub lt  { $_[0]->getSortKey($_[1]) lt  $_[0]->getSortKey($_[2]) }
 646 sub le  { $_[0]->getSortKey($_[1]) le  $_[0]->getSortKey($_[2]) }
 647 sub gt  { $_[0]->getSortKey($_[1]) gt  $_[0]->getSortKey($_[2]) }
 648 sub ge  { $_[0]->getSortKey($_[1]) ge  $_[0]->getSortKey($_[2]) }
 649
 650 ##
 651 ## list[strings] sorted = sort(list[strings] arg)
 652 ##
 653 sub sort {
 654     my $obj = shift;
 655     return
 656         map { $_->[1] }
 657             sort{ $a->[0] cmp $b->[0] }
 658                 map [ $obj->getSortKey($_), $_ ], @_;
 659 }
 660
 661
 662 sub _derivCE_9 {
 663     my $u = shift;
 664     my $base =
 665         (0x4E00 <= $u && $u <= 0x9FA5)
 666             ? 0xFB40 : # CJK
 667         (0x3400 <= $u && $u <= 0x4DB5 || 0x20000 <= $u && $u <= 0x2A6D6)
 668             ? 0xFB80   # CJK ext.
 669             : 0xFBC0;  # others
 670
 671     my $aaaa = $base + ($u >> 15);
 672     my $bbbb = ($u & 0x7FFF) | 0x8000;
 673     return
 674         pack(VCE_TEMPLATE, NON_VAR, $aaaa, Min2Wt, Min3Wt, $u),
 675         pack(VCE_TEMPLATE, NON_VAR, $bbbb,      0,      0, $u);
 676 }
 677
 678 sub _derivCE_8 {
 679     my $code = shift;
 680     my $aaaa =  0xFF80 + ($code >> 15);
 681     my $bbbb = ($code & 0x7FFF) | 0x8000;
 682     return
 683         pack(VCE_TEMPLATE, NON_VAR, $aaaa, 2, 1, $code),
 684         pack(VCE_TEMPLATE, NON_VAR, $bbbb, 0, 0, $code);
 685 }
 686
 687 ##
 688 ## "hhhh hhhh hhhh" to (dddd, dddd, dddd)
 689 ##
 690 sub _getHexArray { map hex, $_[0] =~ /([0-9a-fA-F]+)/g }
 691
 692 #
 693 # $code *must* be in Hangul syllable.
 694 # Check it before you enter here.
 695 #
 696 sub _decompHangul {
 697     my $code = shift;
 698     my $SIndex = $code - 0xAC00;
 699     my $LIndex = int( $SIndex / 588);
 700     my $VIndex = int(($SIndex % 588) / 28);
 701     my $TIndex =      $SIndex % 28;
 702     return (
 703         0x1100 + $LIndex,
 704         0x1161 + $VIndex,
 705         $TIndex ? (0x11A7 + $TIndex) : (),
 706     );
 707 }
 708
 709 sub _isNonCharacter {
 710     my $code = shift;
 711     return ! defined $code                      # removed
 712         || ($code < 0 || 0x10FFFF < $code)      # out of range
 713         || (($code & 0xFFFE) == 0xFFFE)         # ??FFF[EF] (cf. utf8.c)
 714         || (0xD800 <= $code && $code <= 0xDFFF) # unpaired surrogates
 715         || (0xFDD0 <= $code && $code <= 0xFDEF) # other non-characters
 716     ;
 717 }
 718
 719
 720 ##
 721 ## bool _nonIgnorAtLevel(arrayref weights, int level)
 722 ##
 723 sub _nonIgnorAtLevel($$)
 724 {
 725     my $wt = shift;
 726     return if ! defined $wt;
 727     my $lv = shift;
 728     return grep($wt->[$_-1] != 0, MinLevel..$lv) ? TRUE : FALSE;
 729 }
 730
 731 ##
 732 ## bool _eqArray(
 733 ##    arrayref of arrayref[weights] source,
 734 ##    arrayref of arrayref[weights] substr,
 735 ##    int level)
 736 ## * comparison of graphemes vs graphemes.
 737 ##   @$source >= @$substr must be true (check it before call this);
 738 ##
 739 sub _eqArray($$$)
 740 {
 741     my $source = shift;
 742     my $substr = shift;
 743     my $lev = shift;
 744
 745     for my $g (0..@$substr-1){
 746         # Do the $g'th graphemes have the same number of AV weigths?
 747         return if @{ $source->[$g] } != @{ $substr->[$g] };
 748
 749         for my $w (0..@{ $substr->[$g] }-1) {
 750             for my $v (0..$lev-1) {
 751                 return if $source->[$g][$w][$v] != $substr->[$g][$w][$v];
 752             }
 753         }
 754     }
 755     return 1;
 756 }
 757
 758 ##
 759 ## (int position, int length)
 760 ## int position = index(string, substring, position, [undoc'ed grobal])
 761 ##
 762 ## With "grobal" (only for the list context),
 763 ##  returns list of arrayref[position, length].
 764 ##
 765 sub index
 766 {
 767     my $self  = shift;
 768     my $str   = shift;
 769     my $len   = length($str);
 770     my $subCE = $self->splitCE(shift);
 771     my $pos   = @_ ? shift : 0;
 772        $pos   = 0 if $pos < 0;
 773     my $grob  = shift;
 774
 775     my $lev   = $self->{level};
 776     my $ver9  = $self->{UCA_Version} > 8;
 777     my $v2i   = $self->{alternate} ne 'non-ignorable';
 778
 779     if (! @$subCE) {
 780         my $temp = $pos <= 0 ? 0 : $len <= $pos ? $len : $pos;
 781         return $grob
 782             ? map([$_, 0], $temp..$len)
 783             : wantarray ? ($temp,0) : $temp;
 784     }
 785     if ($len < $pos) {
 786         return wantarray ? () : NOMATCHPOS;
 787     }
 788     my $strCE = $self->splitCE($pos ? substr($str, $pos) : $str, TRUE);
 789     if (! @$strCE) {
 790         return wantarray ? () : NOMATCHPOS;
 791     }
 792     my $last_is_variable;
 793     my(@strWt, @iniPos, @finPos, @subWt, @g_ret);
 794
 795     $last_is_variable = FALSE;
 796     for my $wt (map $self->getWt($_), @$subCE) {
 797         my $to_be_pushed = _nonIgnorAtLevel($wt,$lev);
 798
 799         if ($v2i && $ver9) {
 800             if ($wt->[0] == 0) {
 801                 $to_be_pushed = FALSE if $last_is_variable;
 802             } else {
 803                 $last_is_variable = ($wt->[0] == Var1Wt);
 804             }
 805         }
 806
 807         if (@subWt && $wt->[0] == 0) {
 808             push @{ $subWt[-1] }, $wt if $to_be_pushed;
 809         } else {
 810             $wt->[0] = 0 if $wt->[0] == Var1Wt;
 811             push @subWt, [ $wt ];
 812         }
 813     }
 814
 815     my $count = 0;
 816     my $end = @$strCE - 1;
 817
 818     $last_is_variable = FALSE;
 819
 820     for (my $i = 0; $i <= $end; ) { # no $i++
 821         my $found_base = 0;
 822
 823         # fetch a grapheme
 824         while ($i <= $end && $found_base == 0) {
 825             for my $wt ($self->getWt($strCE->[$i][0])) {
 826                 my $to_be_pushed = _nonIgnorAtLevel($wt,$lev);
 827
 828                 if ($v2i && $ver9) {
 829                     if ($wt->[0] == 0) {
 830                         $to_be_pushed = FALSE if $last_is_variable;
 831                     } else {
 832                         $last_is_variable = ($wt->[0] == Var1Wt);
 833                     }
 834                 }
 835
 836                 if (@strWt && $wt->[0] == 0) {
 837                     push @{ $strWt[-1] }, $wt if $to_be_pushed;
 838                     $finPos[-1] = $strCE->[$i][2];
 839                 } elsif ($to_be_pushed) {
 840                     $wt->[0] = 0 if $wt->[0] == Var1Wt;
 841                     push @strWt,  [ $wt ];
 842                     push @iniPos, $found_base ? NOMATCHPOS : $strCE->[$i][1];
 843                     $finPos[-1] = NOMATCHPOS if $found_base;
 844                     push @finPos, $strCE->[$i][2];
 845                     $found_base++;
 846                 }
 847                 # else ===> no-op
 848             }
 849             $i++;
 850         }
 851
 852         # try to match
 853         while ( @strWt > @subWt || (@strWt == @subWt && $i > $end) ) {
 854             if ($iniPos[0] != NOMATCHPOS &&
 855                     $finPos[$#subWt] != NOMATCHPOS &&
 856                         _eqArray(\@strWt, \@subWt, $lev)) {
 857                 my $temp = $iniPos[0] + $pos;
 858
 859                 if ($grob) {
 860                     push @g_ret, [$temp, $finPos[$#subWt] - $iniPos[0]];
 861                     splice @strWt,  0, $#subWt;
 862                     splice @iniPos, 0, $#subWt;
 863                     splice @finPos, 0, $#subWt;
 864                 }
 865                 else {
 866                     return wantarray
 867                         ? ($temp, $finPos[$#subWt] - $iniPos[0])
 868                         :  $temp;
 869                 }
 870             }
 871             shift @strWt;
 872             shift @iniPos;
 873             shift @finPos;
 874         }
 875     }
 876
 877     return $grob
 878         ? @g_ret
 879         : wantarray ? () : NOMATCHPOS;
 880 }
 881
 882 ##
 883 ## scalarref to matching part = match(string, substring)
 884 ##
 885 sub match
 886 {
 887     my $self = shift;
 888     if (my($pos,$len) = $self->index($_[0], $_[1])) {
 889         my $temp = substr($_[0], $pos, $len);
 890         return wantarray ? $temp : \$temp;
 891         # An lvalue ref \substr should be avoided,
 892         # since its value is affected by modification of its referent.
 893     }
 894     else {
 895         return;
 896     }
 897 }
 898
 899 ##
 900 ## arrayref matching parts = gmatch(string, substring)
 901 ##
 902 sub gmatch
 903 {
 904     my $self = shift;
 905     my $str  = shift;
 906     my $sub  = shift;
 907     return map substr($str, $_->[0], $_->[1]),
 908                 $self->index($str, $sub, 0, 'g');
 909 }
 910
 911 ##
 912 ## bool subst'ed = subst(string, substring, replace)
 913 ##
 914 sub subst
 915 {
 916     my $self = shift;
 917     my $code = ref $_[2] eq 'CODE' ? $_[2] : FALSE;
 918
 919     if (my($pos,$len) = $self->index($_[0], $_[1])) {
 920         if ($code) {
 921             my $mat = substr($_[0], $pos, $len);
 922             substr($_[0], $pos, $len, $code->($mat));
 923         } else {
 924             substr($_[0], $pos, $len, $_[2]);
 925         }
 926         return TRUE;
 927     }
 928     else {
 929         return FALSE;
 930     }
 931 }
 932
 933 ##
 934 ## int count = gsubst(string, substring, replace)
 935 ##
 936 sub gsubst
 937 {
 938     my $self = shift;
 939     my $code = ref $_[2] eq 'CODE' ? $_[2] : FALSE;
 940     my $cnt = 0;
 941
 942     # Replacement is carried out from the end, then use reverse.
 943     for my $pos_len (reverse $self->index($_[0], $_[1], 0, 'g')) {
 944         if ($code) {
 945             my $mat = substr($_[0], $pos_len->[0], $pos_len->[1]);
 946             substr($_[0], $pos_len->[0], $pos_len->[1], $code->($mat));
 947         } else {
 948             substr($_[0], $pos_len->[0], $pos_len->[1], $_[2]);
 949         }
 950         $cnt++;
 951     }
 952     return $cnt;
 953 }
 954
 955 1;
 956 __END__
 957
 958 =head1 NAME
 959
 960 Unicode::Collate - Unicode Collation Algorithm
 961
 962 =head1 SYNOPSIS
 963
 964   use Unicode::Collate;
 965
 966   #construct
 967   $Collator = Unicode::Collate->new(%tailoring);
 968
 969   #sort
 970   @sorted = $Collator->sort(@not_sorted);
 971
 972   #compare
 973   $result = $Collator->cmp($a, $b); # returns 1, 0, or -1.
 974
 975 =head1 DESCRIPTION
 976
 977 This module is an implementation
 978 of Unicode Technical Standard #10 (UTS #10)
 979 "Unicode Collation Algorithm."
 980
 981 =head2 Constructor and Tailoring
 982
 983 The C<new> method returns a collator object.
 984
 985    $Collator = Unicode::Collate->new(
 986       UCA_Version => $UCA_Version,
 987       alternate => $alternate,
 988       backwards => $levelNumber, # or \@levelNumbers
 989       entry => $element,
 990       normalization  => $normalization_form,
 991       ignoreName => qr/$ignoreName/,
 992       ignoreChar => qr/$ignoreChar/,
 993       katakana_before_hiragana => $bool,
 994       level => $collationLevel,
 995       overrideCJK => \&overrideCJK,
 996       overrideHangul => \&overrideHangul,
 997       preprocess => \&preprocess,
 998       rearrange => \@charList,
 999       table => $filename,
1000       undefName => qr/$undefName/,
1001       undefChar => qr/$undefChar/,
1002       upper_before_lower => $bool,
1003    );
1004    # if %tailoring is false (i.e. empty),
1005    # $Collator should do the default collation.
1006
1007 =over 4
1008
1009 =item UCA_Version
1010
1011 If the version number of the older UCA is given,
1012 the older behavior of that version is emulated on collating.
1013 If omitted, the return value of C<UCA_Version()> is used.
1014
1015 The supported version: 8 or 9.
1016
1017 B<This parameter may be removed in the future version,
1018 as switching the algorithm would affect the performance.>
1019
1020 =item alternate
1021
1022 -- see 3.2.2 Variable Weighting, UTS #10.
1023
1024 (the title in UCA version 8: Alternate Weighting)
1025
1026 This key allows to alternate weighting for variable collation elements,
1027 which are marked with an ASTERISK in the table
1028 (NOTE: Many punction marks and symbols are variable in F<allkeys.txt>).
1029
1030    alternate => 'blanked', 'non-ignorable', 'shifted', or 'shift-trimmed'.
1031
1032 These names are case-insensitive.
1033 By default (if specification is omitted), 'shifted' is adopted.
1034
1035    'Blanked'        Variable elements are made ignorable at levels 1 through 3;
1036                     considered at the 4th level.
1037
1038    'Non-ignorable'  Variable elements are not reset to ignorable.
1039
1040    'Shifted'        Variable elements are made ignorable at levels 1 through 3
1041                     their level 4 weight is replaced by the old level 1 weight.
1042                     Level 4 weight for Non-Variable elements is 0xFFFF.
1043
1044    'Shift-Trimmed'  Same as 'shifted', but all FFFF's at the 4th level
1045                     are trimmed.
1046
1047 =item backwards
1048
1049 -- see 3.1.2 French Accents, UTS #10.
1050
1051      backwards => $levelNumber or \@levelNumbers
1052
1053 Weights in reverse order; ex. level 2 (diacritic ordering) in French.
1054 If omitted, forwards at all the levels.
1055
1056 =item entry
1057
1058 -- see 3.1 Linguistic Features; 3.2.1 File Format, UTS #10.
1059
1060 Overrides a default order or defines additional collation elements
1061
1062   entry => <<'ENTRIES', # use the UCA file format
1063 00E6 ; [.0861.0020.0002.00E6] [.08B1.0020.0002.00E6] # ligature <ae> as <a><e>
1064 0063 0068 ; [.0893.0020.0002.0063]      # "ch" in traditional Spanish
1065 0043 0068 ; [.0893.0020.0008.0043]      # "Ch" in traditional Spanish
1066 ENTRIES
1067
1068 B<NOTE:> The code point in the UCA file format (before C<';'>)
1069 B<must> be a Unicode code point, but not a native code point.
1070 So C<0063> must always denote C<U+0063>,
1071 but not a character of C<"\x63">.
1072
1073 =item ignoreName
1074
1075 =item ignoreChar
1076
1077 -- see Completely Ignorable, 3.2.2 Variable Weighting, UTS #10.
1078
1079 Makes the entry in the table completely ignorable;
1080 i.e. as if the weights were zero at all level.
1081
1082 E.g. when 'a' and 'e' are ignorable,
1083 'element' is equal to 'lament' (or 'lmnt').
1084
1085 =item level
1086
1087 -- see 4.3 Form a sort key for each string, UTS #10.
1088
1089 Set the maximum level.
1090 Any higher levels than the specified one are ignored.
1091
1092   Level 1: alphabetic ordering
1093   Level 2: diacritic ordering
1094   Level 3: case ordering
1095   Level 4: tie-breaking (e.g. in the case when alternate is 'shifted')
1096
1097   ex.level => 2,
1098
1099 If omitted, the maximum is the 4th.
1100
1101 =item normalization
1102
1103 -- see 4.1 Normalize each input string, UTS #10.
1104
1105 If specified, strings are normalized before preparation of sort keys
1106 (the normalization is executed after preprocess).
1107
1108 As a form name, one of the following names must be used.
1109
1110   'C'  or 'NFC'  for Normalization Form C
1111   'D'  or 'NFD'  for Normalization Form D
1112   'KC' or 'NFKC' for Normalization Form KC
1113   'KD' or 'NFKD' for Normalization Form KD
1114
1115 If omitted, the string is put into Normalization Form D.
1116
1117 If C<undef> is passed explicitly as the value for this key,
1118 any normalization is not carried out (this may make tailoring easier
1119 if any normalization is not desired).
1120
1121 see B<CAVEAT>.
1122
1123 =item overrideCJK
1124
1125 -- see 7.1 Derived Collation Elements, UTS #10.
1126
1127 By default, mapping of CJK Unified Ideographs
1128 uses the Unicode codepoint order.
1129 But the mapping of CJK Unified Ideographs may be overrided.
1130
1131 ex. CJK Unified Ideographs in the JIS code point order.
1132
1133   overrideCJK => sub {
1134       my $u = shift;             # get a Unicode codepoint
1135       my $b = pack('n', $u);     # to UTF-16BE
1136       my $s = your_unicode_to_sjis_converter($b); # convert
1137       my $n = unpack('n', $s);   # convert sjis to short
1138       [ $n, 0x20, 0x2, $u ];     # return the collation element
1139   },
1140
1141 ex. ignores all CJK Unified Ideographs.
1142
1143   overrideCJK => sub {()}, # CODEREF returning empty list
1144
1145    # where ->eq("Pe\x{4E00}rl", "Perl") is true
1146    # as U+4E00 is a CJK Unified Ideograph and to be ignorable.
1147
1148 If C<undef> is passed explicitly as the value for this key,
1149 weights for CJK Unified Ideographs are treated as undefined.
1150 But assignment of weight for CJK Unified Ideographs
1151 in table or L<entry> is still valid.
1152
1153 =item overrideHangul
1154
1155 -- see 7.1 Derived Collation Elements, UTS #10.
1156
1157 By default, Hangul Syllables are decomposed into Hangul Jamo.
1158 But the mapping of Hangul Syllables may be overrided.
1159
1160 This tag works like L<overrideCJK>, so see there for examples.
1161
1162 If you want to override the mapping of Hangul Syllables,
1163 the Normalization Forms D and KD are not appropriate
1164 (they will be decomposed before overriding).
1165
1166 If C<undef> is passed explicitly as the value for this key,
1167 weight for Hangul Syllables is treated as undefined
1168 without decomposition into Hangul Jamo.
1169 But definition of weight for Hangul Syllables
1170 in table or L<entry> is still valid.
1171
1172 =item preprocess
1173
1174 -- see 5.1 Preprocessing, UTS #10.
1175
1176 If specified, the coderef is used to preprocess
1177 before the formation of sort keys.
1178
1179 ex. dropping English articles, such as "a" or "the".
1180 Then, "the pen" is before "a pencil".
1181
1182      preprocess => sub {
1183            my $str = shift;
1184            $str =~ s/\b(?:an?|the)\s+//gi;
1185            $str;
1186         },
1187
1188 =item rearrange
1189
1190 -- see 3.1.3 Rearrangement, UTS #10.
1191
1192 Characters that are not coded in logical order and to be rearranged.
1193 By default,
1194
1195     rearrange => [ 0x0E40..0x0E44, 0x0EC0..0x0EC4 ],
1196
1197 If you want to disallow any rearrangement,
1198 pass C<undef> or C<[]> (a reference to an empty list)
1199 as the value for this key.
1200
1201 B<According to the version 9 of UCA, this parameter shall not be used;
1202 but it is not warned at present.>
1203
1204 =item table
1205
1206 -- see 3.2 Default Unicode Collation Element Table, UTS #10.
1207
1208 You can use another element table if desired.
1209 The table file must be in your C<lib/Unicode/Collate> directory.
1210
1211 By default, the file C<lib/Unicode/Collate/allkeys.txt> is used.
1212
1213 If C<undef> is passed explicitly as the value for this key,
1214 no file is read (but you can define collation elements via L<entry>).
1215
1216 A typical way to define a collation element table
1217 without any file of table:
1218
1219    $onlyABC = Unicode::Collate->new(
1220        table => undef,
1221        entry => << 'ENTRIES',
1222 0061 ; [.0101.0020.0002.0061] # LATIN SMALL LETTER A
1223 0041 ; [.0101.0020.0008.0041] # LATIN CAPITAL LETTER A
1224 0062 ; [.0102.0020.0002.0062] # LATIN SMALL LETTER B
1225 0042 ; [.0102.0020.0008.0042] # LATIN CAPITAL LETTER B
1226 0063 ; [.0103.0020.0002.0063] # LATIN SMALL LETTER C
1227 0043 ; [.0103.0020.0008.0043] # LATIN CAPITAL LETTER C
1228 ENTRIES
1229     );
1230
1231 =item undefName
1232
1233 =item undefChar
1234
1235 -- see 6.3.4 Reducing the Repertoire, UTS #10.
1236
1237 Undefines the collation element as if it were unassigned in the table.
1238 This reduces the size of the table.
1239 If an unassigned character appears in the string to be collated,
1240 the sort key is made from its codepoint
1241 as a single-character collation element,
1242 as it is greater than any other assigned collation elements
1243 (in the codepoint order among the unassigned characters).
1244 But, it'd be better to ignore characters
1245 unfamiliar to you and maybe never used.
1246
1247 =item katakana_before_hiragana
1248
1249 =item upper_before_lower
1250
1251 -- see 6.6 Case Comparisons; 7.3.1 Tertiary Weight Table, UTS #10.
1252
1253 By default, lowercase is before uppercase
1254 and hiragana is before katakana.
1255
1256 If the tag is made true, this is reversed.
1257
1258 B<NOTE>: These tags simplemindedly assume
1259 any lowercase/uppercase or hiragana/katakana distinctions
1260 must occur in level 3, and their weights at level 3
1261 must be same as those mentioned in 7.3.1, UTS #10.
1262 If you define your collation elements which violate this requirement,
1263 these tags don't work validly.
1264
1265 =back
1266
1267 =head2 Methods for Collation
1268
1269 =over 4
1270
1271 =item C<@sorted = $Collator-E<gt>sort(@not_sorted)>
1272
1273 Sorts a list of strings.
1274
1275 =item C<$result = $Collator-E<gt>cmp($a, $b)>
1276
1277 Returns 1 (when C<$a> is greater than C<$b>)
1278 or 0 (when C<$a> is equal to C<$b>)
1279 or -1 (when C<$a> is lesser than C<$b>).
1280
1281 =item C<$result = $Collator-E<gt>eq($a, $b)>
1282
1283 =item C<$result = $Collator-E<gt>ne($a, $b)>
1284
1285 =item C<$result = $Collator-E<gt>lt($a, $b)>
1286
1287 =item C<$result = $Collator-E<gt>le($a, $b)>
1288
1289 =item C<$result = $Collator-E<gt>gt($a, $b)>
1290
1291 =item C<$result = $Collator-E<gt>ge($a, $b)>
1292
1293 They works like the same name operators as theirs.
1294
1295    eq : whether $a is equal to $b.
1296    ne : whether $a is not equal to $b.
1297    lt : whether $a is lesser than $b.
1298    le : whether $a is lesser than $b or equal to $b.
1299    gt : whether $a is greater than $b.
1300    ge : whether $a is greater than $b or equal to $b.
1301
1302 =item C<$sortKey = $Collator-E<gt>getSortKey($string)>
1303
1304 -- see 4.3 Form a sort key for each string, UTS #10.
1305
1306 Returns a sort key.
1307
1308 You compare the sort keys using a binary comparison
1309 and get the result of the comparison of the strings using UCA.
1310
1311    $Collator->getSortKey($a) cmp $Collator->getSortKey($b)
1312
1313       is equivalent to
1314
1315    $Collator->cmp($a, $b)
1316
1317 =item C<$sortKeyForm = $Collator-E<gt>viewSortKey($string)>
1318
1319    use Unicode::Collate;
1320    my $c = Unicode::Collate->new();
1321    print $c->viewSortKey("Perl"),"\n";
1322
1323    # output:
1324    # [0B67 0A65 0B7F 0B03 | 0020 0020 0020 0020 | 0008 0002 0002 0002 | FFFF FFFF FFFF FFFF]
1325    #  Level 1               Level 2               Level 3               Level 4
1326
1327     (If C<UCA_Version> is 8, the output is slightly different.)
1328
1329 =back
1330
1331 =head2 Methods for Searching
1332
1333 B<DISCLAIMER:> If C<preprocess> or C<normalization> tag is true
1334 for C<$Collator>, calling these methods (C<index>, C<match>, C<gmatch>,
1335 C<subst>, C<gsubst>) is croaked,
1336 as the position and the length might differ
1337 from those on the specified string.
1338 (And the C<rearrange> tag is neglected.)
1339
1340 The C<match>, C<gmatch>, C<subst>, C<gsubst> methods work
1341 like C<m//>, C<m//g>, C<s///>, C<s///g>, respectively,
1342 but they are not aware of any pattern, but only a literal substring.
1343
1344 =over 4
1345
1346 =item C<$position = $Collator-E<gt>index($string, $substring[, $position])>
1347
1348 =item C<($position, $length) = $Collator-E<gt>index($string, $substring[, $position])>
1349
1350 If C<$substring> matches a part of C<$string>, returns
1351 the position of the first occurrence of the matching part in scalar context;
1352 in list context, returns a two-element list of
1353 the position and the length of the matching part.
1354
1355 If C<$substring> does not match any part of C<$string>,
1356 returns C<-1> in scalar context and
1357 an empty list in list context.
1358
1359 e.g. you say
1360
1361   my $Collator = Unicode::Collate->new( normalization => undef, level => 1 );
1362                                      # (normalization => undef) is REQUIRED.
1363   my $str = "Ich muß studieren Perl.";
1364   my $sub = "MÜSS";
1365   my $match;
1366   if (my($pos,$len) = $Collator->index($str, $sub)) {
1367       $match = substr($str, $pos, $len);
1368   }
1369
1370 and get C<"muß"> in C<$match> since C<"muß">
1371 is primary equal to C<"MÜSS">.
1372
1373 =item C<$match_ref = $Collator-E<gt>match($string, $substring)>
1374
1375 =item C<($match)   = $Collator-E<gt>match($string, $substring)>
1376
1377 If C<$substring> matches a part of C<$string>, in scalar context, returns
1378 B<a reference to> the first occurrence of the matching part
1379 (C<$match_ref> is always true if matches,
1380 since every reference is B<true>);
1381 in list context, returns the first occurrence of the matching part.
1382
1383 If C<$substring> does not match any part of C<$string>,
1384 returns C<undef> in scalar context and
1385 an empty list in list context.
1386
1387 e.g.
1388
1389     if ($match_ref = $Collator->match($str, $sub)) { # scalar context
1390         print "matches [$$match_ref].\n";
1391     } else {
1392         print "doesn't match.\n";
1393     }
1394
1395      or
1396
1397     if (($match) = $Collator->match($str, $sub)) { # list context
1398         print "matches [$match].\n";
1399     } else {
1400         print "doesn't match.\n";
1401     }
1402
1403 =item C<@match = $Collator-E<gt>gmatch($string, $substring)>
1404
1405 If C<$substring> matches a part of C<$string>, returns
1406 all the matching parts (or matching count in scalar context).
1407
1408 If C<$substring> does not match any part of C<$string>,
1409 returns an empty list.
1410
1411 =item C<$count = $Collator-E<gt>subst($string, $substring, $replacement)>
1412
1413 If C<$substring> matches a part of C<$string>,
1414 the first occurrence of the matching part is replaced by C<$replacement>
1415 (C<$string> is modified) and return C<$count> (always equals to C<1>).
1416
1417 C<$replacement> can be a C<CODEREF>,
1418 taking the matching part as an argument,
1419 and returning a string to replace the matching part
1420 (a bit similar to C<s/(..)/$coderef-E<gt>($1)/e>).
1421
1422 =item C<$count = $Collator-E<gt>gsubst($string, $substring, $replacement)>
1423
1424 If C<$substring> matches a part of C<$string>,
1425 all the occurrences of the matching part is replaced by C<$replacement>
1426 (C<$string> is modified) and return C<$count>.
1427
1428 C<$replacement> can be a C<CODEREF>,
1429 taking the matching part as an argument,
1430 and returning a string to replace the matching part
1431 (a bit similar to C<s/(..)/$coderef-E<gt>($1)/eg>).
1432
1433 e.g.
1434
1435   my $Collator = Unicode::Collate->new( normalization => undef, level => 1 );
1436                                      # (normalization => undef) is REQUIRED.
1437   my $str = "Camel ass came\x{301}l CAMEL horse cAm\0E\0L...";
1438   $Collator->gsubst($str, "camel", sub { "<b>$_[0]</b>" });
1439
1440   # now $str is "<b>Camel</b> ass <b>came\x{301}l</b> <b>CAMEL</b> horse <b>cAm\0E\0L</b>...";
1441   # i.e., all the camels are made bold-faced.
1442
1443 =back
1444
1445 =head2 Other Methods
1446
1447 =over 4
1448
1449 =item C<%old_tailoring = $Collator-E<gt>change(%new_tailoring)>
1450
1451 Change the value of specified keys and returns the changed part.
1452
1453     $Collator = Unicode::Collate->new(level => 4);
1454
1455     $Collator->eq("perl", "PERL"); # false
1456
1457     %old = $Collator->change(level => 2); # returns (level => 4).
1458
1459     $Collator->eq("perl", "PERL"); # true
1460
1461     $Collator->change(%old); # returns (level => 2).
1462
1463     $Collator->eq("perl", "PERL"); # false
1464
1465 Not all C<(key,value)>s are allowed to be changed.
1466 See also C<@Unicode::Collate::ChangeOK> and C<@Unicode::Collate::ChangeNG>.
1467
1468 In the scalar context, returns the modified collator
1469 (but it is B<not> a clone from the original).
1470
1471     $Collator->change(level => 2)->eq("perl", "PERL"); # true
1472
1473     $Collator->eq("perl", "PERL"); # true; now max level is 2nd.
1474
1475     $Collator->change(level => 4)->eq("perl", "PERL"); # false
1476
1477 =item UCA_Version
1478
1479 Returns the version number of UTS #10 this module consults.
1480
1481 =item Base_Unicode_Version
1482
1483 Returns the version number of the Unicode Standard
1484 this module is based on.
1485
1486 =back
1487
1488 =head2 EXPORT
1489
1490 None by default.
1491
1492 =head2 TODO
1493
1494 Unicode::Collate has not been ported to EBCDIC.
1495 IMHO, use of utf8::unicode_to_native()/utf8::native_to_unicode()
1496 at the proper postions should allow
1497 this module to work on EBCDIC platform...
1498
1499 =head2 CAVEAT
1500
1501 Use of the C<normalization> parameter requires
1502 the B<Unicode::Normalize> module.
1503
1504 If you need not it (say, in the case when you need not
1505 handle any combining characters),
1506 assign C<normalization =E<gt> undef> explicitly.
1507
1508 -- see 6.5 Avoiding Normalization, UTS #10.
1509
1510 =head2 Conformance Test
1511
1512 The Conformance Test for the UCA is provided
1513 in L<http://www.unicode.org/reports/tr10/CollationTest.html>
1514 and L<http://www.unicode.org/reports/tr10/CollationTest.zip>
1515
1516 For F<CollationTest_SHIFTED.txt>,
1517 a collator via C<Unicode::Collate-E<gt>new( )> should be used;
1518 for F<CollationTest_NON_IGNORABLE.txt>, a collator via
1519 C<Unicode::Collate-E<gt>new(alternate =E<gt> "non-ignorable", level =E<gt> 3)>.
1520
1521 B<Unicode::Normalize is required to try The Conformance Test.>
1522
1523 =head1 AUTHOR
1524
1525 SADAHIRO Tomoyuki, E<lt>SADAHIRO@cpan.orgE<gt>
1526
1527   http://homepage1.nifty.com/nomenclator/perl/
1528
1529   Copyright(C) 2001-2002, SADAHIRO Tomoyuki. Japan. All rights reserved.
1530
1531   This library is free software; you can redistribute it
1532   and/or modify it under the same terms as Perl itself.
1533
1534 =head1 SEE ALSO
1535
1536 =over 4
1537
1538 =item http://www.unicode.org/reports/tr10/
1539
1540 Unicode Collation Algorithm - UTS #10
1541
1542 =item http://www.unicode.org/reports/tr10/allkeys.txt
1543
1544 The Default Unicode Collation Element Table
1545
1546 =item http://www.unicode.org/reports/tr10/CollationTest.html
1547 http://www.unicode.org/reports/tr10/CollationTest.zip
1548
1549 The latest versions of the conformance test for the UCA
1550
1551 =item http://www.unicode.org/reports/tr15/
1552
1553 Unicode Normalization Forms - UAX #15
1554
1555 =item L<Unicode::Normalize>
1556
1557 =back
1558
1559 =cut