lib/Unicode/Collate.pm

   1 package Unicode::Collate;
   2
   3 BEGIN {
   4     if (ord("A") == 193) {
   5         die "Unicode::Collate not ported to EBCDIC\n";
   6     }
   7 }
   8
   9 use 5.006;
  10 use strict;
  11 use warnings;
  12 use Carp;
  13 use File::Spec;
  14
  15 require Exporter;
  16
  17 # Supporting on EBCDIC platform is not tested.
  18 # Tester(s) welcome!
  19 our $IsEBCDIC = ord("A") != 0x41;
  20
  21 our $VERSION = '0.23';
  22 our $PACKAGE = __PACKAGE__;
  23
  24 our @ISA = qw(Exporter);
  25
  26 our %EXPORT_TAGS = ();
  27 our @EXPORT_OK = ();
  28 our @EXPORT = ();
  29
  30 (our $Path = $INC{'Unicode/Collate.pm'}) =~ s/\.pm$//;
  31 our $KeyFile = "allkeys.txt";
  32
  33 our $UNICODE_VERSION;
  34
  35 eval { require Unicode::UCD };
  36
  37 unless ($@) {
  38     $UNICODE_VERSION = Unicode::UCD::UnicodeVersion();
  39 }
  40 else { # XXX, Perl 5.6.1
  41     my($f, $fh);
  42     foreach my $d (@INC) {
  43         $f = File::Spec->catfile($d, "unicode", "Unicode.301");
  44         if (open($fh, $f)) {
  45             $UNICODE_VERSION = '3.0.1';
  46             close $fh;
  47             last;
  48         }
  49     }
  50 }
  51
  52 # Perl's boolean
  53 use constant TRUE  => 1;
  54 use constant FALSE => "";
  55 use constant NOMATCHPOS => -1;
  56
  57 # A coderef to get combining class imported from Unicode::Normalize
  58 # (i.e. \&Unicode::Normalize::getCombinClass).
  59 # This is also used as a HAS_UNICODE_NORMALIZE flag.
  60 our $getCombinClass;
  61
  62 # Minimum weights at level 2 and 3, respectively
  63 use constant Min2   => 0x20;
  64 use constant Min3   => 0x02;
  65
  66 # Shifted weight at 4th level
  67 use constant Shift4 => 0xFFFF;
  68
  69 # Variable weight at 1st level.
  70 # This is a negative value but should be regarded as zero on collation.
  71 # This is for distinction of variable chars from level 3 ignorable chars.
  72 use constant Var1 => -1;
  73
  74
  75 # A boolean for Variable and 16-bit weights at 4 levels of Collation Element
  76 # PROBLEM: The Default Unicode Collation Element Table
  77 # has weights over 0xFFFF at the 4th level.
  78 # The tie-breaking in the variable weights
  79 # other than "shift" (as well as "shift-trimmed") is unreliable.
  80 use constant VCE_TEMPLATE => 'Cn4';
  81
  82 # Unicode encoding of strings to be collated
  83 # TODO: 'N*' for UTF-32BE, 'V*' for UTF-32LE.
  84 use constant UTF_TEMPLATE => 'U*';
  85
  86 # A sort key: 16-bit weights
  87 # See also the PROBLEM on VCE_TEMPLATE above.
  88 use constant KEY_TEMPLATE => 'n*';
  89
  90 # Level separator in a sort key:
  91 # i.e. pack(KEY_TEMPLATE, 0)
  92 use constant LEVEL_SEP => "\0\0";
  93
  94 # As Unicode code point separator for hash keys.
  95 # A joined code point string (denoted by JCPS below)
  96 # like "65;768" is used for internal processing
  97 # instead of Perl's Unicode string like "\x41\x{300}",
  98 # as the native code point is different from the Unicode code point
  99 # on EBCDIC platform.
 100 # This character must not be included in any stringified
 101 # representation of an integer.
 102 use constant CODE_SEP => ';';
 103
 104 # boolean values of variable weights
 105 use constant NON_VAR => 0; # Non-Variable character
 106 use constant VAR     => 1; # Variable character
 107
 108 # Logical_Order_Exception in PropList.txt
 109 # TODO: synchronization with change of PropList.txt.
 110 our $DefaultRearrange = [ 0x0E40..0x0E44, 0x0EC0..0x0EC4 ];
 111
 112 sub UCA_Version { "9" }
 113
 114 sub Base_Unicode_Version { $UNICODE_VERSION || 'unknown' }
 115
 116 my (%AlternateOK);
 117 @AlternateOK{ qw/
 118     blanked  non-ignorable  shifted  shift-trimmed
 119   / } = ();
 120
 121 our @ChangeOK = qw/
 122     alternate backwards level normalization rearrange
 123     katakana_before_hiragana upper_before_lower
 124     overrideHangul overrideCJK preprocess UCA_Version
 125   /;
 126
 127 our @ChangeNG = qw/
 128     entry entries table combining maxlength
 129     ignoreChar ignoreName undefChar undefName
 130     versionTable alternateTable backwardsTable forwardsTable rearrangeTable
 131     derivCode normCode rearrangeHash L3_ignorable
 132   /;
 133 # The hash key 'ignored' is deleted at VERSION 0.21.
 134 # The hash key 'isShift' are deleted at VERSION 0.23.
 135
 136 my (%ChangeOK, %ChangeNG);
 137 @ChangeOK{ @ChangeOK } = ();
 138 @ChangeNG{ @ChangeNG } = ();
 139
 140 sub change {
 141     my $self = shift;
 142     my %hash = @_;
 143     my %old;
 144     foreach my $k (keys %hash) {
 145         if (exists $ChangeOK{$k}) {
 146             $old{$k} = $self->{$k};
 147             $self->{$k} = $hash{$k};
 148         }
 149         elsif (exists $ChangeNG{$k}) {
 150             croak "change of $k via change() is not allowed!";
 151         }
 152         # else => ignored
 153     }
 154     $self->checkCollator;
 155     return wantarray ? %old : $self;
 156 }
 157
 158 sub checkCollator {
 159     my $self = shift;
 160     croak "Illegal level lower than 1 (passed $self->{level})."
 161         if $self->{level} < 1;
 162     croak "A level higher than 4 (passed $self->{level}) is not supported."
 163         if 4 < $self->{level};
 164
 165     $self->{derivCode} =
 166         $self->{UCA_Version} ==  8 ? \&_derivCE_8 :
 167         $self->{UCA_Version} ==  9 ? \&_derivCE_9 :
 168       croak "Illegal UCA version (passed $self->{UCA_Version}).";
 169
 170     $self->{alternate} = lc($self->{alternate});
 171     croak "$PACKAGE unknown alternate tag name: $self->{alternate}"
 172         unless exists $AlternateOK{ $self->{alternate} };
 173
 174     $self->{backwards} = []
 175         if ! defined $self->{backwards};
 176     $self->{backwards} = [ $self->{backwards} ]
 177         if ! ref $self->{backwards};
 178
 179     $self->{rearrange} = []
 180         if ! defined $self->{rearrange};
 181     croak "$PACKAGE: A list for rearrangement must be store in an ARRAYREF"
 182         if ! ref $self->{rearrange};
 183
 184     # keys of $self->{rearrangeHash} are $self->{rearrange}.
 185     $self->{rearrangeHash} = undef;
 186
 187     if (@{ $self->{rearrange} }) {
 188         @{ $self->{rearrangeHash} }{ @{ $self->{rearrange} } } = ();
 189     }
 190
 191     $self->{normCode} = undef;
 192
 193     if (defined $self->{normalization}) {
 194         eval { require Unicode::Normalize };
 195         croak "Unicode/Normalize.pm is required to normalize strings: $@"
 196             if $@;
 197
 198         Unicode::Normalize->import();
 199         $getCombinClass = \&Unicode::Normalize::getCombinClass
 200             if ! $getCombinClass;
 201
 202         $self->{normCode} =
 203             $self->{normalization} =~ /^(?:NF)?C$/  ? \&NFC :
 204             $self->{normalization} =~ /^(?:NF)?D$/  ? \&NFD :
 205             $self->{normalization} =~ /^(?:NF)?KC$/ ? \&NFKC :
 206             $self->{normalization} =~ /^(?:NF)?KD$/ ? \&NFKD :
 207           croak "$PACKAGE unknown normalization form name: "
 208                 . $self->{normalization};
 209     }
 210     return;
 211 }
 212
 213 sub new
 214 {
 215     my $class = shift;
 216     my $self = bless { @_ }, $class;
 217
 218     # If undef is passed explicitly, no file is read.
 219     $self->{table} = $KeyFile if ! exists $self->{table};
 220     $self->read_table if defined $self->{table};
 221
 222     if ($self->{entry}) {
 223         $self->parseEntry($_) foreach split /\n/, $self->{entry};
 224     }
 225
 226     $self->{level} ||= 4;
 227     $self->{UCA_Version} ||= UCA_Version();
 228
 229     $self->{overrideHangul} = ''
 230         if ! exists $self->{overrideHangul};
 231     $self->{overrideCJK} = ''
 232         if ! exists $self->{overrideCJK};
 233     $self->{normalization} = 'D'
 234         if ! exists $self->{normalization};
 235     $self->{alternate} = $self->{alternateTable} || 'shifted'
 236         if ! exists $self->{alternate};
 237     $self->{rearrange} = $self->{rearrangeTable} || $DefaultRearrange
 238         if ! exists $self->{rearrange};
 239     $self->{backwards} = $self->{backwardsTable}
 240         if ! exists $self->{backwards};
 241
 242     $self->checkCollator;
 243
 244     return $self;
 245 }
 246
 247 sub read_table {
 248     my $self = shift;
 249     my $file = $self->{table} ne '' ? $self->{table} : $KeyFile;
 250
 251     my $filepath = File::Spec->catfile($Path, $file);
 252     open my $fk, "<$filepath"
 253         or croak "File does not exist at $filepath";
 254
 255     while (<$fk>) {
 256         next if /^\s*#/;
 257         if (/^\s*\@/) {
 258             if    (/^\s*\@version\s*(\S*)/) {
 259                 $self->{versionTable} ||= $1;
 260             }
 261             elsif (/^\s*\@alternate\s+(\S*)/) {
 262                 $self->{alternateTable} ||= $1;
 263             }
 264             elsif (/^\s*\@backwards\s+(\S*)/) {
 265                 push @{ $self->{backwardsTable} }, $1;
 266             }
 267             elsif (/^\s*\@forwards\s+(\S*)/) { # parhaps no use
 268                 push @{ $self->{forwardsTable} }, $1;
 269             }
 270             elsif (/^\s*\@rearrange\s+(.*)/) { # (\S*) is NG
 271                 push @{ $self->{rearrangeTable} }, _getHexArray($1);
 272             }
 273             next;
 274         }
 275         $self->parseEntry($_);
 276     }
 277     close $fk;
 278 }
 279
 280
 281 ##
 282 ## get $line, parse it, and write an entry in $self
 283 ##
 284 sub parseEntry
 285 {
 286     my $self = shift;
 287     my $line = shift;
 288     my($name, $entry, @uv, @key);
 289
 290     return if $line !~ /^\s*[0-9A-Fa-f]/;
 291
 292     # removes comment and gets name
 293     $name = $1
 294         if $line =~ s/[#%]\s*(.*)//;
 295     return if defined $self->{undefName} && $name =~ /$self->{undefName}/;
 296
 297     # gets element
 298     my($e, $k) = split /;/, $line;
 299     croak "Wrong Entry: <charList> must be separated by ';' from <collElement>"
 300         if ! $k;
 301
 302     @uv = _getHexArray($e);
 303     return if !@uv;
 304
 305     $entry = join(CODE_SEP, @uv); # in JCPS
 306
 307     if (defined $self->{undefChar} || defined $self->{ignoreChar}) {
 308         # Do not use UTF_TEMPLATE; Perl' RE is only for utf8.
 309         my $ele = $IsEBCDIC
 310             ? pack('U*', map utf8::unicode_to_native($_), @uv)
 311             : pack('U*', @uv);
 312
 313         # regarded as if it were not entried in the table
 314         return
 315             if defined $self->{undefChar} && $ele =~ /$self->{undefChar}/;
 316
 317         # replaced as completely ignorable
 318         $k = '[.0000.0000.0000.0000]'
 319             if defined $self->{ignoreChar} && $ele =~ /$self->{ignoreChar}/;
 320     }
 321
 322     # replaced as completely ignorable
 323     $k = '[.0000.0000.0000.0000]'
 324         if defined $self->{ignoreName} && $name =~ /$self->{ignoreName}/;
 325
 326     my $combining = TRUE; # primary = 0, secondary != 0;
 327     my $is_L3_ignorable;
 328
 329     foreach my $arr ($k =~ /\[([^\[\]]+)\]/g) { # SPACEs allowed
 330         my $var = $arr =~ /\*/; # exactly /^\*/ but be lenient.
 331         my @wt = _getHexArray($arr);
 332         push @key, pack(VCE_TEMPLATE, $var, @wt);
 333         $combining = FALSE
 334             unless $wt[0] == 0 && $wt[1] != 0;
 335         $is_L3_ignorable = TRUE
 336             if $wt[0] + $wt[1] + $wt[2] == 0;
 337           # if $arr !~ /[1-9A-Fa-f]/; NG
 338           # Conformance Test shows L3-ignorable is completely ignorable.
 339     }
 340
 341     $self->{entries}{$entry} = \@key;
 342
 343     $self->{combining}{$entry} = TRUE
 344         if $combining;
 345
 346     # The key is a string representing a numeral code point.
 347
 348     $self->{L3_ignorable}{$uv[0]} = TRUE
 349         if @uv == 1 && $is_L3_ignorable;
 350
 351     # Contraction is to be considered in the range of this maxlength.
 352     $self->{maxlength}{$uv[0]} = scalar @uv
 353         if @uv > 1;
 354 }
 355
 356 ##
 357 ## arrayref[weights] = altCE(bool variable?, list[num] weights)
 358 ##
 359 sub altCE
 360 {
 361     my $self = shift;
 362     my($var, @wt) = unpack(VCE_TEMPLATE, shift);
 363
 364     $self->{alternate} eq 'blanked' ?
 365         $var ? [Var1, 0, 0, $wt[3]] : \@wt :
 366     $self->{alternate} eq 'non-ignorable' ?
 367         \@wt :
 368     $self->{alternate} eq 'shifted' ?
 369         $var ? [Var1, 0, 0, $wt[0] ]
 370              : [ @wt[0..2], $wt[0]+$wt[1]+$wt[2] ? Shift4 : 0 ] :
 371     $self->{alternate} eq 'shift-trimmed' ?
 372         $var ? [Var1, 0, 0, $wt[0] ] : [ @wt[0..2], 0 ] :
 373         croak "$PACKAGE unknown alternate name: $self->{alternate}";
 374 }
 375
 376 sub viewSortKey
 377 {
 378     my $self = shift;
 379     my $ver = $self->{UCA_Version};
 380
 381     my $key  = $self->getSortKey(@_);
 382     my $view = join " ", map sprintf("%04X", $_), unpack(KEY_TEMPLATE, $key);
 383
 384     if ($ver <= 8) {
 385         $view =~ s/ ?0000 ?/|/g;
 386     } else {
 387         $view =~ s/\b0000\b/|/g;
 388     }
 389     return "[$view]";
 390 }
 391
 392
 393 ##
 394 ## arrayref of JCPS   = splitCE(string to be collated)
 395 ## arrayref of arrayref[JCPS, ini_pos, fin_pos] = splitCE(string, true)
 396 ##
 397 sub splitCE
 398 {
 399     my $self = shift;
 400     my $wLen = $_[1];
 401
 402     my $code = $self->{preprocess};
 403     my $norm = $self->{normCode};
 404     my $ent  = $self->{entries};
 405     my $max  = $self->{maxlength};
 406     my $reH  = $self->{rearrangeHash};
 407     my $ign  = $self->{L3_ignorable};
 408     my $ver9 = $self->{UCA_Version} > 8;
 409
 410     my ($str, @buf);
 411
 412     if ($wLen) {
 413         $code and croak "Preprocess breaks character positions. "
 414                         . "Don't use with index(), match(), etc.";
 415         $norm and croak "Normalization breaks character positions. "
 416                         . "Don't use with index(), match(), etc.";
 417         $str = $_[0];
 418     }
 419     else {
 420         $str = $_[0];
 421         $str = &$code($str) if ref $code;
 422         $str = &$norm($str) if ref $norm;
 423     }
 424
 425     # get array of Unicode code point of string.
 426     my @src = $IsEBCDIC
 427         ? map(utf8::native_to_unicode($_), unpack UTF_TEMPLATE, $str)
 428         : unpack(UTF_TEMPLATE, $str);
 429
 430     # rearrangement:
 431     # Character positions are not kept if rearranged,
 432     # then neglected if $wLen is true.
 433     if ($reH && ! $wLen) {
 434         for (my $i = 0; $i < @src; $i++) {
 435             if (exists $reH->{ $src[$i] } && $i + 1 < @src) {
 436                 ($src[$i], $src[$i+1]) = ($src[$i+1], $src[$i]);
 437                 $i++;
 438             }
 439         }
 440     }
 441
 442     if ($ver9) {
 443         # To remove a character marked as a completely ignorable.
 444         for (my $i = 0; $i < @src; $i++) {
 445             $src[$i] = undef if $ign->{ $src[$i] };
 446         }
 447     }
 448
 449     for (my $i = 0; $i < @src; $i++) {
 450         next if _isNonCharacter($src[$i]);
 451
 452         my $i_orig = $i;
 453         my $ce = $src[$i];
 454
 455         if ($max->{$ce}) { # contract
 456             my $temp_ce = $ce;
 457
 458             for (my $p = $i + 1; $p < @src; $p++) {
 459                 next if ! defined $src[$p];
 460                 $temp_ce .= CODE_SEP . $src[$p];
 461                 if ($ent->{$temp_ce}) {
 462                     $ce = $temp_ce;
 463                     $i = $p;
 464                 }
 465             }
 466         }
 467
 468         # with Combining Char (UTS#10, 4.2.1).
 469         # requires Unicode::Normalize.
 470         # Not be $wLen, as not croaked due to $norm.
 471         if ($getCombinClass) {
 472             for (my $p = $i + 1; $p < @src; $p++) {
 473                 next if ! defined $src[$p];
 474                 last unless $getCombinClass->($src[$p]);
 475                 my $tail = CODE_SEP . $src[$p];
 476                 if ($ent->{$ce.$tail}) {
 477                     $ce .= $tail;
 478                     $src[$p] = undef;
 479                 }
 480             }
 481         }
 482
 483         if ($wLen) {
 484             for (my $p = $i + 1; $p < @src; $p++) {
 485                 last if defined $src[$p];
 486                 $i = $p;
 487             }
 488         }
 489
 490         push @buf, $wLen ? [$ce, $i_orig, $i + 1] : $ce;
 491     }
 492     return \@buf;
 493 }
 494
 495
 496 ##
 497 ## list of arrayrefs of weights = getWt(JCPS)
 498 ##
 499 sub getWt
 500 {
 501     my $self = shift;
 502     my $ce   = shift;
 503     my $ent  = $self->{entries};
 504     my $cjk  = $self->{overrideCJK};
 505     my $hang = $self->{overrideHangul};
 506     my $der  = $self->{derivCode};
 507
 508     return if !defined $ce;
 509     return map($self->altCE($_), @{ $ent->{$ce} })
 510         if $ent->{$ce};
 511
 512     # CE must not be a contraction, then it's a code point.
 513     my $u = $ce;
 514
 515     if (0xAC00 <= $u && $u <= 0xD7A3) { # is Hangul Syllale
 516         return map $self->altCE($_),
 517             $hang
 518                 ? map(pack(VCE_TEMPLATE, NON_VAR, @$_), &$hang($u))
 519                 : defined $hang
 520                     ? map({
 521                             $ent->{$_} ? @{ $ent->{$_} } : $der->($_);
 522                         } _decompHangul($u))
 523                     : $der->($u);
 524     }
 525     elsif (0x3400 <= $u && $u <= 0x4DB5 ||
 526            0x4E00 <= $u && $u <= 0x9FA5 ||
 527            0x20000 <= $u && $u <= 0x2A6D6) { # CJK Ideograph
 528         return map $self->altCE($_),
 529             $cjk
 530                 ? map(pack(VCE_TEMPLATE, NON_VAR, @$_), &$cjk($u))
 531                 : defined $cjk && $self->{UCA_Version} <= 8 && $u < 0x10000
 532                     ? pack(VCE_TEMPLATE, NON_VAR, $u, Min2, Min3, $u)
 533                     : $der->($u);
 534     }
 535     else {
 536         return map $self->altCE($_), $der->($u);
 537     }
 538 }
 539
 540
 541 ##
 542 ## string sortkey = getSortKey(string arg)
 543 ##
 544 sub getSortKey
 545 {
 546     my $self = shift;
 547     my $lev  = $self->{level};
 548     my $rCE  = $self->splitCE(shift); # get an arrayref of JCPS
 549     my $ver9 = $self->{UCA_Version} > 8;
 550     my $v2i  = $self->{alternate} ne 'non-ignorable';
 551
 552     # weight arrays
 553     my (@buf, $last_is_variable);
 554
 555     foreach my $wt (map $self->getWt($_), @$rCE) {
 556         if ($v2i && $ver9) {
 557             if ($wt->[0] == 0) { # ignorable
 558                 next if $last_is_variable;
 559             } else {
 560                 $last_is_variable = ($wt->[0] == Var1);
 561             }
 562         }
 563         push @buf, $wt;
 564     }
 565
 566     # make sort key
 567     my @ret = ([],[],[],[]);
 568     foreach my $v (0..$lev-1) {
 569         foreach my $b (@buf) {
 570             push @{ $ret[$v] }, $b->[$v]
 571                 if 0 < $b->[$v];
 572         }
 573     }
 574     foreach (@{ $self->{backwards} }) {
 575         my $v = $_ - 1;
 576         @{ $ret[$v] } = reverse @{ $ret[$v] };
 577     }
 578
 579     # modification of tertiary weights
 580     if ($self->{upper_before_lower}) {
 581         foreach (@{ $ret[2] }) {
 582             if    (0x8 <= $_ && $_ <= 0xC) { $_ -= 6 } # lower
 583             elsif (0x2 <= $_ && $_ <= 0x6) { $_ += 6 } # upper
 584             elsif ($_ == 0x1C)             { $_ += 1 } # square upper
 585             elsif ($_ == 0x1D)             { $_ -= 1 } # square lower
 586         }
 587     }
 588     if ($self->{katakana_before_hiragana}) {
 589         foreach (@{ $ret[2] }) {
 590             if    (0x0F <= $_ && $_ <= 0x13) { $_ -= 2 } # katakana
 591             elsif (0x0D <= $_ && $_ <= 0x0E) { $_ += 5 } # hiragana
 592         }
 593     }
 594     join LEVEL_SEP, map pack(KEY_TEMPLATE, @$_), @ret;
 595 }
 596
 597
 598 ##
 599 ## int compare = cmp(string a, string b)
 600 ##
 601 sub cmp { $_[0]->getSortKey($_[1]) cmp $_[0]->getSortKey($_[2]) }
 602 sub eq  { $_[0]->getSortKey($_[1]) eq  $_[0]->getSortKey($_[2]) }
 603 sub ne  { $_[0]->getSortKey($_[1]) ne  $_[0]->getSortKey($_[2]) }
 604 sub lt  { $_[0]->getSortKey($_[1]) lt  $_[0]->getSortKey($_[2]) }
 605 sub le  { $_[0]->getSortKey($_[1]) le  $_[0]->getSortKey($_[2]) }
 606 sub gt  { $_[0]->getSortKey($_[1]) gt  $_[0]->getSortKey($_[2]) }
 607 sub ge  { $_[0]->getSortKey($_[1]) ge  $_[0]->getSortKey($_[2]) }
 608
 609 ##
 610 ## list[strings] sorted = sort(list[strings] arg)
 611 ##
 612 sub sort {
 613     my $obj = shift;
 614     return
 615         map { $_->[1] }
 616             sort{ $a->[0] cmp $b->[0] }
 617                 map [ $obj->getSortKey($_), $_ ], @_;
 618 }
 619
 620
 621 sub _derivCE_9 {
 622     my $u = shift;
 623     my $base =
 624         (0x4E00 <= $u && $u <= 0x9FA5)
 625             ? 0xFB40 : # CJK
 626         (0x3400 <= $u && $u <= 0x4DB5 || 0x20000 <= $u && $u <= 0x2A6D6)
 627             ? 0xFB80   # CJK ext.
 628             : 0xFBC0;  # others
 629
 630     my $aaaa = $base + ($u >> 15);
 631     my $bbbb = ($u & 0x7FFF) | 0x8000;
 632     return
 633         pack(VCE_TEMPLATE, NON_VAR, $aaaa, Min2, Min3, $u),
 634         pack(VCE_TEMPLATE, NON_VAR, $bbbb,    0,    0, $u);
 635 }
 636
 637 sub _derivCE_8 {
 638     my $code = shift;
 639     my $aaaa =  0xFF80 + ($code >> 15);
 640     my $bbbb = ($code & 0x7FFF) | 0x8000;
 641     return
 642         pack(VCE_TEMPLATE, NON_VAR, $aaaa, 2, 1, $code),
 643         pack(VCE_TEMPLATE, NON_VAR, $bbbb, 0, 0, $code);
 644 }
 645
 646 ##
 647 ## "hhhh hhhh hhhh" to (dddd, dddd, dddd)
 648 ##
 649 sub _getHexArray { map hex, $_[0] =~ /([0-9a-fA-F]+)/g }
 650
 651 #
 652 # $code *must* be in Hangul syllable.
 653 # Check it before you enter here.
 654 #
 655 sub _decompHangul {
 656     my $code = shift;
 657     my $SIndex = $code - 0xAC00;
 658     my $LIndex = int( $SIndex / 588);
 659     my $VIndex = int(($SIndex % 588) / 28);
 660     my $TIndex =      $SIndex % 28;
 661     return (
 662         0x1100 + $LIndex,
 663         0x1161 + $VIndex,
 664         $TIndex ? (0x11A7 + $TIndex) : (),
 665     );
 666 }
 667
 668 sub _isNonCharacter {
 669     my $code = shift;
 670     return ! defined $code                      # removed
 671         || ($code < 0 || 0x10FFFF < $code)      # out of range
 672         || (($code & 0xFFFE) == 0xFFFE)         # ??FFF[EF] (cf. utf8.c)
 673         || (0xD800 <= $code && $code <= 0xDFFF) # unpaired surrogates
 674         || (0xFDD0 <= $code && $code <= 0xFDEF) # other non-characters
 675     ;
 676 }
 677
 678
 679 ##
 680 ## bool _nonIgnorAtLevel(arrayref weights, int level)
 681 ##
 682 sub _nonIgnorAtLevel($$)
 683 {
 684     my $wt = shift;
 685     return if ! defined $wt;
 686     my $lv = shift;
 687     return grep($wt->[$_] != 0, 0..$lv-1) ? TRUE : FALSE;
 688 }
 689
 690 ##
 691 ## bool _eqArray(
 692 ##    arrayref of arrayref[weights] source,
 693 ##    arrayref of arrayref[weights] substr,
 694 ##    int level)
 695 ## * comparison of graphemes vs graphemes.
 696 ##   @$source >= @$substr must be true (check it before call this);
 697 ##
 698 sub _eqArray($$$)
 699 {
 700     my $source = shift;
 701     my $substr = shift;
 702     my $lev = shift;
 703
 704     for my $g (0..@$substr-1){
 705         # Do the $g'th graphemes have the same number of AV weigths?
 706         return if @{ $source->[$g] } != @{ $substr->[$g] };
 707
 708         for my $w (0..@{ $substr->[$g] }-1) {
 709             for my $v (0..$lev-1) {
 710                 return if $source->[$g][$w][$v] != $substr->[$g][$w][$v];
 711             }
 712         }
 713     }
 714     return 1;
 715 }
 716
 717 ##
 718 ## (int position, int length)
 719 ## int position = index(string, substring, position, [undoc'ed grobal])
 720 ##
 721 ## With "grobal" (only for the list context),
 722 ##  returns list of arrayref[position, length].
 723 ##
 724 sub index
 725 {
 726     my $self  = shift;
 727     my $str   = shift;
 728     my $len   = length($str);
 729     my $subCE = $self->splitCE(shift);
 730     my $pos   = @_ ? shift : 0;
 731        $pos   = 0 if $pos < 0;
 732     my $grob  = shift;
 733
 734     my $comb  = $self->{combining};
 735     my $lev   = $self->{level};
 736     my $ver9  = $self->{UCA_Version} > 8;
 737     my $v2i   = $self->{alternate} ne 'non-ignorable';
 738
 739     if (! @$subCE) {
 740         my $temp = $pos <= 0 ? 0 : $len <= $pos ? $len : $pos;
 741         return $grob
 742             ? map([$_, 0], $temp..$len)
 743             : wantarray ? ($temp,0) : $temp;
 744     }
 745     if ($len < $pos) {
 746         return wantarray ? () : NOMATCHPOS;
 747     }
 748     my $strCE = $self->splitCE($pos ? substr($str, $pos) : $str, TRUE);
 749     if (! @$strCE) {
 750         return wantarray ? () : NOMATCHPOS;
 751     }
 752     my $last_is_variable;
 753     my(@strWt, @iniPos, @finPos, @subWt, @g_ret);
 754
 755     $last_is_variable = FALSE;
 756     for my $wt (map $self->getWt($_), @$subCE) {
 757         my $to_be_pushed = _nonIgnorAtLevel($wt,$lev);
 758
 759         if ($v2i && $ver9) {
 760             if ($wt->[0] == 0) {
 761                 $to_be_pushed = FALSE if $last_is_variable;
 762             } else {
 763                 $last_is_variable = ($wt->[0] == Var1);
 764             }
 765         }
 766
 767         if (@subWt && $wt->[0] == 0) {
 768             push @{ $subWt[-1] }, $wt if $to_be_pushed;
 769         } else {
 770             $wt->[0] = 0 if $wt->[0] == Var1;
 771             push @subWt, [ $wt ];
 772         }
 773     }
 774
 775     my $count = 0;
 776     my $end = @$strCE - 1;
 777
 778     $last_is_variable = FALSE;
 779
 780     for (my $i = 0; $i <= $end; ) { # no $i++
 781         my $found_base = 0;
 782
 783         # fetch a grapheme
 784         while ($i <= $end && $found_base == 0) {
 785             for my $wt ($self->getWt($strCE->[$i][0])) {
 786                 my $to_be_pushed = _nonIgnorAtLevel($wt,$lev);
 787
 788                 if ($v2i && $ver9) {
 789                     if ($wt->[0] == 0) {
 790                         $to_be_pushed = FALSE if $last_is_variable;
 791                     } else {
 792                         $last_is_variable = ($wt->[0] == Var1);
 793                     }
 794                 }
 795
 796                 if (@strWt && $wt->[0] == 0) {
 797                     push @{ $strWt[-1] }, $wt if $to_be_pushed;
 798                     $finPos[-1] = $strCE->[$i][2];
 799                 } elsif ($to_be_pushed) {
 800                     $wt->[0] = 0 if $wt->[0] == Var1;
 801                     push @strWt,  [ $wt ];
 802                     push @iniPos, $found_base ? NOMATCHPOS : $strCE->[$i][1];
 803                     $finPos[-1] = NOMATCHPOS if $found_base;
 804                     push @finPos, $strCE->[$i][2];
 805                     $found_base++;
 806                 }
 807                 # else ===> no-op
 808             }
 809             $i++;
 810         }
 811
 812         # try to match
 813         while ( @strWt > @subWt || (@strWt == @subWt && $i > $end) ) {
 814             if ($iniPos[0] != NOMATCHPOS &&
 815                     $finPos[$#subWt] != NOMATCHPOS &&
 816                         _eqArray(\@strWt, \@subWt, $lev)) {
 817                 my $temp = $iniPos[0] + $pos;
 818
 819                 if ($grob) {
 820                     push @g_ret, [$temp, $finPos[$#subWt] - $iniPos[0]];
 821                     splice @strWt,  0, $#subWt;
 822                     splice @iniPos, 0, $#subWt;
 823                     splice @finPos, 0, $#subWt;
 824                 }
 825                 else {
 826                     return wantarray
 827                         ? ($temp, $finPos[$#subWt] - $iniPos[0])
 828                         :  $temp;
 829                 }
 830             }
 831             shift @strWt;
 832             shift @iniPos;
 833             shift @finPos;
 834         }
 835     }
 836
 837     return $grob
 838         ? @g_ret
 839         : wantarray ? () : NOMATCHPOS;
 840 }
 841
 842 ##
 843 ## scalarref to matching part = match(string, substring)
 844 ##
 845 sub match
 846 {
 847     my $self = shift;
 848     if (my($pos,$len) = $self->index($_[0], $_[1])) {
 849         my $temp = substr($_[0], $pos, $len);
 850         return wantarray ? $temp : \$temp;
 851         # An lvalue ref \substr should be avoided,
 852         # since its value is affected by modification of its referent.
 853     }
 854     else {
 855         return;
 856     }
 857 }
 858
 859 ##
 860 ## arrayref matching parts = gmatch(string, substring)
 861 ##
 862 sub gmatch
 863 {
 864     my $self = shift;
 865     my $str  = shift;
 866     my $sub  = shift;
 867     return map substr($str, $_->[0], $_->[1]),
 868                 $self->index($str, $sub, 0, 'g');
 869 }
 870
 871 ##
 872 ## bool subst'ed = subst(string, substring, replace)
 873 ##
 874 sub subst
 875 {
 876     my $self = shift;
 877     my $code = ref $_[2] eq 'CODE' ? $_[2] : FALSE;
 878
 879     if (my($pos,$len) = $self->index($_[0], $_[1])) {
 880         if ($code) {
 881             my $mat = substr($_[0], $pos, $len);
 882             substr($_[0], $pos, $len, $code->($mat));
 883         } else {
 884             substr($_[0], $pos, $len, $_[2]);
 885         }
 886         return TRUE;
 887     }
 888     else {
 889         return FALSE;
 890     }
 891 }
 892
 893 ##
 894 ## int count = gsubst(string, substring, replace)
 895 ##
 896 sub gsubst
 897 {
 898     my $self = shift;
 899     my $code = ref $_[2] eq 'CODE' ? $_[2] : FALSE;
 900     my $cnt = 0;
 901
 902     # Replacement is carried out from the end, then use reverse.
 903     for my $pos_len (reverse $self->index($_[0], $_[1], 0, 'g')) {
 904         if ($code) {
 905             my $mat = substr($_[0], $pos_len->[0], $pos_len->[1]);
 906             substr($_[0], $pos_len->[0], $pos_len->[1], $code->($mat));
 907         } else {
 908             substr($_[0], $pos_len->[0], $pos_len->[1], $_[2]);
 909         }
 910         $cnt++;
 911     }
 912     return $cnt;
 913 }
 914
 915 1;
 916 __END__
 917
 918 =head1 NAME
 919
 920 Unicode::Collate - Unicode Collation Algorithm
 921
 922 =head1 SYNOPSIS
 923
 924   use Unicode::Collate;
 925
 926   #construct
 927   $Collator = Unicode::Collate->new(%tailoring);
 928
 929   #sort
 930   @sorted = $Collator->sort(@not_sorted);
 931
 932   #compare
 933   $result = $Collator->cmp($a, $b); # returns 1, 0, or -1.
 934
 935 =head1 DESCRIPTION
 936
 937 This module is an implementation
 938 of Unicode Technical Standard #10 (UTS #10)
 939 "Unicode Collation Algorithm."
 940
 941 =head2 Constructor and Tailoring
 942
 943 The C<new> method returns a collator object.
 944
 945    $Collator = Unicode::Collate->new(
 946       UCA_Version => $UCA_Version,
 947       alternate => $alternate,
 948       backwards => $levelNumber, # or \@levelNumbers
 949       entry => $element,
 950       normalization  => $normalization_form,
 951       ignoreName => qr/$ignoreName/,
 952       ignoreChar => qr/$ignoreChar/,
 953       katakana_before_hiragana => $bool,
 954       level => $collationLevel,
 955       overrideCJK => \&overrideCJK,
 956       overrideHangul => \&overrideHangul,
 957       preprocess => \&preprocess,
 958       rearrange => \@charList,
 959       table => $filename,
 960       undefName => qr/$undefName/,
 961       undefChar => qr/$undefChar/,
 962       upper_before_lower => $bool,
 963    );
 964    # if %tailoring is false (i.e. empty),
 965    # $Collator should do the default collation.
 966
 967 =over 4
 968
 969 =item UCA_Version
 970
 971 If the version number of the older UCA is given,
 972 the older behavior of that version is emulated on collating.
 973 If omitted, the return value of C<UCA_Version()> is used.
 974
 975 The supported version: 8 or 9.
 976
 977 B<This parameter may be removed in the future version,
 978 as switching the algorithm would affect the performance.>
 979
 980 =item alternate
 981
 982 -- see 3.2.2 Variable Weighting, UTS #10.
 983
 984 (the title in UCA version 8: Alternate Weighting)
 985
 986 This key allows to alternate weighting for variable collation elements,
 987 which are marked with an ASTERISK in the table
 988 (NOTE: Many punction marks and symbols are variable in F<allkeys.txt>).
 989
 990    alternate => 'blanked', 'non-ignorable', 'shifted', or 'shift-trimmed'.
 991
 992 These names are case-insensitive.
 993 By default (if specification is omitted), 'shifted' is adopted.
 994
 995    'Blanked'        Variable elements are made ignorable at levels 1 through 3;
 996                     considered at the 4th level.
 997
 998    'Non-ignorable'  Variable elements are not reset to ignorable.
 999
1000    'Shifted'        Variable elements are made ignorable at levels 1 through 3
1001                     their level 4 weight is replaced by the old level 1 weight.
1002                     Level 4 weight for Non-Variable elements is 0xFFFF.
1003
1004    'Shift-Trimmed'  Same as 'shifted', but all FFFF's at the 4th level
1005                     are trimmed.
1006
1007 =item backwards
1008
1009 -- see 3.1.2 French Accents, UTS #10.
1010
1011      backwards => $levelNumber or \@levelNumbers
1012
1013 Weights in reverse order; ex. level 2 (diacritic ordering) in French.
1014 If omitted, forwards at all the levels.
1015
1016 =item entry
1017
1018 -- see 3.1 Linguistic Features; 3.2.1 File Format, UTS #10.
1019
1020 Overrides a default order or defines additional collation elements
1021
1022   entry => <<'ENTRIES', # use the UCA file format
1023 00E6 ; [.0861.0020.0002.00E6] [.08B1.0020.0002.00E6] # ligature <ae> as <a><e>
1024 0063 0068 ; [.0893.0020.0002.0063]      # "ch" in traditional Spanish
1025 0043 0068 ; [.0893.0020.0008.0043]      # "Ch" in traditional Spanish
1026 ENTRIES
1027
1028 B<NOTE:> The code point in the UCA file format (before C<';'>)
1029 B<must> be a Unicode code point, but not a native code point.
1030 So C<0063> must always denote C<U+0063>,
1031 but not a character of C<"\x63">.
1032
1033 =item ignoreName
1034
1035 =item ignoreChar
1036
1037 -- see Completely Ignorable, 3.2.2 Variable Weighting, UTS #10.
1038
1039 Makes the entry in the table completely ignorable;
1040 i.e. as if the weights were zero at all level.
1041
1042 E.g. when 'a' and 'e' are ignorable,
1043 'element' is equal to 'lament' (or 'lmnt').
1044
1045 =item level
1046
1047 -- see 4.3 Form a sort key for each string, UTS #10.
1048
1049 Set the maximum level.
1050 Any higher levels than the specified one are ignored.
1051
1052   Level 1: alphabetic ordering
1053   Level 2: diacritic ordering
1054   Level 3: case ordering
1055   Level 4: tie-breaking (e.g. in the case when alternate is 'shifted')
1056
1057   ex.level => 2,
1058
1059 If omitted, the maximum is the 4th.
1060
1061 =item normalization
1062
1063 -- see 4.1 Normalize each input string, UTS #10.
1064
1065 If specified, strings are normalized before preparation of sort keys
1066 (the normalization is executed after preprocess).
1067
1068 As a form name, one of the following names must be used.
1069
1070   'C'  or 'NFC'  for Normalization Form C
1071   'D'  or 'NFD'  for Normalization Form D
1072   'KC' or 'NFKC' for Normalization Form KC
1073   'KD' or 'NFKD' for Normalization Form KD
1074
1075 If omitted, the string is put into Normalization Form D.
1076
1077 If C<undef> is passed explicitly as the value for this key,
1078 any normalization is not carried out (this may make tailoring easier
1079 if any normalization is not desired).
1080
1081 see B<CAVEAT>.
1082
1083 =item overrideCJK
1084
1085 -- see 7.1 Derived Collation Elements, UTS #10.
1086
1087 By default, mapping of CJK Unified Ideographs
1088 uses the Unicode codepoint order.
1089 But the mapping of CJK Unified Ideographs may be overrided.
1090
1091 ex. CJK Unified Ideographs in the JIS code point order.
1092
1093   overrideCJK => sub {
1094       my $u = shift;             # get a Unicode codepoint
1095       my $b = pack('n', $u);     # to UTF-16BE
1096       my $s = your_unicode_to_sjis_converter($b); # convert
1097       my $n = unpack('n', $s);   # convert sjis to short
1098       [ $n, 0x20, 0x2, $u ];     # return the collation element
1099   },
1100
1101 ex. ignores all CJK Unified Ideographs.
1102
1103   overrideCJK => sub {()}, # CODEREF returning empty list
1104
1105    # where ->eq("Pe\x{4E00}rl", "Perl") is true
1106    # as U+4E00 is a CJK Unified Ideograph and to be ignorable.
1107
1108 If C<undef> is passed explicitly as the value for this key,
1109 weights for CJK Unified Ideographs are treated as undefined.
1110 But assignment of weight for CJK Unified Ideographs
1111 in table or L<entry> is still valid.
1112
1113 =item overrideHangul
1114
1115 -- see 7.1 Derived Collation Elements, UTS #10.
1116
1117 By default, Hangul Syllables are decomposed into Hangul Jamo.
1118 But the mapping of Hangul Syllables may be overrided.
1119
1120 This tag works like L<overrideCJK>, so see there for examples.
1121
1122 If you want to override the mapping of Hangul Syllables,
1123 the Normalization Forms D and KD are not appropriate
1124 (they will be decomposed before overriding).
1125
1126 If C<undef> is passed explicitly as the value for this key,
1127 weight for Hangul Syllables is treated as undefined
1128 without decomposition into Hangul Jamo.
1129 But definition of weight for Hangul Syllables
1130 in table or L<entry> is still valid.
1131
1132 =item preprocess
1133
1134 -- see 5.1 Preprocessing, UTS #10.
1135
1136 If specified, the coderef is used to preprocess
1137 before the formation of sort keys.
1138
1139 ex. dropping English articles, such as "a" or "the".
1140 Then, "the pen" is before "a pencil".
1141
1142      preprocess => sub {
1143            my $str = shift;
1144            $str =~ s/\b(?:an?|the)\s+//gi;
1145            $str;
1146         },
1147
1148 =item rearrange
1149
1150 -- see 3.1.3 Rearrangement, UTS #10.
1151
1152 Characters that are not coded in logical order and to be rearranged.
1153 By default,
1154
1155     rearrange => [ 0x0E40..0x0E44, 0x0EC0..0x0EC4 ],
1156
1157 If you want to disallow any rearrangement,
1158 pass C<undef> or C<[]> (a reference to an empty list)
1159 as the value for this key.
1160
1161 B<According to the version 9 of UCA, this parameter shall not be used;
1162 but it is not warned at present.>
1163
1164 =item table
1165
1166 -- see 3.2 Default Unicode Collation Element Table, UTS #10.
1167
1168 You can use another element table if desired.
1169 The table file must be in your C<lib/Unicode/Collate> directory.
1170
1171 By default, the file C<lib/Unicode/Collate/allkeys.txt> is used.
1172
1173 If C<undef> is passed explicitly as the value for this key,
1174 no file is read (but you can define collation elements via L<entry>).
1175
1176 A typical way to define a collation element table
1177 without any file of table:
1178
1179    $onlyABC = Unicode::Collate->new(
1180        table => undef,
1181        entry => << 'ENTRIES',
1182 0061 ; [.0101.0020.0002.0061] # LATIN SMALL LETTER A
1183 0041 ; [.0101.0020.0008.0041] # LATIN CAPITAL LETTER A
1184 0062 ; [.0102.0020.0002.0062] # LATIN SMALL LETTER B
1185 0042 ; [.0102.0020.0008.0042] # LATIN CAPITAL LETTER B
1186 0063 ; [.0103.0020.0002.0063] # LATIN SMALL LETTER C
1187 0043 ; [.0103.0020.0008.0043] # LATIN CAPITAL LETTER C
1188 ENTRIES
1189     );
1190
1191 =item undefName
1192
1193 =item undefChar
1194
1195 -- see 6.3.4 Reducing the Repertoire, UTS #10.
1196
1197 Undefines the collation element as if it were unassigned in the table.
1198 This reduces the size of the table.
1199 If an unassigned character appears in the string to be collated,
1200 the sort key is made from its codepoint
1201 as a single-character collation element,
1202 as it is greater than any other assigned collation elements
1203 (in the codepoint order among the unassigned characters).
1204 But, it'd be better to ignore characters
1205 unfamiliar to you and maybe never used.
1206
1207 =item katakana_before_hiragana
1208
1209 =item upper_before_lower
1210
1211 -- see 6.6 Case Comparisons; 7.3.1 Tertiary Weight Table, UTS #10.
1212
1213 By default, lowercase is before uppercase
1214 and hiragana is before katakana.
1215
1216 If the tag is made true, this is reversed.
1217
1218 B<NOTE>: These tags simplemindedly assume
1219 any lowercase/uppercase or hiragana/katakana distinctions
1220 should occur in level 3, and their weights at level 3
1221 should be same as those mentioned in 7.3.1, UTS #10.
1222 If you define your collation elements which violates this,
1223 these tags don't work validly.
1224
1225 =back
1226
1227 =head2 Methods for Collation
1228
1229 =over 4
1230
1231 =item C<@sorted = $Collator-E<gt>sort(@not_sorted)>
1232
1233 Sorts a list of strings.
1234
1235 =item C<$result = $Collator-E<gt>cmp($a, $b)>
1236
1237 Returns 1 (when C<$a> is greater than C<$b>)
1238 or 0 (when C<$a> is equal to C<$b>)
1239 or -1 (when C<$a> is lesser than C<$b>).
1240
1241 =item C<$result = $Collator-E<gt>eq($a, $b)>
1242
1243 =item C<$result = $Collator-E<gt>ne($a, $b)>
1244
1245 =item C<$result = $Collator-E<gt>lt($a, $b)>
1246
1247 =item C<$result = $Collator-E<gt>le($a, $b)>
1248
1249 =item C<$result = $Collator-E<gt>gt($a, $b)>
1250
1251 =item C<$result = $Collator-E<gt>ge($a, $b)>
1252
1253 They works like the same name operators as theirs.
1254
1255    eq : whether $a is equal to $b.
1256    ne : whether $a is not equal to $b.
1257    lt : whether $a is lesser than $b.
1258    le : whether $a is lesser than $b or equal to $b.
1259    gt : whether $a is greater than $b.
1260    ge : whether $a is greater than $b or equal to $b.
1261
1262 =item C<$sortKey = $Collator-E<gt>getSortKey($string)>
1263
1264 -- see 4.3 Form a sort key for each string, UTS #10.
1265
1266 Returns a sort key.
1267
1268 You compare the sort keys using a binary comparison
1269 and get the result of the comparison of the strings using UCA.
1270
1271    $Collator->getSortKey($a) cmp $Collator->getSortKey($b)
1272
1273       is equivalent to
1274
1275    $Collator->cmp($a, $b)
1276
1277 =item C<$sortKeyForm = $Collator-E<gt>viewSortKey($string)>
1278
1279    use Unicode::Collate;
1280    my $c = Unicode::Collate->new();
1281    print $c->viewSortKey("Perl"),"\n";
1282
1283    # output:
1284    # [0B67 0A65 0B7F 0B03 | 0020 0020 0020 0020 | 0008 0002 0002 0002 | FFFF FFFF FFFF FFFF]
1285    #  Level 1               Level 2               Level 3               Level 4
1286
1287     (If C<UCA_Version> is 8, the output is slightly different.)
1288
1289 =back
1290
1291 =head2 Methods for Searching
1292
1293 B<DISCLAIMER:> If C<preprocess> or C<normalization> tag is true
1294 for C<$Collator>, calling these methods (C<index>, C<match>, C<gmatch>,
1295 C<subst>, C<gsubst>) is croaked,
1296 as the position and the length might differ
1297 from those on the specified string.
1298 (And the C<rearrange> tag is neglected.)
1299
1300 The C<match>, C<gmatch>, C<subst>, C<gsubst> methods work
1301 like C<m//>, C<m//g>, C<s///>, C<s///g>, respectively,
1302 but they are not aware of any pattern, but only a literal substring.
1303
1304 =over 4
1305
1306 =item C<$position = $Collator-E<gt>index($string, $substring[, $position])>
1307
1308 =item C<($position, $length) = $Collator-E<gt>index($string, $substring[, $position])>
1309
1310 If C<$substring> matches a part of C<$string>, returns
1311 the position of the first occurrence of the matching part in scalar context;
1312 in list context, returns a two-element list of
1313 the position and the length of the matching part.
1314
1315 If C<$substring> does not match any part of C<$string>,
1316 returns C<-1> in scalar context and
1317 an empty list in list context.
1318
1319 e.g. you say
1320
1321   my $Collator = Unicode::Collate->new( normalization => undef, level => 1 );
1322                                      # (normalization => undef) is REQUIRED.
1323   my $str = "Ich muß studieren Perl.";
1324   my $sub = "MÜSS";
1325   my $match;
1326   if (my($pos,$len) = $Collator->index($str, $sub)) {
1327       $match = substr($str, $pos, $len);
1328   }
1329
1330 and get C<"muß"> in C<$match> since C<"muß">
1331 is primary equal to C<"MÜSS">.
1332
1333 =item C<$match_ref = $Collator-E<gt>match($string, $substring)>
1334
1335 =item C<($match)   = $Collator-E<gt>match($string, $substring)>
1336
1337 If C<$substring> matches a part of C<$string>, in scalar context, returns
1338 B<a reference to> the first occurrence of the matching part
1339 (C<$match_ref> is always true if matches,
1340 since every reference is B<true>);
1341 in list context, returns the first occurrence of the matching part.
1342
1343 If C<$substring> does not match any part of C<$string>,
1344 returns C<undef> in scalar context and
1345 an empty list in list context.
1346
1347 e.g.
1348
1349     if ($match_ref = $Collator->match($str, $sub)) { # scalar context
1350         print "matches [$$match_ref].\n";
1351     } else {
1352         print "doesn't match.\n";
1353     }
1354
1355      or
1356
1357     if (($match) = $Collator->match($str, $sub)) { # list context
1358         print "matches [$match].\n";
1359     } else {
1360         print "doesn't match.\n";
1361     }
1362
1363 =item C<@match = $Collator-E<gt>gmatch($string, $substring)>
1364
1365 If C<$substring> matches a part of C<$string>, returns
1366 all the matching parts (or matching count in scalar context).
1367
1368 If C<$substring> does not match any part of C<$string>,
1369 returns an empty list.
1370
1371 =item C<$count = $Collator-E<gt>subst($string, $substring, $replacement)>
1372
1373 If C<$substring> matches a part of C<$string>,
1374 the first occurrence of the matching part is replaced by C<$replacement>
1375 (C<$string> is modified) and return C<$count> (always equals to C<1>).
1376
1377 C<$replacement> can be a C<CODEREF>,
1378 taking the matching part as an argument,
1379 and returning a string to replace the matching part
1380 (a bit similar to C<s/(..)/$coderef-E<gt>($1)/e>).
1381
1382 =item C<$count = $Collator-E<gt>gsubst($string, $substring, $replacement)>
1383
1384 If C<$substring> matches a part of C<$string>,
1385 all the occurrences of the matching part is replaced by C<$replacement>
1386 (C<$string> is modified) and return C<$count>.
1387
1388 C<$replacement> can be a C<CODEREF>,
1389 taking the matching part as an argument,
1390 and returning a string to replace the matching part
1391 (a bit similar to C<s/(..)/$coderef-E<gt>($1)/eg>).
1392
1393 e.g.
1394
1395   my $Collator = Unicode::Collate->new( normalization => undef, level => 1 );
1396                                      # (normalization => undef) is REQUIRED.
1397   my $str = "Camel ass came\x{301}l CAMEL horse cAm\0E\0L...";
1398   $Collator->gsubst($str, "camel", sub { "<b>$_[0]</b>" });
1399
1400   # now $str is "<b>Camel</b> ass <b>came\x{301}l</b> <b>CAMEL</b> horse <b>cAm\0E\0L</b>...";
1401   # i.e., all the camels are made bold-faced.
1402
1403 =back
1404
1405 =head2 Other Methods
1406
1407 =over 4
1408
1409 =item C<%old_tailoring = $Collator-E<gt>change(%new_tailoring)>
1410
1411 Change the value of specified keys and returns the changed part.
1412
1413     $Collator = Unicode::Collate->new(level => 4);
1414
1415     $Collator->eq("perl", "PERL"); # false
1416
1417     %old = $Collator->change(level => 2); # returns (level => 4).
1418
1419     $Collator->eq("perl", "PERL"); # true
1420
1421     $Collator->change(%old); # returns (level => 2).
1422
1423     $Collator->eq("perl", "PERL"); # false
1424
1425 Not all C<(key,value)>s are allowed to be changed.
1426 See also C<@Unicode::Collate::ChangeOK> and C<@Unicode::Collate::ChangeNG>.
1427
1428 In the scalar context, returns the modified collator
1429 (but it is B<not> a clone from the original).
1430
1431     $Collator->change(level => 2)->eq("perl", "PERL"); # true
1432
1433     $Collator->eq("perl", "PERL"); # true; now max level is 2nd.
1434
1435     $Collator->change(level => 4)->eq("perl", "PERL"); # false
1436
1437 =item UCA_Version
1438
1439 Returns the version number of UTS #10 this module consults.
1440
1441 =item Base_Unicode_Version
1442
1443 Returns the version number of the Unicode Standard
1444 this module is based on.
1445
1446 =back
1447
1448 =head2 EXPORT
1449
1450 None by default.
1451
1452 =head2 TODO
1453
1454 Unicode::Collate has not been ported to EBCDIC.
1455 IMHO, use of utf8::unicode_to_native()/utf8::native_to_unicode()
1456 at the proper postions should allow
1457 this module to work on EBCDIC platform...
1458
1459 =head2 CAVEAT
1460
1461 Use of the C<normalization> parameter requires
1462 the B<Unicode::Normalize> module.
1463
1464 If you need not it (say, in the case when you need not
1465 handle any combining characters),
1466 assign C<normalization =E<gt> undef> explicitly.
1467
1468 -- see 6.5 Avoiding Normalization, UTS #10.
1469
1470 =head2 Conformance Test
1471
1472 The Conformance Test for the UCA is provided
1473 in L<http://www.unicode.org/reports/tr10/CollationTest.html>
1474 and L<http://www.unicode.org/reports/tr10/CollationTest.zip>
1475
1476 For F<CollationTest_SHIFTED.txt>,
1477 a collator via C<Unicode::Collate-E<gt>new( )> should be used;
1478 for F<CollationTest_NON_IGNORABLE.txt>, a collator via
1479 C<Unicode::Collate-E<gt>new(alternate =E<gt> "non-ignorable", level =E<gt> 3)>.
1480
1481 B<Unicode::Normalize is required to try The Conformance Test.>
1482
1483 =head1 AUTHOR
1484
1485 SADAHIRO Tomoyuki, E<lt>SADAHIRO@cpan.orgE<gt>
1486
1487   http://homepage1.nifty.com/nomenclator/perl/
1488
1489   Copyright(C) 2001-2002, SADAHIRO Tomoyuki. Japan. All rights reserved.
1490
1491   This library is free software; you can redistribute it
1492   and/or modify it under the same terms as Perl itself.
1493
1494 =head1 SEE ALSO
1495
1496 =over 4
1497
1498 =item http://www.unicode.org/reports/tr10/
1499
1500 Unicode Collation Algorithm - UTS #10
1501
1502 =item http://www.unicode.org/reports/tr10/allkeys.txt
1503
1504 The Default Unicode Collation Element Table
1505
1506 =item http://www.unicode.org/reports/tr10/CollationTest.html
1507 http://www.unicode.org/reports/tr10/CollationTest.zip
1508
1509 The latest versions of the conformance test for the UCA
1510
1511 =item http://www.unicode.org/reports/tr15/
1512
1513 Unicode Normalization Forms - UAX #15
1514
1515 =item L<Unicode::Normalize>
1516
1517 =back
1518
1519 =cut