lib/Unicode/Collate.pm

   1 package Unicode::Collate;
   2
   3 BEGIN {
   4     unless ("A" eq pack('U', 0x41)) {
   5         die "Unicode::Collate cannot stringify a Unicode code point\n";
   6     }
   7 }
   8
   9 use 5.006;
  10 use strict;
  11 use warnings;
  12 use Carp;
  13 use File::Spec;
  14
  15 no warnings 'utf8';
  16
  17 require Exporter;
  18
  19 our $VERSION = '0.31';
  20 our $PACKAGE = __PACKAGE__;
  21
  22 our @ISA = qw(Exporter);
  23
  24 our %EXPORT_TAGS = ();
  25 our @EXPORT_OK = ();
  26 our @EXPORT = ();
  27
  28 (our $Path = $INC{'Unicode/Collate.pm'}) =~ s/\.pm$//;
  29 our $KeyFile = "allkeys.txt";
  30
  31 # Perl's boolean
  32 use constant TRUE  => 1;
  33 use constant FALSE => "";
  34 use constant NOMATCHPOS => -1;
  35
  36 # A coderef to get combining class imported from Unicode::Normalize
  37 # (i.e. \&Unicode::Normalize::getCombinClass).
  38 # This is also used as a HAS_UNICODE_NORMALIZE flag.
  39 our $CVgetCombinClass;
  40
  41 # Supported Levels
  42 use constant MinLevel => 1;
  43 use constant MaxLevel => 4;
  44
  45 # Minimum weights at level 2 and 3, respectively
  46 use constant Min2Wt => 0x20;
  47 use constant Min3Wt => 0x02;
  48
  49 # Shifted weight at 4th level
  50 use constant Shift4Wt => 0xFFFF;
  51
  52 # Variable weight at 1st level.
  53 # This is a negative value but should be regarded as zero on collation.
  54 # This is for distinction of variable chars from level 3 ignorable chars.
  55 use constant Var1Wt => -1;
  56
  57
  58 # A boolean for Variable and 16-bit weights at 4 levels of Collation Element
  59 # PROBLEM: The Default Unicode Collation Element Table
  60 # has weights over 0xFFFF at the 4th level.
  61 # The tie-breaking in the variable weights
  62 # other than "shift" (as well as "shift-trimmed") is unreliable.
  63 use constant VCE_TEMPLATE => 'Cn4';
  64
  65 # A sort key: 16-bit weights
  66 # See also the PROBLEM on VCE_TEMPLATE above.
  67 use constant KEY_TEMPLATE => 'n*';
  68
  69 # Level separator in a sort key:
  70 # i.e. pack(KEY_TEMPLATE, 0)
  71 use constant LEVEL_SEP => "\0\0";
  72
  73 # As Unicode code point separator for hash keys.
  74 # A joined code point string (denoted by JCPS below)
  75 # like "65;768" is used for internal processing
  76 # instead of Perl's Unicode string like "\x41\x{300}",
  77 # as the native code point is different from the Unicode code point
  78 # on EBCDIC platform.
  79 # This character must not be included in any stringified
  80 # representation of an integer.
  81 use constant CODE_SEP => ';';
  82
  83 # boolean values of variable weights
  84 use constant NON_VAR => 0; # Non-Variable character
  85 use constant VAR     => 1; # Variable character
  86
  87 # specific code points
  88 use constant Hangul_LBase  => 0x1100;
  89 use constant Hangul_LIni   => 0x1100;
  90 use constant Hangul_LFin   => 0x1159;
  91 use constant Hangul_LFill  => 0x115F;
  92 use constant Hangul_VBase  => 0x1161;
  93 use constant Hangul_VIni   => 0x1160;
  94 use constant Hangul_VFin   => 0x11A2;
  95 use constant Hangul_TBase  => 0x11A7;
  96 use constant Hangul_TIni   => 0x11A8;
  97 use constant Hangul_TFin   => 0x11F9;
  98 use constant Hangul_TCount => 28;
  99 use constant Hangul_NCount => 588;
 100 use constant Hangul_SBase  => 0xAC00;
 101 use constant Hangul_SIni   => 0xAC00;
 102 use constant Hangul_SFin   => 0xD7A3;
 103 use constant CJK_UidIni    => 0x4E00;
 104 use constant CJK_UidFin    => 0x9FA5;
 105 use constant CJK_ExtAIni   => 0x3400;
 106 use constant CJK_ExtAFin   => 0x4DB5;
 107 use constant CJK_ExtBIni   => 0x20000;
 108 use constant CJK_ExtBFin   => 0x2A6D6;
 109 use constant BMP_Max       => 0xFFFF;
 110
 111 # Logical_Order_Exception in PropList.txt
 112 # TODO: synchronization with change of PropList.txt.
 113 our $DefaultRearrange = [ 0x0E40..0x0E44, 0x0EC0..0x0EC4 ];
 114
 115 sub UCA_Version { "11" }
 116
 117 sub Base_Unicode_Version { "4.0" }
 118
 119 ######
 120
 121 sub pack_U {
 122     return pack('U*', @_);
 123 }
 124
 125 sub unpack_U {
 126     return unpack('U*', pack('U*').shift);
 127 }
 128
 129 ######
 130
 131 my (%VariableOK);
 132 @VariableOK{ qw/
 133     blanked  non-ignorable  shifted  shift-trimmed
 134   / } = (); # keys lowercased
 135
 136 our @ChangeOK = qw/
 137     alternate backwards level normalization rearrange
 138     katakana_before_hiragana upper_before_lower
 139     overrideHangul overrideCJK preprocess UCA_Version
 140     hangul_terminator variable
 141   /;
 142
 143 our @ChangeNG = qw/
 144     entry mapping table maxlength
 145     ignoreChar ignoreName undefChar undefName variableTable
 146     versionTable alternateTable backwardsTable forwardsTable rearrangeTable
 147     derivCode normCode rearrangeHash L3_ignorable
 148     backwardsFlag
 149   /;
 150 # The hash key 'ignored' is deleted at v 0.21.
 151 # The hash key 'isShift' is deleted at v 0.23.
 152 # The hash key 'combining' is deleted at v 0.24.
 153 # The hash key 'entries' is deleted at v 0.30.
 154
 155 sub version {
 156     my $self = shift;
 157     return $self->{versionTable} || 'unknown';
 158 }
 159
 160 my (%ChangeOK, %ChangeNG);
 161 @ChangeOK{ @ChangeOK } = ();
 162 @ChangeNG{ @ChangeNG } = ();
 163
 164 sub change {
 165     my $self = shift;
 166     my %hash = @_;
 167     my %old;
 168     if (exists $hash{variable} && exists $hash{alternate}) {
 169         delete $hash{alternate};
 170     }
 171     elsif (!exists $hash{variable} && exists $hash{alternate}) {
 172         $hash{variable} = $hash{alternate};
 173     }
 174     foreach my $k (keys %hash) {
 175         if (exists $ChangeOK{$k}) {
 176             $old{$k} = $self->{$k};
 177             $self->{$k} = $hash{$k};
 178         }
 179         elsif (exists $ChangeNG{$k}) {
 180             croak "change of $k via change() is not allowed!";
 181         }
 182         # else => ignored
 183     }
 184     $self->checkCollator;
 185     return wantarray ? %old : $self;
 186 }
 187
 188 sub _checkLevel {
 189     my $level = shift;
 190     my $key   = shift;
 191     croak sprintf "Illegal level %d (in \$self->{%s}) lower than %d.",
 192         $level, $key, MinLevel if MinLevel > $level;
 193     croak sprintf "Unsupported level %d (in \$self->{%s}) higher than %d ",
 194         $level, $key, MaxLevel if MaxLevel < $level;
 195 }
 196
 197 my %DerivCode = (
 198     8 => \&_derivCE_8,
 199     9 => \&_derivCE_9,
 200    11 => \&_derivCE_9, # 11 == 9
 201 );
 202
 203 sub checkCollator {
 204     my $self = shift;
 205     _checkLevel($self->{level}, "level");
 206
 207     $self->{derivCode} = $DerivCode{ $self->{UCA_Version} }
 208         or croak "Illegal UCA version (passed $self->{UCA_Version}).";
 209
 210     $self->{variable} ||= $self->{alternate} || $self->{variableTable} ||
 211                                 $self->{alternateTable} || 'shifted';
 212     $self->{variable} = $self->{alternate} = lc($self->{variable});
 213     exists $VariableOK{ $self->{variable} }
 214         or croak "$PACKAGE unknown variable tag name: $self->{variable}";
 215
 216     if (! defined $self->{backwards}) {
 217         $self->{backwardsFlag} = 0;
 218     }
 219     elsif (! ref $self->{backwards}) {
 220         _checkLevel($self->{backwards}, "backwards");
 221         $self->{backwardsFlag} = 1 << $self->{backwards};
 222     }
 223     else {
 224         my %level;
 225         $self->{backwardsFlag} = 0;
 226         for my $b (@{ $self->{backwards} }) {
 227             _checkLevel($b, "backwards");
 228             $level{$b} = 1;
 229         }
 230         for my $v (sort keys %level) {
 231             $self->{backwardsFlag} += 1 << $v;
 232         }
 233     }
 234
 235     defined $self->{rearrange} or $self->{rearrange} = [];
 236     ref $self->{rearrange}
 237         or croak "$PACKAGE: list for rearrangement must be store in ARRAYREF";
 238
 239     # keys of $self->{rearrangeHash} are $self->{rearrange}.
 240     $self->{rearrangeHash} = undef;
 241
 242     if (@{ $self->{rearrange} }) {
 243         @{ $self->{rearrangeHash} }{ @{ $self->{rearrange} } } = ();
 244     }
 245
 246     $self->{normCode} = undef;
 247
 248     if (defined $self->{normalization}) {
 249         eval { require Unicode::Normalize };
 250         $@ and croak "Unicode::Normalize is required to normalize strings";
 251
 252         $CVgetCombinClass ||= \&Unicode::Normalize::getCombinClass;
 253
 254         if ($self->{normalization} =~ /^(?:NF)D\z/) { # tweak for default
 255             $self->{normCode} = \&Unicode::Normalize::NFD;
 256         }
 257         elsif ($self->{normalization} ne 'prenormalized') {
 258             my $norm = $self->{normalization};
 259             $self->{normCode} = sub {
 260                 Unicode::Normalize::normalize($norm, shift);
 261             };
 262             eval { $self->{normCode}->("") }; # try
 263             $@ and croak "$PACKAGE unknown normalization form name: $norm";
 264         }
 265     }
 266     return;
 267 }
 268
 269 sub new
 270 {
 271     my $class = shift;
 272     my $self = bless { @_ }, $class;
 273
 274     # If undef is passed explicitly, no file is read.
 275     $self->{table} = $KeyFile if ! exists $self->{table};
 276     $self->read_table if defined $self->{table};
 277
 278     if ($self->{entry}) {
 279         $self->parseEntry($_) foreach split /\n/, $self->{entry};
 280     }
 281
 282     $self->{level} ||= MaxLevel;
 283     $self->{UCA_Version} ||= UCA_Version();
 284
 285     $self->{overrideHangul} = ''
 286         if ! exists $self->{overrideHangul};
 287     $self->{overrideCJK} = ''
 288         if ! exists $self->{overrideCJK};
 289     $self->{normalization} = 'NFD'
 290         if ! exists $self->{normalization};
 291     $self->{rearrange} = $self->{rearrangeTable} || $DefaultRearrange
 292         if ! exists $self->{rearrange};
 293     $self->{backwards} = $self->{backwardsTable}
 294         if ! exists $self->{backwards};
 295
 296     $self->checkCollator;
 297
 298     return $self;
 299 }
 300
 301 sub read_table {
 302     my $self = shift;
 303     my $file = $self->{table} ne '' ? $self->{table} : $KeyFile;
 304
 305     my $filepath = File::Spec->catfile($Path, $file);
 306     open my $fk, "<$filepath"
 307         or croak "File does not exist at $filepath";
 308
 309     while (<$fk>) {
 310         next if /^\s*#/;
 311         if (/^\s*\@/) {
 312             if    (/^\s*\@version\s*(\S*)/) {
 313                 $self->{versionTable} ||= $1;
 314             }
 315             elsif (/^\s*\@variable\s+(\S*)/) { # since UTS #10-9
 316                 $self->{variableTable} ||= $1;
 317             }
 318             elsif (/^\s*\@alternate\s+(\S*)/) { # till UTS #10-8
 319                 $self->{alternateTable} ||= $1;
 320             }
 321             elsif (/^\s*\@backwards\s+(\S*)/) {
 322                 push @{ $self->{backwardsTable} }, $1;
 323             }
 324             elsif (/^\s*\@forwards\s+(\S*)/) { # parhaps no use
 325                 push @{ $self->{forwardsTable} }, $1;
 326             }
 327             elsif (/^\s*\@rearrange\s+(.*)/) { # (\S*) is NG
 328                 push @{ $self->{rearrangeTable} }, _getHexArray($1);
 329             }
 330             next;
 331         }
 332         $self->parseEntry($_);
 333     }
 334     close $fk;
 335 }
 336
 337
 338 ##
 339 ## get $line, parse it, and write an entry in $self
 340 ##
 341 sub parseEntry
 342 {
 343     my $self = shift;
 344     my $line = shift;
 345     my($name, $entry, @uv, @key);
 346
 347     return if $line !~ /^\s*[0-9A-Fa-f]/;
 348
 349     # removes comment and gets name
 350     $name = $1
 351         if $line =~ s/[#%]\s*(.*)//;
 352     return if defined $self->{undefName} && $name =~ /$self->{undefName}/;
 353
 354     # gets element
 355     my($e, $k) = split /;/, $line;
 356     croak "Wrong Entry: <charList> must be separated by ';' from <collElement>"
 357         if ! $k;
 358
 359     @uv = _getHexArray($e);
 360     return if !@uv;
 361
 362     $entry = join(CODE_SEP, @uv); # in JCPS
 363
 364     if (defined $self->{undefChar} || defined $self->{ignoreChar}) {
 365         my $ele = pack_U(@uv);
 366
 367         # regarded as if it were not entried in the table
 368         return
 369             if defined $self->{undefChar} && $ele =~ /$self->{undefChar}/;
 370
 371         # replaced as completely ignorable
 372         $k = '[.0000.0000.0000.0000]'
 373             if defined $self->{ignoreChar} && $ele =~ /$self->{ignoreChar}/;
 374     }
 375
 376     # replaced as completely ignorable
 377     $k = '[.0000.0000.0000.0000]'
 378         if defined $self->{ignoreName} && $name =~ /$self->{ignoreName}/;
 379
 380     my $is_L3_ignorable = TRUE;
 381
 382     foreach my $arr ($k =~ /\[([^\[\]]+)\]/g) { # SPACEs allowed
 383         my $var = $arr =~ /\*/; # exactly /^\*/ but be lenient.
 384         my @wt = _getHexArray($arr);
 385         push @key, pack(VCE_TEMPLATE, $var, @wt);
 386         $is_L3_ignorable = FALSE
 387             if $wt[0] + $wt[1] + $wt[2] != 0;
 388           # if $arr !~ /[1-9A-Fa-f]/; NG
 389           # Conformance Test shows L3-ignorable is completely ignorable.
 390         # For expansion, an entry $is_L3_ignorable
 391         # if and only if "all" CEs are [.0000.0000.0000].
 392     }
 393
 394     $self->{mapping}{$entry} = \@key;
 395
 396     if (@uv > 1) {
 397         (!$self->{maxlength}{$uv[0]} || $self->{maxlength}{$uv[0]} < @uv)
 398             and $self->{maxlength}{$uv[0]} = @uv;
 399     }
 400     else {
 401         $is_L3_ignorable
 402             ? ($self->{L3_ignorable}{$uv[0]} = TRUE)
 403             : ($self->{L3_ignorable}{$uv[0]} and
 404                $self->{L3_ignorable}{$uv[0]} = FALSE); # &&= stores key.
 405     }
 406 }
 407
 408
 409 ##
 410 ## arrayref[weights] = varCE(VCE)
 411 ##
 412 sub varCE
 413 {
 414     my $self = shift;
 415     my($var, @wt) = unpack(VCE_TEMPLATE, shift);
 416
 417     $self->{variable} eq 'blanked' ?
 418         $var ? [Var1Wt, 0, 0, $wt[3]] : \@wt :
 419     $self->{variable} eq 'non-ignorable' ?
 420         \@wt :
 421     $self->{variable} eq 'shifted' ?
 422         $var ? [Var1Wt, 0, 0, $wt[0] ]
 423              : [ @wt[0..2], $wt[0]+$wt[1]+$wt[2] ? Shift4Wt : 0 ] :
 424     $self->{variable} eq 'shift-trimmed' ?
 425         $var ? [Var1Wt, 0, 0, $wt[0] ] : [ @wt[0..2], 0 ] :
 426         croak "$PACKAGE unknown variable name: $self->{variable}";
 427 }
 428
 429 sub viewSortKey
 430 {
 431     my $self = shift;
 432     $self->visualizeSortKey($self->getSortKey(@_));
 433 }
 434
 435 sub visualizeSortKey
 436 {
 437     my $self = shift;
 438     my $view = join " ", map sprintf("%04X", $_), unpack(KEY_TEMPLATE, shift);
 439
 440     if ($self->{UCA_Version} <= 8) {
 441         $view =~ s/ ?0000 ?/|/g;
 442     } else {
 443         $view =~ s/\b0000\b/|/g;
 444     }
 445     return "[$view]";
 446 }
 447
 448
 449 ##
 450 ## arrayref of JCPS   = splitEnt(string to be collated)
 451 ## arrayref of arrayref[JCPS, ini_pos, fin_pos] = splitEnt(string, true)
 452 ##
 453 sub splitEnt
 454 {
 455     my $self = shift;
 456     my $wLen = $_[1];
 457
 458     my $code = $self->{preprocess};
 459     my $norm = $self->{normCode};
 460     my $map  = $self->{mapping};
 461     my $max  = $self->{maxlength};
 462     my $reH  = $self->{rearrangeHash};
 463     my $ign  = $self->{L3_ignorable};
 464     my $ver9 = $self->{UCA_Version} >= 9;
 465
 466     my ($str, @buf);
 467
 468     if ($wLen) {
 469         $code and croak "Preprocess breaks character positions. "
 470                         . "Don't use with index(), match(), etc.";
 471         $norm and croak "Normalization breaks character positions. "
 472                         . "Don't use with index(), match(), etc.";
 473         $str = $_[0];
 474     }
 475     else {
 476         $str = $_[0];
 477         $str = &$code($str) if ref $code;
 478         $str = &$norm($str) if ref $norm;
 479     }
 480
 481     # get array of Unicode code point of string.
 482     my @src = unpack_U($str);
 483
 484     # rearrangement:
 485     # Character positions are not kept if rearranged,
 486     # then neglected if $wLen is true.
 487     if ($reH && ! $wLen) {
 488         for (my $i = 0; $i < @src; $i++) {
 489             if (exists $reH->{ $src[$i] } && $i + 1 < @src) {
 490                 ($src[$i], $src[$i+1]) = ($src[$i+1], $src[$i]);
 491                 $i++;
 492             }
 493         }
 494     }
 495
 496     if ($ver9) {
 497         # To remove a character marked as a completely ignorable.
 498         for (my $i = 0; $i < @src; $i++) {
 499             $src[$i] = undef if $ign->{ $src[$i] };
 500         }
 501     }
 502
 503     for (my $i = 0; $i < @src; $i++) {
 504         next if _isIllegal($src[$i]);
 505
 506         my $i_orig = $i;
 507         my $jcps = $src[$i];
 508
 509         if ($max->{$jcps}) { # contract
 510             my $temp_jcps = $jcps;
 511             my $jcpsLen = 1;
 512             my $maxLen = $max->{$jcps};
 513
 514             for (my $p = $i + 1; $jcpsLen < $maxLen && $p < @src; $p++) {
 515                 next if ! defined $src[$p];
 516                 $temp_jcps .= CODE_SEP . $src[$p];
 517                 $jcpsLen++;
 518                 if ($map->{$temp_jcps}) {
 519                     $jcps = $temp_jcps;
 520                     $i = $p;
 521                 }
 522             }
 523
 524         # not-contiguous contraction with Combining Char (cf. UTS#10, S2.1).
 525         # This process requires Unicode::Normalize.
 526         # If "normalization" is undef, here should be skipped *always*
 527         # (in spite of bool value of $CVgetCombinClass),
 528         # since canonical ordering cannot be expected.
 529         # Blocked combining character should not be contracted.
 530
 531             if ($self->{normalization})
 532             # $self->{normCode} is false in the case of "prenormalized".
 533             {
 534                 my $preCC = 0;
 535                 my $curCC = 0;
 536
 537                 for (my $p = $i + 1; $p < @src; $p++) {
 538                     next if ! defined $src[$p];
 539                     $curCC = $CVgetCombinClass->($src[$p]);
 540                     last unless $curCC;
 541                     my $tail = CODE_SEP . $src[$p];
 542                     if ($preCC != $curCC && $map->{$jcps.$tail}) {
 543                         $jcps .= $tail;
 544                         $src[$p] = undef;
 545                     } else {
 546                         $preCC = $curCC;
 547                     }
 548                 }
 549             }
 550         }
 551
 552         if ($wLen) {
 553             for (my $p = $i + 1; $p < @src; $p++) {
 554                 last if defined $src[$p];
 555                 $i = $p;
 556             }
 557         }
 558
 559         push @buf, $wLen ? [$jcps, $i_orig, $i + 1] : $jcps;
 560     }
 561     return \@buf;
 562 }
 563
 564
 565 ##
 566 ## list of arrayrefs of weights = getWt(JCPS)
 567 ##
 568 sub getWt
 569 {
 570     my $self = shift;
 571     my $u    = shift;
 572     my $map  = $self->{mapping};
 573     my $der  = $self->{derivCode};
 574
 575     return if !defined $u;
 576     return map($self->varCE($_), @{ $map->{$u} })
 577         if $map->{$u};
 578
 579     # JCPS must not be a contraction, then it's a code point.
 580     if (Hangul_SIni <= $u && $u <= Hangul_SFin) {
 581         my $hang = $self->{overrideHangul};
 582         my @hangulCE;
 583         if ($hang) {
 584             @hangulCE = map(pack(VCE_TEMPLATE, NON_VAR, @$_), &$hang($u));
 585         }
 586         elsif (!defined $hang) {
 587             @hangulCE = $der->($u);
 588         }
 589         else {
 590             my $max  = $self->{maxlength};
 591             my @decH = _decompHangul($u);
 592
 593             if (@decH == 2) {
 594                 my $contract = join(CODE_SEP, @decH);
 595                 @decH = ($contract) if $map->{$contract};
 596             } else { # must be <@decH == 3>
 597                 if ($max->{$decH[0]}) {
 598                     my $contract = join(CODE_SEP, @decH);
 599                     if ($map->{$contract}) {
 600                         @decH = ($contract);
 601                     } else {
 602                         $contract = join(CODE_SEP, @decH[0,1]);
 603                         $map->{$contract} and @decH = ($contract, $decH[2]);
 604                     }
 605                     # even if V's ignorable, LT contraction is not supported.
 606                     # If such a situatution were required, NFD should be used.
 607                 }
 608                 if (@decH == 3 && $max->{$decH[1]}) {
 609                     my $contract = join(CODE_SEP, @decH[1,2]);
 610                     $map->{$contract} and @decH = ($decH[0], $contract);
 611                 }
 612             }
 613
 614             @hangulCE = map({
 615                     $map->{$_} ? @{ $map->{$_} } : $der->($_);
 616                 } @decH);
 617         }
 618         return map $self->varCE($_), @hangulCE;
 619     }
 620     elsif (CJK_UidIni  <= $u && $u <= CJK_UidFin  ||
 621            CJK_ExtAIni <= $u && $u <= CJK_ExtAFin ||
 622            CJK_ExtBIni <= $u && $u <= CJK_ExtBFin) {
 623         my $cjk  = $self->{overrideCJK};
 624         return map $self->varCE($_),
 625             $cjk
 626                 ? map(pack(VCE_TEMPLATE, NON_VAR, @$_), &$cjk($u))
 627                 : defined $cjk && $self->{UCA_Version} <= 8 && $u <= BMP_Max
 628                     ? pack(VCE_TEMPLATE, NON_VAR, $u, Min2Wt, Min3Wt, $u)
 629                     : $der->($u);
 630     }
 631     else {
 632         return map $self->varCE($_), $der->($u);
 633     }
 634 }
 635
 636
 637 ##
 638 ## string sortkey = getSortKey(string arg)
 639 ##
 640 sub getSortKey
 641 {
 642     my $self = shift;
 643     my $lev  = $self->{level};
 644     my $rEnt = $self->splitEnt(shift); # get an arrayref of JCPS
 645     my $ver9 = $self->{UCA_Version} >= 9;
 646     my $v2i  = $self->{variable} ne 'non-ignorable';
 647
 648     # weight arrays
 649     my (@wts, @buf, $last_is_variable);
 650
 651     if ($self->{hangul_terminator}) {
 652         my $preHST = '';
 653         foreach my $jcps (@$rEnt) {
 654             # weird things like VL, TL-contraction are not considered!
 655             my $curHST = '';
 656             foreach my $u (split /;/, $jcps) {
 657                 $curHST .= getHST($u);
 658             }
 659             if ($preHST && !$curHST || # hangul before non-hangul
 660                 $preHST =~ /L\z/ && $curHST =~ /^T/ ||
 661                 $preHST =~ /V\z/ && $curHST =~ /^L/ ||
 662                 $preHST =~ /T\z/ && $curHST =~ /^[LV]/) {
 663
 664                 push @wts, $self->varCE_HangulTerm;
 665             }
 666             $preHST = $curHST;
 667
 668             push @wts, $self->getWt($jcps);
 669         }
 670         $preHST # end at hangul
 671             and push @wts, $self->varCE_HangulTerm;
 672     }
 673     else {
 674         foreach my $jcps (@$rEnt) {
 675             push @wts, $self->getWt($jcps);
 676         }
 677     }
 678
 679     foreach my $wt (@wts) {
 680         if ($v2i && $ver9) {
 681             if ($wt->[0] == 0) { # ignorable
 682                 next if $last_is_variable;
 683             } else {
 684                 $last_is_variable = ($wt->[0] == Var1Wt);
 685             }
 686         }
 687         push @buf, $wt;
 688     }
 689
 690     # make sort key
 691     my @ret = ([],[],[],[]);
 692     foreach my $v (0..$lev-1) {
 693         foreach my $b (@buf) {
 694             push @{ $ret[$v] }, $b->[$v]
 695                 if 0 < $b->[$v];
 696         }
 697     }
 698
 699     # modification of tertiary weights
 700     if ($self->{upper_before_lower}) {
 701         foreach (@{ $ret[2] }) {
 702             if    (0x8 <= $_ && $_ <= 0xC) { $_ -= 6 } # lower
 703             elsif (0x2 <= $_ && $_ <= 0x6) { $_ += 6 } # upper
 704             elsif ($_ == 0x1C)             { $_ += 1 } # square upper
 705             elsif ($_ == 0x1D)             { $_ -= 1 } # square lower
 706         }
 707     }
 708     if ($self->{katakana_before_hiragana}) {
 709         foreach (@{ $ret[2] }) {
 710             if    (0x0F <= $_ && $_ <= 0x13) { $_ -= 2 } # katakana
 711             elsif (0x0D <= $_ && $_ <= 0x0E) { $_ += 5 } # hiragana
 712         }
 713     }
 714
 715     if ($self->{backwardsFlag}) {
 716         for (my $v = MinLevel; $v <= MaxLevel; $v++) {
 717             if ($self->{backwardsFlag} & (1 << $v)) {
 718                 @{ $ret[$v-1] } = reverse @{ $ret[$v-1] };
 719             }
 720         }
 721     }
 722
 723     join LEVEL_SEP, map pack(KEY_TEMPLATE, @$_), @ret;
 724 }
 725
 726
 727 ##
 728 ## int compare = cmp(string a, string b)
 729 ##
 730 sub cmp { $_[0]->getSortKey($_[1]) cmp $_[0]->getSortKey($_[2]) }
 731 sub eq  { $_[0]->getSortKey($_[1]) eq  $_[0]->getSortKey($_[2]) }
 732 sub ne  { $_[0]->getSortKey($_[1]) ne  $_[0]->getSortKey($_[2]) }
 733 sub lt  { $_[0]->getSortKey($_[1]) lt  $_[0]->getSortKey($_[2]) }
 734 sub le  { $_[0]->getSortKey($_[1]) le  $_[0]->getSortKey($_[2]) }
 735 sub gt  { $_[0]->getSortKey($_[1]) gt  $_[0]->getSortKey($_[2]) }
 736 sub ge  { $_[0]->getSortKey($_[1]) ge  $_[0]->getSortKey($_[2]) }
 737
 738 ##
 739 ## list[strings] sorted = sort(list[strings] arg)
 740 ##
 741 sub sort {
 742     my $obj = shift;
 743     return
 744         map { $_->[1] }
 745             sort{ $a->[0] cmp $b->[0] }
 746                 map [ $obj->getSortKey($_), $_ ], @_;
 747 }
 748
 749
 750 sub _derivCE_9 {
 751     my $u = shift;
 752     my $base =
 753         (CJK_UidIni  <= $u && $u <= CJK_UidFin)
 754             ? 0xFB40 : # CJK
 755         (CJK_ExtAIni <= $u && $u <= CJK_ExtAFin ||
 756          CJK_ExtBIni <= $u && $u <= CJK_ExtBFin)
 757             ? 0xFB80   # CJK ext.
 758             : 0xFBC0;  # others
 759
 760     my $aaaa = $base + ($u >> 15);
 761     my $bbbb = ($u & 0x7FFF) | 0x8000;
 762     return
 763         pack(VCE_TEMPLATE, NON_VAR, $aaaa, Min2Wt, Min3Wt, $u),
 764         pack(VCE_TEMPLATE, NON_VAR, $bbbb,      0,      0, $u);
 765 }
 766
 767 sub _derivCE_8 {
 768     my $code = shift;
 769     my $aaaa =  0xFF80 + ($code >> 15);
 770     my $bbbb = ($code & 0x7FFF) | 0x8000;
 771     return
 772         pack(VCE_TEMPLATE, NON_VAR, $aaaa, 2, 1, $code),
 773         pack(VCE_TEMPLATE, NON_VAR, $bbbb, 0, 0, $code);
 774 }
 775
 776
 777 sub varCE_HangulTerm {
 778     my $self = shift;
 779     return $self->varCE(pack(VCE_TEMPLATE,
 780         NON_VAR, $self->{hangul_terminator}, 0,0,0));
 781 }
 782
 783
 784 ##
 785 ## "hhhh hhhh hhhh" to (dddd, dddd, dddd)
 786 ##
 787 sub _getHexArray { map hex, $_[0] =~ /([0-9a-fA-F]+)/g }
 788
 789 #
 790 # $code *must* be in Hangul syllable.
 791 # Check it before you enter here.
 792 #
 793 sub _decompHangul {
 794     my $code = shift;
 795     my $SIndex = $code - Hangul_SBase;
 796     my $LIndex = int( $SIndex / Hangul_NCount);
 797     my $VIndex = int(($SIndex % Hangul_NCount) / Hangul_TCount);
 798     my $TIndex =      $SIndex % Hangul_TCount;
 799     return (
 800         Hangul_LBase + $LIndex,
 801         Hangul_VBase + $VIndex,
 802         $TIndex ? (Hangul_TBase + $TIndex) : (),
 803     );
 804 }
 805
 806 sub _isIllegal {
 807     my $code = shift;
 808     return ! defined $code                      # removed
 809         || ($code < 0 || 0x10FFFF < $code)      # out of range
 810         || (($code & 0xFFFE) == 0xFFFE)         # ??FFF[EF] (cf. utf8.c)
 811         || (0xD800 <= $code && $code <= 0xDFFF) # unpaired surrogates
 812         || (0xFDD0 <= $code && $code <= 0xFDEF) # other non-characters
 813     ;
 814 }
 815
 816 # Hangul Syllable Type
 817 sub getHST {
 818     my $u = shift;
 819     return
 820         Hangul_LIni <= $u && $u <= Hangul_LFin || $u == Hangul_LFill ? "L" :
 821         Hangul_VIni <= $u && $u <= Hangul_VFin       ? "V" :
 822         Hangul_TIni <= $u && $u <= Hangul_TFin       ? "T" :
 823         Hangul_SIni <= $u && $u <= Hangul_SFin ?
 824             ($u - Hangul_SBase) % Hangul_TCount ? "LVT" : "LV" : "";
 825 }
 826
 827
 828 ##
 829 ## bool _nonIgnorAtLevel(arrayref weights, int level)
 830 ##
 831 sub _nonIgnorAtLevel($$)
 832 {
 833     my $wt = shift;
 834     return if ! defined $wt;
 835     my $lv = shift;
 836     return grep($wt->[$_-1] != 0, MinLevel..$lv) ? TRUE : FALSE;
 837 }
 838
 839 ##
 840 ## bool _eqArray(
 841 ##    arrayref of arrayref[weights] source,
 842 ##    arrayref of arrayref[weights] substr,
 843 ##    int level)
 844 ## * comparison of graphemes vs graphemes.
 845 ##   @$source >= @$substr must be true (check it before call this);
 846 ##
 847 sub _eqArray($$$)
 848 {
 849     my $source = shift;
 850     my $substr = shift;
 851     my $lev = shift;
 852
 853     for my $g (0..@$substr-1){
 854         # Do the $g'th graphemes have the same number of AV weigths?
 855         return if @{ $source->[$g] } != @{ $substr->[$g] };
 856
 857         for my $w (0..@{ $substr->[$g] }-1) {
 858             for my $v (0..$lev-1) {
 859                 return if $source->[$g][$w][$v] != $substr->[$g][$w][$v];
 860             }
 861         }
 862     }
 863     return 1;
 864 }
 865
 866 ##
 867 ## (int position, int length)
 868 ## int position = index(string, substring, position, [undoc'ed grobal])
 869 ##
 870 ## With "grobal" (only for the list context),
 871 ##  returns list of arrayref[position, length].
 872 ##
 873 sub index
 874 {
 875     my $self = shift;
 876     my $str  = shift;
 877     my $len  = length($str);
 878     my $subE = $self->splitEnt(shift);
 879     my $pos  = @_ ? shift : 0;
 880        $pos  = 0 if $pos < 0;
 881     my $grob = shift;
 882
 883     my $lev  = $self->{level};
 884     my $ver9 = $self->{UCA_Version} >= 9;
 885     my $v2i  = $self->{variable} ne 'non-ignorable';
 886
 887     if (! @$subE) {
 888         my $temp = $pos <= 0 ? 0 : $len <= $pos ? $len : $pos;
 889         return $grob
 890             ? map([$_, 0], $temp..$len)
 891             : wantarray ? ($temp,0) : $temp;
 892     }
 893     if ($len < $pos) {
 894         return wantarray ? () : NOMATCHPOS;
 895     }
 896     my $strE = $self->splitEnt($pos ? substr($str, $pos) : $str, TRUE);
 897     if (! @$strE) {
 898         return wantarray ? () : NOMATCHPOS;
 899     }
 900     my $last_is_variable;
 901     my(@strWt, @iniPos, @finPos, @subWt, @g_ret);
 902
 903     $last_is_variable = FALSE;
 904     for my $wt (map $self->getWt($_), @$subE) {
 905         my $to_be_pushed = _nonIgnorAtLevel($wt,$lev);
 906
 907         if ($v2i && $ver9) {
 908             if ($wt->[0] == 0) {
 909                 $to_be_pushed = FALSE if $last_is_variable;
 910             } else {
 911                 $last_is_variable = ($wt->[0] == Var1Wt);
 912             }
 913         }
 914
 915         if (@subWt && $wt->[0] == 0) {
 916             push @{ $subWt[-1] }, $wt if $to_be_pushed;
 917         } else {
 918             $wt->[0] = 0 if $wt->[0] == Var1Wt;
 919             push @subWt, [ $wt ];
 920         }
 921     }
 922
 923     my $count = 0;
 924     my $end = @$strE - 1;
 925
 926     $last_is_variable = FALSE;
 927
 928     for (my $i = 0; $i <= $end; ) { # no $i++
 929         my $found_base = 0;
 930
 931         # fetch a grapheme
 932         while ($i <= $end && $found_base == 0) {
 933             for my $wt ($self->getWt($strE->[$i][0])) {
 934                 my $to_be_pushed = _nonIgnorAtLevel($wt,$lev);
 935
 936                 if ($v2i && $ver9) {
 937                     if ($wt->[0] == 0) {
 938                         $to_be_pushed = FALSE if $last_is_variable;
 939                     } else {
 940                         $last_is_variable = ($wt->[0] == Var1Wt);
 941                     }
 942                 }
 943
 944                 if (@strWt && $wt->[0] == 0) {
 945                     push @{ $strWt[-1] }, $wt if $to_be_pushed;
 946                     $finPos[-1] = $strE->[$i][2];
 947                 } elsif ($to_be_pushed) {
 948                     $wt->[0] = 0 if $wt->[0] == Var1Wt;
 949                     push @strWt,  [ $wt ];
 950                     push @iniPos, $found_base ? NOMATCHPOS : $strE->[$i][1];
 951                     $finPos[-1] = NOMATCHPOS if $found_base;
 952                     push @finPos, $strE->[$i][2];
 953                     $found_base++;
 954                 }
 955                 # else ===> no-op
 956             }
 957             $i++;
 958         }
 959
 960         # try to match
 961         while ( @strWt > @subWt || (@strWt == @subWt && $i > $end) ) {
 962             if ($iniPos[0] != NOMATCHPOS &&
 963                     $finPos[$#subWt] != NOMATCHPOS &&
 964                         _eqArray(\@strWt, \@subWt, $lev)) {
 965                 my $temp = $iniPos[0] + $pos;
 966
 967                 if ($grob) {
 968                     push @g_ret, [$temp, $finPos[$#subWt] - $iniPos[0]];
 969                     splice @strWt,  0, $#subWt;
 970                     splice @iniPos, 0, $#subWt;
 971                     splice @finPos, 0, $#subWt;
 972                 }
 973                 else {
 974                     return wantarray
 975                         ? ($temp, $finPos[$#subWt] - $iniPos[0])
 976                         :  $temp;
 977                 }
 978             }
 979             shift @strWt;
 980             shift @iniPos;
 981             shift @finPos;
 982         }
 983     }
 984
 985     return $grob
 986         ? @g_ret
 987         : wantarray ? () : NOMATCHPOS;
 988 }
 989
 990 ##
 991 ## scalarref to matching part = match(string, substring)
 992 ##
 993 sub match
 994 {
 995     my $self = shift;
 996     if (my($pos,$len) = $self->index($_[0], $_[1])) {
 997         my $temp = substr($_[0], $pos, $len);
 998         return wantarray ? $temp : \$temp;
 999         # An lvalue ref \substr should be avoided,
1000         # since its value is affected by modification of its referent.
1001     }
1002     else {
1003         return;
1004     }
1005 }
1006
1007 ##
1008 ## arrayref matching parts = gmatch(string, substring)
1009 ##
1010 sub gmatch
1011 {
1012     my $self = shift;
1013     my $str  = shift;
1014     my $sub  = shift;
1015     return map substr($str, $_->[0], $_->[1]),
1016                 $self->index($str, $sub, 0, 'g');
1017 }
1018
1019 ##
1020 ## bool subst'ed = subst(string, substring, replace)
1021 ##
1022 sub subst
1023 {
1024     my $self = shift;
1025     my $code = ref $_[2] eq 'CODE' ? $_[2] : FALSE;
1026
1027     if (my($pos,$len) = $self->index($_[0], $_[1])) {
1028         if ($code) {
1029             my $mat = substr($_[0], $pos, $len);
1030             substr($_[0], $pos, $len, $code->($mat));
1031         } else {
1032             substr($_[0], $pos, $len, $_[2]);
1033         }
1034         return TRUE;
1035     }
1036     else {
1037         return FALSE;
1038     }
1039 }
1040
1041 ##
1042 ## int count = gsubst(string, substring, replace)
1043 ##
1044 sub gsubst
1045 {
1046     my $self = shift;
1047     my $code = ref $_[2] eq 'CODE' ? $_[2] : FALSE;
1048     my $cnt = 0;
1049
1050     # Replacement is carried out from the end, then use reverse.
1051     for my $pos_len (reverse $self->index($_[0], $_[1], 0, 'g')) {
1052         if ($code) {
1053             my $mat = substr($_[0], $pos_len->[0], $pos_len->[1]);
1054             substr($_[0], $pos_len->[0], $pos_len->[1], $code->($mat));
1055         } else {
1056             substr($_[0], $pos_len->[0], $pos_len->[1], $_[2]);
1057         }
1058         $cnt++;
1059     }
1060     return $cnt;
1061 }
1062
1063 1;
1064 __END__
1065
1066 =head1 NAME
1067
1068 Unicode::Collate - Unicode Collation Algorithm
1069
1070 =head1 SYNOPSIS
1071
1072   use Unicode::Collate;
1073
1074   #construct
1075   $Collator = Unicode::Collate->new(%tailoring);
1076
1077   #sort
1078   @sorted = $Collator->sort(@not_sorted);
1079
1080   #compare
1081   $result = $Collator->cmp($a, $b); # returns 1, 0, or -1.
1082
1083   # If %tailoring is false (i.e. empty),
1084   # $Collator should do the default collation.
1085
1086 =head1 DESCRIPTION
1087
1088 This module is an implementation
1089 of Unicode Technical Standard #10 (UTS #10)
1090 "Unicode Collation Algorithm."
1091
1092 =head2 Constructor and Tailoring
1093
1094 The C<new> method returns a collator object.
1095
1096    $Collator = Unicode::Collate->new(
1097       UCA_Version => $UCA_Version,
1098       alternate => $alternate, # deprecated: use of 'variable' is recommended.
1099       backwards => $levelNumber, # or \@levelNumbers
1100       entry => $element,
1101       hangul_terminator => $term_primary_weight,
1102       ignoreName => qr/$ignoreName/,
1103       ignoreChar => qr/$ignoreChar/,
1104       katakana_before_hiragana => $bool,
1105       level => $collationLevel,
1106       normalization  => $normalization_form,
1107       overrideCJK => \&overrideCJK,
1108       overrideHangul => \&overrideHangul,
1109       preprocess => \&preprocess,
1110       rearrange => \@charList,
1111       table => $filename,
1112       undefName => qr/$undefName/,
1113       undefChar => qr/$undefChar/,
1114       upper_before_lower => $bool,
1115       variable => $variable,
1116    );
1117
1118 =over 4
1119
1120 =item UCA_Version
1121
1122 If the tracking version number of the older UCA is given,
1123 the older behavior of that tracking version is emulated on collating.
1124 If omitted, the return value of C<UCA_Version()> is used.
1125
1126 The supported tracking version: 8, 9, or 11.
1127
1128 B<This parameter may be removed in the future version,
1129 as switching the algorithm would affect the performance.>
1130
1131 =item backwards
1132
1133 -- see 3.1.2 French Accents, UTS #10.
1134
1135      backwards => $levelNumber or \@levelNumbers
1136
1137 Weights in reverse order; ex. level 2 (diacritic ordering) in French.
1138 If omitted, forwards at all the levels.
1139
1140 =item entry
1141
1142 -- see 3.1 Linguistic Features; 3.2.1 File Format, UTS #10.
1143
1144 If the same character (or a sequence of characters) exists
1145 in the collation element table through C<table>,
1146 mapping to collation elements is overrided.
1147 If it does not exist, the mapping is defined additionally.
1148
1149   entry => <<'ENTRIES', # use the UCA file format
1150 00E6 ; [.0861.0020.0002.00E6] [.08B1.0020.0002.00E6] # ligature <ae> as <a><e>
1151 0063 0068 ; [.0893.0020.0002.0063]      # "ch" in traditional Spanish
1152 0043 0068 ; [.0893.0020.0008.0043]      # "Ch" in traditional Spanish
1153 ENTRIES
1154
1155 B<NOTE:> The code point in the UCA file format (before C<';'>)
1156 B<must> be a Unicode code point, but not a native code point.
1157 So C<0063> must always denote C<U+0063>,
1158 but not a character of C<"\x63">.
1159
1160 =item hangul_terminator
1161
1162 -- see Condition B.2. in 7.1.4 Trailing Weights, UTS #10.
1163
1164 If a true value is given (non-zero but should be positive),
1165 it will be added as a terminator primary weight to the end of
1166 every standard Hangul syllable. Secondary and any higher weights
1167 for terminator are set to zero.
1168 If the value is false or C<hangul_terminator> key does not exist,
1169 insertion of terminator weights will not be performed.
1170
1171 Boundaries of Hangul syllables are determined
1172 according to conjoining Jamo behavior in F<the Unicode Standard>
1173 and F<HangulSyllableType.txt>.
1174
1175 B<Implementation Note:>
1176 (1) For expansion mapping (Unicode character mapped
1177 to a sequence of collation elements), a terminator will not be added
1178 between collation elements, even if Hangul syllable boundary exists there.
1179 Addition of terminator is restricted to the next position
1180 to the last collation element.
1181
1182 (2) Non-conjoining Hangul letters
1183 (Compatibility Jamo, halfwidth Jamo, and enclosed letters) are not
1184 automatically terminated with a terminator primary weight.
1185 These characters may need terminator included in a collation element
1186 table beforehand.
1187
1188 =item ignoreName
1189
1190 =item ignoreChar
1191
1192 -- see Completely Ignorable, 3.2.2 Variable Weighting, UTS #10.
1193
1194 Makes the entry in the table completely ignorable;
1195 i.e. as if the weights were zero at all level.
1196
1197 E.g. when 'a' and 'e' are ignorable,
1198 'element' is equal to 'lament' (or 'lmnt').
1199
1200 =item level
1201
1202 -- see 4.3 Form a sort key for each string, UTS #10.
1203
1204 Set the maximum level.
1205 Any higher levels than the specified one are ignored.
1206
1207   Level 1: alphabetic ordering
1208   Level 2: diacritic ordering
1209   Level 3: case ordering
1210   Level 4: tie-breaking (e.g. in the case when variable is 'shifted')
1211
1212   ex.level => 2,
1213
1214 If omitted, the maximum is the 4th.
1215
1216 =item normalization
1217
1218 -- see 4.1 Normalize each input string, UTS #10.
1219
1220 If specified, strings are normalized before preparation of sort keys
1221 (the normalization is executed after preprocess).
1222
1223 A form name C<Unicode::Normalize::normalize()> accepts will be applied
1224 as C<$normalization_form>.
1225 Acceptable names include C<'NFD'>, C<'NFC'>, C<'NFKD'>, and C<'NFKC'>.
1226 See C<Unicode::Normalize::normalize()> for detail.
1227 If omitted, C<'NFD'> is used.
1228
1229 C<normalization> is performed after C<preprocess> (if defined).
1230
1231 Furthermore, special values, C<undef> and C<"prenormalized">, can be used,
1232 though they are not concerned with C<Unicode::Normalize::normalize()>.
1233
1234 If C<undef> (not a string C<"undef">) is passed explicitly
1235 as the value for this key,
1236 any normalization is not carried out (this may make tailoring easier
1237 if any normalization is not desired).
1238 Under C<(normalization =E<gt> undef)>, only contiguous contractions
1239 are resolved; e.g. C<A-cedilla-ring> would be primary equal to C<A>,
1240 even if C<A-ring> (and C<A-ring-cedilla>) is ordered after C<Z>.
1241 In this point,
1242 C<(normalization =E<gt> undef, preprocess =E<gt> sub { NFD(shift) })>
1243 B<is not> equivalent to C<(normalization =E<gt> 'NFD')>.
1244
1245 In the case of C<(normalization =E<gt> "prenormalized")>,
1246 any normalization is not performed, but
1247 non-contiguous contractions with combining characters are performed.
1248 Therefore
1249 C<(normalization =E<gt> 'prenormalized', preprocess =E<gt> sub { NFD(shift) })>
1250 B<is> equivalent to C<(normalization =E<gt> 'NFD')>.
1251 If source strings are finely prenormalized,
1252 C<(normalization =E<gt> 'prenormalized')> may save time for normalization.
1253
1254 Except C<(normalization =E<gt> undef)>,
1255 B<Unicode::Normalize> is required (see also B<CAVEAT>).
1256
1257 =item overrideCJK
1258
1259 -- see 7.1 Derived Collation Elements, UTS #10.
1260
1261 By default, CJK Unified Ideographs are ordered in Unicode codepoint order
1262 (but C<CJK Unified Ideographs> [C<U+4E00> to C<U+9FA5>]  are lesser than
1263 C<CJK Unified Ideographs Extension> [C<U+3400> to C<U+4DB5> and
1264 C<U+20000> to C<U+2A6D6>].
1265
1266 Through C<overrideCJK>, ordering of CJK Unified Ideographs can be overrided.
1267
1268 ex. CJK Unified Ideographs in the JIS code point order.
1269
1270   overrideCJK => sub {
1271       my $u = shift;             # get a Unicode codepoint
1272       my $b = pack('n', $u);     # to UTF-16BE
1273       my $s = your_unicode_to_sjis_converter($b); # convert
1274       my $n = unpack('n', $s);   # convert sjis to short
1275       [ $n, 0x20, 0x2, $u ];     # return the collation element
1276   },
1277
1278 ex. ignores all CJK Unified Ideographs.
1279
1280   overrideCJK => sub {()}, # CODEREF returning empty list
1281
1282    # where ->eq("Pe\x{4E00}rl", "Perl") is true
1283    # as U+4E00 is a CJK Unified Ideograph and to be ignorable.
1284
1285 If C<undef> is passed explicitly as the value for this key,
1286 weights for CJK Unified Ideographs are treated as undefined.
1287 But assignment of weight for CJK Unified Ideographs
1288 in table or C<entry> is still valid.
1289
1290 =item overrideHangul
1291
1292 -- see 7.1 Derived Collation Elements, UTS #10.
1293
1294 By default, Hangul Syllables are decomposed into Hangul Jamo.
1295 But the mapping of Hangul Syllables may be overrided.
1296
1297 This tag works like C<overrideCJK>, so see there for examples.
1298
1299 If you want to override the mapping of Hangul Syllables,
1300 the Normalization Forms D and KD are not appropriate
1301 (they will be decomposed before overriding).
1302
1303 If C<undef> is passed explicitly as the value for this key,
1304 weight for Hangul Syllables is treated as undefined
1305 without decomposition into Hangul Jamo.
1306 But definition of weight for Hangul Syllables
1307 in table or C<entry> is still valid.
1308
1309 =item preprocess
1310
1311 -- see 5.1 Preprocessing, UTS #10.
1312
1313 If specified, the coderef is used to preprocess
1314 before the formation of sort keys.
1315
1316 ex. dropping English articles, such as "a" or "the".
1317 Then, "the pen" is before "a pencil".
1318
1319      preprocess => sub {
1320            my $str = shift;
1321            $str =~ s/\b(?:an?|the)\s+//gi;
1322            return $str;
1323         },
1324
1325 C<preprocess> is performed before C<normalization> (if defined).
1326
1327 =item rearrange
1328
1329 -- see 3.1.3 Rearrangement, UTS #10.
1330
1331 Characters that are not coded in logical order and to be rearranged.
1332 By default,
1333
1334     rearrange => [ 0x0E40..0x0E44, 0x0EC0..0x0EC4 ],
1335
1336 If you want to disallow any rearrangement,
1337 pass C<undef> or C<[]> (a reference to an empty list)
1338 as the value for this key.
1339
1340 B<According to the version 9 of UCA, this parameter shall not be used;
1341 but it is not warned at present.>
1342
1343 =item table
1344
1345 -- see 3.2 Default Unicode Collation Element Table, UTS #10.
1346
1347 You can use another collation element table if desired.
1348 The table file must be put into a directory
1349 where F<Unicode/Collate.pm> is installed; e.g. into
1350 F<perl/lib/Unicode/Collate/> if you have F<perl/lib/Unicode/Collate.pm>.
1351
1352 By default, the filename F<allkeys.txt> is used.
1353
1354 If C<undef> is passed explicitly as the value for this key,
1355 no file is read (but you can define collation elements via C<entry>).
1356
1357 A typical way to define a collation element table
1358 without any file of table:
1359
1360    $onlyABC = Unicode::Collate->new(
1361        table => undef,
1362        entry => << 'ENTRIES',
1363 0061 ; [.0101.0020.0002.0061] # LATIN SMALL LETTER A
1364 0041 ; [.0101.0020.0008.0041] # LATIN CAPITAL LETTER A
1365 0062 ; [.0102.0020.0002.0062] # LATIN SMALL LETTER B
1366 0042 ; [.0102.0020.0008.0042] # LATIN CAPITAL LETTER B
1367 0063 ; [.0103.0020.0002.0063] # LATIN SMALL LETTER C
1368 0043 ; [.0103.0020.0008.0043] # LATIN CAPITAL LETTER C
1369 ENTRIES
1370     );
1371
1372 =item undefName
1373
1374 =item undefChar
1375
1376 -- see 6.3.4 Reducing the Repertoire, UTS #10.
1377
1378 Undefines the collation element as if it were unassigned in the table.
1379 This reduces the size of the table.
1380 If an unassigned character appears in the string to be collated,
1381 the sort key is made from its codepoint
1382 as a single-character collation element,
1383 as it is greater than any other assigned collation elements
1384 (in the codepoint order among the unassigned characters).
1385 But, it'd be better to ignore characters
1386 unfamiliar to you and maybe never used.
1387
1388 =item katakana_before_hiragana
1389
1390 =item upper_before_lower
1391
1392 -- see 6.6 Case Comparisons; 7.3.1 Tertiary Weight Table, UTS #10.
1393
1394 By default, lowercase is before uppercase
1395 and hiragana is before katakana.
1396
1397 If the tag is made true, this is reversed.
1398
1399 B<NOTE>: These tags simplemindedly assume
1400 any lowercase/uppercase or hiragana/katakana distinctions
1401 must occur in level 3, and their weights at level 3
1402 must be same as those mentioned in 7.3.1, UTS #10.
1403 If you define your collation elements which violate this requirement,
1404 these tags don't work validly.
1405
1406 =item variable
1407
1408 =item alternate
1409
1410 -- see 3.2.2 Variable Weighting, UTS #10.
1411
1412 (the title in UCA version 8: Alternate Weighting)
1413
1414 This key allows to variable weighting for variable collation elements,
1415 which are marked with an ASTERISK in the table
1416 (NOTE: Many punction marks and symbols are variable in F<allkeys.txt>).
1417
1418    variable => 'blanked', 'non-ignorable', 'shifted', or 'shift-trimmed'.
1419
1420 These names are case-insensitive.
1421 By default (if specification is omitted), 'shifted' is adopted.
1422
1423    'Blanked'        Variable elements are made ignorable at levels 1 through 3;
1424                     considered at the 4th level.
1425
1426    'Non-ignorable'  Variable elements are not reset to ignorable.
1427
1428    'Shifted'        Variable elements are made ignorable at levels 1 through 3
1429                     their level 4 weight is replaced by the old level 1 weight.
1430                     Level 4 weight for Non-Variable elements is 0xFFFF.
1431
1432    'Shift-Trimmed'  Same as 'shifted', but all FFFF's at the 4th level
1433                     are trimmed.
1434
1435 For backward compatibility, C<alternate> can be used as an alias
1436 for C<variable>.
1437
1438 =back
1439
1440 =head2 Methods for Collation
1441
1442 =over 4
1443
1444 =item C<@sorted = $Collator-E<gt>sort(@not_sorted)>
1445
1446 Sorts a list of strings.
1447
1448 =item C<$result = $Collator-E<gt>cmp($a, $b)>
1449
1450 Returns 1 (when C<$a> is greater than C<$b>)
1451 or 0 (when C<$a> is equal to C<$b>)
1452 or -1 (when C<$a> is lesser than C<$b>).
1453
1454 =item C<$result = $Collator-E<gt>eq($a, $b)>
1455
1456 =item C<$result = $Collator-E<gt>ne($a, $b)>
1457
1458 =item C<$result = $Collator-E<gt>lt($a, $b)>
1459
1460 =item C<$result = $Collator-E<gt>le($a, $b)>
1461
1462 =item C<$result = $Collator-E<gt>gt($a, $b)>
1463
1464 =item C<$result = $Collator-E<gt>ge($a, $b)>
1465
1466 They works like the same name operators as theirs.
1467
1468    eq : whether $a is equal to $b.
1469    ne : whether $a is not equal to $b.
1470    lt : whether $a is lesser than $b.
1471    le : whether $a is lesser than $b or equal to $b.
1472    gt : whether $a is greater than $b.
1473    ge : whether $a is greater than $b or equal to $b.
1474
1475 =item C<$sortKey = $Collator-E<gt>getSortKey($string)>
1476
1477 -- see 4.3 Form a sort key for each string, UTS #10.
1478
1479 Returns a sort key.
1480
1481 You compare the sort keys using a binary comparison
1482 and get the result of the comparison of the strings using UCA.
1483
1484    $Collator->getSortKey($a) cmp $Collator->getSortKey($b)
1485
1486       is equivalent to
1487
1488    $Collator->cmp($a, $b)
1489
1490 =item C<$sortKeyForm = $Collator-E<gt>viewSortKey($string)>
1491
1492    use Unicode::Collate;
1493    my $c = Unicode::Collate->new();
1494    print $c->viewSortKey("Perl"),"\n";
1495
1496    # output:
1497    # [0B67 0A65 0B7F 0B03 | 0020 0020 0020 0020 | 0008 0002 0002 0002 | FFFF FFFF FFFF FFFF]
1498    #  Level 1               Level 2               Level 3               Level 4
1499
1500     (If C<UCA_Version> is 8, the output is slightly different.)
1501
1502 =back
1503
1504 =head2 Methods for Searching
1505
1506 B<DISCLAIMER:> If C<preprocess> or C<normalization> tag is true
1507 for C<$Collator>, calling these methods (C<index>, C<match>, C<gmatch>,
1508 C<subst>, C<gsubst>) is croaked,
1509 as the position and the length might differ
1510 from those on the specified string.
1511 (And C<rearrange> and C<hangul_terminator> tags are neglected.)
1512
1513 The C<match>, C<gmatch>, C<subst>, C<gsubst> methods work
1514 like C<m//>, C<m//g>, C<s///>, C<s///g>, respectively,
1515 but they are not aware of any pattern, but only a literal substring.
1516
1517 =over 4
1518
1519 =item C<$position = $Collator-E<gt>index($string, $substring[, $position])>
1520
1521 =item C<($position, $length) = $Collator-E<gt>index($string, $substring[, $position])>
1522
1523 If C<$substring> matches a part of C<$string>, returns
1524 the position of the first occurrence of the matching part in scalar context;
1525 in list context, returns a two-element list of
1526 the position and the length of the matching part.
1527
1528 If C<$substring> does not match any part of C<$string>,
1529 returns C<-1> in scalar context and
1530 an empty list in list context.
1531
1532 e.g. you say
1533
1534   my $Collator = Unicode::Collate->new( normalization => undef, level => 1 );
1535                                      # (normalization => undef) is REQUIRED.
1536   my $str = "Ich muß studieren Perl.";
1537   my $sub = "MÜSS";
1538   my $match;
1539   if (my($pos,$len) = $Collator->index($str, $sub)) {
1540       $match = substr($str, $pos, $len);
1541   }
1542
1543 and get C<"muß"> in C<$match> since C<"muß">
1544 is primary equal to C<"MÜSS">.
1545
1546 =item C<$match_ref = $Collator-E<gt>match($string, $substring)>
1547
1548 =item C<($match)   = $Collator-E<gt>match($string, $substring)>
1549
1550 If C<$substring> matches a part of C<$string>, in scalar context, returns
1551 B<a reference to> the first occurrence of the matching part
1552 (C<$match_ref> is always true if matches,
1553 since every reference is B<true>);
1554 in list context, returns the first occurrence of the matching part.
1555
1556 If C<$substring> does not match any part of C<$string>,
1557 returns C<undef> in scalar context and
1558 an empty list in list context.
1559
1560 e.g.
1561
1562     if ($match_ref = $Collator->match($str, $sub)) { # scalar context
1563         print "matches [$$match_ref].\n";
1564     } else {
1565         print "doesn't match.\n";
1566     }
1567
1568      or
1569
1570     if (($match) = $Collator->match($str, $sub)) { # list context
1571         print "matches [$match].\n";
1572     } else {
1573         print "doesn't match.\n";
1574     }
1575
1576 =item C<@match = $Collator-E<gt>gmatch($string, $substring)>
1577
1578 If C<$substring> matches a part of C<$string>, returns
1579 all the matching parts (or matching count in scalar context).
1580
1581 If C<$substring> does not match any part of C<$string>,
1582 returns an empty list.
1583
1584 =item C<$count = $Collator-E<gt>subst($string, $substring, $replacement)>
1585
1586 If C<$substring> matches a part of C<$string>,
1587 the first occurrence of the matching part is replaced by C<$replacement>
1588 (C<$string> is modified) and return C<$count> (always equals to C<1>).
1589
1590 C<$replacement> can be a C<CODEREF>,
1591 taking the matching part as an argument,
1592 and returning a string to replace the matching part
1593 (a bit similar to C<s/(..)/$coderef-E<gt>($1)/e>).
1594
1595 =item C<$count = $Collator-E<gt>gsubst($string, $substring, $replacement)>
1596
1597 If C<$substring> matches a part of C<$string>,
1598 all the occurrences of the matching part is replaced by C<$replacement>
1599 (C<$string> is modified) and return C<$count>.
1600
1601 C<$replacement> can be a C<CODEREF>,
1602 taking the matching part as an argument,
1603 and returning a string to replace the matching part
1604 (a bit similar to C<s/(..)/$coderef-E<gt>($1)/eg>).
1605
1606 e.g.
1607
1608   my $Collator = Unicode::Collate->new( normalization => undef, level => 1 );
1609                                      # (normalization => undef) is REQUIRED.
1610   my $str = "Camel ass came\x{301}l CAMEL horse cAm\0E\0L...";
1611   $Collator->gsubst($str, "camel", sub { "<b>$_[0]</b>" });
1612
1613   # now $str is "<b>Camel</b> ass <b>came\x{301}l</b> <b>CAMEL</b> horse <b>cAm\0E\0L</b>...";
1614   # i.e., all the camels are made bold-faced.
1615
1616 =back
1617
1618 =head2 Other Methods
1619
1620 =over 4
1621
1622 =item C<%old_tailoring = $Collator-E<gt>change(%new_tailoring)>
1623
1624 Change the value of specified keys and returns the changed part.
1625
1626     $Collator = Unicode::Collate->new(level => 4);
1627
1628     $Collator->eq("perl", "PERL"); # false
1629
1630     %old = $Collator->change(level => 2); # returns (level => 4).
1631
1632     $Collator->eq("perl", "PERL"); # true
1633
1634     $Collator->change(%old); # returns (level => 2).
1635
1636     $Collator->eq("perl", "PERL"); # false
1637
1638 Not all C<(key,value)>s are allowed to be changed.
1639 See also C<@Unicode::Collate::ChangeOK> and C<@Unicode::Collate::ChangeNG>.
1640
1641 In the scalar context, returns the modified collator
1642 (but it is B<not> a clone from the original).
1643
1644     $Collator->change(level => 2)->eq("perl", "PERL"); # true
1645
1646     $Collator->eq("perl", "PERL"); # true; now max level is 2nd.
1647
1648     $Collator->change(level => 4)->eq("perl", "PERL"); # false
1649
1650 =item C<$version = $Collator-E<gt>version()>
1651
1652 Returns the version number (a string) of the Unicode Standard
1653 which the C<table> file used by the collator object is based on.
1654 If the table does not include a version line (starting with C<@version>),
1655 returns C<"unknown">.
1656
1657 =item C<UCA_Version()>
1658
1659 Returns the tracking version number of UTS #10 this module consults.
1660
1661 =item C<Base_Unicode_Version()>
1662
1663 Returns the version number of UTS #10 this module consults.
1664
1665 =back
1666
1667 =head2 EXPORT
1668
1669 None by default.
1670
1671 =head2 CAVEAT
1672
1673 Use of the C<normalization> parameter requires
1674 the B<Unicode::Normalize> module.
1675
1676 If you need not it (say, in the case when you need not
1677 handle any combining characters),
1678 assign C<normalization =E<gt> undef> explicitly.
1679
1680 -- see 6.5 Avoiding Normalization, UTS #10.
1681
1682 =head2 Conformance Test
1683
1684 The Conformance Test for the UCA is available
1685 under L<http://www.unicode.org/Public/UCA/>.
1686
1687 For F<CollationTest_SHIFTED.txt>,
1688 a collator via C<Unicode::Collate-E<gt>new( )> should be used;
1689 for F<CollationTest_NON_IGNORABLE.txt>, a collator via
1690 C<Unicode::Collate-E<gt>new(variable =E<gt> "non-ignorable", level =E<gt> 3)>.
1691
1692 B<Unicode::Normalize is required to try The Conformance Test.>
1693
1694 =head1 AUTHOR
1695
1696 SADAHIRO Tomoyuki <SADAHIRO@cpan.org>
1697
1698   http://homepage1.nifty.com/nomenclator/perl/
1699
1700   Copyright(C) 2001-2003, SADAHIRO Tomoyuki. Japan. All rights reserved.
1701
1702   This library is free software; you can redistribute it
1703   and/or modify it under the same terms as Perl itself.
1704
1705 =head1 SEE ALSO
1706
1707 =over 4
1708
1709 =item Unicode Collation Algorithm - UTS #10
1710
1711 L<http://www.unicode.org/reports/tr10/>
1712
1713 =item The Default Unicode Collation Element Table (DUCET)
1714
1715 L<http://www.unicode.org/Public/UCA/latest/allkeys.txt>
1716
1717 =item The conformance test for the UCA
1718
1719 L<http://www.unicode.org/Public/UCA/latest/CollationTest.html>
1720
1721 L<http://www.unicode.org/Public/UCA/latest/CollationTest.zip>
1722
1723 =item Hangul Syllable Type
1724
1725 L<http://www.unicode.org/Public/UNIDATA/HangulSyllableType.txt>
1726
1727 =item Unicode Normalization Forms - UAX #15
1728
1729 L<http://www.unicode.org/reports/tr15/>
1730
1731 =item L<Unicode::Normalize>
1732
1733 =back
1734
1735 =cut