lib/Unicode/Collate.pm

   1 package Unicode::Collate;
   2
   3 BEGIN {
   4     unless ("A" eq pack('U', 0x41)) {
   5         die "Unicode::Collate cannot stringify a Unicode code point\n";
   6     }
   7 }
   8
   9 use 5.006;
  10 use strict;
  11 use warnings;
  12 use Carp;
  13 use File::Spec;
  14
  15 no warnings 'utf8';
  16
  17 our $VERSION = '0.51';
  18 our $PACKAGE = __PACKAGE__;
  19
  20 my @Path = qw(Unicode Collate);
  21 my $KeyFile = "allkeys.txt";
  22
  23 # Perl's boolean
  24 use constant TRUE  => 1;
  25 use constant FALSE => "";
  26 use constant NOMATCHPOS => -1;
  27
  28 # A coderef to get combining class imported from Unicode::Normalize
  29 # (i.e. \&Unicode::Normalize::getCombinClass).
  30 # This is also used as a HAS_UNICODE_NORMALIZE flag.
  31 my $CVgetCombinClass;
  32
  33 # Supported Levels
  34 use constant MinLevel => 1;
  35 use constant MaxLevel => 4;
  36
  37 # Minimum weights at level 2 and 3, respectively
  38 use constant Min2Wt => 0x20;
  39 use constant Min3Wt => 0x02;
  40
  41 # Shifted weight at 4th level
  42 use constant Shift4Wt => 0xFFFF;
  43
  44 # A boolean for Variable and 16-bit weights at 4 levels of Collation Element
  45 # PROBLEM: The Default Unicode Collation Element Table
  46 # has weights over 0xFFFF at the 4th level.
  47 # The tie-breaking in the variable weights
  48 # other than "shift" (as well as "shift-trimmed") is unreliable.
  49 use constant VCE_TEMPLATE => 'Cn4';
  50
  51 # A sort key: 16-bit weights
  52 # See also the PROBLEM on VCE_TEMPLATE above.
  53 use constant KEY_TEMPLATE => 'n*';
  54
  55 # Level separator in a sort key:
  56 # i.e. pack(KEY_TEMPLATE, 0)
  57 use constant LEVEL_SEP => "\0\0";
  58
  59 # As Unicode code point separator for hash keys.
  60 # A joined code point string (denoted by JCPS below)
  61 # like "65;768" is used for internal processing
  62 # instead of Perl's Unicode string like "\x41\x{300}",
  63 # as the native code point is different from the Unicode code point
  64 # on EBCDIC platform.
  65 # This character must not be included in any stringified
  66 # representation of an integer.
  67 use constant CODE_SEP => ';';
  68
  69 # boolean values of variable weights
  70 use constant NON_VAR => 0; # Non-Variable character
  71 use constant VAR     => 1; # Variable character
  72
  73 # specific code points
  74 use constant Hangul_LBase  => 0x1100;
  75 use constant Hangul_LIni   => 0x1100;
  76 use constant Hangul_LFin   => 0x1159;
  77 use constant Hangul_LFill  => 0x115F;
  78 use constant Hangul_VBase  => 0x1161;
  79 use constant Hangul_VIni   => 0x1160; # from Vowel Filler
  80 use constant Hangul_VFin   => 0x11A2;
  81 use constant Hangul_TBase  => 0x11A7; # from "no-final" codepoint
  82 use constant Hangul_TIni   => 0x11A8;
  83 use constant Hangul_TFin   => 0x11F9;
  84 use constant Hangul_TCount => 28;
  85 use constant Hangul_NCount => 588;
  86 use constant Hangul_SBase  => 0xAC00;
  87 use constant Hangul_SIni   => 0xAC00;
  88 use constant Hangul_SFin   => 0xD7A3;
  89 use constant CJK_UidIni    => 0x4E00;
  90 use constant CJK_UidFin    => 0x9FA5;
  91 use constant CJK_UidF41    => 0x9FBB;
  92 use constant CJK_ExtAIni   => 0x3400;
  93 use constant CJK_ExtAFin   => 0x4DB5;
  94 use constant CJK_ExtBIni   => 0x20000;
  95 use constant CJK_ExtBFin   => 0x2A6D6;
  96 use constant BMP_Max       => 0xFFFF;
  97
  98 # Logical_Order_Exception in PropList.txt
  99 my $DefaultRearrange = [ 0x0E40..0x0E44, 0x0EC0..0x0EC4 ];
 100
 101 sub UCA_Version { "14" }
 102
 103 sub Base_Unicode_Version { "4.1.0" }
 104
 105 ######
 106
 107 sub pack_U {
 108     return pack('U*', @_);
 109 }
 110
 111 sub unpack_U {
 112     return unpack('U*', shift(@_).pack('U*'));
 113 }
 114
 115 ######
 116
 117 my (%VariableOK);
 118 @VariableOK{ qw/
 119     blanked  non-ignorable  shifted  shift-trimmed
 120   / } = (); # keys lowercased
 121
 122 our @ChangeOK = qw/
 123     alternate backwards level normalization rearrange
 124     katakana_before_hiragana upper_before_lower
 125     overrideHangul overrideCJK preprocess UCA_Version
 126     hangul_terminator variable
 127   /;
 128
 129 our @ChangeNG = qw/
 130     entry mapping table maxlength
 131     ignoreChar ignoreName undefChar undefName variableTable
 132     versionTable alternateTable backwardsTable forwardsTable rearrangeTable
 133     derivCode normCode rearrangeHash
 134     backwardsFlag
 135   /;
 136 # The hash key 'ignored' is deleted at v 0.21.
 137 # The hash key 'isShift' is deleted at v 0.23.
 138 # The hash key 'combining' is deleted at v 0.24.
 139 # The hash key 'entries' is deleted at v 0.30.
 140 # The hash key 'L3_ignorable' is deleted at v 0.40.
 141
 142 sub version {
 143     my $self = shift;
 144     return $self->{versionTable} || 'unknown';
 145 }
 146
 147 my (%ChangeOK, %ChangeNG);
 148 @ChangeOK{ @ChangeOK } = ();
 149 @ChangeNG{ @ChangeNG } = ();
 150
 151 sub change {
 152     my $self = shift;
 153     my %hash = @_;
 154     my %old;
 155     if (exists $hash{variable} && exists $hash{alternate}) {
 156         delete $hash{alternate};
 157     }
 158     elsif (!exists $hash{variable} && exists $hash{alternate}) {
 159         $hash{variable} = $hash{alternate};
 160     }
 161     foreach my $k (keys %hash) {
 162         if (exists $ChangeOK{$k}) {
 163             $old{$k} = $self->{$k};
 164             $self->{$k} = $hash{$k};
 165         }
 166         elsif (exists $ChangeNG{$k}) {
 167             croak "change of $k via change() is not allowed!";
 168         }
 169         # else => ignored
 170     }
 171     $self->checkCollator();
 172     return wantarray ? %old : $self;
 173 }
 174
 175 sub _checkLevel {
 176     my $level = shift;
 177     my $key   = shift; # 'level' or 'backwards'
 178     MinLevel <= $level or croak sprintf
 179         "Illegal level %d (in value for key '%s') lower than %d.",
 180             $level, $key, MinLevel;
 181     $level <= MaxLevel or croak sprintf
 182         "Unsupported level %d (in value for key '%s') higher than %d.",
 183             $level, $key, MaxLevel;
 184 }
 185
 186 my %DerivCode = (
 187     8 => \&_derivCE_8,
 188     9 => \&_derivCE_9,
 189    11 => \&_derivCE_9, # 11 == 9
 190    14 => \&_derivCE_14,
 191 );
 192
 193 sub checkCollator {
 194     my $self = shift;
 195     _checkLevel($self->{level}, "level");
 196
 197     $self->{derivCode} = $DerivCode{ $self->{UCA_Version} }
 198         or croak "Illegal UCA version (passed $self->{UCA_Version}).";
 199
 200     $self->{variable} ||= $self->{alternate} || $self->{variableTable} ||
 201                                 $self->{alternateTable} || 'shifted';
 202     $self->{variable} = $self->{alternate} = lc($self->{variable});
 203     exists $VariableOK{ $self->{variable} }
 204         or croak "$PACKAGE unknown variable parameter name: $self->{variable}";
 205
 206     if (! defined $self->{backwards}) {
 207         $self->{backwardsFlag} = 0;
 208     }
 209     elsif (! ref $self->{backwards}) {
 210         _checkLevel($self->{backwards}, "backwards");
 211         $self->{backwardsFlag} = 1 << $self->{backwards};
 212     }
 213     else {
 214         my %level;
 215         $self->{backwardsFlag} = 0;
 216         for my $b (@{ $self->{backwards} }) {
 217             _checkLevel($b, "backwards");
 218             $level{$b} = 1;
 219         }
 220         for my $v (sort keys %level) {
 221             $self->{backwardsFlag} += 1 << $v;
 222         }
 223     }
 224
 225     defined $self->{rearrange} or $self->{rearrange} = [];
 226     ref $self->{rearrange}
 227         or croak "$PACKAGE: list for rearrangement must be store in ARRAYREF";
 228
 229     # keys of $self->{rearrangeHash} are $self->{rearrange}.
 230     $self->{rearrangeHash} = undef;
 231
 232     if (@{ $self->{rearrange} }) {
 233         @{ $self->{rearrangeHash} }{ @{ $self->{rearrange} } } = ();
 234     }
 235
 236     $self->{normCode} = undef;
 237
 238     if (defined $self->{normalization}) {
 239         eval { require Unicode::Normalize };
 240         $@ and croak "Unicode::Normalize is required to normalize strings";
 241
 242         $CVgetCombinClass ||= \&Unicode::Normalize::getCombinClass;
 243
 244         if ($self->{normalization} =~ /^(?:NF)D\z/) { # tweak for default
 245             $self->{normCode} = \&Unicode::Normalize::NFD;
 246         }
 247         elsif ($self->{normalization} ne 'prenormalized') {
 248             my $norm = $self->{normalization};
 249             $self->{normCode} = sub {
 250                 Unicode::Normalize::normalize($norm, shift);
 251             };
 252             eval { $self->{normCode}->("") }; # try
 253             $@ and croak "$PACKAGE unknown normalization form name: $norm";
 254         }
 255     }
 256     return;
 257 }
 258
 259 sub new
 260 {
 261     my $class = shift;
 262     my $self = bless { @_ }, $class;
 263
 264     # If undef is passed explicitly, no file is read.
 265     $self->{table} = $KeyFile if ! exists $self->{table};
 266     $self->read_table() if defined $self->{table};
 267
 268     if ($self->{entry}) {
 269         while ($self->{entry} =~ /([^\n]+)/g) {
 270             $self->parseEntry($1);
 271         }
 272     }
 273
 274     $self->{level} ||= MaxLevel;
 275     $self->{UCA_Version} ||= UCA_Version();
 276
 277     $self->{overrideHangul} = FALSE
 278         if ! exists $self->{overrideHangul};
 279     $self->{overrideCJK} = FALSE
 280         if ! exists $self->{overrideCJK};
 281     $self->{normalization} = 'NFD'
 282         if ! exists $self->{normalization};
 283     $self->{rearrange} = $self->{rearrangeTable} ||
 284         ($self->{UCA_Version} <= 11 ? $DefaultRearrange : [])
 285         if ! exists $self->{rearrange};
 286     $self->{backwards} = $self->{backwardsTable}
 287         if ! exists $self->{backwards};
 288
 289     $self->checkCollator();
 290
 291     return $self;
 292 }
 293
 294 sub read_table {
 295     my $self = shift;
 296
 297     my($f, $fh);
 298     foreach my $d (@INC) {
 299         $f = File::Spec->catfile($d, @Path, $self->{table});
 300         last if open($fh, $f);
 301         $f = undef;
 302     }
 303     if (!defined $f) {
 304         $f = File::Spec->catfile(@Path, $self->{table});
 305         croak("$PACKAGE: Can't locate $f in \@INC (\@INC contains: @INC)");
 306     }
 307
 308     while (<$fh>) {
 309         next if /^\s*#/;
 310         unless (s/^\s*\@//) {
 311             $self->parseEntry($_);
 312             next;
 313         }
 314
 315         if (/^version\s*(\S*)/) {
 316             $self->{versionTable} ||= $1;
 317         }
 318         elsif (/^variable\s+(\S*)/) { # since UTS #10-9
 319             $self->{variableTable} ||= $1;
 320         }
 321         elsif (/^alternate\s+(\S*)/) { # till UTS #10-8
 322             $self->{alternateTable} ||= $1;
 323         }
 324         elsif (/^backwards\s+(\S*)/) {
 325             push @{ $self->{backwardsTable} }, $1;
 326         }
 327         elsif (/^forwards\s+(\S*)/) { # parhaps no use
 328             push @{ $self->{forwardsTable} }, $1;
 329         }
 330         elsif (/^rearrange\s+(.*)/) { # (\S*) is NG
 331             push @{ $self->{rearrangeTable} }, _getHexArray($1);
 332         }
 333     }
 334     close $fh;
 335 }
 336
 337
 338 ##
 339 ## get $line, parse it, and write an entry in $self
 340 ##
 341 sub parseEntry
 342 {
 343     my $self = shift;
 344     my $line = shift;
 345     my($name, $entry, @uv, @key);
 346
 347     return if $line !~ /^\s*[0-9A-Fa-f]/;
 348
 349     # removes comment and gets name
 350     $name = $1
 351         if $line =~ s/[#%]\s*(.*)//;
 352     return if defined $self->{undefName} && $name =~ /$self->{undefName}/;
 353
 354     # gets element
 355     my($e, $k) = split /;/, $line;
 356     croak "Wrong Entry: <charList> must be separated by ';' from <collElement>"
 357         if ! $k;
 358
 359     @uv = _getHexArray($e);
 360     return if !@uv;
 361
 362     $entry = join(CODE_SEP, @uv); # in JCPS
 363
 364     if (defined $self->{undefChar} || defined $self->{ignoreChar}) {
 365         my $ele = pack_U(@uv);
 366
 367         # regarded as if it were not entried in the table
 368         return
 369             if defined $self->{undefChar} && $ele =~ /$self->{undefChar}/;
 370
 371         # replaced as completely ignorable
 372         $k = '[.0000.0000.0000.0000]'
 373             if defined $self->{ignoreChar} && $ele =~ /$self->{ignoreChar}/;
 374     }
 375
 376     # replaced as completely ignorable
 377     $k = '[.0000.0000.0000.0000]'
 378         if defined $self->{ignoreName} && $name =~ /$self->{ignoreName}/;
 379
 380     my $is_L3_ignorable = TRUE;
 381
 382     foreach my $arr ($k =~ /\[([^\[\]]+)\]/g) { # SPACEs allowed
 383         my $var = $arr =~ /\*/; # exactly /^\*/ but be lenient.
 384         my @wt = _getHexArray($arr);
 385         push @key, pack(VCE_TEMPLATE, $var, @wt);
 386         $is_L3_ignorable = FALSE
 387             if $wt[0] || $wt[1] || $wt[2];
 388         # Conformance Test for 3.1.1 and 4.0.0 shows Level 3 ignorable
 389         # is completely ignorable.
 390         # For expansion, an entry $is_L3_ignorable
 391         # if and only if "all" CEs are [.0000.0000.0000].
 392     }
 393
 394     $self->{mapping}{$entry} = $is_L3_ignorable ? [] : \@key;
 395
 396     if (@uv > 1) {
 397         (!$self->{maxlength}{$uv[0]} || $self->{maxlength}{$uv[0]} < @uv)
 398             and $self->{maxlength}{$uv[0]} = @uv;
 399     }
 400 }
 401
 402
 403 ##
 404 ## VCE = _varCE(variable term, VCE)
 405 ##
 406 sub _varCE
 407 {
 408     my $vbl = shift;
 409     my $vce = shift;
 410     if ($vbl eq 'non-ignorable') {
 411         return $vce;
 412     }
 413     my ($var, @wt) = unpack VCE_TEMPLATE, $vce;
 414
 415     if ($var) {
 416         return pack(VCE_TEMPLATE, $var, 0, 0, 0,
 417                 $vbl eq 'blanked' ? $wt[3] : $wt[0]);
 418     }
 419     elsif ($vbl eq 'blanked') {
 420         return $vce;
 421     }
 422     else {
 423         return pack(VCE_TEMPLATE, $var, @wt[0..2],
 424             $vbl eq 'shifted' && $wt[0]+$wt[1]+$wt[2] ? Shift4Wt : 0);
 425     }
 426 }
 427
 428 sub viewSortKey
 429 {
 430     my $self = shift;
 431     $self->visualizeSortKey($self->getSortKey(@_));
 432 }
 433
 434 sub visualizeSortKey
 435 {
 436     my $self = shift;
 437     my $view = join " ", map sprintf("%04X", $_), unpack(KEY_TEMPLATE, shift);
 438
 439     if ($self->{UCA_Version} <= 8) {
 440         $view =~ s/ ?0000 ?/|/g;
 441     } else {
 442         $view =~ s/\b0000\b/|/g;
 443     }
 444     return "[$view]";
 445 }
 446
 447
 448 ##
 449 ## arrayref of JCPS   = splitEnt(string to be collated)
 450 ## arrayref of arrayref[JCPS, ini_pos, fin_pos] = splitEnt(string, true)
 451 ##
 452 sub splitEnt
 453 {
 454     my $self = shift;
 455     my $wLen = $_[1];
 456
 457     my $code = $self->{preprocess};
 458     my $norm = $self->{normCode};
 459     my $map  = $self->{mapping};
 460     my $max  = $self->{maxlength};
 461     my $reH  = $self->{rearrangeHash};
 462     my $ver9 = $self->{UCA_Version} >= 9 && $self->{UCA_Version} <= 11;
 463
 464     my ($str, @buf);
 465
 466     if ($wLen) {
 467         $code and croak "Preprocess breaks character positions. "
 468                         . "Don't use with index(), match(), etc.";
 469         $norm and croak "Normalization breaks character positions. "
 470                         . "Don't use with index(), match(), etc.";
 471         $str = $_[0];
 472     }
 473     else {
 474         $str = $_[0];
 475         $str = &$code($str) if ref $code;
 476         $str = &$norm($str) if ref $norm;
 477     }
 478
 479     # get array of Unicode code point of string.
 480     my @src = unpack_U($str);
 481
 482     # rearrangement:
 483     # Character positions are not kept if rearranged,
 484     # then neglected if $wLen is true.
 485     if ($reH && ! $wLen) {
 486         for (my $i = 0; $i < @src; $i++) {
 487             if (exists $reH->{ $src[$i] } && $i + 1 < @src) {
 488                 ($src[$i], $src[$i+1]) = ($src[$i+1], $src[$i]);
 489                 $i++;
 490             }
 491         }
 492     }
 493
 494     # remove a code point marked as a completely ignorable.
 495     for (my $i = 0; $i < @src; $i++) {
 496         $src[$i] = undef
 497             if _isIllegal($src[$i]) || ($ver9 &&
 498                 $map->{ $src[$i] } && @{ $map->{ $src[$i] } } == 0);
 499     }
 500
 501     for (my $i = 0; $i < @src; $i++) {
 502         my $jcps = $src[$i];
 503
 504         # skip removed code point
 505         if (! defined $jcps) {
 506             if ($wLen && @buf) {
 507                 $buf[-1][2] = $i + 1;
 508             }
 509             next;
 510         }
 511
 512         my $i_orig = $i;
 513
 514         # find contraction
 515         if ($max->{$jcps}) {
 516             my $temp_jcps = $jcps;
 517             my $jcpsLen = 1;
 518             my $maxLen = $max->{$jcps};
 519
 520             for (my $p = $i + 1; $jcpsLen < $maxLen && $p < @src; $p++) {
 521                 next if ! defined $src[$p];
 522                 $temp_jcps .= CODE_SEP . $src[$p];
 523                 $jcpsLen++;
 524                 if ($map->{$temp_jcps}) {
 525                     $jcps = $temp_jcps;
 526                     $i = $p;
 527                 }
 528             }
 529
 530         # not-contiguous contraction with Combining Char (cf. UTS#10, S2.1).
 531         # This process requires Unicode::Normalize.
 532         # If "normalization" is undef, here should be skipped *always*
 533         # (in spite of bool value of $CVgetCombinClass),
 534         # since canonical ordering cannot be expected.
 535         # Blocked combining character should not be contracted.
 536
 537             if ($self->{normalization})
 538             # $self->{normCode} is false in the case of "prenormalized".
 539             {
 540                 my $preCC = 0;
 541                 my $curCC = 0;
 542
 543                 for (my $p = $i + 1; $p < @src; $p++) {
 544                     next if ! defined $src[$p];
 545                     $curCC = $CVgetCombinClass->($src[$p]);
 546                     last unless $curCC;
 547                     my $tail = CODE_SEP . $src[$p];
 548                     if ($preCC != $curCC && $map->{$jcps.$tail}) {
 549                         $jcps .= $tail;
 550                         $src[$p] = undef;
 551                     } else {
 552                         $preCC = $curCC;
 553                     }
 554                 }
 555             }
 556         }
 557
 558         # skip completely ignorable
 559         if ($map->{$jcps} && @{ $map->{$jcps} } == 0) {
 560             if ($wLen && @buf) {
 561                 $buf[-1][2] = $i + 1;
 562             }
 563             next;
 564         }
 565
 566         push @buf, $wLen ? [$jcps, $i_orig, $i + 1] : $jcps;
 567     }
 568     return \@buf;
 569 }
 570
 571
 572 ##
 573 ## list of VCE = getWt(JCPS)
 574 ##
 575 sub getWt
 576 {
 577     my $self = shift;
 578     my $u    = shift;
 579     my $vbl  = $self->{variable};
 580     my $map  = $self->{mapping};
 581     my $der  = $self->{derivCode};
 582
 583     return if !defined $u;
 584     return map(_varCE($vbl, $_), @{ $map->{$u} })
 585         if $map->{$u};
 586
 587     # JCPS must not be a contraction, then it's a code point.
 588     if (Hangul_SIni <= $u && $u <= Hangul_SFin) {
 589         my $hang = $self->{overrideHangul};
 590         my @hangulCE;
 591         if ($hang) {
 592             @hangulCE = map(pack(VCE_TEMPLATE, NON_VAR, @$_), &$hang($u));
 593         }
 594         elsif (!defined $hang) {
 595             @hangulCE = $der->($u);
 596         }
 597         else {
 598             my $max  = $self->{maxlength};
 599             my @decH = _decompHangul($u);
 600
 601             if (@decH == 2) {
 602                 my $contract = join(CODE_SEP, @decH);
 603                 @decH = ($contract) if $map->{$contract};
 604             } else { # must be <@decH == 3>
 605                 if ($max->{$decH[0]}) {
 606                     my $contract = join(CODE_SEP, @decH);
 607                     if ($map->{$contract}) {
 608                         @decH = ($contract);
 609                     } else {
 610                         $contract = join(CODE_SEP, @decH[0,1]);
 611                         $map->{$contract} and @decH = ($contract, $decH[2]);
 612                     }
 613                     # even if V's ignorable, LT contraction is not supported.
 614                     # If such a situatution were required, NFD should be used.
 615                 }
 616                 if (@decH == 3 && $max->{$decH[1]}) {
 617                     my $contract = join(CODE_SEP, @decH[1,2]);
 618                     $map->{$contract} and @decH = ($decH[0], $contract);
 619                 }
 620             }
 621
 622             @hangulCE = map({
 623                     $map->{$_} ? @{ $map->{$_} } : $der->($_);
 624                 } @decH);
 625         }
 626         return map _varCE($vbl, $_), @hangulCE;
 627     }
 628     elsif (_isUIdeo($u, $self->{UCA_Version})) {
 629         my $cjk  = $self->{overrideCJK};
 630         return map _varCE($vbl, $_),
 631             $cjk
 632                 ? map(pack(VCE_TEMPLATE, NON_VAR, @$_), &$cjk($u))
 633                 : defined $cjk && $self->{UCA_Version} <= 8 && $u < 0x10000
 634                     ? _uideoCE_8($u)
 635                     : $der->($u);
 636     }
 637     else {
 638         return map _varCE($vbl, $_), $der->($u);
 639     }
 640 }
 641
 642
 643 ##
 644 ## string sortkey = getSortKey(string arg)
 645 ##
 646 sub getSortKey
 647 {
 648     my $self = shift;
 649     my $lev  = $self->{level};
 650     my $rEnt = $self->splitEnt(shift); # get an arrayref of JCPS
 651     my $v2i  = $self->{UCA_Version} >= 9 &&
 652                 $self->{variable} ne 'non-ignorable';
 653
 654     my @buf; # weight arrays
 655     if ($self->{hangul_terminator}) {
 656         my $preHST = '';
 657         foreach my $jcps (@$rEnt) {
 658             # weird things like VL, TL-contraction are not considered!
 659             my $curHST = '';
 660             foreach my $u (split /;/, $jcps) {
 661                 $curHST .= getHST($u);
 662             }
 663             if ($preHST && !$curHST || # hangul before non-hangul
 664                 $preHST =~ /L\z/ && $curHST =~ /^T/ ||
 665                 $preHST =~ /V\z/ && $curHST =~ /^L/ ||
 666                 $preHST =~ /T\z/ && $curHST =~ /^[LV]/) {
 667
 668                 push @buf, $self->getWtHangulTerm();
 669             }
 670             $preHST = $curHST;
 671
 672             push @buf, $self->getWt($jcps);
 673         }
 674         $preHST # end at hangul
 675             and push @buf, $self->getWtHangulTerm();
 676     }
 677     else {
 678         foreach my $jcps (@$rEnt) {
 679             push @buf, $self->getWt($jcps);
 680         }
 681     }
 682
 683     # make sort key
 684     my @ret = ([],[],[],[]);
 685     my $last_is_variable;
 686
 687     foreach my $vwt (@buf) {
 688         my($var, @wt) = unpack(VCE_TEMPLATE, $vwt);
 689
 690         # "Ignorable (L1, L2) after Variable" since track. v. 9
 691         if ($v2i) {
 692             if ($var) {
 693                 $last_is_variable = TRUE;
 694             }
 695             elsif (!$wt[0]) { # ignorable
 696                 next if $last_is_variable;
 697             }
 698             else {
 699                 $last_is_variable = FALSE;
 700             }
 701         }
 702         foreach my $v (0..$lev-1) {
 703             0 < $wt[$v] and push @{ $ret[$v] }, $wt[$v];
 704         }
 705     }
 706
 707     # modification of tertiary weights
 708     if ($self->{upper_before_lower}) {
 709         foreach (@{ $ret[2] }) {
 710             if    (0x8 <= $_ && $_ <= 0xC) { $_ -= 6 } # lower
 711             elsif (0x2 <= $_ && $_ <= 0x6) { $_ += 6 } # upper
 712             elsif ($_ == 0x1C)             { $_ += 1 } # square upper
 713             elsif ($_ == 0x1D)             { $_ -= 1 } # square lower
 714         }
 715     }
 716     if ($self->{katakana_before_hiragana}) {
 717         foreach (@{ $ret[2] }) {
 718             if    (0x0F <= $_ && $_ <= 0x13) { $_ -= 2 } # katakana
 719             elsif (0x0D <= $_ && $_ <= 0x0E) { $_ += 5 } # hiragana
 720         }
 721     }
 722
 723     if ($self->{backwardsFlag}) {
 724         for (my $v = MinLevel; $v <= MaxLevel; $v++) {
 725             if ($self->{backwardsFlag} & (1 << $v)) {
 726                 @{ $ret[$v-1] } = reverse @{ $ret[$v-1] };
 727             }
 728         }
 729     }
 730
 731     join LEVEL_SEP, map pack(KEY_TEMPLATE, @$_), @ret;
 732 }
 733
 734
 735 ##
 736 ## int compare = cmp(string a, string b)
 737 ##
 738 sub cmp { $_[0]->getSortKey($_[1]) cmp $_[0]->getSortKey($_[2]) }
 739 sub eq  { $_[0]->getSortKey($_[1]) eq  $_[0]->getSortKey($_[2]) }
 740 sub ne  { $_[0]->getSortKey($_[1]) ne  $_[0]->getSortKey($_[2]) }
 741 sub lt  { $_[0]->getSortKey($_[1]) lt  $_[0]->getSortKey($_[2]) }
 742 sub le  { $_[0]->getSortKey($_[1]) le  $_[0]->getSortKey($_[2]) }
 743 sub gt  { $_[0]->getSortKey($_[1]) gt  $_[0]->getSortKey($_[2]) }
 744 sub ge  { $_[0]->getSortKey($_[1]) ge  $_[0]->getSortKey($_[2]) }
 745
 746 ##
 747 ## list[strings] sorted = sort(list[strings] arg)
 748 ##
 749 sub sort {
 750     my $obj = shift;
 751     return
 752         map { $_->[1] }
 753             sort{ $a->[0] cmp $b->[0] }
 754                 map [ $obj->getSortKey($_), $_ ], @_;
 755 }
 756
 757
 758 sub _derivCE_14 {
 759     my $u = shift;
 760     my $base =
 761         (CJK_UidIni  <= $u && $u <= CJK_UidF41)
 762             ? 0xFB40 : # CJK
 763         (CJK_ExtAIni <= $u && $u <= CJK_ExtAFin ||
 764          CJK_ExtBIni <= $u && $u <= CJK_ExtBFin)
 765             ? 0xFB80   # CJK ext.
 766             : 0xFBC0;  # others
 767
 768     my $aaaa = $base + ($u >> 15);
 769     my $bbbb = ($u & 0x7FFF) | 0x8000;
 770     return
 771         pack(VCE_TEMPLATE, NON_VAR, $aaaa, Min2Wt, Min3Wt, $u),
 772         pack(VCE_TEMPLATE, NON_VAR, $bbbb,      0,      0, $u);
 773 }
 774
 775 sub _derivCE_9 {
 776     my $u = shift;
 777     my $base =
 778         (CJK_UidIni  <= $u && $u <= CJK_UidFin)
 779             ? 0xFB40 : # CJK
 780         (CJK_ExtAIni <= $u && $u <= CJK_ExtAFin ||
 781          CJK_ExtBIni <= $u && $u <= CJK_ExtBFin)
 782             ? 0xFB80   # CJK ext.
 783             : 0xFBC0;  # others
 784
 785     my $aaaa = $base + ($u >> 15);
 786     my $bbbb = ($u & 0x7FFF) | 0x8000;
 787     return
 788         pack(VCE_TEMPLATE, NON_VAR, $aaaa, Min2Wt, Min3Wt, $u),
 789         pack(VCE_TEMPLATE, NON_VAR, $bbbb,      0,      0, $u);
 790 }
 791
 792 sub _derivCE_8 {
 793     my $code = shift;
 794     my $aaaa =  0xFF80 + ($code >> 15);
 795     my $bbbb = ($code & 0x7FFF) | 0x8000;
 796     return
 797         pack(VCE_TEMPLATE, NON_VAR, $aaaa, 2, 1, $code),
 798         pack(VCE_TEMPLATE, NON_VAR, $bbbb, 0, 0, $code);
 799 }
 800
 801 sub _uideoCE_8 {
 802     my $u = shift;
 803     return pack(VCE_TEMPLATE, NON_VAR, $u, Min2Wt, Min3Wt, $u);
 804 }
 805
 806 sub _isUIdeo {
 807     my ($u, $uca_vers) = @_;
 808     return(
 809         (CJK_UidIni <= $u &&
 810             ($uca_vers >= 14 ? ( $u <= CJK_UidF41) : ($u <= CJK_UidFin)))
 811                 ||
 812         (CJK_ExtAIni <= $u && $u <= CJK_ExtAFin)
 813                 ||
 814         (CJK_ExtBIni <= $u && $u <= CJK_ExtBFin)
 815     );
 816 }
 817
 818
 819 sub getWtHangulTerm {
 820     my $self = shift;
 821     return _varCE($self->{variable},
 822         pack(VCE_TEMPLATE, NON_VAR, $self->{hangul_terminator}, 0,0,0));
 823 }
 824
 825
 826 ##
 827 ## "hhhh hhhh hhhh" to (dddd, dddd, dddd)
 828 ##
 829 sub _getHexArray { map hex, $_[0] =~ /([0-9a-fA-F]+)/g }
 830
 831 #
 832 # $code *must* be in Hangul syllable.
 833 # Check it before you enter here.
 834 #
 835 sub _decompHangul {
 836     my $code = shift;
 837     my $si = $code - Hangul_SBase;
 838     my $li = int( $si / Hangul_NCount);
 839     my $vi = int(($si % Hangul_NCount) / Hangul_TCount);
 840     my $ti =      $si % Hangul_TCount;
 841     return (
 842         Hangul_LBase + $li,
 843         Hangul_VBase + $vi,
 844         $ti ? (Hangul_TBase + $ti) : (),
 845     );
 846 }
 847
 848 sub _isIllegal {
 849     my $code = shift;
 850     return ! defined $code                      # removed
 851         || ($code < 0 || 0x10FFFF < $code)      # out of range
 852         || (($code & 0xFFFE) == 0xFFFE)         # ??FFF[EF] (cf. utf8.c)
 853         || (0xD800 <= $code && $code <= 0xDFFF) # unpaired surrogates
 854         || (0xFDD0 <= $code && $code <= 0xFDEF) # other non-characters
 855     ;
 856 }
 857
 858 # Hangul Syllable Type
 859 sub getHST {
 860     my $u = shift;
 861     return
 862         Hangul_LIni <= $u && $u <= Hangul_LFin || $u == Hangul_LFill ? "L" :
 863         Hangul_VIni <= $u && $u <= Hangul_VFin       ? "V" :
 864         Hangul_TIni <= $u && $u <= Hangul_TFin       ? "T" :
 865         Hangul_SIni <= $u && $u <= Hangul_SFin ?
 866             ($u - Hangul_SBase) % Hangul_TCount ? "LVT" : "LV" : "";
 867 }
 868
 869
 870 ##
 871 ## bool _nonIgnorAtLevel(arrayref weights, int level)
 872 ##
 873 sub _nonIgnorAtLevel($$)
 874 {
 875     my $wt = shift;
 876     return if ! defined $wt;
 877     my $lv = shift;
 878     return grep($wt->[$_-1] != 0, MinLevel..$lv) ? TRUE : FALSE;
 879 }
 880
 881 ##
 882 ## bool _eqArray(
 883 ##    arrayref of arrayref[weights] source,
 884 ##    arrayref of arrayref[weights] substr,
 885 ##    int level)
 886 ## * comparison of graphemes vs graphemes.
 887 ##   @$source >= @$substr must be true (check it before call this);
 888 ##
 889 sub _eqArray($$$)
 890 {
 891     my $source = shift;
 892     my $substr = shift;
 893     my $lev = shift;
 894
 895     for my $g (0..@$substr-1){
 896         # Do the $g'th graphemes have the same number of AV weigths?
 897         return if @{ $source->[$g] } != @{ $substr->[$g] };
 898
 899         for my $w (0..@{ $substr->[$g] }-1) {
 900             for my $v (0..$lev-1) {
 901                 return if $source->[$g][$w][$v] != $substr->[$g][$w][$v];
 902             }
 903         }
 904     }
 905     return 1;
 906 }
 907
 908 ##
 909 ## (int position, int length)
 910 ## int position = index(string, substring, position, [undoc'ed grobal])
 911 ##
 912 ## With "grobal" (only for the list context),
 913 ##  returns list of arrayref[position, length].
 914 ##
 915 sub index
 916 {
 917     my $self = shift;
 918     my $str  = shift;
 919     my $len  = length($str);
 920     my $subE = $self->splitEnt(shift);
 921     my $pos  = @_ ? shift : 0;
 922        $pos  = 0 if $pos < 0;
 923     my $grob = shift;
 924
 925     my $lev  = $self->{level};
 926     my $v2i  = $self->{UCA_Version} >= 9 &&
 927                 $self->{variable} ne 'non-ignorable';
 928
 929     if (! @$subE) {
 930         my $temp = $pos <= 0 ? 0 : $len <= $pos ? $len : $pos;
 931         return $grob
 932             ? map([$_, 0], $temp..$len)
 933             : wantarray ? ($temp,0) : $temp;
 934     }
 935     $len < $pos
 936         and return wantarray ? () : NOMATCHPOS;
 937     my $strE = $self->splitEnt($pos ? substr($str, $pos) : $str, TRUE);
 938     @$strE
 939         or return wantarray ? () : NOMATCHPOS;
 940
 941     my(@strWt, @iniPos, @finPos, @subWt, @g_ret);
 942
 943     my $last_is_variable;
 944     for my $vwt (map $self->getWt($_), @$subE) {
 945         my($var, @wt) = unpack(VCE_TEMPLATE, $vwt);
 946         my $to_be_pushed = _nonIgnorAtLevel(\@wt,$lev);
 947
 948         # "Ignorable (L1, L2) after Variable" since track. v. 9
 949         if ($v2i) {
 950             if ($var) {
 951                 $last_is_variable = TRUE;
 952             }
 953             elsif (!$wt[0]) { # ignorable
 954                 $to_be_pushed = FALSE if $last_is_variable;
 955             }
 956             else {
 957                 $last_is_variable = FALSE;
 958             }
 959         }
 960
 961         if (@subWt && !$var && !$wt[0]) {
 962             push @{ $subWt[-1] }, \@wt if $to_be_pushed;
 963         } else {
 964             push @subWt, [ \@wt ];
 965         }
 966     }
 967
 968     my $count = 0;
 969     my $end = @$strE - 1;
 970
 971     $last_is_variable = FALSE; # reuse
 972     for (my $i = 0; $i <= $end; ) { # no $i++
 973         my $found_base = 0;
 974
 975         # fetch a grapheme
 976         while ($i <= $end && $found_base == 0) {
 977             for my $vwt ($self->getWt($strE->[$i][0])) {
 978                 my($var, @wt) = unpack(VCE_TEMPLATE, $vwt);
 979                 my $to_be_pushed = _nonIgnorAtLevel(\@wt,$lev);
 980
 981                 # "Ignorable (L1, L2) after Variable" since track. v. 9
 982                 if ($v2i) {
 983                     if ($var) {
 984                         $last_is_variable = TRUE;
 985                     }
 986                     elsif (!$wt[0]) { # ignorable
 987                         $to_be_pushed = FALSE if $last_is_variable;
 988                     }
 989                     else {
 990                         $last_is_variable = FALSE;
 991                     }
 992                 }
 993
 994                 if (@strWt && !$var && !$wt[0]) {
 995                     push @{ $strWt[-1] }, \@wt if $to_be_pushed;
 996                     $finPos[-1] = $strE->[$i][2];
 997                 } elsif ($to_be_pushed) {
 998                     push @strWt, [ \@wt ];
 999                     push @iniPos, $found_base ? NOMATCHPOS : $strE->[$i][1];
1000                     $finPos[-1] = NOMATCHPOS if $found_base;
1001                     push @finPos, $strE->[$i][2];
1002                     $found_base++;
1003                 }
1004                 # else ===> no-op
1005             }
1006             $i++;
1007         }
1008
1009         # try to match
1010         while ( @strWt > @subWt || (@strWt == @subWt && $i > $end) ) {
1011             if ($iniPos[0] != NOMATCHPOS &&
1012                     $finPos[$#subWt] != NOMATCHPOS &&
1013                         _eqArray(\@strWt, \@subWt, $lev)) {
1014                 my $temp = $iniPos[0] + $pos;
1015
1016                 if ($grob) {
1017                     push @g_ret, [$temp, $finPos[$#subWt] - $iniPos[0]];
1018                     splice @strWt,  0, $#subWt;
1019                     splice @iniPos, 0, $#subWt;
1020                     splice @finPos, 0, $#subWt;
1021                 }
1022                 else {
1023                     return wantarray
1024                         ? ($temp, $finPos[$#subWt] - $iniPos[0])
1025                         :  $temp;
1026                 }
1027             }
1028             shift @strWt;
1029             shift @iniPos;
1030             shift @finPos;
1031         }
1032     }
1033
1034     return $grob
1035         ? @g_ret
1036         : wantarray ? () : NOMATCHPOS;
1037 }
1038
1039 ##
1040 ## scalarref to matching part = match(string, substring)
1041 ##
1042 sub match
1043 {
1044     my $self = shift;
1045     if (my($pos,$len) = $self->index($_[0], $_[1])) {
1046         my $temp = substr($_[0], $pos, $len);
1047         return wantarray ? $temp : \$temp;
1048         # An lvalue ref \substr should be avoided,
1049         # since its value is affected by modification of its referent.
1050     }
1051     else {
1052         return;
1053     }
1054 }
1055
1056 ##
1057 ## arrayref matching parts = gmatch(string, substring)
1058 ##
1059 sub gmatch
1060 {
1061     my $self = shift;
1062     my $str  = shift;
1063     my $sub  = shift;
1064     return map substr($str, $_->[0], $_->[1]),
1065                 $self->index($str, $sub, 0, 'g');
1066 }
1067
1068 ##
1069 ## bool subst'ed = subst(string, substring, replace)
1070 ##
1071 sub subst
1072 {
1073     my $self = shift;
1074     my $code = ref $_[2] eq 'CODE' ? $_[2] : FALSE;
1075
1076     if (my($pos,$len) = $self->index($_[0], $_[1])) {
1077         if ($code) {
1078             my $mat = substr($_[0], $pos, $len);
1079             substr($_[0], $pos, $len, $code->($mat));
1080         } else {
1081             substr($_[0], $pos, $len, $_[2]);
1082         }
1083         return TRUE;
1084     }
1085     else {
1086         return FALSE;
1087     }
1088 }
1089
1090 ##
1091 ## int count = gsubst(string, substring, replace)
1092 ##
1093 sub gsubst
1094 {
1095     my $self = shift;
1096     my $code = ref $_[2] eq 'CODE' ? $_[2] : FALSE;
1097     my $cnt = 0;
1098
1099     # Replacement is carried out from the end, then use reverse.
1100     for my $pos_len (reverse $self->index($_[0], $_[1], 0, 'g')) {
1101         if ($code) {
1102             my $mat = substr($_[0], $pos_len->[0], $pos_len->[1]);
1103             substr($_[0], $pos_len->[0], $pos_len->[1], $code->($mat));
1104         } else {
1105             substr($_[0], $pos_len->[0], $pos_len->[1], $_[2]);
1106         }
1107         $cnt++;
1108     }
1109     return $cnt;
1110 }
1111
1112 1;
1113 __END__
1114
1115 =head1 NAME
1116
1117 Unicode::Collate - Unicode Collation Algorithm
1118
1119 =head1 SYNOPSIS
1120
1121   use Unicode::Collate;
1122
1123   #construct
1124   $Collator = Unicode::Collate->new(%tailoring);
1125
1126   #sort
1127   @sorted = $Collator->sort(@not_sorted);
1128
1129   #compare
1130   $result = $Collator->cmp($a, $b); # returns 1, 0, or -1.
1131
1132   # If %tailoring is false (i.e. empty),
1133   # $Collator should do the default collation.
1134
1135 =head1 DESCRIPTION
1136
1137 This module is an implementation of Unicode Technical Standard #10
1138 (a.k.a. UTS #10) - Unicode Collation Algorithm (a.k.a. UCA).
1139
1140 =head2 Constructor and Tailoring
1141
1142 The C<new> method returns a collator object.
1143
1144    $Collator = Unicode::Collate->new(
1145       UCA_Version => $UCA_Version,
1146       alternate => $alternate, # deprecated: use of 'variable' is recommended.
1147       backwards => $levelNumber, # or \@levelNumbers
1148       entry => $element,
1149       hangul_terminator => $term_primary_weight,
1150       ignoreName => qr/$ignoreName/,
1151       ignoreChar => qr/$ignoreChar/,
1152       katakana_before_hiragana => $bool,
1153       level => $collationLevel,
1154       normalization  => $normalization_form,
1155       overrideCJK => \&overrideCJK,
1156       overrideHangul => \&overrideHangul,
1157       preprocess => \&preprocess,
1158       rearrange => \@charList,
1159       table => $filename,
1160       undefName => qr/$undefName/,
1161       undefChar => qr/$undefChar/,
1162       upper_before_lower => $bool,
1163       variable => $variable,
1164    );
1165
1166 =over 4
1167
1168 =item UCA_Version
1169
1170 If the tracking version number of UCA is given,
1171 behavior of that tracking version is emulated on collating.
1172 If omitted, the return value of C<UCA_Version()> is used.
1173 C<UCA_Version()> should return the latest tracking version supported.
1174
1175 The supported tracking version: 8, 9, 11, or 14.
1176
1177      UCA       Unicode Standard         DUCET (@version)
1178      ---------------------------------------------------
1179       8              3.1                3.0.1 (3.0.1d9)
1180       9     3.1 with Corrigendum 3      3.1.1 (3.1.1)
1181      11              4.0                4.0.0 (4.0.0)
1182      14             4.1.0               4.1.0 (4.1.0)
1183
1184 Note: Recent UTS #10 renames "Tracking Version" to "Revision."
1185
1186 =item alternate
1187
1188 -- see 3.2.2 Alternate Weighting, version 8 of UTS #10
1189
1190 For backward compatibility, C<alternate> (old name) can be used
1191 as an alias for C<variable>.
1192
1193 =item backwards
1194
1195 -- see 3.1.2 French Accents, UTS #10.
1196
1197      backwards => $levelNumber or \@levelNumbers
1198
1199 Weights in reverse order; ex. level 2 (diacritic ordering) in French.
1200 If omitted, forwards at all the levels.
1201
1202 =item entry
1203
1204 -- see 3.1 Linguistic Features; 3.2.1 File Format, UTS #10.
1205
1206 If the same character (or a sequence of characters) exists
1207 in the collation element table through C<table>,
1208 mapping to collation elements is overrided.
1209 If it does not exist, the mapping is defined additionally.
1210
1211     entry => <<'ENTRY', # for DUCET v4.0.0 (allkeys-4.0.0.txt)
1212 0063 0068 ; [.0E6A.0020.0002.0063] # ch
1213 0043 0068 ; [.0E6A.0020.0007.0043] # Ch
1214 0043 0048 ; [.0E6A.0020.0008.0043] # CH
1215 006C 006C ; [.0F4C.0020.0002.006C] # ll
1216 004C 006C ; [.0F4C.0020.0007.004C] # Ll
1217 004C 004C ; [.0F4C.0020.0008.004C] # LL
1218 00F1      ; [.0F7B.0020.0002.00F1] # n-tilde
1219 006E 0303 ; [.0F7B.0020.0002.00F1] # n-tilde
1220 00D1      ; [.0F7B.0020.0008.00D1] # N-tilde
1221 004E 0303 ; [.0F7B.0020.0008.00D1] # N-tilde
1222 ENTRY
1223
1224     entry => <<'ENTRY', # for DUCET v4.0.0 (allkeys-4.0.0.txt)
1225 00E6 ; [.0E33.0020.0002.00E6][.0E8B.0020.0002.00E6] # ae ligature as <a><e>
1226 00C6 ; [.0E33.0020.0008.00C6][.0E8B.0020.0008.00C6] # AE ligature as <A><E>
1227 ENTRY
1228
1229 B<NOTE:> The code point in the UCA file format (before C<';'>)
1230 B<must> be a Unicode code point (defined as hexadecimal),
1231 but not a native code point.
1232 So C<0063> must always denote C<U+0063>,
1233 but not a character of C<"\x63">.
1234
1235 Weighting may vary depending on collation element table.
1236 So ensure the weights defined in C<entry> will be consistent with
1237 those in the collation element table loaded via C<table>.
1238
1239 In DUCET v4.0.0, primary weight of C<C> is C<0E60>
1240 and that of C<D> is C<0E6D>. So setting primary weight of C<CH> to C<0E6A>
1241 (as a value between C<0E60> and C<0E6D>)
1242 makes ordering as C<C E<lt> CH E<lt> D>.
1243 Exactly speaking DUCET already has some characters between C<C> and C<D>:
1244 C<small capital C> (C<U+1D04>) with primary weight C<0E64>,
1245 C<c-hook/C-hook> (C<U+0188/U+0187>) with C<0E65>,
1246 and C<c-curl> (C<U+0255>) with C<0E69>.
1247 Then primary weight C<0E6A> for C<CH> makes C<CH>
1248 ordered between C<c-curl> and C<D>.
1249
1250 =item hangul_terminator
1251
1252 -- see 7.1.4 Trailing Weights, UTS #10.
1253
1254 If a true value is given (non-zero but should be positive),
1255 it will be added as a terminator primary weight to the end of
1256 every standard Hangul syllable. Secondary and any higher weights
1257 for terminator are set to zero.
1258 If the value is false or C<hangul_terminator> key does not exist,
1259 insertion of terminator weights will not be performed.
1260
1261 Boundaries of Hangul syllables are determined
1262 according to conjoining Jamo behavior in F<the Unicode Standard>
1263 and F<HangulSyllableType.txt>.
1264
1265 B<Implementation Note:>
1266 (1) For expansion mapping (Unicode character mapped
1267 to a sequence of collation elements), a terminator will not be added
1268 between collation elements, even if Hangul syllable boundary exists there.
1269 Addition of terminator is restricted to the next position
1270 to the last collation element.
1271
1272 (2) Non-conjoining Hangul letters
1273 (Compatibility Jamo, halfwidth Jamo, and enclosed letters) are not
1274 automatically terminated with a terminator primary weight.
1275 These characters may need terminator included in a collation element
1276 table beforehand.
1277
1278 =item ignoreChar
1279
1280 =item ignoreName
1281
1282 -- see 3.2.2 Variable Weighting, UTS #10.
1283
1284 Makes the entry in the table completely ignorable;
1285 i.e. as if the weights were zero at all level.
1286
1287 Through C<ignoreChar>, any character matching C<qr/$ignoreChar/>
1288 will be ignored. Through C<ignoreName>, any character whose name
1289 (given in the C<table> file as a comment) matches C<qr/$ignoreName/>
1290 will be ignored.
1291
1292 E.g. when 'a' and 'e' are ignorable,
1293 'element' is equal to 'lament' (or 'lmnt').
1294
1295 =item katakana_before_hiragana
1296
1297 -- see 7.3.1 Tertiary Weight Table, UTS #10.
1298
1299 By default, hiragana is before katakana.
1300 If the parameter is made true, this is reversed.
1301
1302 B<NOTE>: This parameter simplemindedly assumes that any hiragana/katakana
1303 distinctions must occur in level 3, and their weights at level 3 must be
1304 same as those mentioned in 7.3.1, UTS #10.
1305 If you define your collation elements which violate this requirement,
1306 this parameter does not work validly.
1307
1308 =item level
1309
1310 -- see 4.3 Form Sort Key, UTS #10.
1311
1312 Set the maximum level.
1313 Any higher levels than the specified one are ignored.
1314
1315   Level 1: alphabetic ordering
1316   Level 2: diacritic ordering
1317   Level 3: case ordering
1318   Level 4: tie-breaking (e.g. in the case when variable is 'shifted')
1319
1320   ex.level => 2,
1321
1322 If omitted, the maximum is the 4th.
1323
1324 =item normalization
1325
1326 -- see 4.1 Normalize, UTS #10.
1327
1328 If specified, strings are normalized before preparation of sort keys
1329 (the normalization is executed after preprocess).
1330
1331 A form name C<Unicode::Normalize::normalize()> accepts will be applied
1332 as C<$normalization_form>.
1333 Acceptable names include C<'NFD'>, C<'NFC'>, C<'NFKD'>, and C<'NFKC'>.
1334 See C<Unicode::Normalize::normalize()> for detail.
1335 If omitted, C<'NFD'> is used.
1336
1337 C<normalization> is performed after C<preprocess> (if defined).
1338
1339 Furthermore, special values, C<undef> and C<"prenormalized">, can be used,
1340 though they are not concerned with C<Unicode::Normalize::normalize()>.
1341
1342 If C<undef> (not a string C<"undef">) is passed explicitly
1343 as the value for this key,
1344 any normalization is not carried out (this may make tailoring easier
1345 if any normalization is not desired). Under C<(normalization =E<gt> undef)>,
1346 only contiguous contractions are resolved;
1347 e.g. even if C<A-ring> (and C<A-ring-cedilla>) is ordered after C<Z>,
1348 C<A-cedilla-ring> would be primary equal to C<A>.
1349 In this point,
1350 C<(normalization =E<gt> undef, preprocess =E<gt> sub { NFD(shift) })>
1351 B<is not> equivalent to C<(normalization =E<gt> 'NFD')>.
1352
1353 In the case of C<(normalization =E<gt> "prenormalized")>,
1354 any normalization is not performed, but
1355 non-contiguous contractions with combining characters are performed.
1356 Therefore
1357 C<(normalization =E<gt> 'prenormalized', preprocess =E<gt> sub { NFD(shift) })>
1358 B<is> equivalent to C<(normalization =E<gt> 'NFD')>.
1359 If source strings are finely prenormalized,
1360 C<(normalization =E<gt> 'prenormalized')> may save time for normalization.
1361
1362 Except C<(normalization =E<gt> undef)>,
1363 B<Unicode::Normalize> is required (see also B<CAVEAT>).
1364
1365 =item overrideCJK
1366
1367 -- see 7.1 Derived Collation Elements, UTS #10.
1368
1369 By default, CJK Unified Ideographs are ordered in Unicode codepoint order
1370 but C<CJK Unified Ideographs> (if C<UCA_Version> is 8 to 11, its range is
1371 C<U+4E00..U+9FA5>; if C<UCA_Version> is 14, its range is C<U+4E00..U+9FBB>)
1372 are lesser than C<CJK Unified Ideographs Extension> (its range is
1373 C<U+3400..U+4DB5> and C<U+20000..U+2A6D6>).
1374
1375 Through C<overrideCJK>, ordering of CJK Unified Ideographs can be overrided.
1376
1377 ex. CJK Unified Ideographs in the JIS code point order.
1378
1379   overrideCJK => sub {
1380       my $u = shift;             # get a Unicode codepoint
1381       my $b = pack('n', $u);     # to UTF-16BE
1382       my $s = your_unicode_to_sjis_converter($b); # convert
1383       my $n = unpack('n', $s);   # convert sjis to short
1384       [ $n, 0x20, 0x2, $u ];     # return the collation element
1385   },
1386
1387 ex. ignores all CJK Unified Ideographs.
1388
1389   overrideCJK => sub {()}, # CODEREF returning empty list
1390
1391    # where ->eq("Pe\x{4E00}rl", "Perl") is true
1392    # as U+4E00 is a CJK Unified Ideograph and to be ignorable.
1393
1394 If C<undef> is passed explicitly as the value for this key,
1395 weights for CJK Unified Ideographs are treated as undefined.
1396 But assignment of weight for CJK Unified Ideographs
1397 in table or C<entry> is still valid.
1398
1399 =item overrideHangul
1400
1401 -- see 7.1 Derived Collation Elements, UTS #10.
1402
1403 By default, Hangul Syllables are decomposed into Hangul Jamo,
1404 even if C<(normalization =E<gt> undef)>.
1405 But the mapping of Hangul Syllables may be overrided.
1406
1407 This parameter works like C<overrideCJK>, so see there for examples.
1408
1409 If you want to override the mapping of Hangul Syllables,
1410 NFD, NFKD, and FCD are not appropriate,
1411 since they will decompose Hangul Syllables before overriding.
1412
1413 If C<undef> is passed explicitly as the value for this key,
1414 weight for Hangul Syllables is treated as undefined
1415 without decomposition into Hangul Jamo.
1416 But definition of weight for Hangul Syllables
1417 in table or C<entry> is still valid.
1418
1419 =item preprocess
1420
1421 -- see 5.1 Preprocessing, UTS #10.
1422
1423 If specified, the coderef is used to preprocess
1424 before the formation of sort keys.
1425
1426 ex. dropping English articles, such as "a" or "the".
1427 Then, "the pen" is before "a pencil".
1428
1429      preprocess => sub {
1430            my $str = shift;
1431            $str =~ s/\b(?:an?|the)\s+//gi;
1432            return $str;
1433         },
1434
1435 C<preprocess> is performed before C<normalization> (if defined).
1436
1437 =item rearrange
1438
1439 -- see 3.1.3 Rearrangement, UTS #10.
1440
1441 Characters that are not coded in logical order and to be rearranged.
1442 If C<UCA_Version> is equal to or lesser than 11, default is:
1443
1444     rearrange => [ 0x0E40..0x0E44, 0x0EC0..0x0EC4 ],
1445
1446 If you want to disallow any rearrangement, pass C<undef> or C<[]>
1447 (a reference to empty list) as the value for this key.
1448
1449 If C<UCA_Version> is equal to 14, default is C<[]> (i.e. no rearrangement).
1450
1451 B<According to the version 9 of UCA, this parameter shall not be used;
1452 but it is not warned at present.>
1453
1454 =item table
1455
1456 -- see 3.2 Default Unicode Collation Element Table, UTS #10.
1457
1458 You can use another collation element table if desired.
1459
1460 The table file should locate in the F<Unicode/Collate> directory
1461 on C<@INC>. Say, if the filename is F<Foo.txt>,
1462 the table file is searched as F<Unicode/Collate/Foo.txt> in C<@INC>.
1463
1464 By default, F<allkeys.txt> (as the filename of DUCET) is used.
1465 If you will prepare your own table file, any name other than F<allkeys.txt>
1466 may be better to avoid namespace conflict.
1467
1468 If C<undef> is passed explicitly as the value for this key,
1469 no file is read (but you can define collation elements via C<entry>).
1470
1471 A typical way to define a collation element table
1472 without any file of table:
1473
1474    $onlyABC = Unicode::Collate->new(
1475        table => undef,
1476        entry => << 'ENTRIES',
1477 0061 ; [.0101.0020.0002.0061] # LATIN SMALL LETTER A
1478 0041 ; [.0101.0020.0008.0041] # LATIN CAPITAL LETTER A
1479 0062 ; [.0102.0020.0002.0062] # LATIN SMALL LETTER B
1480 0042 ; [.0102.0020.0008.0042] # LATIN CAPITAL LETTER B
1481 0063 ; [.0103.0020.0002.0063] # LATIN SMALL LETTER C
1482 0043 ; [.0103.0020.0008.0043] # LATIN CAPITAL LETTER C
1483 ENTRIES
1484     );
1485
1486 If C<ignoreName> or C<undefName> is used, character names should be
1487 specified as a comment (following C<#>) on each line.
1488
1489 =item undefChar
1490
1491 =item undefName
1492
1493 -- see 6.3.4 Reducing the Repertoire, UTS #10.
1494
1495 Undefines the collation element as if it were unassigned in the table.
1496 This reduces the size of the table.
1497 If an unassigned character appears in the string to be collated,
1498 the sort key is made from its codepoint
1499 as a single-character collation element,
1500 as it is greater than any other assigned collation elements
1501 (in the codepoint order among the unassigned characters).
1502 But, it'd be better to ignore characters
1503 unfamiliar to you and maybe never used.
1504
1505 Through C<undefChar>, any character matching C<qr/$undefChar/>
1506 will be undefined. Through C<undefName>, any character whose name
1507 (given in the C<table> file as a comment) matches C<qr/$undefName/>
1508 will be undefined.
1509
1510 ex. Collation weights for beyond-BMP characters are not stored in object:
1511
1512     undefChar => qr/[^\0-\x{fffd}]/,
1513
1514 =item upper_before_lower
1515
1516 -- see 6.6 Case Comparisons, UTS #10.
1517
1518 By default, lowercase is before uppercase.
1519 If the parameter is made true, this is reversed.
1520
1521 B<NOTE>: This parameter simplemindedly assumes that any lowercase/uppercase
1522 distinctions must occur in level 3, and their weights at level 3 must be
1523 same as those mentioned in 7.3.1, UTS #10.
1524 If you define your collation elements which differs from this requirement,
1525 this parameter doesn't work validly.
1526
1527 =item variable
1528
1529 -- see 3.2.2 Variable Weighting, UTS #10.
1530
1531 This key allows to variable weighting for variable collation elements,
1532 which are marked with an ASTERISK in the table
1533 (NOTE: Many punction marks and symbols are variable in F<allkeys.txt>).
1534
1535    variable => 'blanked', 'non-ignorable', 'shifted', or 'shift-trimmed'.
1536
1537 These names are case-insensitive.
1538 By default (if specification is omitted), 'shifted' is adopted.
1539
1540    'Blanked'        Variable elements are made ignorable at levels 1 through 3;
1541                     considered at the 4th level.
1542
1543    'Non-Ignorable'  Variable elements are not reset to ignorable.
1544
1545    'Shifted'        Variable elements are made ignorable at levels 1 through 3
1546                     their level 4 weight is replaced by the old level 1 weight.
1547                     Level 4 weight for Non-Variable elements is 0xFFFF.
1548
1549    'Shift-Trimmed'  Same as 'shifted', but all FFFF's at the 4th level
1550                     are trimmed.
1551
1552 =back
1553
1554 =head2 Methods for Collation
1555
1556 =over 4
1557
1558 =item C<@sorted = $Collator-E<gt>sort(@not_sorted)>
1559
1560 Sorts a list of strings.
1561
1562 =item C<$result = $Collator-E<gt>cmp($a, $b)>
1563
1564 Returns 1 (when C<$a> is greater than C<$b>)
1565 or 0 (when C<$a> is equal to C<$b>)
1566 or -1 (when C<$a> is lesser than C<$b>).
1567
1568 =item C<$result = $Collator-E<gt>eq($a, $b)>
1569
1570 =item C<$result = $Collator-E<gt>ne($a, $b)>
1571
1572 =item C<$result = $Collator-E<gt>lt($a, $b)>
1573
1574 =item C<$result = $Collator-E<gt>le($a, $b)>
1575
1576 =item C<$result = $Collator-E<gt>gt($a, $b)>
1577
1578 =item C<$result = $Collator-E<gt>ge($a, $b)>
1579
1580 They works like the same name operators as theirs.
1581
1582    eq : whether $a is equal to $b.
1583    ne : whether $a is not equal to $b.
1584    lt : whether $a is lesser than $b.
1585    le : whether $a is lesser than $b or equal to $b.
1586    gt : whether $a is greater than $b.
1587    ge : whether $a is greater than $b or equal to $b.
1588
1589 =item C<$sortKey = $Collator-E<gt>getSortKey($string)>
1590
1591 -- see 4.3 Form Sort Key, UTS #10.
1592
1593 Returns a sort key.
1594
1595 You compare the sort keys using a binary comparison
1596 and get the result of the comparison of the strings using UCA.
1597
1598    $Collator->getSortKey($a) cmp $Collator->getSortKey($b)
1599
1600       is equivalent to
1601
1602    $Collator->cmp($a, $b)
1603
1604 =item C<$sortKeyForm = $Collator-E<gt>viewSortKey($string)>
1605
1606 Converts a sorting key into its representation form.
1607 If C<UCA_Version> is 8, the output is slightly different.
1608
1609    use Unicode::Collate;
1610    my $c = Unicode::Collate->new();
1611    print $c->viewSortKey("Perl"),"\n";
1612
1613    # output:
1614    # [0B67 0A65 0B7F 0B03 | 0020 0020 0020 0020 | 0008 0002 0002 0002 | FFFF FFFF FFFF FFFF]
1615    #  Level 1               Level 2               Level 3               Level 4
1616
1617 =back
1618
1619 =head2 Methods for Searching
1620
1621 B<DISCLAIMER:> If C<preprocess> or C<normalization> parameter is true
1622 for C<$Collator>, calling these methods (C<index>, C<match>, C<gmatch>,
1623 C<subst>, C<gsubst>) is croaked,
1624 as the position and the length might differ
1625 from those on the specified string.
1626 (And C<rearrange> and C<hangul_terminator> parameters are neglected.)
1627
1628 The C<match>, C<gmatch>, C<subst>, C<gsubst> methods work
1629 like C<m//>, C<m//g>, C<s///>, C<s///g>, respectively,
1630 but they are not aware of any pattern, but only a literal substring.
1631
1632 =over 4
1633
1634 =item C<$position = $Collator-E<gt>index($string, $substring[, $position])>
1635
1636 =item C<($position, $length) = $Collator-E<gt>index($string, $substring[, $position])>
1637
1638 If C<$substring> matches a part of C<$string>, returns
1639 the position of the first occurrence of the matching part in scalar context;
1640 in list context, returns a two-element list of
1641 the position and the length of the matching part.
1642
1643 If C<$substring> does not match any part of C<$string>,
1644 returns C<-1> in scalar context and
1645 an empty list in list context.
1646
1647 e.g. you say
1648
1649   my $Collator = Unicode::Collate->new( normalization => undef, level => 1 );
1650                                      # (normalization => undef) is REQUIRED.
1651   my $str = "Ich muß studieren Perl.";
1652   my $sub = "MÜSS";
1653   my $match;
1654   if (my($pos,$len) = $Collator->index($str, $sub)) {
1655       $match = substr($str, $pos, $len);
1656   }
1657
1658 and get C<"muß"> in C<$match> since C<"muß">
1659 is primary equal to C<"MÜSS">.
1660
1661 =item C<$match_ref = $Collator-E<gt>match($string, $substring)>
1662
1663 =item C<($match)   = $Collator-E<gt>match($string, $substring)>
1664
1665 If C<$substring> matches a part of C<$string>, in scalar context, returns
1666 B<a reference to> the first occurrence of the matching part
1667 (C<$match_ref> is always true if matches,
1668 since every reference is B<true>);
1669 in list context, returns the first occurrence of the matching part.
1670
1671 If C<$substring> does not match any part of C<$string>,
1672 returns C<undef> in scalar context and
1673 an empty list in list context.
1674
1675 e.g.
1676
1677     if ($match_ref = $Collator->match($str, $sub)) { # scalar context
1678         print "matches [$$match_ref].\n";
1679     } else {
1680         print "doesn't match.\n";
1681     }
1682
1683      or
1684
1685     if (($match) = $Collator->match($str, $sub)) { # list context
1686         print "matches [$match].\n";
1687     } else {
1688         print "doesn't match.\n";
1689     }
1690
1691 =item C<@match = $Collator-E<gt>gmatch($string, $substring)>
1692
1693 If C<$substring> matches a part of C<$string>, returns
1694 all the matching parts (or matching count in scalar context).
1695
1696 If C<$substring> does not match any part of C<$string>,
1697 returns an empty list.
1698
1699 =item C<$count = $Collator-E<gt>subst($string, $substring, $replacement)>
1700
1701 If C<$substring> matches a part of C<$string>,
1702 the first occurrence of the matching part is replaced by C<$replacement>
1703 (C<$string> is modified) and return C<$count> (always equals to C<1>).
1704
1705 C<$replacement> can be a C<CODEREF>,
1706 taking the matching part as an argument,
1707 and returning a string to replace the matching part
1708 (a bit similar to C<s/(..)/$coderef-E<gt>($1)/e>).
1709
1710 =item C<$count = $Collator-E<gt>gsubst($string, $substring, $replacement)>
1711
1712 If C<$substring> matches a part of C<$string>,
1713 all the occurrences of the matching part is replaced by C<$replacement>
1714 (C<$string> is modified) and return C<$count>.
1715
1716 C<$replacement> can be a C<CODEREF>,
1717 taking the matching part as an argument,
1718 and returning a string to replace the matching part
1719 (a bit similar to C<s/(..)/$coderef-E<gt>($1)/eg>).
1720
1721 e.g.
1722
1723   my $Collator = Unicode::Collate->new( normalization => undef, level => 1 );
1724                                      # (normalization => undef) is REQUIRED.
1725   my $str = "Camel donkey zebra came\x{301}l CAMEL horse cAm\0E\0L...";
1726   $Collator->gsubst($str, "camel", sub { "<b>$_[0]</b>" });
1727
1728   # now $str is "<b>Camel</b> donkey zebra <b>came\x{301}l</b> <b>CAMEL</b> horse <b>cAm\0E\0L</b>...";
1729   # i.e., all the camels are made bold-faced.
1730
1731 =back
1732
1733 =head2 Other Methods
1734
1735 =over 4
1736
1737 =item C<%old_tailoring = $Collator-E<gt>change(%new_tailoring)>
1738
1739 Change the value of specified keys and returns the changed part.
1740
1741     $Collator = Unicode::Collate->new(level => 4);
1742
1743     $Collator->eq("perl", "PERL"); # false
1744
1745     %old = $Collator->change(level => 2); # returns (level => 4).
1746
1747     $Collator->eq("perl", "PERL"); # true
1748
1749     $Collator->change(%old); # returns (level => 2).
1750
1751     $Collator->eq("perl", "PERL"); # false
1752
1753 Not all C<(key,value)>s are allowed to be changed.
1754 See also C<@Unicode::Collate::ChangeOK> and C<@Unicode::Collate::ChangeNG>.
1755
1756 In the scalar context, returns the modified collator
1757 (but it is B<not> a clone from the original).
1758
1759     $Collator->change(level => 2)->eq("perl", "PERL"); # true
1760
1761     $Collator->eq("perl", "PERL"); # true; now max level is 2nd.
1762
1763     $Collator->change(level => 4)->eq("perl", "PERL"); # false
1764
1765 =item C<$version = $Collator-E<gt>version()>
1766
1767 Returns the version number (a string) of the Unicode Standard
1768 which the C<table> file used by the collator object is based on.
1769 If the table does not include a version line (starting with C<@version>),
1770 returns C<"unknown">.
1771
1772 =item C<UCA_Version()>
1773
1774 Returns the tracking version number of UTS #10 this module consults.
1775
1776 =item C<Base_Unicode_Version()>
1777
1778 Returns the version number of UTS #10 this module consults.
1779
1780 =back
1781
1782 =head1 EXPORT
1783
1784 No method will be exported.
1785
1786 =head1 INSTALL
1787
1788 Though this module can be used without any C<table> file,
1789 to use this module easily, it is recommended to install a table file
1790 in the UCA format, by copying it under the directory
1791 <a place in @INC>/Unicode/Collate.
1792
1793 The most preferable one is "The Default Unicode Collation Element Table",
1794 available from the Unicode Consortium's website:
1795
1796    http://www.unicode.org/Public/UCA/
1797
1798    http://www.unicode.org/Public/UCA/latest/allkeys.txt (latest version)
1799
1800 If DUCET is not installed, it is recommended to copy the file
1801 from http://www.unicode.org/Public/UCA/latest/allkeys.txt
1802 to <a place in @INC>/Unicode/Collate/allkeys.txt
1803 manually.
1804
1805 =head1 CAVEATS
1806
1807 =over 4
1808
1809 =item Normalization
1810
1811 Use of the C<normalization> parameter requires the B<Unicode::Normalize>
1812 module (see L<Unicode::Normalize>).
1813
1814 If you need not it (say, in the case when you need not
1815 handle any combining characters),
1816 assign C<normalization =E<gt> undef> explicitly.
1817
1818 -- see 6.5 Avoiding Normalization, UTS #10.
1819
1820 =item Conformance Test
1821
1822 The Conformance Test for the UCA is available
1823 under L<http://www.unicode.org/Public/UCA/>.
1824
1825 For F<CollationTest_SHIFTED.txt>,
1826 a collator via C<Unicode::Collate-E<gt>new( )> should be used;
1827 for F<CollationTest_NON_IGNORABLE.txt>, a collator via
1828 C<Unicode::Collate-E<gt>new(variable =E<gt> "non-ignorable", level =E<gt> 3)>.
1829
1830 B<Unicode::Normalize is required to try The Conformance Test.>
1831
1832 =back
1833
1834 =head1 AUTHOR, COPYRIGHT AND LICENSE
1835
1836 The Unicode::Collate module for perl was written by SADAHIRO Tomoyuki,
1837 <SADAHIRO@cpan.org>. This module is Copyright(C) 2001-2005,
1838 SADAHIRO Tomoyuki. Japan. All rights reserved.
1839
1840 This module is free software; you can redistribute it and/or
1841 modify it under the same terms as Perl itself.
1842
1843 The file Unicode/Collate/allkeys.txt was copied directly
1844 from http://www.unicode.org/Public/UCA/4.1.0/allkeys.txt (aka DUCET).
1845 This file is Copyright (c) 1991-2005 Unicode, Inc. All rights reserved.
1846 Distributed under the Terms of Use in http://www.unicode.org/copyright.html
1847
1848 =head1 SEE ALSO
1849
1850 =over 4
1851
1852 =item Unicode Collation Algorithm - UTS #10
1853
1854 L<http://www.unicode.org/reports/tr10/>
1855
1856 =item The Default Unicode Collation Element Table (DUCET)
1857
1858 L<http://www.unicode.org/Public/UCA/latest/allkeys.txt>
1859
1860 =item The conformance test for the UCA
1861
1862 L<http://www.unicode.org/Public/UCA/latest/CollationTest.html>
1863
1864 L<http://www.unicode.org/Public/UCA/latest/CollationTest.zip>
1865
1866 =item Hangul Syllable Type
1867
1868 L<http://www.unicode.org/Public/UNIDATA/HangulSyllableType.txt>
1869
1870 =item Unicode Normalization Forms - UAX #15
1871
1872 L<http://www.unicode.org/reports/tr15/>
1873
1874 =back
1875
1876 =cut