lib/Unicode/Collate.pm

   1 package Unicode::Collate;
   2
   3 BEGIN {
   4     unless ("A" eq pack('U', 0x41)) {
   5         die "Unicode::Collate cannot stringify a Unicode code point\n";
   6     }
   7 }
   8
   9 use 5.006;
  10 use strict;
  11 use warnings;
  12 use Carp;
  13 use File::Spec;
  14
  15 no warnings 'utf8';
  16
  17 our $VERSION = '0.50';
  18 our $PACKAGE = __PACKAGE__;
  19
  20 my @Path = qw(Unicode Collate);
  21 my $KeyFile = "allkeys.txt";
  22
  23 # Perl's boolean
  24 use constant TRUE  => 1;
  25 use constant FALSE => "";
  26 use constant NOMATCHPOS => -1;
  27
  28 # A coderef to get combining class imported from Unicode::Normalize
  29 # (i.e. \&Unicode::Normalize::getCombinClass).
  30 # This is also used as a HAS_UNICODE_NORMALIZE flag.
  31 my $CVgetCombinClass;
  32
  33 # Supported Levels
  34 use constant MinLevel => 1;
  35 use constant MaxLevel => 4;
  36
  37 # Minimum weights at level 2 and 3, respectively
  38 use constant Min2Wt => 0x20;
  39 use constant Min3Wt => 0x02;
  40
  41 # Shifted weight at 4th level
  42 use constant Shift4Wt => 0xFFFF;
  43
  44 # A boolean for Variable and 16-bit weights at 4 levels of Collation Element
  45 # PROBLEM: The Default Unicode Collation Element Table
  46 # has weights over 0xFFFF at the 4th level.
  47 # The tie-breaking in the variable weights
  48 # other than "shift" (as well as "shift-trimmed") is unreliable.
  49 use constant VCE_TEMPLATE => 'Cn4';
  50
  51 # A sort key: 16-bit weights
  52 # See also the PROBLEM on VCE_TEMPLATE above.
  53 use constant KEY_TEMPLATE => 'n*';
  54
  55 # Level separator in a sort key:
  56 # i.e. pack(KEY_TEMPLATE, 0)
  57 use constant LEVEL_SEP => "\0\0";
  58
  59 # As Unicode code point separator for hash keys.
  60 # A joined code point string (denoted by JCPS below)
  61 # like "65;768" is used for internal processing
  62 # instead of Perl's Unicode string like "\x41\x{300}",
  63 # as the native code point is different from the Unicode code point
  64 # on EBCDIC platform.
  65 # This character must not be included in any stringified
  66 # representation of an integer.
  67 use constant CODE_SEP => ';';
  68
  69 # boolean values of variable weights
  70 use constant NON_VAR => 0; # Non-Variable character
  71 use constant VAR     => 1; # Variable character
  72
  73 # specific code points
  74 use constant Hangul_LBase  => 0x1100;
  75 use constant Hangul_LIni   => 0x1100;
  76 use constant Hangul_LFin   => 0x1159;
  77 use constant Hangul_LFill  => 0x115F;
  78 use constant Hangul_VBase  => 0x1161;
  79 use constant Hangul_VIni   => 0x1160; # from Vowel Filler
  80 use constant Hangul_VFin   => 0x11A2;
  81 use constant Hangul_TBase  => 0x11A7; # from "no-final" codepoint
  82 use constant Hangul_TIni   => 0x11A8;
  83 use constant Hangul_TFin   => 0x11F9;
  84 use constant Hangul_TCount => 28;
  85 use constant Hangul_NCount => 588;
  86 use constant Hangul_SBase  => 0xAC00;
  87 use constant Hangul_SIni   => 0xAC00;
  88 use constant Hangul_SFin   => 0xD7A3;
  89 use constant CJK_UidIni    => 0x4E00;
  90 use constant CJK_UidFin    => 0x9FA5;
  91 use constant CJK_UidF41    => 0x9FBB;
  92 use constant CJK_ExtAIni   => 0x3400;
  93 use constant CJK_ExtAFin   => 0x4DB5;
  94 use constant CJK_ExtBIni   => 0x20000;
  95 use constant CJK_ExtBFin   => 0x2A6D6;
  96 use constant BMP_Max       => 0xFFFF;
  97
  98 # Logical_Order_Exception in PropList.txt
  99 my $DefaultRearrange = [ 0x0E40..0x0E44, 0x0EC0..0x0EC4 ];
 100
 101 sub UCA_Version { "14" }
 102
 103 sub Base_Unicode_Version { "4.1.0" }
 104
 105 ######
 106
 107 sub pack_U {
 108     return pack('U*', @_);
 109 }
 110
 111 sub unpack_U {
 112     return unpack('U*', shift(@_).pack('U*'));
 113 }
 114
 115 ######
 116
 117 my (%VariableOK);
 118 @VariableOK{ qw/
 119     blanked  non-ignorable  shifted  shift-trimmed
 120   / } = (); # keys lowercased
 121
 122 our @ChangeOK = qw/
 123     alternate backwards level normalization rearrange
 124     katakana_before_hiragana upper_before_lower
 125     overrideHangul overrideCJK preprocess UCA_Version
 126     hangul_terminator variable
 127   /;
 128
 129 our @ChangeNG = qw/
 130     entry mapping table maxlength
 131     ignoreChar ignoreName undefChar undefName variableTable
 132     versionTable alternateTable backwardsTable forwardsTable rearrangeTable
 133     derivCode normCode rearrangeHash
 134     backwardsFlag
 135   /;
 136 # The hash key 'ignored' is deleted at v 0.21.
 137 # The hash key 'isShift' is deleted at v 0.23.
 138 # The hash key 'combining' is deleted at v 0.24.
 139 # The hash key 'entries' is deleted at v 0.30.
 140 # The hash key 'L3_ignorable' is deleted at v 0.40.
 141
 142 sub version {
 143     my $self = shift;
 144     return $self->{versionTable} || 'unknown';
 145 }
 146
 147 my (%ChangeOK, %ChangeNG);
 148 @ChangeOK{ @ChangeOK } = ();
 149 @ChangeNG{ @ChangeNG } = ();
 150
 151 sub change {
 152     my $self = shift;
 153     my %hash = @_;
 154     my %old;
 155     if (exists $hash{variable} && exists $hash{alternate}) {
 156         delete $hash{alternate};
 157     }
 158     elsif (!exists $hash{variable} && exists $hash{alternate}) {
 159         $hash{variable} = $hash{alternate};
 160     }
 161     foreach my $k (keys %hash) {
 162         if (exists $ChangeOK{$k}) {
 163             $old{$k} = $self->{$k};
 164             $self->{$k} = $hash{$k};
 165         }
 166         elsif (exists $ChangeNG{$k}) {
 167             croak "change of $k via change() is not allowed!";
 168         }
 169         # else => ignored
 170     }
 171     $self->checkCollator();
 172     return wantarray ? %old : $self;
 173 }
 174
 175 sub _checkLevel {
 176     my $level = shift;
 177     my $key   = shift; # 'level' or 'backwards'
 178     MinLevel <= $level or croak sprintf
 179         "Illegal level %d (in value for key '%s') lower than %d.",
 180             $level, $key, MinLevel;
 181     $level <= MaxLevel or croak sprintf
 182         "Unsupported level %d (in value for key '%s') higher than %d.",
 183             $level, $key, MaxLevel;
 184 }
 185
 186 my %DerivCode = (
 187     8 => \&_derivCE_8,
 188     9 => \&_derivCE_9,
 189    11 => \&_derivCE_9, # 11 == 9
 190    14 => \&_derivCE_14,
 191 );
 192
 193 sub checkCollator {
 194     my $self = shift;
 195     _checkLevel($self->{level}, "level");
 196
 197     $self->{derivCode} = $DerivCode{ $self->{UCA_Version} }
 198         or croak "Illegal UCA version (passed $self->{UCA_Version}).";
 199
 200     $self->{variable} ||= $self->{alternate} || $self->{variableTable} ||
 201                                 $self->{alternateTable} || 'shifted';
 202     $self->{variable} = $self->{alternate} = lc($self->{variable});
 203     exists $VariableOK{ $self->{variable} }
 204         or croak "$PACKAGE unknown variable parameter name: $self->{variable}";
 205
 206     if (! defined $self->{backwards}) {
 207         $self->{backwardsFlag} = 0;
 208     }
 209     elsif (! ref $self->{backwards}) {
 210         _checkLevel($self->{backwards}, "backwards");
 211         $self->{backwardsFlag} = 1 << $self->{backwards};
 212     }
 213     else {
 214         my %level;
 215         $self->{backwardsFlag} = 0;
 216         for my $b (@{ $self->{backwards} }) {
 217             _checkLevel($b, "backwards");
 218             $level{$b} = 1;
 219         }
 220         for my $v (sort keys %level) {
 221             $self->{backwardsFlag} += 1 << $v;
 222         }
 223     }
 224
 225     defined $self->{rearrange} or $self->{rearrange} = [];
 226     ref $self->{rearrange}
 227         or croak "$PACKAGE: list for rearrangement must be store in ARRAYREF";
 228
 229     # keys of $self->{rearrangeHash} are $self->{rearrange}.
 230     $self->{rearrangeHash} = undef;
 231
 232     if (@{ $self->{rearrange} }) {
 233         @{ $self->{rearrangeHash} }{ @{ $self->{rearrange} } } = ();
 234     }
 235
 236     $self->{normCode} = undef;
 237
 238     if (defined $self->{normalization}) {
 239         eval { require Unicode::Normalize };
 240         $@ and croak "Unicode::Normalize is required to normalize strings";
 241
 242         $CVgetCombinClass ||= \&Unicode::Normalize::getCombinClass;
 243
 244         if ($self->{normalization} =~ /^(?:NF)D\z/) { # tweak for default
 245             $self->{normCode} = \&Unicode::Normalize::NFD;
 246         }
 247         elsif ($self->{normalization} ne 'prenormalized') {
 248             my $norm = $self->{normalization};
 249             $self->{normCode} = sub {
 250                 Unicode::Normalize::normalize($norm, shift);
 251             };
 252             eval { $self->{normCode}->("") }; # try
 253             $@ and croak "$PACKAGE unknown normalization form name: $norm";
 254         }
 255     }
 256     return;
 257 }
 258
 259 sub new
 260 {
 261     my $class = shift;
 262     my $self = bless { @_ }, $class;
 263
 264     # If undef is passed explicitly, no file is read.
 265     $self->{table} = $KeyFile if ! exists $self->{table};
 266     $self->read_table() if defined $self->{table};
 267
 268     if ($self->{entry}) {
 269         while ($self->{entry} =~ /([^\n]+)/g) {
 270             $self->parseEntry($1);
 271         }
 272     }
 273
 274     $self->{level} ||= MaxLevel;
 275     $self->{UCA_Version} ||= UCA_Version();
 276
 277     $self->{overrideHangul} = FALSE
 278         if ! exists $self->{overrideHangul};
 279     $self->{overrideCJK} = FALSE
 280         if ! exists $self->{overrideCJK};
 281     $self->{normalization} = 'NFD'
 282         if ! exists $self->{normalization};
 283     $self->{rearrange} = $self->{rearrangeTable} ||
 284         ($self->{UCA_Version} <= 11 ? $DefaultRearrange : [])
 285         if ! exists $self->{rearrange};
 286     $self->{backwards} = $self->{backwardsTable}
 287         if ! exists $self->{backwards};
 288
 289     $self->checkCollator();
 290
 291     return $self;
 292 }
 293
 294 sub read_table {
 295     my $self = shift;
 296
 297     my($f, $fh);
 298     foreach my $d (@INC) {
 299         $f = File::Spec->catfile($d, @Path, $self->{table});
 300         last if open($fh, $f);
 301         $f = undef;
 302     }
 303     defined $f
 304         or croak "$PACKAGE: $self->{table} is not found in @INC";
 305
 306     while (<$fh>) {
 307         next if /^\s*#/;
 308         unless (s/^\s*\@//) {
 309             $self->parseEntry($_);
 310             next;
 311         }
 312
 313         if (/^version\s*(\S*)/) {
 314             $self->{versionTable} ||= $1;
 315         }
 316         elsif (/^variable\s+(\S*)/) { # since UTS #10-9
 317             $self->{variableTable} ||= $1;
 318         }
 319         elsif (/^alternate\s+(\S*)/) { # till UTS #10-8
 320             $self->{alternateTable} ||= $1;
 321         }
 322         elsif (/^backwards\s+(\S*)/) {
 323             push @{ $self->{backwardsTable} }, $1;
 324         }
 325         elsif (/^forwards\s+(\S*)/) { # parhaps no use
 326             push @{ $self->{forwardsTable} }, $1;
 327         }
 328         elsif (/^rearrange\s+(.*)/) { # (\S*) is NG
 329             push @{ $self->{rearrangeTable} }, _getHexArray($1);
 330         }
 331     }
 332     close $fh;
 333 }
 334
 335
 336 ##
 337 ## get $line, parse it, and write an entry in $self
 338 ##
 339 sub parseEntry
 340 {
 341     my $self = shift;
 342     my $line = shift;
 343     my($name, $entry, @uv, @key);
 344
 345     return if $line !~ /^\s*[0-9A-Fa-f]/;
 346
 347     # removes comment and gets name
 348     $name = $1
 349         if $line =~ s/[#%]\s*(.*)//;
 350     return if defined $self->{undefName} && $name =~ /$self->{undefName}/;
 351
 352     # gets element
 353     my($e, $k) = split /;/, $line;
 354     croak "Wrong Entry: <charList> must be separated by ';' from <collElement>"
 355         if ! $k;
 356
 357     @uv = _getHexArray($e);
 358     return if !@uv;
 359
 360     $entry = join(CODE_SEP, @uv); # in JCPS
 361
 362     if (defined $self->{undefChar} || defined $self->{ignoreChar}) {
 363         my $ele = pack_U(@uv);
 364
 365         # regarded as if it were not entried in the table
 366         return
 367             if defined $self->{undefChar} && $ele =~ /$self->{undefChar}/;
 368
 369         # replaced as completely ignorable
 370         $k = '[.0000.0000.0000.0000]'
 371             if defined $self->{ignoreChar} && $ele =~ /$self->{ignoreChar}/;
 372     }
 373
 374     # replaced as completely ignorable
 375     $k = '[.0000.0000.0000.0000]'
 376         if defined $self->{ignoreName} && $name =~ /$self->{ignoreName}/;
 377
 378     my $is_L3_ignorable = TRUE;
 379
 380     foreach my $arr ($k =~ /\[([^\[\]]+)\]/g) { # SPACEs allowed
 381         my $var = $arr =~ /\*/; # exactly /^\*/ but be lenient.
 382         my @wt = _getHexArray($arr);
 383         push @key, pack(VCE_TEMPLATE, $var, @wt);
 384         $is_L3_ignorable = FALSE
 385             if $wt[0] || $wt[1] || $wt[2];
 386         # Conformance Test for 3.1.1 and 4.0.0 shows Level 3 ignorable
 387         # is completely ignorable.
 388         # For expansion, an entry $is_L3_ignorable
 389         # if and only if "all" CEs are [.0000.0000.0000].
 390     }
 391
 392     $self->{mapping}{$entry} = $is_L3_ignorable ? [] : \@key;
 393
 394     if (@uv > 1) {
 395         (!$self->{maxlength}{$uv[0]} || $self->{maxlength}{$uv[0]} < @uv)
 396             and $self->{maxlength}{$uv[0]} = @uv;
 397     }
 398 }
 399
 400
 401 ##
 402 ## VCE = _varCE(variable term, VCE)
 403 ##
 404 sub _varCE
 405 {
 406     my $vbl = shift;
 407     my $vce = shift;
 408     if ($vbl eq 'non-ignorable') {
 409         return $vce;
 410     }
 411     my ($var, @wt) = unpack VCE_TEMPLATE, $vce;
 412
 413     if ($var) {
 414         return pack(VCE_TEMPLATE, $var, 0, 0, 0,
 415                 $vbl eq 'blanked' ? $wt[3] : $wt[0]);
 416     }
 417     elsif ($vbl eq 'blanked') {
 418         return $vce;
 419     }
 420     else {
 421         return pack(VCE_TEMPLATE, $var, @wt[0..2],
 422             $vbl eq 'shifted' && $wt[0]+$wt[1]+$wt[2] ? Shift4Wt : 0);
 423     }
 424 }
 425
 426 sub viewSortKey
 427 {
 428     my $self = shift;
 429     $self->visualizeSortKey($self->getSortKey(@_));
 430 }
 431
 432 sub visualizeSortKey
 433 {
 434     my $self = shift;
 435     my $view = join " ", map sprintf("%04X", $_), unpack(KEY_TEMPLATE, shift);
 436
 437     if ($self->{UCA_Version} <= 8) {
 438         $view =~ s/ ?0000 ?/|/g;
 439     } else {
 440         $view =~ s/\b0000\b/|/g;
 441     }
 442     return "[$view]";
 443 }
 444
 445
 446 ##
 447 ## arrayref of JCPS   = splitEnt(string to be collated)
 448 ## arrayref of arrayref[JCPS, ini_pos, fin_pos] = splitEnt(string, true)
 449 ##
 450 sub splitEnt
 451 {
 452     my $self = shift;
 453     my $wLen = $_[1];
 454
 455     my $code = $self->{preprocess};
 456     my $norm = $self->{normCode};
 457     my $map  = $self->{mapping};
 458     my $max  = $self->{maxlength};
 459     my $reH  = $self->{rearrangeHash};
 460     my $ver9 = $self->{UCA_Version} >= 9 && $self->{UCA_Version} <= 11;
 461
 462     my ($str, @buf);
 463
 464     if ($wLen) {
 465         $code and croak "Preprocess breaks character positions. "
 466                         . "Don't use with index(), match(), etc.";
 467         $norm and croak "Normalization breaks character positions. "
 468                         . "Don't use with index(), match(), etc.";
 469         $str = $_[0];
 470     }
 471     else {
 472         $str = $_[0];
 473         $str = &$code($str) if ref $code;
 474         $str = &$norm($str) if ref $norm;
 475     }
 476
 477     # get array of Unicode code point of string.
 478     my @src = unpack_U($str);
 479
 480     # rearrangement:
 481     # Character positions are not kept if rearranged,
 482     # then neglected if $wLen is true.
 483     if ($reH && ! $wLen) {
 484         for (my $i = 0; $i < @src; $i++) {
 485             if (exists $reH->{ $src[$i] } && $i + 1 < @src) {
 486                 ($src[$i], $src[$i+1]) = ($src[$i+1], $src[$i]);
 487                 $i++;
 488             }
 489         }
 490     }
 491
 492     # remove a code point marked as a completely ignorable.
 493     for (my $i = 0; $i < @src; $i++) {
 494         $src[$i] = undef
 495             if _isIllegal($src[$i]) || ($ver9 &&
 496                 $map->{ $src[$i] } && @{ $map->{ $src[$i] } } == 0);
 497     }
 498
 499     for (my $i = 0; $i < @src; $i++) {
 500         my $jcps = $src[$i];
 501
 502         # skip removed code point
 503         if (! defined $jcps) {
 504             if ($wLen && @buf) {
 505                 $buf[-1][2] = $i + 1;
 506             }
 507             next;
 508         }
 509
 510         my $i_orig = $i;
 511
 512         # find contraction
 513         if ($max->{$jcps}) {
 514             my $temp_jcps = $jcps;
 515             my $jcpsLen = 1;
 516             my $maxLen = $max->{$jcps};
 517
 518             for (my $p = $i + 1; $jcpsLen < $maxLen && $p < @src; $p++) {
 519                 next if ! defined $src[$p];
 520                 $temp_jcps .= CODE_SEP . $src[$p];
 521                 $jcpsLen++;
 522                 if ($map->{$temp_jcps}) {
 523                     $jcps = $temp_jcps;
 524                     $i = $p;
 525                 }
 526             }
 527
 528         # not-contiguous contraction with Combining Char (cf. UTS#10, S2.1).
 529         # This process requires Unicode::Normalize.
 530         # If "normalization" is undef, here should be skipped *always*
 531         # (in spite of bool value of $CVgetCombinClass),
 532         # since canonical ordering cannot be expected.
 533         # Blocked combining character should not be contracted.
 534
 535             if ($self->{normalization})
 536             # $self->{normCode} is false in the case of "prenormalized".
 537             {
 538                 my $preCC = 0;
 539                 my $curCC = 0;
 540
 541                 for (my $p = $i + 1; $p < @src; $p++) {
 542                     next if ! defined $src[$p];
 543                     $curCC = $CVgetCombinClass->($src[$p]);
 544                     last unless $curCC;
 545                     my $tail = CODE_SEP . $src[$p];
 546                     if ($preCC != $curCC && $map->{$jcps.$tail}) {
 547                         $jcps .= $tail;
 548                         $src[$p] = undef;
 549                     } else {
 550                         $preCC = $curCC;
 551                     }
 552                 }
 553             }
 554         }
 555
 556         # skip completely ignorable
 557         if ($map->{$jcps} && @{ $map->{$jcps} } == 0) {
 558             if ($wLen && @buf) {
 559                 $buf[-1][2] = $i + 1;
 560             }
 561             next;
 562         }
 563
 564         push @buf, $wLen ? [$jcps, $i_orig, $i + 1] : $jcps;
 565     }
 566     return \@buf;
 567 }
 568
 569
 570 ##
 571 ## list of VCE = getWt(JCPS)
 572 ##
 573 sub getWt
 574 {
 575     my $self = shift;
 576     my $u    = shift;
 577     my $vbl  = $self->{variable};
 578     my $map  = $self->{mapping};
 579     my $der  = $self->{derivCode};
 580
 581     return if !defined $u;
 582     return map(_varCE($vbl, $_), @{ $map->{$u} })
 583         if $map->{$u};
 584
 585     # JCPS must not be a contraction, then it's a code point.
 586     if (Hangul_SIni <= $u && $u <= Hangul_SFin) {
 587         my $hang = $self->{overrideHangul};
 588         my @hangulCE;
 589         if ($hang) {
 590             @hangulCE = map(pack(VCE_TEMPLATE, NON_VAR, @$_), &$hang($u));
 591         }
 592         elsif (!defined $hang) {
 593             @hangulCE = $der->($u);
 594         }
 595         else {
 596             my $max  = $self->{maxlength};
 597             my @decH = _decompHangul($u);
 598
 599             if (@decH == 2) {
 600                 my $contract = join(CODE_SEP, @decH);
 601                 @decH = ($contract) if $map->{$contract};
 602             } else { # must be <@decH == 3>
 603                 if ($max->{$decH[0]}) {
 604                     my $contract = join(CODE_SEP, @decH);
 605                     if ($map->{$contract}) {
 606                         @decH = ($contract);
 607                     } else {
 608                         $contract = join(CODE_SEP, @decH[0,1]);
 609                         $map->{$contract} and @decH = ($contract, $decH[2]);
 610                     }
 611                     # even if V's ignorable, LT contraction is not supported.
 612                     # If such a situatution were required, NFD should be used.
 613                 }
 614                 if (@decH == 3 && $max->{$decH[1]}) {
 615                     my $contract = join(CODE_SEP, @decH[1,2]);
 616                     $map->{$contract} and @decH = ($decH[0], $contract);
 617                 }
 618             }
 619
 620             @hangulCE = map({
 621                     $map->{$_} ? @{ $map->{$_} } : $der->($_);
 622                 } @decH);
 623         }
 624         return map _varCE($vbl, $_), @hangulCE;
 625     }
 626     elsif (_isUIdeo($u, $self->{UCA_Version})) {
 627         my $cjk  = $self->{overrideCJK};
 628         return map _varCE($vbl, $_),
 629             $cjk
 630                 ? map(pack(VCE_TEMPLATE, NON_VAR, @$_), &$cjk($u))
 631                 : defined $cjk && $self->{UCA_Version} <= 8 && $u < 0x10000
 632                     ? _uideoCE_8($u)
 633                     : $der->($u);
 634     }
 635     else {
 636         return map _varCE($vbl, $_), $der->($u);
 637     }
 638 }
 639
 640
 641 ##
 642 ## string sortkey = getSortKey(string arg)
 643 ##
 644 sub getSortKey
 645 {
 646     my $self = shift;
 647     my $lev  = $self->{level};
 648     my $rEnt = $self->splitEnt(shift); # get an arrayref of JCPS
 649     my $v2i  = $self->{UCA_Version} >= 9 &&
 650                 $self->{variable} ne 'non-ignorable';
 651
 652     my @buf; # weight arrays
 653     if ($self->{hangul_terminator}) {
 654         my $preHST = '';
 655         foreach my $jcps (@$rEnt) {
 656             # weird things like VL, TL-contraction are not considered!
 657             my $curHST = '';
 658             foreach my $u (split /;/, $jcps) {
 659                 $curHST .= getHST($u);
 660             }
 661             if ($preHST && !$curHST || # hangul before non-hangul
 662                 $preHST =~ /L\z/ && $curHST =~ /^T/ ||
 663                 $preHST =~ /V\z/ && $curHST =~ /^L/ ||
 664                 $preHST =~ /T\z/ && $curHST =~ /^[LV]/) {
 665
 666                 push @buf, $self->getWtHangulTerm();
 667             }
 668             $preHST = $curHST;
 669
 670             push @buf, $self->getWt($jcps);
 671         }
 672         $preHST # end at hangul
 673             and push @buf, $self->getWtHangulTerm();
 674     }
 675     else {
 676         foreach my $jcps (@$rEnt) {
 677             push @buf, $self->getWt($jcps);
 678         }
 679     }
 680
 681     # make sort key
 682     my @ret = ([],[],[],[]);
 683     my $last_is_variable;
 684
 685     foreach my $vwt (@buf) {
 686         my($var, @wt) = unpack(VCE_TEMPLATE, $vwt);
 687
 688         # "Ignorable (L1, L2) after Variable" since track. v. 9
 689         if ($v2i) {
 690             if ($var) {
 691                 $last_is_variable = TRUE;
 692             }
 693             elsif (!$wt[0]) { # ignorable
 694                 next if $last_is_variable;
 695             }
 696             else {
 697                 $last_is_variable = FALSE;
 698             }
 699         }
 700         foreach my $v (0..$lev-1) {
 701             0 < $wt[$v] and push @{ $ret[$v] }, $wt[$v];
 702         }
 703     }
 704
 705     # modification of tertiary weights
 706     if ($self->{upper_before_lower}) {
 707         foreach (@{ $ret[2] }) {
 708             if    (0x8 <= $_ && $_ <= 0xC) { $_ -= 6 } # lower
 709             elsif (0x2 <= $_ && $_ <= 0x6) { $_ += 6 } # upper
 710             elsif ($_ == 0x1C)             { $_ += 1 } # square upper
 711             elsif ($_ == 0x1D)             { $_ -= 1 } # square lower
 712         }
 713     }
 714     if ($self->{katakana_before_hiragana}) {
 715         foreach (@{ $ret[2] }) {
 716             if    (0x0F <= $_ && $_ <= 0x13) { $_ -= 2 } # katakana
 717             elsif (0x0D <= $_ && $_ <= 0x0E) { $_ += 5 } # hiragana
 718         }
 719     }
 720
 721     if ($self->{backwardsFlag}) {
 722         for (my $v = MinLevel; $v <= MaxLevel; $v++) {
 723             if ($self->{backwardsFlag} & (1 << $v)) {
 724                 @{ $ret[$v-1] } = reverse @{ $ret[$v-1] };
 725             }
 726         }
 727     }
 728
 729     join LEVEL_SEP, map pack(KEY_TEMPLATE, @$_), @ret;
 730 }
 731
 732
 733 ##
 734 ## int compare = cmp(string a, string b)
 735 ##
 736 sub cmp { $_[0]->getSortKey($_[1]) cmp $_[0]->getSortKey($_[2]) }
 737 sub eq  { $_[0]->getSortKey($_[1]) eq  $_[0]->getSortKey($_[2]) }
 738 sub ne  { $_[0]->getSortKey($_[1]) ne  $_[0]->getSortKey($_[2]) }
 739 sub lt  { $_[0]->getSortKey($_[1]) lt  $_[0]->getSortKey($_[2]) }
 740 sub le  { $_[0]->getSortKey($_[1]) le  $_[0]->getSortKey($_[2]) }
 741 sub gt  { $_[0]->getSortKey($_[1]) gt  $_[0]->getSortKey($_[2]) }
 742 sub ge  { $_[0]->getSortKey($_[1]) ge  $_[0]->getSortKey($_[2]) }
 743
 744 ##
 745 ## list[strings] sorted = sort(list[strings] arg)
 746 ##
 747 sub sort {
 748     my $obj = shift;
 749     return
 750         map { $_->[1] }
 751             sort{ $a->[0] cmp $b->[0] }
 752                 map [ $obj->getSortKey($_), $_ ], @_;
 753 }
 754
 755
 756 sub _derivCE_14 {
 757     my $u = shift;
 758     my $base =
 759         (CJK_UidIni  <= $u && $u <= CJK_UidF41)
 760             ? 0xFB40 : # CJK
 761         (CJK_ExtAIni <= $u && $u <= CJK_ExtAFin ||
 762          CJK_ExtBIni <= $u && $u <= CJK_ExtBFin)
 763             ? 0xFB80   # CJK ext.
 764             : 0xFBC0;  # others
 765
 766     my $aaaa = $base + ($u >> 15);
 767     my $bbbb = ($u & 0x7FFF) | 0x8000;
 768     return
 769         pack(VCE_TEMPLATE, NON_VAR, $aaaa, Min2Wt, Min3Wt, $u),
 770         pack(VCE_TEMPLATE, NON_VAR, $bbbb,      0,      0, $u);
 771 }
 772
 773 sub _derivCE_9 {
 774     my $u = shift;
 775     my $base =
 776         (CJK_UidIni  <= $u && $u <= CJK_UidFin)
 777             ? 0xFB40 : # CJK
 778         (CJK_ExtAIni <= $u && $u <= CJK_ExtAFin ||
 779          CJK_ExtBIni <= $u && $u <= CJK_ExtBFin)
 780             ? 0xFB80   # CJK ext.
 781             : 0xFBC0;  # others
 782
 783     my $aaaa = $base + ($u >> 15);
 784     my $bbbb = ($u & 0x7FFF) | 0x8000;
 785     return
 786         pack(VCE_TEMPLATE, NON_VAR, $aaaa, Min2Wt, Min3Wt, $u),
 787         pack(VCE_TEMPLATE, NON_VAR, $bbbb,      0,      0, $u);
 788 }
 789
 790 sub _derivCE_8 {
 791     my $code = shift;
 792     my $aaaa =  0xFF80 + ($code >> 15);
 793     my $bbbb = ($code & 0x7FFF) | 0x8000;
 794     return
 795         pack(VCE_TEMPLATE, NON_VAR, $aaaa, 2, 1, $code),
 796         pack(VCE_TEMPLATE, NON_VAR, $bbbb, 0, 0, $code);
 797 }
 798
 799 sub _uideoCE_8 {
 800     my $u = shift;
 801     return pack(VCE_TEMPLATE, NON_VAR, $u, Min2Wt, Min3Wt, $u);
 802 }
 803
 804 sub _isUIdeo {
 805     my ($u, $uca_vers) = @_;
 806     return(
 807         (CJK_UidIni <= $u &&
 808             ($uca_vers >= 14 ? ( $u <= CJK_UidF41) : ($u <= CJK_UidFin)))
 809                 ||
 810         (CJK_ExtAIni <= $u && $u <= CJK_ExtAFin)
 811                 ||
 812         (CJK_ExtBIni <= $u && $u <= CJK_ExtBFin)
 813     );
 814 }
 815
 816
 817 sub getWtHangulTerm {
 818     my $self = shift;
 819     return _varCE($self->{variable},
 820         pack(VCE_TEMPLATE, NON_VAR, $self->{hangul_terminator}, 0,0,0));
 821 }
 822
 823
 824 ##
 825 ## "hhhh hhhh hhhh" to (dddd, dddd, dddd)
 826 ##
 827 sub _getHexArray { map hex, $_[0] =~ /([0-9a-fA-F]+)/g }
 828
 829 #
 830 # $code *must* be in Hangul syllable.
 831 # Check it before you enter here.
 832 #
 833 sub _decompHangul {
 834     my $code = shift;
 835     my $si = $code - Hangul_SBase;
 836     my $li = int( $si / Hangul_NCount);
 837     my $vi = int(($si % Hangul_NCount) / Hangul_TCount);
 838     my $ti =      $si % Hangul_TCount;
 839     return (
 840         Hangul_LBase + $li,
 841         Hangul_VBase + $vi,
 842         $ti ? (Hangul_TBase + $ti) : (),
 843     );
 844 }
 845
 846 sub _isIllegal {
 847     my $code = shift;
 848     return ! defined $code                      # removed
 849         || ($code < 0 || 0x10FFFF < $code)      # out of range
 850         || (($code & 0xFFFE) == 0xFFFE)         # ??FFF[EF] (cf. utf8.c)
 851         || (0xD800 <= $code && $code <= 0xDFFF) # unpaired surrogates
 852         || (0xFDD0 <= $code && $code <= 0xFDEF) # other non-characters
 853     ;
 854 }
 855
 856 # Hangul Syllable Type
 857 sub getHST {
 858     my $u = shift;
 859     return
 860         Hangul_LIni <= $u && $u <= Hangul_LFin || $u == Hangul_LFill ? "L" :
 861         Hangul_VIni <= $u && $u <= Hangul_VFin       ? "V" :
 862         Hangul_TIni <= $u && $u <= Hangul_TFin       ? "T" :
 863         Hangul_SIni <= $u && $u <= Hangul_SFin ?
 864             ($u - Hangul_SBase) % Hangul_TCount ? "LVT" : "LV" : "";
 865 }
 866
 867
 868 ##
 869 ## bool _nonIgnorAtLevel(arrayref weights, int level)
 870 ##
 871 sub _nonIgnorAtLevel($$)
 872 {
 873     my $wt = shift;
 874     return if ! defined $wt;
 875     my $lv = shift;
 876     return grep($wt->[$_-1] != 0, MinLevel..$lv) ? TRUE : FALSE;
 877 }
 878
 879 ##
 880 ## bool _eqArray(
 881 ##    arrayref of arrayref[weights] source,
 882 ##    arrayref of arrayref[weights] substr,
 883 ##    int level)
 884 ## * comparison of graphemes vs graphemes.
 885 ##   @$source >= @$substr must be true (check it before call this);
 886 ##
 887 sub _eqArray($$$)
 888 {
 889     my $source = shift;
 890     my $substr = shift;
 891     my $lev = shift;
 892
 893     for my $g (0..@$substr-1){
 894         # Do the $g'th graphemes have the same number of AV weigths?
 895         return if @{ $source->[$g] } != @{ $substr->[$g] };
 896
 897         for my $w (0..@{ $substr->[$g] }-1) {
 898             for my $v (0..$lev-1) {
 899                 return if $source->[$g][$w][$v] != $substr->[$g][$w][$v];
 900             }
 901         }
 902     }
 903     return 1;
 904 }
 905
 906 ##
 907 ## (int position, int length)
 908 ## int position = index(string, substring, position, [undoc'ed grobal])
 909 ##
 910 ## With "grobal" (only for the list context),
 911 ##  returns list of arrayref[position, length].
 912 ##
 913 sub index
 914 {
 915     my $self = shift;
 916     my $str  = shift;
 917     my $len  = length($str);
 918     my $subE = $self->splitEnt(shift);
 919     my $pos  = @_ ? shift : 0;
 920        $pos  = 0 if $pos < 0;
 921     my $grob = shift;
 922
 923     my $lev  = $self->{level};
 924     my $v2i  = $self->{UCA_Version} >= 9 &&
 925                 $self->{variable} ne 'non-ignorable';
 926
 927     if (! @$subE) {
 928         my $temp = $pos <= 0 ? 0 : $len <= $pos ? $len : $pos;
 929         return $grob
 930             ? map([$_, 0], $temp..$len)
 931             : wantarray ? ($temp,0) : $temp;
 932     }
 933     $len < $pos
 934         and return wantarray ? () : NOMATCHPOS;
 935     my $strE = $self->splitEnt($pos ? substr($str, $pos) : $str, TRUE);
 936     @$strE
 937         or return wantarray ? () : NOMATCHPOS;
 938
 939     my(@strWt, @iniPos, @finPos, @subWt, @g_ret);
 940
 941     my $last_is_variable;
 942     for my $vwt (map $self->getWt($_), @$subE) {
 943         my($var, @wt) = unpack(VCE_TEMPLATE, $vwt);
 944         my $to_be_pushed = _nonIgnorAtLevel(\@wt,$lev);
 945
 946         # "Ignorable (L1, L2) after Variable" since track. v. 9
 947         if ($v2i) {
 948             if ($var) {
 949                 $last_is_variable = TRUE;
 950             }
 951             elsif (!$wt[0]) { # ignorable
 952                 $to_be_pushed = FALSE if $last_is_variable;
 953             }
 954             else {
 955                 $last_is_variable = FALSE;
 956             }
 957         }
 958
 959         if (@subWt && !$var && !$wt[0]) {
 960             push @{ $subWt[-1] }, \@wt if $to_be_pushed;
 961         } else {
 962             push @subWt, [ \@wt ];
 963         }
 964     }
 965
 966     my $count = 0;
 967     my $end = @$strE - 1;
 968
 969     $last_is_variable = FALSE; # reuse
 970     for (my $i = 0; $i <= $end; ) { # no $i++
 971         my $found_base = 0;
 972
 973         # fetch a grapheme
 974         while ($i <= $end && $found_base == 0) {
 975             for my $vwt ($self->getWt($strE->[$i][0])) {
 976                 my($var, @wt) = unpack(VCE_TEMPLATE, $vwt);
 977                 my $to_be_pushed = _nonIgnorAtLevel(\@wt,$lev);
 978
 979                 # "Ignorable (L1, L2) after Variable" since track. v. 9
 980                 if ($v2i) {
 981                     if ($var) {
 982                         $last_is_variable = TRUE;
 983                     }
 984                     elsif (!$wt[0]) { # ignorable
 985                         $to_be_pushed = FALSE if $last_is_variable;
 986                     }
 987                     else {
 988                         $last_is_variable = FALSE;
 989                     }
 990                 }
 991
 992                 if (@strWt && !$var && !$wt[0]) {
 993                     push @{ $strWt[-1] }, \@wt if $to_be_pushed;
 994                     $finPos[-1] = $strE->[$i][2];
 995                 } elsif ($to_be_pushed) {
 996                     push @strWt, [ \@wt ];
 997                     push @iniPos, $found_base ? NOMATCHPOS : $strE->[$i][1];
 998                     $finPos[-1] = NOMATCHPOS if $found_base;
 999                     push @finPos, $strE->[$i][2];
1000                     $found_base++;
1001                 }
1002                 # else ===> no-op
1003             }
1004             $i++;
1005         }
1006
1007         # try to match
1008         while ( @strWt > @subWt || (@strWt == @subWt && $i > $end) ) {
1009             if ($iniPos[0] != NOMATCHPOS &&
1010                     $finPos[$#subWt] != NOMATCHPOS &&
1011                         _eqArray(\@strWt, \@subWt, $lev)) {
1012                 my $temp = $iniPos[0] + $pos;
1013
1014                 if ($grob) {
1015                     push @g_ret, [$temp, $finPos[$#subWt] - $iniPos[0]];
1016                     splice @strWt,  0, $#subWt;
1017                     splice @iniPos, 0, $#subWt;
1018                     splice @finPos, 0, $#subWt;
1019                 }
1020                 else {
1021                     return wantarray
1022                         ? ($temp, $finPos[$#subWt] - $iniPos[0])
1023                         :  $temp;
1024                 }
1025             }
1026             shift @strWt;
1027             shift @iniPos;
1028             shift @finPos;
1029         }
1030     }
1031
1032     return $grob
1033         ? @g_ret
1034         : wantarray ? () : NOMATCHPOS;
1035 }
1036
1037 ##
1038 ## scalarref to matching part = match(string, substring)
1039 ##
1040 sub match
1041 {
1042     my $self = shift;
1043     if (my($pos,$len) = $self->index($_[0], $_[1])) {
1044         my $temp = substr($_[0], $pos, $len);
1045         return wantarray ? $temp : \$temp;
1046         # An lvalue ref \substr should be avoided,
1047         # since its value is affected by modification of its referent.
1048     }
1049     else {
1050         return;
1051     }
1052 }
1053
1054 ##
1055 ## arrayref matching parts = gmatch(string, substring)
1056 ##
1057 sub gmatch
1058 {
1059     my $self = shift;
1060     my $str  = shift;
1061     my $sub  = shift;
1062     return map substr($str, $_->[0], $_->[1]),
1063                 $self->index($str, $sub, 0, 'g');
1064 }
1065
1066 ##
1067 ## bool subst'ed = subst(string, substring, replace)
1068 ##
1069 sub subst
1070 {
1071     my $self = shift;
1072     my $code = ref $_[2] eq 'CODE' ? $_[2] : FALSE;
1073
1074     if (my($pos,$len) = $self->index($_[0], $_[1])) {
1075         if ($code) {
1076             my $mat = substr($_[0], $pos, $len);
1077             substr($_[0], $pos, $len, $code->($mat));
1078         } else {
1079             substr($_[0], $pos, $len, $_[2]);
1080         }
1081         return TRUE;
1082     }
1083     else {
1084         return FALSE;
1085     }
1086 }
1087
1088 ##
1089 ## int count = gsubst(string, substring, replace)
1090 ##
1091 sub gsubst
1092 {
1093     my $self = shift;
1094     my $code = ref $_[2] eq 'CODE' ? $_[2] : FALSE;
1095     my $cnt = 0;
1096
1097     # Replacement is carried out from the end, then use reverse.
1098     for my $pos_len (reverse $self->index($_[0], $_[1], 0, 'g')) {
1099         if ($code) {
1100             my $mat = substr($_[0], $pos_len->[0], $pos_len->[1]);
1101             substr($_[0], $pos_len->[0], $pos_len->[1], $code->($mat));
1102         } else {
1103             substr($_[0], $pos_len->[0], $pos_len->[1], $_[2]);
1104         }
1105         $cnt++;
1106     }
1107     return $cnt;
1108 }
1109
1110 1;
1111 __END__
1112
1113 =head1 NAME
1114
1115 Unicode::Collate - Unicode Collation Algorithm
1116
1117 =head1 SYNOPSIS
1118
1119   use Unicode::Collate;
1120
1121   #construct
1122   $Collator = Unicode::Collate->new(%tailoring);
1123
1124   #sort
1125   @sorted = $Collator->sort(@not_sorted);
1126
1127   #compare
1128   $result = $Collator->cmp($a, $b); # returns 1, 0, or -1.
1129
1130   # If %tailoring is false (i.e. empty),
1131   # $Collator should do the default collation.
1132
1133 =head1 DESCRIPTION
1134
1135 This module is an implementation of Unicode Technical Standard #10
1136 (a.k.a. UTS #10) - Unicode Collation Algorithm (a.k.a. UCA).
1137
1138 =head2 Constructor and Tailoring
1139
1140 The C<new> method returns a collator object.
1141
1142    $Collator = Unicode::Collate->new(
1143       UCA_Version => $UCA_Version,
1144       alternate => $alternate, # deprecated: use of 'variable' is recommended.
1145       backwards => $levelNumber, # or \@levelNumbers
1146       entry => $element,
1147       hangul_terminator => $term_primary_weight,
1148       ignoreName => qr/$ignoreName/,
1149       ignoreChar => qr/$ignoreChar/,
1150       katakana_before_hiragana => $bool,
1151       level => $collationLevel,
1152       normalization  => $normalization_form,
1153       overrideCJK => \&overrideCJK,
1154       overrideHangul => \&overrideHangul,
1155       preprocess => \&preprocess,
1156       rearrange => \@charList,
1157       table => $filename,
1158       undefName => qr/$undefName/,
1159       undefChar => qr/$undefChar/,
1160       upper_before_lower => $bool,
1161       variable => $variable,
1162    );
1163
1164 =over 4
1165
1166 =item UCA_Version
1167
1168 If the tracking version number of UCA is given,
1169 behavior of that tracking version is emulated on collating.
1170 If omitted, the return value of C<UCA_Version()> is used.
1171 C<UCA_Version()> should return the latest tracking version supported.
1172
1173 The supported tracking version: 8, 9, 11, or 14.
1174
1175      UCA tracking version          Unicode version
1176              8                          3.1
1177              9                 3.1 with Corrigendum 3
1178             11                          4.0
1179             14                         4.1.0
1180
1181 Note: Recent UTS #10 renames "Tracking Version" to "Revision."
1182
1183 =item alternate
1184
1185 -- see 3.2.2 Alternate Weighting, version 8 of UTS #10
1186
1187 For backward compatibility, C<alternate> (old name) can be used
1188 as an alias for C<variable>.
1189
1190 =item backwards
1191
1192 -- see 3.1.2 French Accents, UTS #10.
1193
1194      backwards => $levelNumber or \@levelNumbers
1195
1196 Weights in reverse order; ex. level 2 (diacritic ordering) in French.
1197 If omitted, forwards at all the levels.
1198
1199 =item entry
1200
1201 -- see 3.1 Linguistic Features; 3.2.1 File Format, UTS #10.
1202
1203 If the same character (or a sequence of characters) exists
1204 in the collation element table through C<table>,
1205 mapping to collation elements is overrided.
1206 If it does not exist, the mapping is defined additionally.
1207
1208     entry => <<'ENTRY', # for DUCET v4.0.0 (allkeys-4.0.0.txt)
1209 0063 0068 ; [.0E6A.0020.0002.0063] # ch
1210 0043 0068 ; [.0E6A.0020.0007.0043] # Ch
1211 0043 0048 ; [.0E6A.0020.0008.0043] # CH
1212 006C 006C ; [.0F4C.0020.0002.006C] # ll
1213 004C 006C ; [.0F4C.0020.0007.004C] # Ll
1214 004C 004C ; [.0F4C.0020.0008.004C] # LL
1215 00F1      ; [.0F7B.0020.0002.00F1] # n-tilde
1216 006E 0303 ; [.0F7B.0020.0002.00F1] # n-tilde
1217 00D1      ; [.0F7B.0020.0008.00D1] # N-tilde
1218 004E 0303 ; [.0F7B.0020.0008.00D1] # N-tilde
1219 ENTRY
1220
1221     entry => <<'ENTRY', # for DUCET v4.0.0 (allkeys-4.0.0.txt)
1222 00E6 ; [.0E33.0020.0002.00E6][.0E8B.0020.0002.00E6] # ae ligature as <a><e>
1223 00C6 ; [.0E33.0020.0008.00C6][.0E8B.0020.0008.00C6] # AE ligature as <A><E>
1224 ENTRY
1225
1226 B<NOTE:> The code point in the UCA file format (before C<';'>)
1227 B<must> be a Unicode code point (defined as hexadecimal),
1228 but not a native code point.
1229 So C<0063> must always denote C<U+0063>,
1230 but not a character of C<"\x63">.
1231
1232 Weighting may vary depending on collation element table.
1233 So ensure the weights defined in C<entry> will be consistent with
1234 those in the collation element table loaded via C<table>.
1235
1236 In DUCET v4.0.0, primary weight of C<C> is C<0E60>
1237 and that of C<D> is C<0E6D>. So setting primary weight of C<CH> to C<0E6A>
1238 (as a value between C<0E60> and C<0E6D>)
1239 makes ordering as C<C E<lt> CH E<lt> D>.
1240 Exactly speaking DUCET already has some characters between C<C> and C<D>:
1241 C<small capital C> (C<U+1D04>) with primary weight C<0E64>,
1242 C<c-hook/C-hook> (C<U+0188/U+0187>) with C<0E65>,
1243 and C<c-curl> (C<U+0255>) with C<0E69>.
1244 Then primary weight C<0E6A> for C<CH> makes C<CH>
1245 ordered between C<c-curl> and C<D>.
1246
1247 =item hangul_terminator
1248
1249 -- see 7.1.4 Trailing Weights, UTS #10.
1250
1251 If a true value is given (non-zero but should be positive),
1252 it will be added as a terminator primary weight to the end of
1253 every standard Hangul syllable. Secondary and any higher weights
1254 for terminator are set to zero.
1255 If the value is false or C<hangul_terminator> key does not exist,
1256 insertion of terminator weights will not be performed.
1257
1258 Boundaries of Hangul syllables are determined
1259 according to conjoining Jamo behavior in F<the Unicode Standard>
1260 and F<HangulSyllableType.txt>.
1261
1262 B<Implementation Note:>
1263 (1) For expansion mapping (Unicode character mapped
1264 to a sequence of collation elements), a terminator will not be added
1265 between collation elements, even if Hangul syllable boundary exists there.
1266 Addition of terminator is restricted to the next position
1267 to the last collation element.
1268
1269 (2) Non-conjoining Hangul letters
1270 (Compatibility Jamo, halfwidth Jamo, and enclosed letters) are not
1271 automatically terminated with a terminator primary weight.
1272 These characters may need terminator included in a collation element
1273 table beforehand.
1274
1275 =item ignoreChar
1276
1277 =item ignoreName
1278
1279 -- see 3.2.2 Variable Weighting, UTS #10.
1280
1281 Makes the entry in the table completely ignorable;
1282 i.e. as if the weights were zero at all level.
1283
1284 Through C<ignoreChar>, any character matching C<qr/$ignoreChar/>
1285 will be ignored. Through C<ignoreName>, any character whose name
1286 (given in the C<table> file as a comment) matches C<qr/$ignoreName/>
1287 will be ignored.
1288
1289 E.g. when 'a' and 'e' are ignorable,
1290 'element' is equal to 'lament' (or 'lmnt').
1291
1292 =item katakana_before_hiragana
1293
1294 -- see 7.3.1 Tertiary Weight Table, UTS #10.
1295
1296 By default, hiragana is before katakana.
1297 If the parameter is made true, this is reversed.
1298
1299 B<NOTE>: This parameter simplemindedly assumes that any hiragana/katakana
1300 distinctions must occur in level 3, and their weights at level 3 must be
1301 same as those mentioned in 7.3.1, UTS #10.
1302 If you define your collation elements which violate this requirement,
1303 this parameter does not work validly.
1304
1305 =item level
1306
1307 -- see 4.3 Form Sort Key, UTS #10.
1308
1309 Set the maximum level.
1310 Any higher levels than the specified one are ignored.
1311
1312   Level 1: alphabetic ordering
1313   Level 2: diacritic ordering
1314   Level 3: case ordering
1315   Level 4: tie-breaking (e.g. in the case when variable is 'shifted')
1316
1317   ex.level => 2,
1318
1319 If omitted, the maximum is the 4th.
1320
1321 =item normalization
1322
1323 -- see 4.1 Normalize, UTS #10.
1324
1325 If specified, strings are normalized before preparation of sort keys
1326 (the normalization is executed after preprocess).
1327
1328 A form name C<Unicode::Normalize::normalize()> accepts will be applied
1329 as C<$normalization_form>.
1330 Acceptable names include C<'NFD'>, C<'NFC'>, C<'NFKD'>, and C<'NFKC'>.
1331 See C<Unicode::Normalize::normalize()> for detail.
1332 If omitted, C<'NFD'> is used.
1333
1334 C<normalization> is performed after C<preprocess> (if defined).
1335
1336 Furthermore, special values, C<undef> and C<"prenormalized">, can be used,
1337 though they are not concerned with C<Unicode::Normalize::normalize()>.
1338
1339 If C<undef> (not a string C<"undef">) is passed explicitly
1340 as the value for this key,
1341 any normalization is not carried out (this may make tailoring easier
1342 if any normalization is not desired). Under C<(normalization =E<gt> undef)>,
1343 only contiguous contractions are resolved;
1344 e.g. even if C<A-ring> (and C<A-ring-cedilla>) is ordered after C<Z>,
1345 C<A-cedilla-ring> would be primary equal to C<A>.
1346 In this point,
1347 C<(normalization =E<gt> undef, preprocess =E<gt> sub { NFD(shift) })>
1348 B<is not> equivalent to C<(normalization =E<gt> 'NFD')>.
1349
1350 In the case of C<(normalization =E<gt> "prenormalized")>,
1351 any normalization is not performed, but
1352 non-contiguous contractions with combining characters are performed.
1353 Therefore
1354 C<(normalization =E<gt> 'prenormalized', preprocess =E<gt> sub { NFD(shift) })>
1355 B<is> equivalent to C<(normalization =E<gt> 'NFD')>.
1356 If source strings are finely prenormalized,
1357 C<(normalization =E<gt> 'prenormalized')> may save time for normalization.
1358
1359 Except C<(normalization =E<gt> undef)>,
1360 B<Unicode::Normalize> is required (see also B<CAVEAT>).
1361
1362 =item overrideCJK
1363
1364 -- see 7.1 Derived Collation Elements, UTS #10.
1365
1366 By default, CJK Unified Ideographs are ordered in Unicode codepoint order
1367 but C<CJK Unified Ideographs> (if C<UCA_Version> is 8 to 11, its range is
1368 C<U+4E00..U+9FA5>; if C<UCA_Version> is 14, its range is C<U+4E00..U+9FBB>)
1369 are lesser than C<CJK Unified Ideographs Extension> (its range is
1370 C<U+3400..U+4DB5> and C<U+20000..U+2A6D6>).
1371
1372 Through C<overrideCJK>, ordering of CJK Unified Ideographs can be overrided.
1373
1374 ex. CJK Unified Ideographs in the JIS code point order.
1375
1376   overrideCJK => sub {
1377       my $u = shift;             # get a Unicode codepoint
1378       my $b = pack('n', $u);     # to UTF-16BE
1379       my $s = your_unicode_to_sjis_converter($b); # convert
1380       my $n = unpack('n', $s);   # convert sjis to short
1381       [ $n, 0x20, 0x2, $u ];     # return the collation element
1382   },
1383
1384 ex. ignores all CJK Unified Ideographs.
1385
1386   overrideCJK => sub {()}, # CODEREF returning empty list
1387
1388    # where ->eq("Pe\x{4E00}rl", "Perl") is true
1389    # as U+4E00 is a CJK Unified Ideograph and to be ignorable.
1390
1391 If C<undef> is passed explicitly as the value for this key,
1392 weights for CJK Unified Ideographs are treated as undefined.
1393 But assignment of weight for CJK Unified Ideographs
1394 in table or C<entry> is still valid.
1395
1396 =item overrideHangul
1397
1398 -- see 7.1 Derived Collation Elements, UTS #10.
1399
1400 By default, Hangul Syllables are decomposed into Hangul Jamo,
1401 even if C<(normalization =E<gt> undef)>.
1402 But the mapping of Hangul Syllables may be overrided.
1403
1404 This parameter works like C<overrideCJK>, so see there for examples.
1405
1406 If you want to override the mapping of Hangul Syllables,
1407 NFD, NFKD, and FCD are not appropriate,
1408 since they will decompose Hangul Syllables before overriding.
1409
1410 If C<undef> is passed explicitly as the value for this key,
1411 weight for Hangul Syllables is treated as undefined
1412 without decomposition into Hangul Jamo.
1413 But definition of weight for Hangul Syllables
1414 in table or C<entry> is still valid.
1415
1416 =item preprocess
1417
1418 -- see 5.1 Preprocessing, UTS #10.
1419
1420 If specified, the coderef is used to preprocess
1421 before the formation of sort keys.
1422
1423 ex. dropping English articles, such as "a" or "the".
1424 Then, "the pen" is before "a pencil".
1425
1426      preprocess => sub {
1427            my $str = shift;
1428            $str =~ s/\b(?:an?|the)\s+//gi;
1429            return $str;
1430         },
1431
1432 C<preprocess> is performed before C<normalization> (if defined).
1433
1434 =item rearrange
1435
1436 -- see 3.1.3 Rearrangement, UTS #10.
1437
1438 Characters that are not coded in logical order and to be rearranged.
1439 If C<UCA_Version> is equal to or lesser than 11, default is:
1440
1441     rearrange => [ 0x0E40..0x0E44, 0x0EC0..0x0EC4 ],
1442
1443 If you want to disallow any rearrangement, pass C<undef> or C<[]>
1444 (a reference to empty list) as the value for this key.
1445
1446 If C<UCA_Version> is equal to 14, default is C<[]> (i.e. no rearrangement).
1447
1448 B<According to the version 9 of UCA, this parameter shall not be used;
1449 but it is not warned at present.>
1450
1451 =item table
1452
1453 -- see 3.2 Default Unicode Collation Element Table, UTS #10.
1454
1455 You can use another collation element table if desired.
1456
1457 The table file should locate in the F<Unicode/Collate> directory
1458 on C<@INC>. Say, if the filename is F<Foo.txt>,
1459 the table file is searched as F<Unicode/Collate/Foo.txt> in C<@INC>.
1460
1461 By default, F<allkeys.txt> (as the filename of DUCET) is used.
1462
1463 If C<undef> is passed explicitly as the value for this key,
1464 no file is read (but you can define collation elements via C<entry>).
1465
1466 A typical way to define a collation element table
1467 without any file of table:
1468
1469    $onlyABC = Unicode::Collate->new(
1470        table => undef,
1471        entry => << 'ENTRIES',
1472 0061 ; [.0101.0020.0002.0061] # LATIN SMALL LETTER A
1473 0041 ; [.0101.0020.0008.0041] # LATIN CAPITAL LETTER A
1474 0062 ; [.0102.0020.0002.0062] # LATIN SMALL LETTER B
1475 0042 ; [.0102.0020.0008.0042] # LATIN CAPITAL LETTER B
1476 0063 ; [.0103.0020.0002.0063] # LATIN SMALL LETTER C
1477 0043 ; [.0103.0020.0008.0043] # LATIN CAPITAL LETTER C
1478 ENTRIES
1479     );
1480
1481 If C<ignoreName> or C<undefName> is used, character names should be
1482 specified as a comment (following C<#>) on each line.
1483
1484 =item undefChar
1485
1486 =item undefName
1487
1488 -- see 6.3.4 Reducing the Repertoire, UTS #10.
1489
1490 Undefines the collation element as if it were unassigned in the table.
1491 This reduces the size of the table.
1492 If an unassigned character appears in the string to be collated,
1493 the sort key is made from its codepoint
1494 as a single-character collation element,
1495 as it is greater than any other assigned collation elements
1496 (in the codepoint order among the unassigned characters).
1497 But, it'd be better to ignore characters
1498 unfamiliar to you and maybe never used.
1499
1500 Through C<undefChar>, any character matching C<qr/$undefChar/>
1501 will be undefined. Through C<undefName>, any character whose name
1502 (given in the C<table> file as a comment) matches C<qr/$undefName/>
1503 will be undefined.
1504
1505 ex. Collation weights for beyond-BMP characters are not stored in object:
1506
1507     undefChar => qr/[^\0-\x{fffd}]/,
1508
1509 =item upper_before_lower
1510
1511 -- see 6.6 Case Comparisons, UTS #10.
1512
1513 By default, lowercase is before uppercase.
1514 If the parameter is made true, this is reversed.
1515
1516 B<NOTE>: This parameter simplemindedly assumes that any lowercase/uppercase
1517 distinctions must occur in level 3, and their weights at level 3 must be
1518 same as those mentioned in 7.3.1, UTS #10.
1519 If you define your collation elements which differs from this requirement,
1520 this parameter doesn't work validly.
1521
1522 =item variable
1523
1524 -- see 3.2.2 Variable Weighting, UTS #10.
1525
1526 This key allows to variable weighting for variable collation elements,
1527 which are marked with an ASTERISK in the table
1528 (NOTE: Many punction marks and symbols are variable in F<allkeys.txt>).
1529
1530    variable => 'blanked', 'non-ignorable', 'shifted', or 'shift-trimmed'.
1531
1532 These names are case-insensitive.
1533 By default (if specification is omitted), 'shifted' is adopted.
1534
1535    'Blanked'        Variable elements are made ignorable at levels 1 through 3;
1536                     considered at the 4th level.
1537
1538    'Non-Ignorable'  Variable elements are not reset to ignorable.
1539
1540    'Shifted'        Variable elements are made ignorable at levels 1 through 3
1541                     their level 4 weight is replaced by the old level 1 weight.
1542                     Level 4 weight for Non-Variable elements is 0xFFFF.
1543
1544    'Shift-Trimmed'  Same as 'shifted', but all FFFF's at the 4th level
1545                     are trimmed.
1546
1547 =back
1548
1549 =head2 Methods for Collation
1550
1551 =over 4
1552
1553 =item C<@sorted = $Collator-E<gt>sort(@not_sorted)>
1554
1555 Sorts a list of strings.
1556
1557 =item C<$result = $Collator-E<gt>cmp($a, $b)>
1558
1559 Returns 1 (when C<$a> is greater than C<$b>)
1560 or 0 (when C<$a> is equal to C<$b>)
1561 or -1 (when C<$a> is lesser than C<$b>).
1562
1563 =item C<$result = $Collator-E<gt>eq($a, $b)>
1564
1565 =item C<$result = $Collator-E<gt>ne($a, $b)>
1566
1567 =item C<$result = $Collator-E<gt>lt($a, $b)>
1568
1569 =item C<$result = $Collator-E<gt>le($a, $b)>
1570
1571 =item C<$result = $Collator-E<gt>gt($a, $b)>
1572
1573 =item C<$result = $Collator-E<gt>ge($a, $b)>
1574
1575 They works like the same name operators as theirs.
1576
1577    eq : whether $a is equal to $b.
1578    ne : whether $a is not equal to $b.
1579    lt : whether $a is lesser than $b.
1580    le : whether $a is lesser than $b or equal to $b.
1581    gt : whether $a is greater than $b.
1582    ge : whether $a is greater than $b or equal to $b.
1583
1584 =item C<$sortKey = $Collator-E<gt>getSortKey($string)>
1585
1586 -- see 4.3 Form Sort Key, UTS #10.
1587
1588 Returns a sort key.
1589
1590 You compare the sort keys using a binary comparison
1591 and get the result of the comparison of the strings using UCA.
1592
1593    $Collator->getSortKey($a) cmp $Collator->getSortKey($b)
1594
1595       is equivalent to
1596
1597    $Collator->cmp($a, $b)
1598
1599 =item C<$sortKeyForm = $Collator-E<gt>viewSortKey($string)>
1600
1601 Converts a sorting key into its representation form.
1602 If C<UCA_Version> is 8, the output is slightly different.
1603
1604    use Unicode::Collate;
1605    my $c = Unicode::Collate->new();
1606    print $c->viewSortKey("Perl"),"\n";
1607
1608    # output:
1609    # [0B67 0A65 0B7F 0B03 | 0020 0020 0020 0020 | 0008 0002 0002 0002 | FFFF FFFF FFFF FFFF]
1610    #  Level 1               Level 2               Level 3               Level 4
1611
1612 =back
1613
1614 =head2 Methods for Searching
1615
1616 B<DISCLAIMER:> If C<preprocess> or C<normalization> parameter is true
1617 for C<$Collator>, calling these methods (C<index>, C<match>, C<gmatch>,
1618 C<subst>, C<gsubst>) is croaked,
1619 as the position and the length might differ
1620 from those on the specified string.
1621 (And C<rearrange> and C<hangul_terminator> parameters are neglected.)
1622
1623 The C<match>, C<gmatch>, C<subst>, C<gsubst> methods work
1624 like C<m//>, C<m//g>, C<s///>, C<s///g>, respectively,
1625 but they are not aware of any pattern, but only a literal substring.
1626
1627 =over 4
1628
1629 =item C<$position = $Collator-E<gt>index($string, $substring[, $position])>
1630
1631 =item C<($position, $length) = $Collator-E<gt>index($string, $substring[, $position])>
1632
1633 If C<$substring> matches a part of C<$string>, returns
1634 the position of the first occurrence of the matching part in scalar context;
1635 in list context, returns a two-element list of
1636 the position and the length of the matching part.
1637
1638 If C<$substring> does not match any part of C<$string>,
1639 returns C<-1> in scalar context and
1640 an empty list in list context.
1641
1642 e.g. you say
1643
1644   my $Collator = Unicode::Collate->new( normalization => undef, level => 1 );
1645                                      # (normalization => undef) is REQUIRED.
1646   my $str = "Ich muß studieren Perl.";
1647   my $sub = "MÜSS";
1648   my $match;
1649   if (my($pos,$len) = $Collator->index($str, $sub)) {
1650       $match = substr($str, $pos, $len);
1651   }
1652
1653 and get C<"muß"> in C<$match> since C<"muß">
1654 is primary equal to C<"MÜSS">.
1655
1656 =item C<$match_ref = $Collator-E<gt>match($string, $substring)>
1657
1658 =item C<($match)   = $Collator-E<gt>match($string, $substring)>
1659
1660 If C<$substring> matches a part of C<$string>, in scalar context, returns
1661 B<a reference to> the first occurrence of the matching part
1662 (C<$match_ref> is always true if matches,
1663 since every reference is B<true>);
1664 in list context, returns the first occurrence of the matching part.
1665
1666 If C<$substring> does not match any part of C<$string>,
1667 returns C<undef> in scalar context and
1668 an empty list in list context.
1669
1670 e.g.
1671
1672     if ($match_ref = $Collator->match($str, $sub)) { # scalar context
1673         print "matches [$$match_ref].\n";
1674     } else {
1675         print "doesn't match.\n";
1676     }
1677
1678      or
1679
1680     if (($match) = $Collator->match($str, $sub)) { # list context
1681         print "matches [$match].\n";
1682     } else {
1683         print "doesn't match.\n";
1684     }
1685
1686 =item C<@match = $Collator-E<gt>gmatch($string, $substring)>
1687
1688 If C<$substring> matches a part of C<$string>, returns
1689 all the matching parts (or matching count in scalar context).
1690
1691 If C<$substring> does not match any part of C<$string>,
1692 returns an empty list.
1693
1694 =item C<$count = $Collator-E<gt>subst($string, $substring, $replacement)>
1695
1696 If C<$substring> matches a part of C<$string>,
1697 the first occurrence of the matching part is replaced by C<$replacement>
1698 (C<$string> is modified) and return C<$count> (always equals to C<1>).
1699
1700 C<$replacement> can be a C<CODEREF>,
1701 taking the matching part as an argument,
1702 and returning a string to replace the matching part
1703 (a bit similar to C<s/(..)/$coderef-E<gt>($1)/e>).
1704
1705 =item C<$count = $Collator-E<gt>gsubst($string, $substring, $replacement)>
1706
1707 If C<$substring> matches a part of C<$string>,
1708 all the occurrences of the matching part is replaced by C<$replacement>
1709 (C<$string> is modified) and return C<$count>.
1710
1711 C<$replacement> can be a C<CODEREF>,
1712 taking the matching part as an argument,
1713 and returning a string to replace the matching part
1714 (a bit similar to C<s/(..)/$coderef-E<gt>($1)/eg>).
1715
1716 e.g.
1717
1718   my $Collator = Unicode::Collate->new( normalization => undef, level => 1 );
1719                                      # (normalization => undef) is REQUIRED.
1720   my $str = "Camel donkey zebra came\x{301}l CAMEL horse cAm\0E\0L...";
1721   $Collator->gsubst($str, "camel", sub { "<b>$_[0]</b>" });
1722
1723   # now $str is "<b>Camel</b> donkey zebra <b>came\x{301}l</b> <b>CAMEL</b> horse <b>cAm\0E\0L</b>...";
1724   # i.e., all the camels are made bold-faced.
1725
1726 =back
1727
1728 =head2 Other Methods
1729
1730 =over 4
1731
1732 =item C<%old_tailoring = $Collator-E<gt>change(%new_tailoring)>
1733
1734 Change the value of specified keys and returns the changed part.
1735
1736     $Collator = Unicode::Collate->new(level => 4);
1737
1738     $Collator->eq("perl", "PERL"); # false
1739
1740     %old = $Collator->change(level => 2); # returns (level => 4).
1741
1742     $Collator->eq("perl", "PERL"); # true
1743
1744     $Collator->change(%old); # returns (level => 2).
1745
1746     $Collator->eq("perl", "PERL"); # false
1747
1748 Not all C<(key,value)>s are allowed to be changed.
1749 See also C<@Unicode::Collate::ChangeOK> and C<@Unicode::Collate::ChangeNG>.
1750
1751 In the scalar context, returns the modified collator
1752 (but it is B<not> a clone from the original).
1753
1754     $Collator->change(level => 2)->eq("perl", "PERL"); # true
1755
1756     $Collator->eq("perl", "PERL"); # true; now max level is 2nd.
1757
1758     $Collator->change(level => 4)->eq("perl", "PERL"); # false
1759
1760 =item C<$version = $Collator-E<gt>version()>
1761
1762 Returns the version number (a string) of the Unicode Standard
1763 which the C<table> file used by the collator object is based on.
1764 If the table does not include a version line (starting with C<@version>),
1765 returns C<"unknown">.
1766
1767 =item C<UCA_Version()>
1768
1769 Returns the tracking version number of UTS #10 this module consults.
1770
1771 =item C<Base_Unicode_Version()>
1772
1773 Returns the version number of UTS #10 this module consults.
1774
1775 =back
1776
1777 =head1 EXPORT
1778
1779 No method will be exported.
1780
1781 =head1 CAVEATS
1782
1783 =over 4
1784
1785 =item Normalization
1786
1787 Use of the C<normalization> parameter requires the B<Unicode::Normalize>
1788 module (see L<Unicode::Normalize>).
1789
1790 If you need not it (say, in the case when you need not
1791 handle any combining characters),
1792 assign C<normalization =E<gt> undef> explicitly.
1793
1794 -- see 6.5 Avoiding Normalization, UTS #10.
1795
1796 =item Conformance Test
1797
1798 The Conformance Test for the UCA is available
1799 under L<http://www.unicode.org/Public/UCA/>.
1800
1801 For F<CollationTest_SHIFTED.txt>,
1802 a collator via C<Unicode::Collate-E<gt>new( )> should be used;
1803 for F<CollationTest_NON_IGNORABLE.txt>, a collator via
1804 C<Unicode::Collate-E<gt>new(variable =E<gt> "non-ignorable", level =E<gt> 3)>.
1805
1806 B<Unicode::Normalize is required to try The Conformance Test.>
1807
1808 =back
1809
1810 =head1 AUTHOR
1811
1812 SADAHIRO Tomoyuki <SADAHIRO@cpan.org>
1813
1814 Copyright(C) 2001-2005, SADAHIRO Tomoyuki. Japan. All rights reserved.
1815
1816 This module is free software; you can redistribute it
1817 and/or modify it under the same terms as Perl itself.
1818
1819 =head1 SEE ALSO
1820
1821 =over 4
1822
1823 =item Unicode Collation Algorithm - UTS #10
1824
1825 L<http://www.unicode.org/reports/tr10/>
1826
1827 =item The Default Unicode Collation Element Table (DUCET)
1828
1829 L<http://www.unicode.org/Public/UCA/latest/allkeys.txt>
1830
1831 =item The conformance test for the UCA
1832
1833 L<http://www.unicode.org/Public/UCA/latest/CollationTest.html>
1834
1835 L<http://www.unicode.org/Public/UCA/latest/CollationTest.zip>
1836
1837 =item Hangul Syllable Type
1838
1839 L<http://www.unicode.org/Public/UNIDATA/HangulSyllableType.txt>
1840
1841 =item Unicode Normalization Forms - UAX #15
1842
1843 L<http://www.unicode.org/reports/tr15/>
1844
1845 =back
1846
1847 =cut