lib/Unicode/Collate.pm

   1 package Unicode::Collate;
   2
   3 BEGIN {
   4     if (ord("A") == 193) {
   5         die "Unicode::Collate not ported to EBCDIC\n";
   6     }
   7 }
   8
   9 use 5.006;
  10 use strict;
  11 use warnings;
  12 use Carp;
  13 use File::Spec;
  14
  15 require Exporter;
  16
  17 our $VERSION = '0.20';
  18 our $PACKAGE = __PACKAGE__;
  19
  20 our @ISA = qw(Exporter);
  21
  22 our %EXPORT_TAGS = ();
  23 our @EXPORT_OK = ();
  24 our @EXPORT = ();
  25
  26 (our $Path = $INC{'Unicode/Collate.pm'}) =~ s/\.pm$//;
  27 our $KeyFile = "allkeys.txt";
  28
  29 our $UNICODE_VERSION;
  30
  31 eval { require Unicode::UCD };
  32
  33 unless ($@) {
  34     $UNICODE_VERSION = Unicode::UCD::UnicodeVersion();
  35 }
  36 else { # XXX, Perl 5.6.1
  37     my($f, $fh);
  38     foreach my $d (@INC) {
  39         $f = File::Spec->catfile($d, "unicode", "Unicode.301");
  40         if (open($fh, $f)) {
  41             $UNICODE_VERSION = '3.0.1';
  42             close $fh;
  43             last;
  44         }
  45     }
  46 }
  47
  48 our $getCombinClass; # coderef for combining class from Unicode::Normalize
  49
  50 use constant Min2   => 0x20;    # minimum weight at level 2
  51 use constant Min3   => 0x02;    # minimum weight at level 3
  52
  53 # format for pack
  54 use constant VCE_FORMAT => 'Cn4'; # for variable + CE with 4 levels
  55
  56 # values of variable
  57 use constant NON_VAR => 0; # Non-Variable character
  58 use constant VAR     => 1; # Variable character
  59
  60 our $DefaultRearrange = [ 0x0E40..0x0E44, 0x0EC0..0x0EC4 ];
  61
  62 sub UCA_Version { "9" }
  63
  64 sub Base_Unicode_Version { $UNICODE_VERSION || 'unknown' }
  65
  66 my (%AlternateOK);
  67 @AlternateOK{ qw/
  68     blanked  non-ignorable  shifted  shift-trimmed
  69   / } = ();
  70
  71 our @ChangeOK = qw/
  72     alternate backwards level normalization rearrange
  73     katakana_before_hiragana upper_before_lower
  74     overrideHangul overrideCJK preprocess UCA_Version
  75   /;
  76
  77 our @ChangeNG = qw/
  78     entry entries table ignored combining maxlength
  79     ignoreChar ignoreName undefChar undefName
  80     versionTable alternateTable backwardsTable forwardsTable rearrangeTable
  81     derivCode normCode rearrangeHash isShift L3ignorable
  82   /;
  83
  84 my (%ChangeOK, %ChangeNG);
  85 @ChangeOK{ @ChangeOK } = ();
  86 @ChangeNG{ @ChangeNG } = ();
  87
  88 sub change {
  89     my $self = shift;
  90     my %hash = @_;
  91     my %old;
  92     foreach my $k (keys %hash) {
  93         if (exists $ChangeOK{$k}) {
  94             $old{$k} = $self->{$k};
  95             $self->{$k} = $hash{$k};
  96         }
  97         elsif (exists $ChangeNG{$k}) {
  98             croak "change of $k via change() is not allowed!";
  99         }
 100         # else => ignored
 101     }
 102     $self->checkCollator;
 103     return wantarray ? %old : $self;
 104 }
 105
 106 sub checkCollator {
 107     my $self = shift;
 108     croak "Illegal level lower than 1 (passed $self->{level})."
 109         if $self->{level} < 1;
 110     croak "A level higher than 4 (passed $self->{level}) is not supported."
 111         if 4 < $self->{level};
 112
 113     $self->{derivCode} =
 114         $self->{UCA_Version} == -1 ? \&broken_derivCE :
 115         $self->{UCA_Version} ==  8 ? \&derivCE_8 :
 116         $self->{UCA_Version} ==  9 ? \&derivCE_9 :
 117       croak "Illegal UCA version (passed $self->{UCA_Version}).";
 118
 119     $self->{alternate} = lc($self->{alternate});
 120     croak "$PACKAGE unknown alternate tag name: $self->{alternate}"
 121         unless exists $AlternateOK{ $self->{alternate} };
 122
 123     $self->{isShift} = $self->{alternate} eq 'shifted' ||
 124                 $self->{alternate} eq 'shift-trimmed';
 125
 126     $self->{backwards} = []
 127         if ! defined $self->{backwards};
 128     $self->{backwards} = [ $self->{backwards} ]
 129         if ! ref $self->{backwards};
 130
 131     $self->{rearrange} = []
 132         if ! defined $self->{rearrange};
 133     croak "$PACKAGE: A list for rearrangement must be store in an ARRAYREF"
 134         if ! ref $self->{rearrange};
 135
 136     # keys of $self->{rearrangeHash} are $self->{rearrange}.
 137     $self->{rearrangeHash} = undef;
 138
 139     if (@{ $self->{rearrange} }) {
 140         @{ $self->{rearrangeHash} }{ @{ $self->{rearrange} } } = ();
 141     }
 142
 143     $self->{normCode} = undef;
 144
 145     if (defined $self->{normalization}) {
 146         eval { require Unicode::Normalize };
 147         croak "Unicode/Normalize.pm is required to normalize strings: $@"
 148             if $@;
 149
 150         Unicode::Normalize->import();
 151         $getCombinClass = \&Unicode::Normalize::getCombinClass
 152             if ! $getCombinClass;
 153
 154         $self->{normCode} =
 155             $self->{normalization} =~ /^(?:NF)?C$/  ? \&NFC :
 156             $self->{normalization} =~ /^(?:NF)?D$/  ? \&NFD :
 157             $self->{normalization} =~ /^(?:NF)?KC$/ ? \&NFKC :
 158             $self->{normalization} =~ /^(?:NF)?KD$/ ? \&NFKD :
 159           croak "$PACKAGE unknown normalization form name: "
 160                 . $self->{normalization};
 161     }
 162     return;
 163 }
 164
 165 sub new
 166 {
 167     my $class = shift;
 168     my $self = bless { @_ }, $class;
 169
 170     # If undef is passed explicitly, no file is read.
 171     $self->{table} = $KeyFile if ! exists $self->{table};
 172     $self->read_table if defined $self->{table};
 173
 174     if ($self->{entry}) {
 175         $self->parseEntry($_) foreach split /\n/, $self->{entry};
 176     }
 177
 178     $self->{level} ||= 4;
 179     $self->{UCA_Version} ||= UCA_Version();
 180
 181     $self->{overrideHangul} = ''
 182         if ! exists $self->{overrideHangul};
 183     $self->{overrideCJK} = ''
 184         if ! exists $self->{overrideCJK};
 185     $self->{normalization} = 'D'
 186         if ! exists $self->{normalization};
 187     $self->{alternate} = $self->{alternateTable} || 'shifted'
 188         if ! exists $self->{alternate};
 189     $self->{rearrange} = $self->{rearrangeTable} || $DefaultRearrange
 190         if ! exists $self->{rearrange};
 191     $self->{backwards} = $self->{backwardsTable}
 192         if ! exists $self->{backwards};
 193
 194     $self->checkCollator;
 195
 196     return $self;
 197 }
 198
 199 sub read_table {
 200     my $self = shift;
 201     my $file = $self->{table} ne '' ? $self->{table} : $KeyFile;
 202
 203     my $filepath = File::Spec->catfile($Path, $file);
 204     open my $fk, "<$filepath"
 205         or croak "File does not exist at $filepath";
 206
 207     while (<$fk>) {
 208         next if /^\s*#/;
 209         if (/^\s*\@/) {
 210             if    (/^\s*\@version\s*(\S*)/) {
 211                 $self->{versionTable} ||= $1;
 212             }
 213             elsif (/^\s*\@alternate\s+(\S*)/) {
 214                 $self->{alternateTable} ||= $1;
 215             }
 216             elsif (/^\s*\@backwards\s+(\S*)/) {
 217                 push @{ $self->{backwardsTable} }, $1;
 218             }
 219             elsif (/^\s*\@forwards\s+(\S*)/) { # parhaps no use
 220                 push @{ $self->{forwardsTable} }, $1;
 221             }
 222             elsif (/^\s*\@rearrange\s+(.*)/) { # (\S*) is NG
 223                 push @{ $self->{rearrangeTable} }, _getHexArray($1);
 224             }
 225             next;
 226         }
 227         $self->parseEntry($_);
 228     }
 229     close $fk;
 230 }
 231
 232
 233 ##
 234 ## get $line, parse it, and write an entry in $self
 235 ##
 236 sub parseEntry
 237 {
 238     my $self = shift;
 239     my $line = shift;
 240     my($name, $ele, @key);
 241
 242     return if $line !~ /^\s*[0-9A-Fa-f]/;
 243
 244     # removes comment and gets name
 245     $name = $1
 246         if $line =~ s/[#%]\s*(.*)//;
 247     return if defined $self->{undefName} && $name =~ /$self->{undefName}/;
 248
 249     # gets element
 250     my($e, $k) = split /;/, $line;
 251     croak "Wrong Entry: <charList> must be separated by ';' from <collElement>"
 252         if ! $k;
 253
 254     my @e = _getHexArray($e);
 255     return if !@e;
 256
 257     $ele = pack('U*', @e);
 258     return if defined $self->{undefChar} && $ele =~ /$self->{undefChar}/;
 259
 260     # get sort key
 261     if (defined $self->{ignoreName} && $name =~ /$self->{ignoreName}/ ||
 262         defined $self->{ignoreChar} && $ele  =~ /$self->{ignoreChar}/)
 263     {
 264         $self->{entries}{$ele} = $self->{ignored}{$ele} = 1;
 265     }
 266     else {
 267         my $combining = 1; # primary = 0, secondary != 0;
 268         my $level3ingore;
 269
 270         foreach my $arr ($k =~ /\[([^\[\]]+)\]/g) { # SPACEs allowed
 271             my $var = $arr =~ /\*/; # exactly /^\*/ but be lenient.
 272             my @arr = _getHexArray($arr);
 273             push @key, pack(VCE_FORMAT, $var, @arr);
 274             $combining = 0 unless $arr[0] == 0 && $arr[1] != 0;
 275             $level3ingore = 1 if $arr[0] == 0 && $arr[1] == 0 && $arr[2] == 0;
 276         }
 277         $self->{entries}{$ele} = \@key;
 278
 279         $self->{combining}{$ele} = 1
 280             if $combining;
 281
 282         $self->{L3ignorable}{$e[0]} = 1
 283             if @e == 1 && $level3ingore;
 284     }
 285     $self->{maxlength}{ord $ele} = scalar @e if @e > 1;
 286 }
 287
 288 ##
 289 ## arrayref CE = altCE(bool variable?, list[num] weights)
 290 ##
 291 sub altCE
 292 {
 293     my $self = shift;
 294     my($var, @c) = unpack(VCE_FORMAT, shift);
 295
 296     $self->{alternate} eq 'blanked' ?
 297         $var ? [0,0,0,$c[3]] : \@c :
 298     $self->{alternate} eq 'non-ignorable' ?
 299         \@c :
 300     $self->{alternate} eq 'shifted' ?
 301         $var ? [0,0,0,$c[0] ] : [ @c[0..2], $c[0]+$c[1]+$c[2] ? 0xFFFF : 0 ] :
 302     $self->{alternate} eq 'shift-trimmed' ?
 303         $var ? [0,0,0,$c[0] ] : [ @c[0..2], 0 ] :
 304         croak "$PACKAGE unknown alternate name: $self->{alternate}";
 305 }
 306
 307 sub viewSortKey
 308 {
 309     my $self = shift;
 310     my $ver = $self->{UCA_Version};
 311
 312     my $key  = $self->getSortKey(@_);
 313     my $view = join " ", map sprintf("%04X", $_), unpack 'n*', $key;
 314     if ($ver <= 8) {
 315         $view =~ s/ ?0000 ?/|/g;
 316     } else {
 317         $view =~ s/\b0000\b/|/g;
 318     }
 319     return "[$view]";
 320 }
 321
 322
 323 ##
 324 ## list[strings] elements = splitCE(string arg)
 325 ##
 326 sub splitCE
 327 {
 328     my $self = shift;
 329     my $code = $self->{preprocess};
 330     my $norm = $self->{normCode};
 331     my $ent  = $self->{entries};
 332     my $max  = $self->{maxlength};
 333     my $reH  = $self->{rearrangeHash};
 334     my $L3i  = $self->{L3ignorable};
 335     my $ver9 = $self->{UCA_Version} > 8;
 336
 337     my $str = ref $code ? &$code(shift) : shift;
 338     $str = &$norm($str) if ref $norm;
 339
 340     my @src = unpack('U*', $str);
 341     my @buf;
 342
 343     # rearrangement
 344     if ($reH) {
 345         for (my $i = 0; $i < @src; $i++) {
 346             if (exists $reH->{ $src[$i] } && $i + 1 < @src) {
 347                 ($src[$i], $src[$i+1]) = ($src[$i+1], $src[$i]);
 348                 $i++;
 349             }
 350         }
 351     }
 352
 353     if ($ver9) {
 354         @src = grep ! $L3i->{$_}, @src;
 355     }
 356
 357     for (my $i = 0; $i < @src; $i++) {
 358         my $ch;
 359         my $u = $src[$i];
 360
 361         # non-characters
 362         next unless defined $u;
 363         next if $u < 0 || 0x10FFFF < $u    # out of range
 364             || (0xD800 <= $u && $u <= 0xDFFF) # unpaired surrogates
 365             || (0xFDD0 <= $u && $u <= 0xFDEF) # non-character
 366         ;
 367
 368         my $four = $u & 0xFFFF;
 369         next if $four == 0xFFFE || $four == 0xFFFF;
 370
 371         if ($max->{$u}) { # contract
 372             for (my $j = $max->{$u}; $j >= 1; $j--) {
 373                 next unless $i+$j-1 < @src;
 374                 $ch = pack 'U*', @src[$i .. $i+$j-1];
 375                 $i += $j-1, last if $ent->{$ch};
 376             }
 377         } else {
 378             $ch = pack('U', $u);
 379         }
 380
 381         # with Combining Char (UTS#10, 4.2.1), here requires Unicode::Normalize.
 382         if ($getCombinClass && defined $ch) {
 383             for (my $j = $i+1; $j < @src; $j++) {
 384                 next unless defined $src[$j];
 385                 last unless $getCombinClass->( $src[$j] );
 386                 my $comb = pack 'U', $src[$j];
 387                 next if ! $ent->{ $ch.$comb };
 388                 $ch .= $comb;
 389                 $src[$j] = undef;
 390             }
 391         }
 392         push @buf, $ch;
 393     }
 394     wantarray ? @buf : \@buf;
 395 }
 396
 397
 398 ##
 399 ## list[arrayrefs] weight = getWt(string element)
 400 ##
 401 sub getWt
 402 {
 403     my $self = shift;
 404     my $ch   = shift;
 405     my $ent  = $self->{entries};
 406     my $ign  = $self->{ignored};
 407     my $cjk  = $self->{overrideCJK};
 408     my $hang = $self->{overrideHangul};
 409     my $der  = $self->{derivCode};
 410
 411     return if !defined $ch || $ign->{$ch}; # ignored
 412     return map($self->altCE($_), @{ $ent->{$ch} })
 413         if $ent->{$ch};
 414
 415     my $u = unpack('U', $ch);
 416
 417     if (0xAC00 <= $u && $u <= 0xD7A3) { # is_Hangul
 418         return map $self->altCE($_),
 419             $hang
 420                 ? map(pack(VCE_FORMAT, NON_VAR, @$_), &$hang($u))
 421                 : defined $hang
 422                     ? map({
 423                             my $v = $_;
 424                             my $vCE = $ent->{pack('U', $v)};
 425                             $vCE ? @$vCE : $der->($v);
 426                         } _decompHangul($u))
 427                     : $der->($u);
 428     }
 429     elsif (0x3400 <= $u && $u <= 0x4DB5 ||
 430            0x4E00 <= $u && $u <= 0x9FA5 ||
 431            0x20000 <= $u && $u <= 0x2A6D6) { # CJK Ideograph
 432         return map $self->altCE($_),
 433             $cjk
 434                 ? map(pack(VCE_FORMAT, NON_VAR, @$_), &$cjk($u))
 435                 : defined $cjk && $self->{UCA_Version} <= 8 && $u < 0x10000
 436                     ? pack(VCE_FORMAT, NON_VAR, $u, 0x20, 0x02, $u)
 437                     : $der->($u);
 438     }
 439     else {
 440         return map $self->altCE($_), $der->($u);
 441     }
 442 }
 443
 444 ##
 445 ## int = index(string, substring)
 446 ##
 447 sub index
 448 {
 449     my $self = shift;
 450     my $lev  = $self->{level};
 451     my $comb = $self->{combining};
 452     my $str  = $self->splitCE(shift);
 453     my $sub  = $self->splitCE(shift);
 454
 455     return wantarray ? (0,0) : 0 if ! @$sub;
 456     return wantarray ?  ()  : -1 if ! @$str;
 457
 458     my @subWt = grep _ignorableAtLevel($_,$lev),
 459                 map $self->getWt($_), @$sub;
 460
 461     my(@strWt,@strPt);
 462     my $count = 0;
 463     for (my $i = 0; $i < @$str; $i++) {
 464         my $go_ahead = 0;
 465
 466         my @tmp = grep _ignorableAtLevel($_,$lev), $self->getWt($str->[$i]);
 467         $go_ahead += length $str->[$i];
 468
 469         # /*XXX*/ still broken.
 470         # index("e\x{300}", "e") should be 'no match' at level 2 or higher
 471         # as "e\x{300}" is a *single* grapheme cluster and not equal to "e".
 472
 473         # go ahead as far as we find a combining character;
 474         while ($i + 1 < @$str &&
 475               (! defined $str->[$i+1] || $comb->{ $str->[$i+1] }) ) {
 476             $i++;
 477             next if ! defined $str->[$i];
 478             $go_ahead += length $str->[$i];
 479             push @tmp,
 480                 grep _ignorableAtLevel($_,$lev), $self->getWt($str->[$i]);
 481         }
 482
 483         push @strWt, @tmp;
 484         push @strPt, ($count) x @tmp;
 485         $count += $go_ahead;
 486
 487         while (@strWt >= @subWt) {
 488             if (_eqArray(\@strWt, \@subWt, $lev)) {
 489                 my $pos = $strPt[0];
 490                 return wantarray ? ($pos, $count-$pos) : $pos;
 491             }
 492             shift @strWt;
 493             shift @strPt;
 494         }
 495     }
 496     return wantarray ? () : -1;
 497 }
 498
 499 ##
 500 ## bool _eqArray(arrayref, arrayref, level)
 501 ##
 502 sub _eqArray($$$)
 503 {
 504     my $a   = shift; # length $a >= length $b;
 505     my $b   = shift;
 506     my $lev = shift;
 507     for my $v (0..$lev-1) {
 508         for my $c (0..@$b-1){
 509             return if $a->[$c][$v] != $b->[$c][$v];
 510         }
 511     }
 512     return 1;
 513 }
 514
 515
 516 ##
 517 ## bool _ignorableAtLevel(CE, level)
 518 ##
 519 sub _ignorableAtLevel($$)
 520 {
 521     my $ce = shift;
 522     return unless defined $ce;
 523     my $lv = shift;
 524     return ! grep { ! $ce->[$_] } 0..$lv-1;
 525 }
 526
 527
 528 ##
 529 ## string sortkey = getSortKey(string arg)
 530 ##
 531 sub getSortKey
 532 {
 533     my $self = shift;
 534     my $lev  = $self->{level};
 535     my $rCE  = $self->splitCE(shift); # get an arrayref
 536     my $ver9 = $self->{UCA_Version} > 8;
 537     my $sht  = $self->{isShift};
 538
 539     # weight arrays
 540     my (@buf, $last_is_variable);
 541
 542     foreach my $ce (@$rCE) {
 543         my @t = $self->getWt($ce);
 544         if ($sht && $ver9) {
 545             if (@t == 1 && $t[0][0] == 0) {
 546                 if ($t[0][1] == 0 && $t[0][2] == 0) {
 547                     $last_is_variable = 1;
 548                 } else {
 549                     next if $last_is_variable;
 550                 }
 551             } else {
 552                 $last_is_variable = 0;
 553             }
 554         }
 555         push @buf, @t;
 556     }
 557
 558     # make sort key
 559     my @ret = ([],[],[],[]);
 560     foreach my $v (0..$lev-1) {
 561         foreach my $b (@buf) {
 562             push @{ $ret[$v] }, $b->[$v] if $b->[$v];
 563         }
 564     }
 565     foreach (@{ $self->{backwards} }) {
 566         my $v = $_ - 1;
 567         @{ $ret[$v] } = reverse @{ $ret[$v] };
 568     }
 569
 570     # modification of tertiary weights
 571     if ($self->{upper_before_lower}) {
 572         foreach (@{ $ret[2] }) {
 573             if    (0x8 <= $_ && $_ <= 0xC) { $_ -= 6 } # lower
 574             elsif (0x2 <= $_ && $_ <= 0x6) { $_ += 6 } # upper
 575             elsif ($_ == 0x1C)             { $_ += 1 } # square upper
 576             elsif ($_ == 0x1D)             { $_ -= 1 } # square lower
 577         }
 578     }
 579     if ($self->{katakana_before_hiragana}) {
 580         foreach (@{ $ret[2] }) {
 581             if    (0x0F <= $_ && $_ <= 0x13) { $_ -= 2 } # katakana
 582             elsif (0x0D <= $_ && $_ <= 0x0E) { $_ += 5 } # hiragana
 583         }
 584     }
 585     join "\0\0", map pack('n*', @$_), @ret;
 586 }
 587
 588
 589 ##
 590 ## int compare = cmp(string a, string b)
 591 ##
 592 sub cmp { $_[0]->getSortKey($_[1]) cmp $_[0]->getSortKey($_[2]) }
 593 sub eq  { $_[0]->getSortKey($_[1]) eq  $_[0]->getSortKey($_[2]) }
 594 sub ne  { $_[0]->getSortKey($_[1]) ne  $_[0]->getSortKey($_[2]) }
 595 sub lt  { $_[0]->getSortKey($_[1]) lt  $_[0]->getSortKey($_[2]) }
 596 sub le  { $_[0]->getSortKey($_[1]) le  $_[0]->getSortKey($_[2]) }
 597 sub gt  { $_[0]->getSortKey($_[1]) gt  $_[0]->getSortKey($_[2]) }
 598 sub ge  { $_[0]->getSortKey($_[1]) ge  $_[0]->getSortKey($_[2]) }
 599
 600 ##
 601 ## list[strings] sorted = sort(list[strings] arg)
 602 ##
 603 sub sort {
 604     my $obj = shift;
 605     return
 606         map { $_->[1] }
 607             sort{ $a->[0] cmp $b->[0] }
 608                 map [ $obj->getSortKey($_), $_ ], @_;
 609 }
 610
 611
 612 sub derivCE_9 {
 613     my $u = shift;
 614     my $base =
 615         (0x4E00 <= $u && $u <= 0x9FA5) # CJK
 616             ? 0xFB40 :
 617         (0x3400 <= $u && $u <= 0x4DB5 || 0x20000 <= $u && $u <= 0x2A6D6)
 618             ? 0xFB80 : 0xFBC0;
 619
 620     my $aaaa = $base + ($u >> 15);
 621     my $bbbb = ($u & 0x7FFF) | 0x8000;
 622     return
 623         pack(VCE_FORMAT, NON_VAR, $aaaa, Min2, Min3, $u),
 624         pack(VCE_FORMAT, NON_VAR, $bbbb,    0,    0, $u);
 625 }
 626
 627 sub derivCE_8 {
 628     my $code = shift;
 629     my $aaaa =  0xFF80 + ($code >> 15);
 630     my $bbbb = ($code & 0x7FFF) | 0x8000;
 631     return
 632         pack(VCE_FORMAT, NON_VAR, $aaaa, 2, 1, $code),
 633         pack(VCE_FORMAT, NON_VAR, $bbbb, 0, 0, $code);
 634 }
 635
 636 sub broken_derivCE { # NG
 637     my $code = shift;
 638     my $aaaa = 0xFFC2 + ($code >> 15);
 639     my $bbbb = $code & 0x7FFF | 0x1000;
 640     return
 641         pack(VCE_FORMAT, NON_VAR, $aaaa, 2, 1, $code),
 642         pack(VCE_FORMAT, NON_VAR, $bbbb, 0, 0, $code);
 643 }
 644
 645 ##
 646 ## "hhhh hhhh hhhh" to (dddd, dddd, dddd)
 647 ##
 648 sub _getHexArray { map hex, $_[0] =~ /([0-9a-fA-F]+)/g }
 649
 650 #
 651 # $code must be in Hangul syllable.
 652 # Check it before you enter here.
 653 #
 654 sub _decompHangul {
 655     my $code = shift;
 656     my $SIndex = $code - 0xAC00;
 657     my $LIndex = int( $SIndex / 588);
 658     my $VIndex = int(($SIndex % 588) / 28);
 659     my $TIndex =      $SIndex % 28;
 660     return (
 661         0x1100 + $LIndex,
 662         0x1161 + $VIndex,
 663         $TIndex ? (0x11A7 + $TIndex) : (),
 664     );
 665 }
 666
 667 1;
 668 __END__
 669
 670 =head1 NAME
 671
 672 Unicode::Collate - Unicode Collation Algorithm
 673
 674 =head1 SYNOPSIS
 675
 676   use Unicode::Collate;
 677
 678   #construct
 679   $Collator = Unicode::Collate->new(%tailoring);
 680
 681   #sort
 682   @sorted = $Collator->sort(@not_sorted);
 683
 684   #compare
 685   $result = $Collator->cmp($a, $b); # returns 1, 0, or -1.
 686
 687 =head1 DESCRIPTION
 688
 689 =head2 Constructor and Tailoring
 690
 691 The C<new> method returns a collator object.
 692
 693    $Collator = Unicode::Collate->new(
 694       UCA_Version => $UCA_Version,
 695       alternate => $alternate,
 696       backwards => $levelNumber, # or \@levelNumbers
 697       entry => $element,
 698       normalization  => $normalization_form,
 699       ignoreName => qr/$ignoreName/,
 700       ignoreChar => qr/$ignoreChar/,
 701       katakana_before_hiragana => $bool,
 702       level => $collationLevel,
 703       overrideCJK => \&overrideCJK,
 704       overrideHangul => \&overrideHangul,
 705       preprocess => \&preprocess,
 706       rearrange => \@charList,
 707       table => $filename,
 708       undefName => qr/$undefName/,
 709       undefChar => qr/$undefChar/,
 710       upper_before_lower => $bool,
 711    );
 712    # if %tailoring is false (i.e. empty),
 713    # $Collator should do the default collation.
 714
 715 =over 4
 716
 717 =item UCA_Version
 718
 719 If the version number of the older UCA is given,
 720 the older behavior of that version is emulated on collating.
 721 If omitted, the return value of C<UCA_Version()> is used.
 722
 723 The supported version: 8 or 9.
 724
 725 B<This parameter may be removed in the future version,
 726 as switching the algorithm would affect the performance.>
 727
 728 =item alternate
 729
 730 -- see 3.2.2 Alternate Weighting, UTR #10.
 731
 732 This key allows to alternate weighting for variable collation elements,
 733 which are marked with an ASTERISK in the table
 734 (NOTE: Many punction marks and symbols are variable in F<allkeys.txt>).
 735
 736    alternate => 'blanked', 'non-ignorable', 'shifted', or 'shift-trimmed'.
 737
 738 These names are case-insensitive.
 739 By default (if specification is omitted), 'shifted' is adopted.
 740
 741    'Blanked'        Variable elements are ignorable at levels 1 through 3;
 742                     considered at the 4th level.
 743
 744    'Non-ignorable'  Variable elements are not reset to ignorable.
 745
 746    'Shifted'        Variable elements are ignorable at levels 1 through 3
 747                     their level 4 weight is replaced by the old level 1 weight.
 748                     Level 4 weight for Non-Variable elements is 0xFFFF.
 749
 750    'Shift-Trimmed'  Same as 'shifted', but all FFFF's at the 4th level
 751                     are trimmed.
 752
 753 =item backwards
 754
 755 -- see 3.1.2 French Accents, UTR #10.
 756
 757      backwards => $levelNumber or \@levelNumbers
 758
 759 Weights in reverse order; ex. level 2 (diacritic ordering) in French.
 760 If omitted, forwards at all the levels.
 761
 762 =item entry
 763
 764 -- see 3.1 Linguistic Features; 3.2.1 File Format, UTR #10.
 765
 766 Overrides a default order or defines additional collation elements
 767
 768   entry => <<'ENTRIES', # use the UCA file format
 769 00E6 ; [.0861.0020.0002.00E6] [.08B1.0020.0002.00E6] # ligature <ae> as <a><e>
 770 0063 0068 ; [.0893.0020.0002.0063]      # "ch" in traditional Spanish
 771 0043 0068 ; [.0893.0020.0008.0043]      # "Ch" in traditional Spanish
 772 ENTRIES
 773
 774 =item ignoreName
 775
 776 =item ignoreChar
 777
 778 -- see Completely Ignorable, 3.2.2 Alternate Weighting, UTR #10.
 779
 780 Makes the entry in the table ignorable.
 781 If a collation element is ignorable,
 782 it is ignored as if the element had been deleted from there.
 783
 784 E.g. when 'a' and 'e' are ignorable,
 785 'element' is equal to 'lament' (or 'lmnt').
 786
 787 =item level
 788
 789 -- see 4.3 Form a sort key for each string, UTR #10.
 790
 791 Set the maximum level.
 792 Any higher levels than the specified one are ignored.
 793
 794   Level 1: alphabetic ordering
 795   Level 2: diacritic ordering
 796   Level 3: case ordering
 797   Level 4: tie-breaking (e.g. in the case when alternate is 'shifted')
 798
 799   ex.level => 2,
 800
 801 If omitted, the maximum is the 4th.
 802
 803 =item normalization
 804
 805 -- see 4.1 Normalize each input string, UTR #10.
 806
 807 If specified, strings are normalized before preparation of sort keys
 808 (the normalization is executed after preprocess).
 809
 810 As a form name, one of the following names must be used.
 811
 812   'C'  or 'NFC'  for Normalization Form C
 813   'D'  or 'NFD'  for Normalization Form D
 814   'KC' or 'NFKC' for Normalization Form KC
 815   'KD' or 'NFKD' for Normalization Form KD
 816
 817 If omitted, the string is put into Normalization Form D.
 818
 819 If C<undef> is passed explicitly as the value for this key,
 820 any normalization is not carried out (this may make tailoring easier
 821 if any normalization is not desired).
 822
 823 see B<CAVEAT>.
 824
 825 =item overrideCJK
 826
 827 -- see 7.1 Derived Collation Elements, UTR #10.
 828
 829 By default, mapping of CJK Unified Ideographs
 830 uses the Unicode codepoint order.
 831 But the mapping of CJK Unified Ideographs may be overrided.
 832
 833 ex. CJK Unified Ideographs in the JIS code point order.
 834
 835   overrideCJK => sub {
 836       my $u = shift;             # get a Unicode codepoint
 837       my $b = pack('n', $u);     # to UTF-16BE
 838       my $s = your_unicode_to_sjis_converter($b); # convert
 839       my $n = unpack('n', $s);   # convert sjis to short
 840       [ $n, 0x20, 0x2, $u ];     # return the collation element
 841   },
 842
 843 ex. ignores all CJK Unified Ideographs.
 844
 845   overrideCJK => sub {()}, # CODEREF returning empty list
 846
 847    # where ->eq("Pe\x{4E00}rl", "Perl") is true
 848    # as U+4E00 is a CJK Unified Ideograph and to be ignorable.
 849
 850 If C<undef> is passed explicitly as the value for this key,
 851 weights for CJK Unified Ideographs are treated as undefined.
 852 But assignment of weight for CJK Unified Ideographs
 853 in table or L<entry> is still valid.
 854
 855 =item overrideHangul
 856
 857 -- see 7.1 Derived Collation Elements, UTR #10.
 858
 859 By default, Hangul Syllables are decomposed into Hangul Jamo.
 860 But the mapping of Hangul Syllables may be overrided.
 861
 862 This tag works like L<overrideCJK>, so see there for examples.
 863
 864 If you want to override the mapping of Hangul Syllables,
 865 the Normalization Forms D and KD are not appropriate
 866 (they will be decomposed before overriding).
 867
 868 If C<undef> is passed explicitly as the value for this key,
 869 weight for Hangul Syllables is treated as undefined
 870 without decomposition into Hangul Jamo.
 871 But definition of weight for Hangul Syllables
 872 in table or L<entry> is still valid.
 873
 874 =item preprocess
 875
 876 -- see 5.1 Preprocessing, UTR #10.
 877
 878 If specified, the coderef is used to preprocess
 879 before the formation of sort keys.
 880
 881 ex. dropping English articles, such as "a" or "the".
 882 Then, "the pen" is before "a pencil".
 883
 884      preprocess => sub {
 885            my $str = shift;
 886            $str =~ s/\b(?:an?|the)\s+//gi;
 887            $str;
 888         },
 889
 890 =item rearrange
 891
 892 -- see 3.1.3 Rearrangement, UTR #10.
 893
 894 Characters that are not coded in logical order and to be rearranged.
 895 By default,
 896
 897     rearrange => [ 0x0E40..0x0E44, 0x0EC0..0x0EC4 ],
 898
 899 If you want to disallow any rearrangement,
 900 pass C<undef> or C<[]> (a reference to an empty list)
 901 as the value for this key.
 902
 903 B<According to the version 9 of UCA, this parameter shall not be used;
 904 but it is not warned at present.>
 905
 906 =item table
 907
 908 -- see 3.2 Default Unicode Collation Element Table, UTR #10.
 909
 910 You can use another element table if desired.
 911 The table file must be in your C<lib/Unicode/Collate> directory.
 912
 913 By default, the file C<lib/Unicode/Collate/allkeys.txt> is used.
 914
 915 If C<undef> is passed explicitly as the value for this key,
 916 no file is read (but you can define collation elements via L<entry>).
 917
 918 A typical way to define a collation element table
 919 without any file of table:
 920
 921    $onlyABC = Unicode::Collate->new(
 922        table => undef,
 923        entry => << 'ENTRIES',
 924 0061 ; [.0101.0020.0002.0061] # LATIN SMALL LETTER A
 925 0041 ; [.0101.0020.0008.0041] # LATIN CAPITAL LETTER A
 926 0062 ; [.0102.0020.0002.0062] # LATIN SMALL LETTER B
 927 0042 ; [.0102.0020.0008.0042] # LATIN CAPITAL LETTER B
 928 0063 ; [.0103.0020.0002.0063] # LATIN SMALL LETTER C
 929 0043 ; [.0103.0020.0008.0043] # LATIN CAPITAL LETTER C
 930 ENTRIES
 931     );
 932
 933 =item undefName
 934
 935 =item undefChar
 936
 937 -- see 6.3.4 Reducing the Repertoire, UTR #10.
 938
 939 Undefines the collation element as if it were unassigned in the table.
 940 This reduces the size of the table.
 941 If an unassigned character appears in the string to be collated,
 942 the sort key is made from its codepoint
 943 as a single-character collation element,
 944 as it is greater than any other assigned collation elements
 945 (in the codepoint order among the unassigned characters).
 946 But, it'd be better to ignore characters
 947 unfamiliar to you and maybe never used.
 948
 949 =item katakana_before_hiragana
 950
 951 =item upper_before_lower
 952
 953 -- see 6.6 Case Comparisons; 7.3.1 Tertiary Weight Table, UTR #10.
 954
 955 By default, lowercase is before uppercase
 956 and hiragana is before katakana.
 957
 958 If the tag is made true, this is reversed.
 959
 960 B<NOTE>: These tags simplemindedly assume
 961 any lowercase/uppercase or hiragana/katakana distinctions
 962 should occur in level 3, and their weights at level 3
 963 should be same as those mentioned in 7.3.1, UTR #10.
 964 If you define your collation elements which violates this,
 965 these tags doesn't work validly.
 966
 967 =back
 968
 969 =head2 Methods for Collation
 970
 971 =over 4
 972
 973 =item C<@sorted = $Collator-E<gt>sort(@not_sorted)>
 974
 975 Sorts a list of strings.
 976
 977 =item C<$result = $Collator-E<gt>cmp($a, $b)>
 978
 979 Returns 1 (when C<$a> is greater than C<$b>)
 980 or 0 (when C<$a> is equal to C<$b>)
 981 or -1 (when C<$a> is lesser than C<$b>).
 982
 983 =item C<$result = $Collator-E<gt>eq($a, $b)>
 984
 985 =item C<$result = $Collator-E<gt>ne($a, $b)>
 986
 987 =item C<$result = $Collator-E<gt>lt($a, $b)>
 988
 989 =item C<$result = $Collator-E<gt>le($a, $b)>
 990
 991 =item C<$result = $Collator-E<gt>gt($a, $b)>
 992
 993 =item C<$result = $Collator-E<gt>ge($a, $b)>
 994
 995 They works like the same name operators as theirs.
 996
 997    eq : whether $a is equal to $b.
 998    ne : whether $a is not equal to $b.
 999    lt : whether $a is lesser than $b.
1000    le : whether $a is lesser than $b or equal to $b.
1001    gt : whether $a is greater than $b.
1002    ge : whether $a is greater than $b or equal to $b.
1003
1004 =item C<$sortKey = $Collator-E<gt>getSortKey($string)>
1005
1006 -- see 4.3 Form a sort key for each string, UTR #10.
1007
1008 Returns a sort key.
1009
1010 You compare the sort keys using a binary comparison
1011 and get the result of the comparison of the strings using UCA.
1012
1013    $Collator->getSortKey($a) cmp $Collator->getSortKey($b)
1014
1015       is equivalent to
1016
1017    $Collator->cmp($a, $b)
1018
1019 =item C<$sortKeyForm = $Collator-E<gt>viewSortKey($string)>
1020
1021    use Unicode::Collate;
1022    my $c = Unicode::Collate->new();
1023    print $c->viewSortKey("Perl"),"\n";
1024
1025    # output:
1026    # [0B67 0A65 0B7F 0B03 | 0020 0020 0020 0020 | 0008 0002 0002 0002 | FFFF FFFF FFFF FFFF]
1027    #  Level 1               Level 2               Level 3               Level 4
1028
1029     (If C<UCA_Version> is 8, the output is slightly different.)
1030
1031 =item C<$position = $Collator-E<gt>index($string, $substring)>
1032
1033 =item C<($position, $length) = $Collator-E<gt>index($string, $substring)>
1034
1035 -- see 6.8 Searching, UTR #10.
1036
1037 If C<$substring> matches a part of C<$string>, returns
1038 the position of the first occurrence of the matching part in scalar context;
1039 in list context, returns a two-element list of
1040 the position and the length of the matching part.
1041
1042 B<Notice> that the length of the matching part may differ from
1043 the length of C<$substring>.
1044
1045 B<Note> that the position and the length are counted on the string
1046 after the process of preprocess, normalization, and rearrangement.
1047 Therefore, in case the specified string is not binary equal to
1048 the preprocessed/normalized/rearranged string, the position and the length
1049 may differ form those on the specified string. But it is guaranteed
1050 that, if matched, it returns a non-negative value as C<$position>.
1051
1052 If C<$substring> does not match any part of C<$string>,
1053 returns C<-1> in scalar context and
1054 an empty list in list context.
1055
1056 e.g. you say
1057
1058   my $Collator = Unicode::Collate->new( normalization => undef, level => 1 );
1059   my $str = "Ich mu\x{00DF} studieren.";
1060   my $sub = "m\x{00FC}ss";
1061   my $match;
1062   if (my($pos,$len) = $Collator->index($str, $sub)) {
1063       $match = substr($str, $pos, $len);
1064   }
1065
1066 and get C<"mu\x{00DF}"> in C<$match> since C<"mu>E<223>C<">
1067 is primary equal to C<"m>E<252>C<ss">.
1068
1069 =back
1070
1071 =head2 Other Methods
1072
1073 =over 4
1074
1075 =item C<%old_tailoring = $Collator-E<gt>change(%new_tailoring)>
1076
1077 Change the value of specified keys and returns the changed part.
1078
1079     $Collator = Unicode::Collate->new(level => 4);
1080
1081     $Collator->eq("perl", "PERL"); # false
1082
1083     %old = $Collator->change(level => 2); # returns (level => 4).
1084
1085     $Collator->eq("perl", "PERL"); # true
1086
1087     $Collator->change(%old); # returns (level => 2).
1088
1089     $Collator->eq("perl", "PERL"); # false
1090
1091 Not all C<(key,value)>s are allowed to be changed.
1092 See also C<@Unicode::Collate::ChangeOK> and C<@Unicode::Collate::ChangeNG>.
1093
1094 In the scalar context, returns the modified collator
1095 (but it is B<not> a clone from the original).
1096
1097     $Collator->change(level => 2)->eq("perl", "PERL"); # true
1098
1099     $Collator->eq("perl", "PERL"); # true; now max level is 2nd.
1100
1101     $Collator->change(level => 4)->eq("perl", "PERL"); # false
1102
1103 =item UCA_Version
1104
1105 Returns the version number of Unicode Technical Standard 10
1106 this module consults.
1107
1108 =item Base_Unicode_Version
1109
1110 Returns the version number of the Unicode Standard
1111 this module is based on.
1112
1113 =back
1114
1115 =head2 EXPORT
1116
1117 None by default.
1118
1119 =head2 TODO
1120
1121 Unicode::Collate has not been ported to EBCDIC.  The code mostly would
1122 work just fine but a decision needs to be made: how the module should
1123 work in EBCDIC?  Should the low 256 characters be understood as
1124 Unicode or as EBCDIC code points?  Should one be chosen or should
1125 there be a way to do either?  Or should such translation be left
1126 outside the module for the user to do, for example by using
1127 Encode::from_to()?
1128 (or utf8::unicode_to_native()/utf8::native_to_unicode()?)
1129
1130 =head2 CAVEAT
1131
1132 Use of the C<normalization> parameter requires
1133 the B<Unicode::Normalize> module.
1134
1135 If you need not it (say, in the case when you need not
1136 handle any combining characters),
1137 assign C<normalization =E<gt> undef> explicitly.
1138
1139 -- see 6.5 Avoiding Normalization, UTR #10.
1140
1141 =head2 Conformance Test
1142
1143 The Conformance Test for the UCA is provided
1144 in L<http://www.unicode.org/reports/tr10/CollationTest.html>
1145 and L<http://www.unicode.org/reports/tr10/CollationTest.zip>
1146
1147 For F<CollationTest_SHIFTED.txt>,
1148 a collator via C<Unicode::Collate-E<gt>new( )> should be used;
1149 for F<CollationTest_NON_IGNORABLE.txt>, a collator via
1150 C<Unicode::Collate-E<gt>new(alternate =E<gt> "non-ignorable", level =E<gt> 3)>.
1151
1152 B<Unicode::Normalize is required to try this test.>
1153
1154 =head2 BUGS
1155
1156 C<index()> is an experimental method and
1157 its return value may be unreliable.
1158 The correct implementation for C<index()> must be based
1159 on Locale-Sensitive Support: Level 3 in UTR #18,
1160 F<Unicode Regular Expression Guidelines>.
1161
1162 See also 4.2 Locale-Dependent Graphemes in UTR #18.
1163
1164 =head1 AUTHOR
1165
1166 SADAHIRO Tomoyuki, E<lt>SADAHIRO@cpan.orgE<gt>
1167
1168   http://homepage1.nifty.com/nomenclator/perl/
1169
1170   Copyright(C) 2001-2002, SADAHIRO Tomoyuki. Japan. All rights reserved.
1171
1172   This library is free software; you can redistribute it
1173   and/or modify it under the same terms as Perl itself.
1174
1175 =head1 SEE ALSO
1176
1177 =over 4
1178
1179 =item http://www.unicode.org/reports/tr10/
1180
1181 Unicode Collation Algorithm - UTR #10
1182
1183 =item http://www.unicode.org/reports/tr10/allkeys.txt
1184
1185 The Default Unicode Collation Element Table
1186
1187 =item http://www.unicode.org/reports/tr10/CollationTest.html
1188 http://www.unicode.org/reports/tr10/CollationTest.zip
1189
1190 The latest versions of the conformance test for the UCA
1191
1192 =item http://www.unicode.org/reports/tr15/
1193
1194 Unicode Normalization Forms - UAX #15
1195
1196 =item http://www.unicode.org/reports/tr18
1197
1198 Unicode Regular Expression Guidelines - UTR #18
1199
1200 =item L<Unicode::Normalize>
1201
1202 =back
1203
1204 =cut