lib/Unicode/Collate.pm

   1 package Unicode::Collate;
   2
   3 BEGIN {
   4     if (ord("A") == 193) {
   5         die "Unicode::Collate not ported to EBCDIC\n";
   6     }
   7 }
   8
   9 use 5.006;
  10 use strict;
  11 use warnings;
  12 use Carp;
  13 use File::Spec;
  14
  15 require Exporter;
  16
  17 our $VERSION = '0.21';
  18 our $PACKAGE = __PACKAGE__;
  19
  20 our @ISA = qw(Exporter);
  21
  22 our %EXPORT_TAGS = ();
  23 our @EXPORT_OK = ();
  24 our @EXPORT = ();
  25
  26 (our $Path = $INC{'Unicode/Collate.pm'}) =~ s/\.pm$//;
  27 our $KeyFile = "allkeys.txt";
  28
  29 our $UNICODE_VERSION;
  30
  31 eval { require Unicode::UCD };
  32
  33 unless ($@) {
  34     $UNICODE_VERSION = Unicode::UCD::UnicodeVersion();
  35 }
  36 else { # XXX, Perl 5.6.1
  37     my($f, $fh);
  38     foreach my $d (@INC) {
  39         $f = File::Spec->catfile($d, "unicode", "Unicode.301");
  40         if (open($fh, $f)) {
  41             $UNICODE_VERSION = '3.0.1';
  42             close $fh;
  43             last;
  44         }
  45     }
  46 }
  47
  48 our $getCombinClass; # coderef for combining class from Unicode::Normalize
  49
  50 use constant Min2   => 0x20;    # minimum weight at level 2
  51 use constant Min3   => 0x02;    # minimum weight at level 3
  52
  53 # format for pack
  54 use constant VCE_FORMAT => 'Cn4'; # for variable + CE with 4 levels
  55
  56 # values of variable
  57 use constant NON_VAR => 0; # Non-Variable character
  58 use constant VAR     => 1; # Variable character
  59
  60 our $DefaultRearrange = [ 0x0E40..0x0E44, 0x0EC0..0x0EC4 ];
  61
  62 sub UCA_Version { "9" }
  63
  64 sub Base_Unicode_Version { $UNICODE_VERSION || 'unknown' }
  65
  66 my (%AlternateOK);
  67 @AlternateOK{ qw/
  68     blanked  non-ignorable  shifted  shift-trimmed
  69   / } = ();
  70
  71 our @ChangeOK = qw/
  72     alternate backwards level normalization rearrange
  73     katakana_before_hiragana upper_before_lower
  74     overrideHangul overrideCJK preprocess UCA_Version
  75   /;
  76
  77 our @ChangeNG = qw/
  78     entry entries table combining maxlength
  79     ignoreChar ignoreName undefChar undefName
  80     versionTable alternateTable backwardsTable forwardsTable rearrangeTable
  81     derivCode normCode rearrangeHash isShift L3ignorable
  82   /;
  83 # The hash key 'ignored' is deleted at VERSION 0.21.
  84
  85 my (%ChangeOK, %ChangeNG);
  86 @ChangeOK{ @ChangeOK } = ();
  87 @ChangeNG{ @ChangeNG } = ();
  88
  89 sub change {
  90     my $self = shift;
  91     my %hash = @_;
  92     my %old;
  93     foreach my $k (keys %hash) {
  94         if (exists $ChangeOK{$k}) {
  95             $old{$k} = $self->{$k};
  96             $self->{$k} = $hash{$k};
  97         }
  98         elsif (exists $ChangeNG{$k}) {
  99             croak "change of $k via change() is not allowed!";
 100         }
 101         # else => ignored
 102     }
 103     $self->checkCollator;
 104     return wantarray ? %old : $self;
 105 }
 106
 107 sub checkCollator {
 108     my $self = shift;
 109     croak "Illegal level lower than 1 (passed $self->{level})."
 110         if $self->{level} < 1;
 111     croak "A level higher than 4 (passed $self->{level}) is not supported."
 112         if 4 < $self->{level};
 113
 114     $self->{derivCode} =
 115         $self->{UCA_Version} == -1 ? \&broken_derivCE :
 116         $self->{UCA_Version} ==  8 ? \&derivCE_8 :
 117         $self->{UCA_Version} ==  9 ? \&derivCE_9 :
 118       croak "Illegal UCA version (passed $self->{UCA_Version}).";
 119
 120     $self->{alternate} = lc($self->{alternate});
 121     croak "$PACKAGE unknown alternate tag name: $self->{alternate}"
 122         unless exists $AlternateOK{ $self->{alternate} };
 123
 124     $self->{isShift} = $self->{alternate} eq 'shifted' ||
 125                 $self->{alternate} eq 'shift-trimmed';
 126
 127     $self->{backwards} = []
 128         if ! defined $self->{backwards};
 129     $self->{backwards} = [ $self->{backwards} ]
 130         if ! ref $self->{backwards};
 131
 132     $self->{rearrange} = []
 133         if ! defined $self->{rearrange};
 134     croak "$PACKAGE: A list for rearrangement must be store in an ARRAYREF"
 135         if ! ref $self->{rearrange};
 136
 137     # keys of $self->{rearrangeHash} are $self->{rearrange}.
 138     $self->{rearrangeHash} = undef;
 139
 140     if (@{ $self->{rearrange} }) {
 141         @{ $self->{rearrangeHash} }{ @{ $self->{rearrange} } } = ();
 142     }
 143
 144     $self->{normCode} = undef;
 145
 146     if (defined $self->{normalization}) {
 147         eval { require Unicode::Normalize };
 148         croak "Unicode/Normalize.pm is required to normalize strings: $@"
 149             if $@;
 150
 151         Unicode::Normalize->import();
 152         $getCombinClass = \&Unicode::Normalize::getCombinClass
 153             if ! $getCombinClass;
 154
 155         $self->{normCode} =
 156             $self->{normalization} =~ /^(?:NF)?C$/  ? \&NFC :
 157             $self->{normalization} =~ /^(?:NF)?D$/  ? \&NFD :
 158             $self->{normalization} =~ /^(?:NF)?KC$/ ? \&NFKC :
 159             $self->{normalization} =~ /^(?:NF)?KD$/ ? \&NFKD :
 160           croak "$PACKAGE unknown normalization form name: "
 161                 . $self->{normalization};
 162     }
 163     return;
 164 }
 165
 166 sub new
 167 {
 168     my $class = shift;
 169     my $self = bless { @_ }, $class;
 170
 171     # If undef is passed explicitly, no file is read.
 172     $self->{table} = $KeyFile if ! exists $self->{table};
 173     $self->read_table if defined $self->{table};
 174
 175     if ($self->{entry}) {
 176         $self->parseEntry($_) foreach split /\n/, $self->{entry};
 177     }
 178
 179     $self->{level} ||= 4;
 180     $self->{UCA_Version} ||= UCA_Version();
 181
 182     $self->{overrideHangul} = ''
 183         if ! exists $self->{overrideHangul};
 184     $self->{overrideCJK} = ''
 185         if ! exists $self->{overrideCJK};
 186     $self->{normalization} = 'D'
 187         if ! exists $self->{normalization};
 188     $self->{alternate} = $self->{alternateTable} || 'shifted'
 189         if ! exists $self->{alternate};
 190     $self->{rearrange} = $self->{rearrangeTable} || $DefaultRearrange
 191         if ! exists $self->{rearrange};
 192     $self->{backwards} = $self->{backwardsTable}
 193         if ! exists $self->{backwards};
 194
 195     $self->checkCollator;
 196
 197     return $self;
 198 }
 199
 200 sub read_table {
 201     my $self = shift;
 202     my $file = $self->{table} ne '' ? $self->{table} : $KeyFile;
 203
 204     my $filepath = File::Spec->catfile($Path, $file);
 205     open my $fk, "<$filepath"
 206         or croak "File does not exist at $filepath";
 207
 208     while (<$fk>) {
 209         next if /^\s*#/;
 210         if (/^\s*\@/) {
 211             if    (/^\s*\@version\s*(\S*)/) {
 212                 $self->{versionTable} ||= $1;
 213             }
 214             elsif (/^\s*\@alternate\s+(\S*)/) {
 215                 $self->{alternateTable} ||= $1;
 216             }
 217             elsif (/^\s*\@backwards\s+(\S*)/) {
 218                 push @{ $self->{backwardsTable} }, $1;
 219             }
 220             elsif (/^\s*\@forwards\s+(\S*)/) { # parhaps no use
 221                 push @{ $self->{forwardsTable} }, $1;
 222             }
 223             elsif (/^\s*\@rearrange\s+(.*)/) { # (\S*) is NG
 224                 push @{ $self->{rearrangeTable} }, _getHexArray($1);
 225             }
 226             next;
 227         }
 228         $self->parseEntry($_);
 229     }
 230     close $fk;
 231 }
 232
 233
 234 ##
 235 ## get $line, parse it, and write an entry in $self
 236 ##
 237 sub parseEntry
 238 {
 239     my $self = shift;
 240     my $line = shift;
 241     my($name, $ele, @key);
 242
 243     return if $line !~ /^\s*[0-9A-Fa-f]/;
 244
 245     # removes comment and gets name
 246     $name = $1
 247         if $line =~ s/[#%]\s*(.*)//;
 248     return if defined $self->{undefName} && $name =~ /$self->{undefName}/;
 249
 250     # gets element
 251     my($e, $k) = split /;/, $line;
 252     croak "Wrong Entry: <charList> must be separated by ';' from <collElement>"
 253         if ! $k;
 254
 255     my @e = _getHexArray($e);
 256     return if !@e;
 257
 258     $ele = pack('U*', @e);
 259     return if defined $self->{undefChar} && $ele =~ /$self->{undefChar}/;
 260
 261     my $combining = 1; # primary = 0, secondary != 0;
 262     my $level3ignore;
 263
 264     # replace with completely ignorable
 265     if (defined $self->{ignoreName} && $name =~ /$self->{ignoreName}/ ||
 266         defined $self->{ignoreChar} && $ele  =~ /$self->{ignoreChar}/)
 267     {
 268         $k = '[.0000.0000.0000.0000]';
 269     }
 270
 271     foreach my $arr ($k =~ /\[([^\[\]]+)\]/g) { # SPACEs allowed
 272         my $var = $arr =~ /\*/; # exactly /^\*/ but be lenient.
 273         my @arr = _getHexArray($arr);
 274         push @key, pack(VCE_FORMAT, $var, @arr);
 275         $combining = 0 unless $arr[0] == 0 && $arr[1] != 0;
 276         $level3ignore = 1 if $arr[0] == 0 && $arr[1] == 0 && $arr[2] == 0;
 277     }
 278
 279     $self->{entries}{$ele} = \@key;
 280
 281     $self->{combining}{$ele} = 1
 282         if $combining;
 283
 284     $self->{L3ignorable}{$e[0]} = 1
 285         if @e == 1 && $level3ignore;
 286
 287     $self->{maxlength}{ord $ele} = scalar @e if @e > 1;
 288 }
 289
 290 ##
 291 ## arrayref CE = altCE(bool variable?, list[num] weights)
 292 ##
 293 sub altCE
 294 {
 295     my $self = shift;
 296     my($var, @c) = unpack(VCE_FORMAT, shift);
 297
 298     $self->{alternate} eq 'blanked' ?
 299         $var ? [0,0,0,$c[3]] : \@c :
 300     $self->{alternate} eq 'non-ignorable' ?
 301         \@c :
 302     $self->{alternate} eq 'shifted' ?
 303         $var ? [0,0,0,$c[0] ] : [ @c[0..2], $c[0]+$c[1]+$c[2] ? 0xFFFF : 0 ] :
 304     $self->{alternate} eq 'shift-trimmed' ?
 305         $var ? [0,0,0,$c[0] ] : [ @c[0..2], 0 ] :
 306         croak "$PACKAGE unknown alternate name: $self->{alternate}";
 307 }
 308
 309 sub viewSortKey
 310 {
 311     my $self = shift;
 312     my $ver = $self->{UCA_Version};
 313
 314     my $key  = $self->getSortKey(@_);
 315     my $view = join " ", map sprintf("%04X", $_), unpack 'n*', $key;
 316     if ($ver <= 8) {
 317         $view =~ s/ ?0000 ?/|/g;
 318     } else {
 319         $view =~ s/\b0000\b/|/g;
 320     }
 321     return "[$view]";
 322 }
 323
 324
 325 ##
 326 ## list[strings] elements = splitCE(string arg)
 327 ##
 328 sub splitCE
 329 {
 330     my $self = shift;
 331     my $code = $self->{preprocess};
 332     my $norm = $self->{normCode};
 333     my $ent  = $self->{entries};
 334     my $max  = $self->{maxlength};
 335     my $reH  = $self->{rearrangeHash};
 336     my $L3i  = $self->{L3ignorable};
 337     my $ver9 = $self->{UCA_Version} > 8;
 338
 339     my $str = ref $code ? &$code(shift) : shift;
 340     $str = &$norm($str) if ref $norm;
 341
 342     my @src = unpack('U*', $str);
 343     my @buf;
 344
 345     # rearrangement
 346     if ($reH) {
 347         for (my $i = 0; $i < @src; $i++) {
 348             if (exists $reH->{ $src[$i] } && $i + 1 < @src) {
 349                 ($src[$i], $src[$i+1]) = ($src[$i+1], $src[$i]);
 350                 $i++;
 351             }
 352         }
 353     }
 354
 355     if ($ver9) {
 356         @src = grep ! $L3i->{$_}, @src;
 357     }
 358
 359     for (my $i = 0; $i < @src; $i++) {
 360         my $ch;
 361         my $u = $src[$i];
 362
 363         # non-characters
 364         next if ! defined $u
 365             || ($u < 0 || 0x10FFFF < $u)      # out of range
 366             || (($u & 0xFFFE) == 0xFFFE)      # ??FFFE or ??FFFF (cf. utf8.c)
 367             || (0xD800 <= $u && $u <= 0xDFFF) # unpaired surrogates
 368             || (0xFDD0 <= $u && $u <= 0xFDEF) # non-character
 369         ;
 370
 371         if ($max->{$u}) { # contract
 372             for (my $j = $max->{$u}; $j >= 1; $j--) {
 373                 next unless $i+$j-1 < @src;
 374                 $ch = pack 'U*', @src[$i .. $i+$j-1];
 375                 $i += $j-1, last if $ent->{$ch};
 376             }
 377         } else {
 378             $ch = pack('U', $u);
 379         }
 380
 381         # with Combining Char (UTS#10, 4.2.1), here requires Unicode::Normalize.
 382         if ($getCombinClass && defined $ch) {
 383             for (my $j = $i+1; $j < @src; $j++) {
 384                 next unless defined $src[$j];
 385                 last unless $getCombinClass->( $src[$j] );
 386                 my $comb = pack 'U', $src[$j];
 387                 next if ! $ent->{ $ch.$comb };
 388                 $ch .= $comb;
 389                 $src[$j] = undef;
 390             }
 391         }
 392         push @buf, $ch;
 393     }
 394     wantarray ? @buf : \@buf;
 395 }
 396
 397
 398 ##
 399 ## list[arrayrefs] weight = getWt(string element)
 400 ##
 401 sub getWt
 402 {
 403     my $self = shift;
 404     my $ch   = shift;
 405     my $ent  = $self->{entries};
 406     my $cjk  = $self->{overrideCJK};
 407     my $hang = $self->{overrideHangul};
 408     my $der  = $self->{derivCode};
 409
 410     return if !defined $ch;
 411     return map($self->altCE($_), @{ $ent->{$ch} })
 412         if $ent->{$ch};
 413
 414     my $u = unpack('U', $ch);
 415
 416     if (0xAC00 <= $u && $u <= 0xD7A3) { # is_Hangul
 417         return map $self->altCE($_),
 418             $hang
 419                 ? map(pack(VCE_FORMAT, NON_VAR, @$_), &$hang($u))
 420                 : defined $hang
 421                     ? map({
 422                             my $v = $_;
 423                             my $vCE = $ent->{pack('U', $v)};
 424                             $vCE ? @$vCE : $der->($v);
 425                         } _decompHangul($u))
 426                     : $der->($u);
 427     }
 428     elsif (0x3400 <= $u && $u <= 0x4DB5 ||
 429            0x4E00 <= $u && $u <= 0x9FA5 ||
 430            0x20000 <= $u && $u <= 0x2A6D6) { # CJK Ideograph
 431         return map $self->altCE($_),
 432             $cjk
 433                 ? map(pack(VCE_FORMAT, NON_VAR, @$_), &$cjk($u))
 434                 : defined $cjk && $self->{UCA_Version} <= 8 && $u < 0x10000
 435                     ? pack(VCE_FORMAT, NON_VAR, $u, 0x20, 0x02, $u)
 436                     : $der->($u);
 437     }
 438     else {
 439         return map $self->altCE($_), $der->($u);
 440     }
 441 }
 442
 443 ##
 444 ## int = index(string, substring)
 445 ##
 446 sub index
 447 {
 448     my $self = shift;
 449     my $lev  = $self->{level};
 450     my $comb = $self->{combining};
 451     my $str  = $self->splitCE(shift);
 452     my $sub  = $self->splitCE(shift);
 453
 454     return wantarray ? (0,0) : 0 if ! @$sub;
 455     return wantarray ?  ()  : -1 if ! @$str;
 456
 457     my @subWt = grep _ignorableAtLevel($_,$lev),
 458                 map $self->getWt($_), @$sub;
 459
 460     my(@strWt,@strPt);
 461     my $count = 0;
 462     for (my $i = 0; $i < @$str; $i++) {
 463         my $go_ahead = 0;
 464
 465         my @tmp = grep _ignorableAtLevel($_,$lev), $self->getWt($str->[$i]);
 466         $go_ahead += length $str->[$i];
 467
 468         # /*XXX*/ still broken.
 469         # index("e\x{300}", "e") should be 'no match' at level 2 or higher
 470         # as "e\x{300}" is a *single* grapheme cluster and not equal to "e".
 471
 472         # go ahead as far as we find a combining character;
 473         while ($i + 1 < @$str &&
 474               (! defined $str->[$i+1] || $comb->{ $str->[$i+1] }) ) {
 475             $i++;
 476             next if ! defined $str->[$i];
 477             $go_ahead += length $str->[$i];
 478             push @tmp,
 479                 grep _ignorableAtLevel($_,$lev), $self->getWt($str->[$i]);
 480         }
 481
 482         push @strWt, @tmp;
 483         push @strPt, ($count) x @tmp;
 484         $count += $go_ahead;
 485
 486         while (@strWt >= @subWt) {
 487             if (_eqArray(\@strWt, \@subWt, $lev)) {
 488                 my $pos = $strPt[0];
 489                 return wantarray ? ($pos, $count-$pos) : $pos;
 490             }
 491             shift @strWt;
 492             shift @strPt;
 493         }
 494     }
 495     return wantarray ? () : -1;
 496 }
 497
 498 ##
 499 ## bool _eqArray(arrayref, arrayref, level)
 500 ##
 501 sub _eqArray($$$)
 502 {
 503     my $a   = shift; # length $a >= length $b;
 504     my $b   = shift;
 505     my $lev = shift;
 506     for my $v (0..$lev-1) {
 507         for my $c (0..@$b-1){
 508             return if $a->[$c][$v] != $b->[$c][$v];
 509         }
 510     }
 511     return 1;
 512 }
 513
 514
 515 ##
 516 ## bool _ignorableAtLevel(CE, level)
 517 ##
 518 sub _ignorableAtLevel($$)
 519 {
 520     my $ce = shift;
 521     return unless defined $ce;
 522     my $lv = shift;
 523     return ! grep { ! $ce->[$_] } 0..$lv-1;
 524 }
 525
 526
 527 ##
 528 ## string sortkey = getSortKey(string arg)
 529 ##
 530 sub getSortKey
 531 {
 532     my $self = shift;
 533     my $lev  = $self->{level};
 534     my $rCE  = $self->splitCE(shift); # get an arrayref
 535     my $ver9 = $self->{UCA_Version} > 8;
 536     my $sht  = $self->{isShift};
 537
 538     # weight arrays
 539     my (@buf, $last_is_variable);
 540
 541     foreach my $ce (@$rCE) {
 542         my @t = $self->getWt($ce);
 543         if ($sht && $ver9) {
 544             if (@t == 1 && $t[0][0] == 0) {
 545                 if ($t[0][1] == 0 && $t[0][2] == 0) {
 546                     $last_is_variable = 1;
 547                 } else {
 548                     next if $last_is_variable;
 549                 }
 550             } else {
 551                 $last_is_variable = 0;
 552             }
 553         }
 554         push @buf, @t;
 555     }
 556
 557     # make sort key
 558     my @ret = ([],[],[],[]);
 559     foreach my $v (0..$lev-1) {
 560         foreach my $b (@buf) {
 561             push @{ $ret[$v] }, $b->[$v] if $b->[$v];
 562         }
 563     }
 564     foreach (@{ $self->{backwards} }) {
 565         my $v = $_ - 1;
 566         @{ $ret[$v] } = reverse @{ $ret[$v] };
 567     }
 568
 569     # modification of tertiary weights
 570     if ($self->{upper_before_lower}) {
 571         foreach (@{ $ret[2] }) {
 572             if    (0x8 <= $_ && $_ <= 0xC) { $_ -= 6 } # lower
 573             elsif (0x2 <= $_ && $_ <= 0x6) { $_ += 6 } # upper
 574             elsif ($_ == 0x1C)             { $_ += 1 } # square upper
 575             elsif ($_ == 0x1D)             { $_ -= 1 } # square lower
 576         }
 577     }
 578     if ($self->{katakana_before_hiragana}) {
 579         foreach (@{ $ret[2] }) {
 580             if    (0x0F <= $_ && $_ <= 0x13) { $_ -= 2 } # katakana
 581             elsif (0x0D <= $_ && $_ <= 0x0E) { $_ += 5 } # hiragana
 582         }
 583     }
 584     join "\0\0", map pack('n*', @$_), @ret;
 585 }
 586
 587
 588 ##
 589 ## int compare = cmp(string a, string b)
 590 ##
 591 sub cmp { $_[0]->getSortKey($_[1]) cmp $_[0]->getSortKey($_[2]) }
 592 sub eq  { $_[0]->getSortKey($_[1]) eq  $_[0]->getSortKey($_[2]) }
 593 sub ne  { $_[0]->getSortKey($_[1]) ne  $_[0]->getSortKey($_[2]) }
 594 sub lt  { $_[0]->getSortKey($_[1]) lt  $_[0]->getSortKey($_[2]) }
 595 sub le  { $_[0]->getSortKey($_[1]) le  $_[0]->getSortKey($_[2]) }
 596 sub gt  { $_[0]->getSortKey($_[1]) gt  $_[0]->getSortKey($_[2]) }
 597 sub ge  { $_[0]->getSortKey($_[1]) ge  $_[0]->getSortKey($_[2]) }
 598
 599 ##
 600 ## list[strings] sorted = sort(list[strings] arg)
 601 ##
 602 sub sort {
 603     my $obj = shift;
 604     return
 605         map { $_->[1] }
 606             sort{ $a->[0] cmp $b->[0] }
 607                 map [ $obj->getSortKey($_), $_ ], @_;
 608 }
 609
 610
 611 sub derivCE_9 {
 612     my $u = shift;
 613     my $base =
 614         (0x4E00 <= $u && $u <= 0x9FA5) # CJK
 615             ? 0xFB40 :
 616         (0x3400 <= $u && $u <= 0x4DB5 || 0x20000 <= $u && $u <= 0x2A6D6)
 617             ? 0xFB80 : 0xFBC0;
 618
 619     my $aaaa = $base + ($u >> 15);
 620     my $bbbb = ($u & 0x7FFF) | 0x8000;
 621     return
 622         pack(VCE_FORMAT, NON_VAR, $aaaa, Min2, Min3, $u),
 623         pack(VCE_FORMAT, NON_VAR, $bbbb,    0,    0, $u);
 624 }
 625
 626 sub derivCE_8 {
 627     my $code = shift;
 628     my $aaaa =  0xFF80 + ($code >> 15);
 629     my $bbbb = ($code & 0x7FFF) | 0x8000;
 630     return
 631         pack(VCE_FORMAT, NON_VAR, $aaaa, 2, 1, $code),
 632         pack(VCE_FORMAT, NON_VAR, $bbbb, 0, 0, $code);
 633 }
 634
 635 sub broken_derivCE { # NG
 636     my $code = shift;
 637     my $aaaa = 0xFFC2 + ($code >> 15);
 638     my $bbbb = $code & 0x7FFF | 0x1000;
 639     return
 640         pack(VCE_FORMAT, NON_VAR, $aaaa, 2, 1, $code),
 641         pack(VCE_FORMAT, NON_VAR, $bbbb, 0, 0, $code);
 642 }
 643
 644 ##
 645 ## "hhhh hhhh hhhh" to (dddd, dddd, dddd)
 646 ##
 647 sub _getHexArray { map hex, $_[0] =~ /([0-9a-fA-F]+)/g }
 648
 649 #
 650 # $code must be in Hangul syllable.
 651 # Check it before you enter here.
 652 #
 653 sub _decompHangul {
 654     my $code = shift;
 655     my $SIndex = $code - 0xAC00;
 656     my $LIndex = int( $SIndex / 588);
 657     my $VIndex = int(($SIndex % 588) / 28);
 658     my $TIndex =      $SIndex % 28;
 659     return (
 660         0x1100 + $LIndex,
 661         0x1161 + $VIndex,
 662         $TIndex ? (0x11A7 + $TIndex) : (),
 663     );
 664 }
 665
 666 1;
 667 __END__
 668
 669 =head1 NAME
 670
 671 Unicode::Collate - Unicode Collation Algorithm
 672
 673 =head1 SYNOPSIS
 674
 675   use Unicode::Collate;
 676
 677   #construct
 678   $Collator = Unicode::Collate->new(%tailoring);
 679
 680   #sort
 681   @sorted = $Collator->sort(@not_sorted);
 682
 683   #compare
 684   $result = $Collator->cmp($a, $b); # returns 1, 0, or -1.
 685
 686 =head1 DESCRIPTION
 687
 688 =head2 Constructor and Tailoring
 689
 690 The C<new> method returns a collator object.
 691
 692    $Collator = Unicode::Collate->new(
 693       UCA_Version => $UCA_Version,
 694       alternate => $alternate,
 695       backwards => $levelNumber, # or \@levelNumbers
 696       entry => $element,
 697       normalization  => $normalization_form,
 698       ignoreName => qr/$ignoreName/,
 699       ignoreChar => qr/$ignoreChar/,
 700       katakana_before_hiragana => $bool,
 701       level => $collationLevel,
 702       overrideCJK => \&overrideCJK,
 703       overrideHangul => \&overrideHangul,
 704       preprocess => \&preprocess,
 705       rearrange => \@charList,
 706       table => $filename,
 707       undefName => qr/$undefName/,
 708       undefChar => qr/$undefChar/,
 709       upper_before_lower => $bool,
 710    );
 711    # if %tailoring is false (i.e. empty),
 712    # $Collator should do the default collation.
 713
 714 =over 4
 715
 716 =item UCA_Version
 717
 718 If the version number of the older UCA is given,
 719 the older behavior of that version is emulated on collating.
 720 If omitted, the return value of C<UCA_Version()> is used.
 721
 722 The supported version: 8 or 9.
 723
 724 B<This parameter may be removed in the future version,
 725 as switching the algorithm would affect the performance.>
 726
 727 =item alternate
 728
 729 -- see 3.2.2 Variable Weighting, UTR #10.
 730
 731 (the title in UCA version 8: Alternate Weighting)
 732
 733 This key allows to alternate weighting for variable collation elements,
 734 which are marked with an ASTERISK in the table
 735 (NOTE: Many punction marks and symbols are variable in F<allkeys.txt>).
 736
 737    alternate => 'blanked', 'non-ignorable', 'shifted', or 'shift-trimmed'.
 738
 739 These names are case-insensitive.
 740 By default (if specification is omitted), 'shifted' is adopted.
 741
 742    'Blanked'        Variable elements are ignorable at levels 1 through 3;
 743                     considered at the 4th level.
 744
 745    'Non-ignorable'  Variable elements are not reset to ignorable.
 746
 747    'Shifted'        Variable elements are ignorable at levels 1 through 3
 748                     their level 4 weight is replaced by the old level 1 weight.
 749                     Level 4 weight for Non-Variable elements is 0xFFFF.
 750
 751    'Shift-Trimmed'  Same as 'shifted', but all FFFF's at the 4th level
 752                     are trimmed.
 753
 754 =item backwards
 755
 756 -- see 3.1.2 French Accents, UTR #10.
 757
 758      backwards => $levelNumber or \@levelNumbers
 759
 760 Weights in reverse order; ex. level 2 (diacritic ordering) in French.
 761 If omitted, forwards at all the levels.
 762
 763 =item entry
 764
 765 -- see 3.1 Linguistic Features; 3.2.1 File Format, UTR #10.
 766
 767 Overrides a default order or defines additional collation elements
 768
 769   entry => <<'ENTRIES', # use the UCA file format
 770 00E6 ; [.0861.0020.0002.00E6] [.08B1.0020.0002.00E6] # ligature <ae> as <a><e>
 771 0063 0068 ; [.0893.0020.0002.0063]      # "ch" in traditional Spanish
 772 0043 0068 ; [.0893.0020.0008.0043]      # "Ch" in traditional Spanish
 773 ENTRIES
 774
 775 =item ignoreName
 776
 777 =item ignoreChar
 778
 779 -- see Completely Ignorable, 3.2.2 Variable Weighting, UTR #10.
 780
 781 Makes the entry in the table completely ignorable;
 782 i.e. as if the weights were zero at all level.
 783
 784 E.g. when 'a' and 'e' are ignorable,
 785 'element' is equal to 'lament' (or 'lmnt').
 786
 787 =item level
 788
 789 -- see 4.3 Form a sort key for each string, UTR #10.
 790
 791 Set the maximum level.
 792 Any higher levels than the specified one are ignored.
 793
 794   Level 1: alphabetic ordering
 795   Level 2: diacritic ordering
 796   Level 3: case ordering
 797   Level 4: tie-breaking (e.g. in the case when alternate is 'shifted')
 798
 799   ex.level => 2,
 800
 801 If omitted, the maximum is the 4th.
 802
 803 =item normalization
 804
 805 -- see 4.1 Normalize each input string, UTR #10.
 806
 807 If specified, strings are normalized before preparation of sort keys
 808 (the normalization is executed after preprocess).
 809
 810 As a form name, one of the following names must be used.
 811
 812   'C'  or 'NFC'  for Normalization Form C
 813   'D'  or 'NFD'  for Normalization Form D
 814   'KC' or 'NFKC' for Normalization Form KC
 815   'KD' or 'NFKD' for Normalization Form KD
 816
 817 If omitted, the string is put into Normalization Form D.
 818
 819 If C<undef> is passed explicitly as the value for this key,
 820 any normalization is not carried out (this may make tailoring easier
 821 if any normalization is not desired).
 822
 823 see B<CAVEAT>.
 824
 825 =item overrideCJK
 826
 827 -- see 7.1 Derived Collation Elements, UTR #10.
 828
 829 By default, mapping of CJK Unified Ideographs
 830 uses the Unicode codepoint order.
 831 But the mapping of CJK Unified Ideographs may be overrided.
 832
 833 ex. CJK Unified Ideographs in the JIS code point order.
 834
 835   overrideCJK => sub {
 836       my $u = shift;             # get a Unicode codepoint
 837       my $b = pack('n', $u);     # to UTF-16BE
 838       my $s = your_unicode_to_sjis_converter($b); # convert
 839       my $n = unpack('n', $s);   # convert sjis to short
 840       [ $n, 0x20, 0x2, $u ];     # return the collation element
 841   },
 842
 843 ex. ignores all CJK Unified Ideographs.
 844
 845   overrideCJK => sub {()}, # CODEREF returning empty list
 846
 847    # where ->eq("Pe\x{4E00}rl", "Perl") is true
 848    # as U+4E00 is a CJK Unified Ideograph and to be ignorable.
 849
 850 If C<undef> is passed explicitly as the value for this key,
 851 weights for CJK Unified Ideographs are treated as undefined.
 852 But assignment of weight for CJK Unified Ideographs
 853 in table or L<entry> is still valid.
 854
 855 =item overrideHangul
 856
 857 -- see 7.1 Derived Collation Elements, UTR #10.
 858
 859 By default, Hangul Syllables are decomposed into Hangul Jamo.
 860 But the mapping of Hangul Syllables may be overrided.
 861
 862 This tag works like L<overrideCJK>, so see there for examples.
 863
 864 If you want to override the mapping of Hangul Syllables,
 865 the Normalization Forms D and KD are not appropriate
 866 (they will be decomposed before overriding).
 867
 868 If C<undef> is passed explicitly as the value for this key,
 869 weight for Hangul Syllables is treated as undefined
 870 without decomposition into Hangul Jamo.
 871 But definition of weight for Hangul Syllables
 872 in table or L<entry> is still valid.
 873
 874 =item preprocess
 875
 876 -- see 5.1 Preprocessing, UTR #10.
 877
 878 If specified, the coderef is used to preprocess
 879 before the formation of sort keys.
 880
 881 ex. dropping English articles, such as "a" or "the".
 882 Then, "the pen" is before "a pencil".
 883
 884      preprocess => sub {
 885            my $str = shift;
 886            $str =~ s/\b(?:an?|the)\s+//gi;
 887            $str;
 888         },
 889
 890 =item rearrange
 891
 892 -- see 3.1.3 Rearrangement, UTR #10.
 893
 894 Characters that are not coded in logical order and to be rearranged.
 895 By default,
 896
 897     rearrange => [ 0x0E40..0x0E44, 0x0EC0..0x0EC4 ],
 898
 899 If you want to disallow any rearrangement,
 900 pass C<undef> or C<[]> (a reference to an empty list)
 901 as the value for this key.
 902
 903 B<According to the version 9 of UCA, this parameter shall not be used;
 904 but it is not warned at present.>
 905
 906 =item table
 907
 908 -- see 3.2 Default Unicode Collation Element Table, UTR #10.
 909
 910 You can use another element table if desired.
 911 The table file must be in your C<lib/Unicode/Collate> directory.
 912
 913 By default, the file C<lib/Unicode/Collate/allkeys.txt> is used.
 914
 915 If C<undef> is passed explicitly as the value for this key,
 916 no file is read (but you can define collation elements via L<entry>).
 917
 918 A typical way to define a collation element table
 919 without any file of table:
 920
 921    $onlyABC = Unicode::Collate->new(
 922        table => undef,
 923        entry => << 'ENTRIES',
 924 0061 ; [.0101.0020.0002.0061] # LATIN SMALL LETTER A
 925 0041 ; [.0101.0020.0008.0041] # LATIN CAPITAL LETTER A
 926 0062 ; [.0102.0020.0002.0062] # LATIN SMALL LETTER B
 927 0042 ; [.0102.0020.0008.0042] # LATIN CAPITAL LETTER B
 928 0063 ; [.0103.0020.0002.0063] # LATIN SMALL LETTER C
 929 0043 ; [.0103.0020.0008.0043] # LATIN CAPITAL LETTER C
 930 ENTRIES
 931     );
 932
 933 =item undefName
 934
 935 =item undefChar
 936
 937 -- see 6.3.4 Reducing the Repertoire, UTR #10.
 938
 939 Undefines the collation element as if it were unassigned in the table.
 940 This reduces the size of the table.
 941 If an unassigned character appears in the string to be collated,
 942 the sort key is made from its codepoint
 943 as a single-character collation element,
 944 as it is greater than any other assigned collation elements
 945 (in the codepoint order among the unassigned characters).
 946 But, it'd be better to ignore characters
 947 unfamiliar to you and maybe never used.
 948
 949 =item katakana_before_hiragana
 950
 951 =item upper_before_lower
 952
 953 -- see 6.6 Case Comparisons; 7.3.1 Tertiary Weight Table, UTR #10.
 954
 955 By default, lowercase is before uppercase
 956 and hiragana is before katakana.
 957
 958 If the tag is made true, this is reversed.
 959
 960 B<NOTE>: These tags simplemindedly assume
 961 any lowercase/uppercase or hiragana/katakana distinctions
 962 should occur in level 3, and their weights at level 3
 963 should be same as those mentioned in 7.3.1, UTR #10.
 964 If you define your collation elements which violates this,
 965 these tags doesn't work validly.
 966
 967 =back
 968
 969 =head2 Methods for Collation
 970
 971 =over 4
 972
 973 =item C<@sorted = $Collator-E<gt>sort(@not_sorted)>
 974
 975 Sorts a list of strings.
 976
 977 =item C<$result = $Collator-E<gt>cmp($a, $b)>
 978
 979 Returns 1 (when C<$a> is greater than C<$b>)
 980 or 0 (when C<$a> is equal to C<$b>)
 981 or -1 (when C<$a> is lesser than C<$b>).
 982
 983 =item C<$result = $Collator-E<gt>eq($a, $b)>
 984
 985 =item C<$result = $Collator-E<gt>ne($a, $b)>
 986
 987 =item C<$result = $Collator-E<gt>lt($a, $b)>
 988
 989 =item C<$result = $Collator-E<gt>le($a, $b)>
 990
 991 =item C<$result = $Collator-E<gt>gt($a, $b)>
 992
 993 =item C<$result = $Collator-E<gt>ge($a, $b)>
 994
 995 They works like the same name operators as theirs.
 996
 997    eq : whether $a is equal to $b.
 998    ne : whether $a is not equal to $b.
 999    lt : whether $a is lesser than $b.
1000    le : whether $a is lesser than $b or equal to $b.
1001    gt : whether $a is greater than $b.
1002    ge : whether $a is greater than $b or equal to $b.
1003
1004 =item C<$sortKey = $Collator-E<gt>getSortKey($string)>
1005
1006 -- see 4.3 Form a sort key for each string, UTR #10.
1007
1008 Returns a sort key.
1009
1010 You compare the sort keys using a binary comparison
1011 and get the result of the comparison of the strings using UCA.
1012
1013    $Collator->getSortKey($a) cmp $Collator->getSortKey($b)
1014
1015       is equivalent to
1016
1017    $Collator->cmp($a, $b)
1018
1019 =item C<$sortKeyForm = $Collator-E<gt>viewSortKey($string)>
1020
1021    use Unicode::Collate;
1022    my $c = Unicode::Collate->new();
1023    print $c->viewSortKey("Perl"),"\n";
1024
1025    # output:
1026    # [0B67 0A65 0B7F 0B03 | 0020 0020 0020 0020 | 0008 0002 0002 0002 | FFFF FFFF FFFF FFFF]
1027    #  Level 1               Level 2               Level 3               Level 4
1028
1029     (If C<UCA_Version> is 8, the output is slightly different.)
1030
1031 =item C<$position = $Collator-E<gt>index($string, $substring)>
1032
1033 =item C<($position, $length) = $Collator-E<gt>index($string, $substring)>
1034
1035 -- see 6.8 Searching, UTR #10.
1036
1037 If C<$substring> matches a part of C<$string>, returns
1038 the position of the first occurrence of the matching part in scalar context;
1039 in list context, returns a two-element list of
1040 the position and the length of the matching part.
1041
1042 B<Notice> that the length of the matching part may differ from
1043 the length of C<$substring>.
1044
1045 B<Note> that the position and the length are counted on the string
1046 after the process of preprocess, normalization, and rearrangement.
1047 Therefore, in case the specified string is not binary equal to
1048 the preprocessed/normalized/rearranged string, the position and the length
1049 may differ form those on the specified string. But it is guaranteed
1050 that, if matched, it returns a non-negative value as C<$position>.
1051
1052 If C<$substring> does not match any part of C<$string>,
1053 returns C<-1> in scalar context and
1054 an empty list in list context.
1055
1056 e.g. you say
1057
1058   my $Collator = Unicode::Collate->new( normalization => undef, level => 1 );
1059   my $str = "Ich mu\x{00DF} studieren.";
1060   my $sub = "m\x{00FC}ss";
1061   my $match;
1062   if (my($pos,$len) = $Collator->index($str, $sub)) {
1063       $match = substr($str, $pos, $len);
1064   }
1065
1066 and get C<"mu\x{00DF}"> in C<$match> since C<"mu>E<223>C<">
1067 is primary equal to C<"m>E<252>C<ss">.
1068
1069 =back
1070
1071 =head2 Other Methods
1072
1073 =over 4
1074
1075 =item C<%old_tailoring = $Collator-E<gt>change(%new_tailoring)>
1076
1077 Change the value of specified keys and returns the changed part.
1078
1079     $Collator = Unicode::Collate->new(level => 4);
1080
1081     $Collator->eq("perl", "PERL"); # false
1082
1083     %old = $Collator->change(level => 2); # returns (level => 4).
1084
1085     $Collator->eq("perl", "PERL"); # true
1086
1087     $Collator->change(%old); # returns (level => 2).
1088
1089     $Collator->eq("perl", "PERL"); # false
1090
1091 Not all C<(key,value)>s are allowed to be changed.
1092 See also C<@Unicode::Collate::ChangeOK> and C<@Unicode::Collate::ChangeNG>.
1093
1094 In the scalar context, returns the modified collator
1095 (but it is B<not> a clone from the original).
1096
1097     $Collator->change(level => 2)->eq("perl", "PERL"); # true
1098
1099     $Collator->eq("perl", "PERL"); # true; now max level is 2nd.
1100
1101     $Collator->change(level => 4)->eq("perl", "PERL"); # false
1102
1103 =item UCA_Version
1104
1105 Returns the version number of Unicode Technical Standard 10
1106 this module consults.
1107
1108 =item Base_Unicode_Version
1109
1110 Returns the version number of the Unicode Standard
1111 this module is based on.
1112
1113 =back
1114
1115 =head2 EXPORT
1116
1117 None by default.
1118
1119 =head2 TODO
1120
1121 Unicode::Collate has not been ported to EBCDIC.  The code mostly would
1122 work just fine but a decision needs to be made: how the module should
1123 work in EBCDIC?  Should the low 256 characters be understood as
1124 Unicode or as EBCDIC code points?  Should one be chosen or should
1125 there be a way to do either?  Or should such translation be left
1126 outside the module for the user to do, for example by using
1127 Encode::from_to()?
1128 (or utf8::unicode_to_native()/utf8::native_to_unicode()?)
1129
1130 =head2 CAVEAT
1131
1132 Use of the C<normalization> parameter requires
1133 the B<Unicode::Normalize> module.
1134
1135 If you need not it (say, in the case when you need not
1136 handle any combining characters),
1137 assign C<normalization =E<gt> undef> explicitly.
1138
1139 -- see 6.5 Avoiding Normalization, UTR #10.
1140
1141 =head2 Conformance Test
1142
1143 The Conformance Test for the UCA is provided
1144 in L<http://www.unicode.org/reports/tr10/CollationTest.html>
1145 and L<http://www.unicode.org/reports/tr10/CollationTest.zip>
1146
1147 For F<CollationTest_SHIFTED.txt>,
1148 a collator via C<Unicode::Collate-E<gt>new( )> should be used;
1149 for F<CollationTest_NON_IGNORABLE.txt>, a collator via
1150 C<Unicode::Collate-E<gt>new(alternate =E<gt> "non-ignorable", level =E<gt> 3)>.
1151
1152 B<Unicode::Normalize is required to try this test.>
1153
1154 =head2 BUGS
1155
1156 C<index()> is an experimental method and
1157 its return value may be unreliable.
1158 The correct implementation for C<index()> must be based
1159 on Locale-Sensitive Support: Level 3 in UTR #18,
1160 F<Unicode Regular Expression Guidelines>.
1161
1162 See also 4.2 Locale-Dependent Graphemes in UTR #18.
1163
1164 =head1 AUTHOR
1165
1166 SADAHIRO Tomoyuki, E<lt>SADAHIRO@cpan.orgE<gt>
1167
1168   http://homepage1.nifty.com/nomenclator/perl/
1169
1170   Copyright(C) 2001-2002, SADAHIRO Tomoyuki. Japan. All rights reserved.
1171
1172   This library is free software; you can redistribute it
1173   and/or modify it under the same terms as Perl itself.
1174
1175 =head1 SEE ALSO
1176
1177 =over 4
1178
1179 =item http://www.unicode.org/reports/tr10/
1180
1181 Unicode Collation Algorithm - UTR #10
1182
1183 =item http://www.unicode.org/reports/tr10/allkeys.txt
1184
1185 The Default Unicode Collation Element Table
1186
1187 =item http://www.unicode.org/reports/tr10/CollationTest.html
1188 http://www.unicode.org/reports/tr10/CollationTest.zip
1189
1190 The latest versions of the conformance test for the UCA
1191
1192 =item http://www.unicode.org/reports/tr15/
1193
1194 Unicode Normalization Forms - UAX #15
1195
1196 =item http://www.unicode.org/reports/tr18
1197
1198 Unicode Regular Expression Guidelines - UTR #18
1199
1200 =item L<Unicode::Normalize>
1201
1202 =back
1203
1204 =cut