lib/Unicode/Collate.pm

   1 package Unicode::Collate;
   2
   3 use 5.006;
   4 use strict;
   5 use warnings;
   6 use Carp;
   7 use Lingua::KO::Hangul::Util;
   8 require Exporter;
   9
  10 our $VERSION = '0.07';
  11 our $PACKAGE = __PACKAGE__;
  12
  13 our @ISA = qw(Exporter);
  14
  15 our %EXPORT_TAGS = ();
  16 our @EXPORT_OK = ();
  17 our @EXPORT = ();
  18
  19 (our $Path = $INC{'Unicode/Collate.pm'}) =~ s/\.pm$//;
  20 our $KeyFile = "allkeys.txt";
  21
  22 our %Combin; # combining class from Unicode::Normalize
  23
  24 use constant Min2      => 0x20;   # minimum weight at level 2
  25 use constant Min3      => 0x02;   # minimum weight at level 3
  26 use constant UNDEFINED => 0xFF80; # special value for undefined CE
  27
  28 ##
  29 ## constructor
  30 ##
  31 sub new
  32 {
  33   my $class = shift;
  34   my $self = bless { @_ }, $class;
  35
  36   # alternate
  37   $self->{alternate} =
  38      ! exists  $self->{alternate} ? 'shifted' :
  39      ! defined $self->{alternate} ? '' : $self->{alternate};
  40
  41   # collation level
  42   $self->{level} ||= $self->{alternate} =~ /shift/ ? 4 : 3;
  43
  44   # normalization form
  45   $self->{normalization} = 'D' if ! exists $self->{normalization};
  46
  47   eval "use Unicode::Normalize;" if defined $self->{normalization};
  48
  49   $self->{normalize} =
  50     ! defined $self->{normalization}        ? undef :
  51     $self->{normalization} =~ /^(?:NF)?C$/  ? \&NFC :
  52     $self->{normalization} =~ /^(?:NF)?D$/  ? \&NFD :
  53     $self->{normalization} =~ /^(?:NF)?KC$/ ? \&NFKC :
  54     $self->{normalization} =~ /^(?:NF)?KD$/ ? \&NFKD :
  55     croak "$PACKAGE unknown normalization form name: $self->{normalization}";
  56
  57   *Combin = \%Unicode::Normalize::Combin if $self->{normalize} && ! %Combin;
  58
  59   # backwards
  60   $self->{backwards} ||= [];
  61   $self->{backwards} = [ $self->{backwards} ] if ! ref $self->{backwards};
  62
  63   # rearrange
  64   $self->{rearrange} ||= []; # maybe not U+0000 (an ASCII)
  65   $self->{rearrange} = [ $self->{rearrange} ] if ! ref $self->{rearrange};
  66
  67   # open the table file
  68   my $file = defined $self->{table} ? $self->{table} : $KeyFile;
  69   open my $fk, "<$Path/$file" or croak "File does not exist at $Path/$file";
  70
  71   while(<$fk>){
  72     next if /^\s*#/;
  73     if(/^\s*\@/){
  74        if(/^\@version\s*(\S*)/){
  75          $self->{version} ||= $1;
  76        }
  77        elsif(/^\@alternate\s+(.*)/){
  78          $self->{alternate} ||= $1;
  79        }
  80        elsif(/^\@backwards\s+(.*)/){
  81          push @{ $self->{backwards} }, $1;
  82        }
  83        elsif(/^\@rearrange\s+(.*)/){
  84          push @{ $self->{rearrange} }, _getHexArray($1);
  85        }
  86        next;
  87     }
  88     $self->parseEntry($_);
  89   }
  90   close $fk;
  91   if($self->{entry}){
  92     $self->parseEntry($_) foreach split /\n/, $self->{entry};
  93   }
  94
  95   # keys of $self->{rearrangeHash} are $self->{rearrange}.
  96   $self->{rearrangeHash} = {};
  97   @{ $self->{rearrangeHash} }{ @{ $self->{rearrange} } } = ();
  98
  99   return $self;
 100 }
 101
 102 ##
 103 ## get $line, parse it, and write an entry in $self
 104 ##
 105 sub parseEntry
 106 {
 107   my $self = shift;
 108   my $line = shift;
 109   my($name, $ele, @key);
 110
 111   return if $line !~ /^\s*[0-9A-Fa-f]/;
 112
 113   # get name
 114   $name = $1 if $line =~ s/#\s*(.*)//;
 115   return if defined $self->{undefName} && $name =~ /$self->{undefName}/;
 116
 117   # get element
 118   my($e, $k) = split /;/, $line;
 119   my @e = _getHexArray($e);
 120   $ele = pack('U*', @e);
 121   return if defined $self->{undefChar} && $ele =~ /$self->{undefChar}/;
 122
 123   # get sort key
 124   if(
 125      defined $self->{ignoreName} && $name =~ /$self->{ignoreName}/ ||
 126      defined $self->{ignoreChar} && $ele  =~ /$self->{ignoreChar}/
 127   )
 128   {
 129      $self->{ignored}{$ele} = 1;
 130      $self->{entries}{$ele} = 1; # true
 131   }
 132   else
 133   {
 134     foreach my $arr ($k =~ /\[(\S+)\]/g) {
 135       my $var = $arr =~ /\*/;
 136       push @key, $self->getCE( $var, _getHexArray($arr) );
 137     }
 138     $self->{entries}{$ele} = \@key;
 139   }
 140   $self->{maxlength}{ord $ele} = scalar @e if @e > 1;
 141 }
 142
 143
 144 ##
 145 ## list to collation element
 146 ##
 147 sub getCE
 148 {
 149   my $self = shift;
 150   my $var  = shift;
 151   my @c    = @_;
 152
 153   $self->{alternate} eq 'blanked' ?
 154      $var ? [0,0,0] : [ @c[0..2] ] :
 155   $self->{alternate} eq 'non-ignorable' ? [ @c[0..2] ] :
 156   $self->{alternate} eq 'shifted' ?
 157     $var ? [0,0,0,$c[0] ] : [ @c[0..2], $c[0]+$c[1]+$c[2] ? 0xFFFF : 0 ] :
 158   $self->{alternate} eq 'shift-trimmed' ?
 159     $var ? [0,0,0,$c[0] ] : [ @c[0..2], 0 ] :
 160    \@c;
 161 }
 162
 163 ##
 164 ## to debug
 165 ##
 166 sub viewSortKey
 167 {
 168   my $self = shift;
 169   my $key  = $self->getSortKey(@_);
 170   my $view = join " ", map sprintf("%04X", $_), unpack 'n*', $key;
 171   $view =~ s/ ?0000 ?/|/g;
 172   "[$view]";
 173 }
 174
 175 ##
 176 ## sort key
 177 ##
 178 sub getSortKey
 179 {
 180   my $self = shift;
 181   my $code = $self->{preprocess};
 182   my $norm = $self->{normalize};
 183   my $ent  = $self->{entries};
 184   my $ign  = $self->{ignored};
 185   my $max  = $self->{maxlength};
 186   my $lev  = $self->{level};
 187   my $cjk  = $self->{overrideCJK};
 188   my $hang = $self->{overrideHangul};
 189   my $rear = $self->{rearrangeHash};
 190
 191   my $str = ref $code ? &$code(shift) : shift;
 192   $str = &$norm($str) if ref $norm;
 193
 194   my(@src, @buf);
 195   @src = unpack('U*', $str);
 196
 197   # rearrangement
 198   for(my $i = 0; $i < @src; $i++)
 199   {
 200      ($src[$i], $src[$i+1]) = ($src[$i+1], $src[$i])
 201         if $rear->{ $src[$i] };
 202      $i++;
 203   }
 204
 205   for(my $i = 0; $i < @src; $i++)
 206   {
 207     my $ch;
 208     my $u  = $src[$i];
 209
 210   # non-characters
 211     next if $u < 0 || 0x10FFFF < $u     # out of range
 212          || 0xD800 < $u && $u < 0xDFFF; # unpaired surrogates
 213     my $four = $u & 0xFFFF;
 214     next if $four == 0xFFFE || $four == 0xFFFF;
 215
 216     if($max->{$u}) # contract
 217     {
 218       for(my $j = $max->{$u}; $j >= 1; $j--)
 219       {
 220         next unless $i+$j-1 < @src;
 221         $ch = pack 'U*', @src[$i .. $i+$j-1];
 222         $i += $j-1, last if $ent->{$ch};
 223       }
 224     }
 225     else {  $ch = pack('U', $u) }
 226
 227     if(%Combin && defined $ch) # with Combining Char
 228     {
 229       for(my $j = $i+1; $j < @src && $Combin{ $src[$j] }; $j++)
 230       {
 231         my $comb = pack 'U', $src[$j];
 232         next if ! $ent->{ $ch.$comb };
 233         $ch .= $comb;
 234         splice(@src, $j, 1);
 235         last;
 236       }
 237     }
 238
 239     next if !defined $ch || $ign->{$ch};   # ignored
 240
 241     push @buf,
 242       $ent->{$ch}
 243         ? @{ $ent->{$ch} }
 244         : _isHangul($u)
 245           ? $hang
 246             ? &$hang($u)
 247             : map(@{ $ent->{pack('U', $_)} }, decomposeHangul($u))
 248           : _isCJK($u)
 249             ? $cjk ? &$cjk($u) : map($self->getCE(0,@$_), _CJK($u))
 250             : map($self->getCE(0,@$_), _derivCE($u));
 251   }
 252
 253   # make sort key
 254   my @ret = ([],[],[],[]);
 255   foreach my $v (0..$lev-1){
 256     foreach my $b (@buf){
 257       push @{ $ret[$v] }, $b->[$v] if $b->[$v];
 258     }
 259   }
 260   foreach (@{ $self->{backwards} }){
 261     my $v = $_ - 1;
 262     @{ $ret[$v] } = reverse @{ $ret[$v] };
 263   }
 264
 265   # modification of tertiary weights
 266   if($self->{upper_before_lower}){
 267     foreach (@{ $ret[2] }){
 268       if   (0x8 <= $_ && $_ <= 0xC){ $_ -= 6 } # lower
 269       elsif(0x2 <= $_ && $_ <= 0x6){ $_ += 6 } # upper
 270       elsif($_ == 0x1C)            { $_ += 1 } # square upper
 271       elsif($_ == 0x1D)            { $_ -= 1 } # square lower
 272     }
 273   }
 274   if($self->{katakana_before_hiragana}){
 275     foreach (@{ $ret[2] }){
 276       if   (0x0F <= $_ && $_ <= 0x13){ $_ -= 2 } # katakana
 277       elsif(0x0D <= $_ && $_ <= 0x0E){ $_ += 5 } # hiragana
 278     }
 279   }
 280   join "\0\0", map pack('n*', @$_), @ret;
 281 }
 282
 283
 284 ##
 285 ## cmp
 286 ##
 287 sub cmp
 288 {
 289   my $obj = shift;
 290   my $a   = shift;
 291   my $b   = shift;
 292   $obj->getSortKey($a) cmp $obj->getSortKey($b);
 293 }
 294
 295 ##
 296 ## sort
 297 ##
 298 sub sort
 299 {
 300   my $obj = shift;
 301
 302   map { $_->[1] }
 303   sort{ $a->[0] cmp $b->[0] }
 304   map [ $obj->getSortKey($_), $_ ], @_;
 305 }
 306
 307 ##
 308 ## Derived CE
 309 ##
 310 sub _derivCE
 311 {
 312   my $code = shift;
 313   my $a = UNDEFINED + ($code >> 15); # ok
 314   my $b = ($code & 0x7FFF) | 0x8000; # ok
 315 # my $a = 0xFFC2 + ($code >> 15);    # ng
 316 # my $b = $code & 0x7FFF | 0x1000;   # ng
 317   $b ? ([$a,2,1,$code],[$b,0,0,$code]) : [$a,2,1,$code];
 318 }
 319
 320 ##
 321 ## "hhhh hhhh hhhh" to (dddd, dddd, dddd)
 322 ##
 323 sub _getHexArray
 324 {
 325   my $str = shift;
 326   map hex(), $str =~ /([0-9a-fA-F]+)/g;
 327 }
 328
 329 ##
 330 ##  CJK Unified Ideographs
 331 ##
 332 sub _isCJK
 333 {
 334   my $u = shift;
 335   return 0x3400 <= $u && $u <= 0x4DB5
 336       || 0x4E00 <= $u && $u <= 0x9FA5
 337 #      || 0x20000 <= $u && $u <= 0x2A6D6;
 338 }
 339
 340 ##
 341 ##  CJK Unified Ideographs
 342 ##
 343 sub _CJK
 344 {
 345   my $u = shift;
 346   $u > 0xFFFF ? _derivCE($u) : [$u,0x20,0x02,$u];
 347 }
 348
 349 ##
 350 ## Hangul Syllables
 351 ##
 352 sub _isHangul
 353 {
 354   my $code = shift;
 355   return 0xAC00 <= $code && $code <= 0xD7A3;
 356 }
 357
 358 1;
 359 __END__
 360
 361 =head1 NAME
 362
 363 Unicode::Collate - use UCA (Unicode Collation Algorithm)
 364
 365 =head1 SYNOPSIS
 366
 367   use Unicode::Collate;
 368
 369   #construct
 370   $UCA = Unicode::Collate->new(%tailoring);
 371
 372   #sort
 373   @sorted = $UCA->sort(@not_sorted);
 374
 375   #compare
 376   $result = $UCA->cmp($a, $b); # returns 1, 0, or -1.
 377
 378 =head1 DESCRIPTION
 379
 380 =head2 Constructor and Tailoring
 381
 382    $UCA = Unicode::Collate->new(
 383       alternate => $alternate,
 384       backwards => $levelNumber, # or \@levelNumbers
 385       entry => $element,
 386       normalization  => $normalization_form,
 387       ignoreName => qr/$ignoreName/,
 388       ignoreChar => qr/$ignoreChar/,
 389       katakana_before_hiragana => $bool,
 390       level => $collationLevel,
 391       overrideCJK => \&overrideCJK,
 392       overrideHangul => \&overrideHangul,
 393       preprocess => \&preprocess,
 394       rearrange => \@charList,
 395       table => $filename,
 396       undefName => qr/$undefName/,
 397       undefChar => qr/$undefChar/,
 398       upper_before_lower => $bool,
 399    );
 400    # if %tailoring is false (empty),
 401    # $UCA should do the default collation.
 402
 403 =over 4
 404
 405 =item alternate
 406
 407 -- see 3.2.2 Alternate Weighting, UTR #10.
 408
 409    alternate => 'shifted', 'blanked', 'non-ignorable', or 'shift-trimmed'.
 410
 411 By default (if specification is omitted), 'shifted' is adopted.
 412
 413 =item backwards
 414
 415 -- see 3.1.2 French Accents, UTR #10.
 416
 417      backwards => $levelNumber or \@levelNumbers
 418
 419 Weights in reverse order; ex. level 2 (diacritic ordering) in French.
 420 If omitted, forwards at all the levels.
 421
 422 =item entry
 423
 424 -- see 3.1 Linguistic Features; 3.2.1 File Format, UTR #10.
 425
 426 Overrides a default order or adds a new element
 427
 428   entry => <<'ENTRIES', # use the UCA file format
 429 00E6 ; [.0861.0020.0002.00E6] [.08B1.0020.0002.00E6] # ligature <ae> as <a e>
 430 0063 0068 ; [.0893.0020.0002.0063]      # "ch" in traditional Spanish
 431 0043 0068 ; [.0893.0020.0008.0043]      # "Ch" in traditional Spanish
 432 ENTRIES
 433
 434 =item ignoreName
 435
 436 =item ignoreChar
 437
 438 -- see Completely Ignorable, 3.2.2 Alternate Weighting, UTR #10.
 439
 440 Ignores the entry in the table.
 441 If an ignored collation element appears in the string to be collated,
 442 it is ignored as if the element had been deleted from there.
 443
 444 E.g. when 'a' and 'e' are ignored,
 445 'element' is equal to 'lament' (or 'lmnt').
 446
 447 =item level
 448
 449 -- see 4.3 Form a sort key for each string, UTR #10.
 450
 451 Set the maximum level.
 452 Any higher levels than the specified one are ignored.
 453
 454   Level 1: alphabetic ordering
 455   Level 2: diacritic ordering
 456   Level 3: case ordering
 457   Level 4: tie-breaking (e.g. in the case when alternate is 'shifted')
 458
 459   ex.level => 2,
 460
 461 =item normalization
 462
 463 -- see 4.1 Normalize each input string, UTR #10.
 464
 465 If specified, strings are normalized before preparation sort keys
 466 (the normalization is executed after preprocess).
 467
 468 As a form name, one of the following names must be used.
 469
 470   'C'  or 'NFC'  for Normalization Form C
 471   'D'  or 'NFD'  for Normalization Form D
 472   'KC' or 'NFKC' for Normalization Form KC
 473   'KD' or 'NFKD' for Normalization Form KD
 474
 475 If omitted, the string is put into Normalization Form D.
 476
 477 If undefined explicitly (as C<normalization =E<gt> undef>),
 478 any normalization is not carried out (this may make tailoring easier
 479 if any normalization is not desired).
 480
 481 see B<CAVEAT>.
 482
 483 =item overrideCJK
 484
 485 =item overrideHangul
 486
 487 -- see 7.1 Derived Collation Elements, UTR #10.
 488
 489 By default, mapping of CJK Unified Ideographs
 490 uses the Unicode codepoint order
 491 and Hangul Syllables are decomposed into Hangul Jamo.
 492
 493 The mapping of CJK Unified Ideographs
 494 or Hangul Syllables may be overrided.
 495
 496 ex. CJK Unified Ideographs in the JIS codepoint order.
 497
 498   overrideCJK => sub {
 499     my $u = shift;               # get unicode codepoint
 500     my $b = pack('n', $u);       # to UTF-16BE
 501     my $s = your_unicode_to_sjis_converter($b); # convert
 502     my $n = unpack('n', $s);     # convert sjis to short
 503     [ $n, 1, 1 ];                # return collation element
 504   },
 505
 506 If you want to override the mapping of Hangul Syllables,
 507 the Normalization Forms D and KD are not appropriate
 508 (they will be decomposed before overriding).
 509
 510 =item preprocess
 511
 512 -- see 5.1 Preprocessing, UTR #10.
 513
 514 If specified, the coderef is used to preprocess
 515 before the formation of sort keys.
 516
 517 ex. dropping English articles, such as "a" or "the".
 518 Then, "the pen" is before "a pencil".
 519
 520      preprocess => sub {
 521            my $str = shift;
 522            $str =~ s/\b(?:an?|the)\s+//g;
 523            $str;
 524         },
 525
 526 =item rearrange
 527
 528 -- see 3.1.3 Rearrangement, UTR #10.
 529
 530 Characters that are not coded in logical order and to be rearranged.
 531 By default,
 532
 533     rearrange => [ 0x0E40..0x0E44, 0x0EC0..0x0EC4 ],
 534
 535 =item table
 536
 537 -- see 3.2 Default Unicode Collation Element Table, UTR #10.
 538
 539 You can use another element table if desired.
 540 The table file must be in your C<lib/Unicode/Collate> directory.
 541
 542 By default, the file C<lib/Unicode/Collate/allkeys.txt> is used.
 543
 544 =item undefName
 545
 546 =item undefChar
 547
 548 -- see 6.3.4 Reducing the Repertoire, UTR #10.
 549
 550 Undefines the collation element as if it were unassigned in the table.
 551 This reduces the size of the table.
 552 If an unassigned character appears in the string to be collated,
 553 the sort key is made from its codepoint
 554 as a single-character collation element,
 555 as it is greater than any other assigned collation elements
 556 (in the codepoint order among the unassigned characters).
 557 But, it'd be better to ignore characters
 558 unfamiliar to you and maybe never used.
 559
 560 =item katakana_before_hiragana
 561
 562 =item upper_before_lower
 563
 564 -- see 6.6 Case Comparisons; 7.3.1 Tertiary Weight Table, UTR #10.
 565
 566 By default, lowercase is before uppercase
 567 and hiragana is before katakana.
 568
 569 If the parameter is true, this is reversed.
 570
 571 =back
 572
 573 =head2 Other methods
 574
 575 =over 4
 576
 577 =item C<@sorted = $UCA-E<gt>sort(@not_sorted)>
 578
 579 Sorts a list of strings.
 580
 581 =item C<$result = $UCA-E<gt>cmp($a, $b)>
 582
 583 Returns 1 (when C<$a> is greater than C<$b>)
 584 or 0 (when C<$a> is equal to C<$b>)
 585 or -1 (when C<$a> is lesser than C<$b>).
 586
 587 =item C<$sortKey = $UCA-E<gt>getSortKey($string)>
 588
 589 -- see 4.3 Form a sort key for each string, UTR #10.
 590
 591 Returns a sort key.
 592
 593 You compare the sort keys using a binary comparison
 594 and get the result of the comparison of the strings using UCA.
 595
 596    $UCA->getSortKey($a) cmp $UCA->getSortKey($b)
 597
 598       is equivalent to
 599
 600    $UCA->cmp($a, $b)
 601
 602 =back
 603
 604 =head2 EXPORT
 605
 606 None by default.
 607
 608 =head2 CAVEAT
 609
 610 Use of the C<normalization> parameter requires
 611 the B<Unicode::Normalize> module.
 612
 613 If you need not it (e.g. in the case when you need not
 614 handle any combining characters),
 615 assign C<normalization =E<gt> undef> explicitly.
 616
 617 =head1 AUTHOR
 618
 619 SADAHIRO Tomoyuki, E<lt>SADAHIRO@cpan.orgE<gt>
 620
 621   http://homepage1.nifty.com/nomenclator/perl/
 622
 623   Copyright(C) 2001, SADAHIRO Tomoyuki. Japan. All rights reserved.
 624
 625   This program is free software; you can redistribute it and/or
 626   modify it under the same terms as Perl itself.
 627
 628 =head1 SEE ALSO
 629
 630 =over 4
 631
 632 =item L<Lingua::KO::Hangul::Util>
 633
 634 utility functions for Hangul Syllables
 635
 636 =item L<Unicode::Normalize>
 637
 638 normalized forms of Unicode text
 639
 640 =item Unicode Collation Algorithm - Unicode TR #10
 641
 642 http://www.unicode.org/unicode/reports/tr10/
 643
 644 =back
 645
 646 =cut