lib/Unicode/Collate.pm

   1 package Unicode::Collate;
   2
   3 use 5.006;
   4 use strict;
   5 use warnings;
   6 use Carp;
   7 require Exporter;
   8
   9 our $VERSION = '0.08';
  10 our $PACKAGE = __PACKAGE__;
  11
  12 our @ISA = qw(Exporter);
  13
  14 our %EXPORT_TAGS = ();
  15 our @EXPORT_OK = ();
  16 our @EXPORT = ();
  17
  18 (our $Path = $INC{'Unicode/Collate.pm'}) =~ s/\.pm$//;
  19 our $KeyFile = "allkeys.txt";
  20
  21 # Lingua::KO::Hangul::Util not part of the standard distribution
  22 # but it will be used if available.
  23
  24 eval { require Lingua::KO::Hangul::Util };
  25 my $hasHangulUtil = ! $@;
  26 if ($hasHangulUtil) {
  27     Lingua::KO::Hangul::Util->import();
  28 }
  29
  30 our %Combin; # combining class from Unicode::Normalize
  31
  32 use constant Min2      => 0x20;   # minimum weight at level 2
  33 use constant Min3      => 0x02;   # minimum weight at level 3
  34 use constant UNDEFINED => 0xFF80; # special value for undefined CE
  35
  36 ##
  37 ## constructor
  38 ##
  39 sub new
  40 {
  41   my $class = shift;
  42   my $self = bless { @_ }, $class;
  43
  44   # alternate
  45   $self->{alternate} =
  46      ! exists  $self->{alternate} ? 'shifted' :
  47      ! defined $self->{alternate} ? '' : $self->{alternate};
  48
  49   # collation level
  50   $self->{level} ||= ($self->{alternate} =~ /shift/ ? 4 : 3);
  51
  52   # normalization form
  53   $self->{normalization} = 'D' if ! exists $self->{normalization};
  54
  55   eval "use Unicode::Normalize;" if defined $self->{normalization};
  56
  57   $self->{normalize} =
  58     ! defined $self->{normalization}        ? undef :
  59     $self->{normalization} =~ /^(?:NF)?C$/  ? \&NFC :
  60     $self->{normalization} =~ /^(?:NF)?D$/  ? \&NFD :
  61     $self->{normalization} =~ /^(?:NF)?KC$/ ? \&NFKC :
  62     $self->{normalization} =~ /^(?:NF)?KD$/ ? \&NFKD :
  63     croak "$PACKAGE unknown normalization form name: $self->{normalization}";
  64
  65   *Combin = \%Unicode::Normalize::Combin if $self->{normalize} && ! %Combin;
  66
  67   # backwards
  68   $self->{backwards} ||= [];
  69   $self->{backwards} = [ $self->{backwards} ] if ! ref $self->{backwards};
  70
  71   # rearrange
  72   $self->{rearrange} ||= []; # maybe not U+0000 (an ASCII)
  73   $self->{rearrange} = [ $self->{rearrange} ] if ! ref $self->{rearrange};
  74
  75   # open the table file
  76   my $file = defined $self->{table} ? $self->{table} : $KeyFile;
  77   open my $fk, "<$Path/$file" or croak "File does not exist at $Path/$file";
  78
  79   while(<$fk>){
  80     next if /^\s*#/;
  81     if(/^\s*\@/){
  82        if(/^\@version\s*(\S*)/){
  83          $self->{version} ||= $1;
  84        }
  85        elsif(/^\@alternate\s+(.*)/){
  86          $self->{alternate} ||= $1;
  87        }
  88        elsif(/^\@backwards\s+(.*)/){
  89          push @{ $self->{backwards} }, $1;
  90        }
  91        elsif(/^\@rearrange\s+(.*)/){
  92          push @{ $self->{rearrange} }, _getHexArray($1);
  93        }
  94        next;
  95     }
  96     $self->parseEntry($_);
  97   }
  98   close $fk;
  99   if($self->{entry}){
 100     $self->parseEntry($_) foreach split /\n/, $self->{entry};
 101   }
 102
 103   # keys of $self->{rearrangeHash} are $self->{rearrange}.
 104   $self->{rearrangeHash} = {};
 105   @{ $self->{rearrangeHash} }{ @{ $self->{rearrange} } } = ();
 106
 107   return $self;
 108 }
 109
 110 ##
 111 ## get $line, parse it, and write an entry in $self
 112 ##
 113 sub parseEntry
 114 {
 115   my $self = shift;
 116   my $line = shift;
 117   my($name, $ele, @key);
 118
 119   return if $line !~ /^\s*[0-9A-Fa-f]/;
 120
 121   # get name
 122   $name = $1 if $line =~ s/#\s*(.*)//;
 123   return if defined $self->{undefName} && $name =~ /$self->{undefName}/;
 124
 125   # get element
 126   my($e, $k) = split /;/, $line;
 127   my @e = _getHexArray($e);
 128   $ele = pack('U*', @e);
 129   return if defined $self->{undefChar} && $ele =~ /$self->{undefChar}/;
 130
 131   # get sort key
 132   if(
 133      defined $self->{ignoreName} && $name =~ /$self->{ignoreName}/ ||
 134      defined $self->{ignoreChar} && $ele  =~ /$self->{ignoreChar}/
 135   )
 136   {
 137      $self->{entries}{$ele} = $self->{ignored}{$ele} = 1;
 138   }
 139   else
 140   {
 141     foreach my $arr ($k =~ /\[(\S+)\]/g) {
 142       my $var = $arr =~ /\*/;
 143       push @key, $self->altCE( $var, _getHexArray($arr) );
 144     }
 145     $self->{entries}{$ele} = \@key;
 146   }
 147   $self->{maxlength}{ord $ele} = scalar @e if @e > 1;
 148 }
 149
 150
 151 ##
 152 ## arrayref CE = altCE(bool variable?, list[num] weights)
 153 ##
 154 sub altCE
 155 {
 156   my $self = shift;
 157   my $var  = shift;
 158   my @c    = @_;
 159
 160   $self->{alternate} eq 'blanked' ?
 161     $var ? [0,0,0] : [ @c[0..2] ] :
 162   $self->{alternate} eq 'non-ignorable' ?
 163     [ @c[0..2] ] :
 164   $self->{alternate} eq 'shifted' ?
 165     $var ? [0,0,0,$c[0] ] : [ @c[0..2], $c[0]+$c[1]+$c[2] ? 0xFFFF : 0 ] :
 166   $self->{alternate} eq 'shift-trimmed' ?
 167     $var ? [0,0,0,$c[0] ] : [ @c[0..2], 0 ] :
 168    \@c;
 169 }
 170
 171 ##
 172 ## string hex_sortkey = splitCE(string arg)
 173 ##
 174 sub viewSortKey
 175 {
 176   my $self = shift;
 177   my $key  = $self->getSortKey(@_);
 178   my $view = join " ", map sprintf("%04X", $_), unpack 'n*', $key;
 179   $view =~ s/ ?0000 ?/|/g;
 180   "[$view]";
 181 }
 182
 183
 184 ##
 185 ## list[strings] elements = splitCE(string arg)
 186 ##
 187 sub splitCE
 188 {
 189   my $self = shift;
 190   my $code = $self->{preprocess};
 191   my $norm = $self->{normalize};
 192   my $ent  = $self->{entries};
 193   my $max  = $self->{maxlength};
 194   my $rear = $self->{rearrangeHash};
 195
 196   my $str = ref $code ? &$code(shift) : shift;
 197   $str = &$norm($str) if ref $norm;
 198
 199   my(@src, @buf);
 200   @src = unpack('U*', $str);
 201
 202   # rearrangement
 203   for(my $i = 0; $i < @src; $i++)
 204   {
 205      ($src[$i], $src[$i+1]) = ($src[$i+1], $src[$i])
 206         if $rear->{ $src[$i] };
 207      $i++;
 208   }
 209
 210   for(my $i = 0; $i < @src; $i++)
 211   {
 212     my $ch;
 213     my $u  = $src[$i];
 214
 215   # non-characters
 216     next if $u < 0 || 0x10FFFF < $u     # out of range
 217          || 0xD800 < $u && $u < 0xDFFF; # unpaired surrogates
 218     my $four = $u & 0xFFFF;
 219     next if $four == 0xFFFE || $four == 0xFFFF;
 220
 221     if($max->{$u}) # contract
 222     {
 223       for(my $j = $max->{$u}; $j >= 1; $j--)
 224       {
 225         next unless $i+$j-1 < @src;
 226         $ch = pack 'U*', @src[$i .. $i+$j-1];
 227         $i += $j-1, last if $ent->{$ch};
 228       }
 229     }
 230     else {  $ch = pack('U', $u) }
 231
 232     if(%Combin && defined $ch) # with Combining Char
 233     {
 234       for(my $j = $i+1; $j < @src && $Combin{ $src[$j] }; $j++)
 235       {
 236         my $comb = pack 'U', $src[$j];
 237         next if ! $ent->{ $ch.$comb };
 238         $ch .= $comb;
 239         splice(@src, $j, 1);
 240         last;
 241       }
 242     }
 243     push @buf, $ch;
 244   }
 245   wantarray ? @buf : \@buf;
 246 }
 247
 248
 249 ##
 250 ## list[arrayrefs] weight = getWt(string element)
 251 ##
 252 sub getWt
 253 {
 254   my $self = shift;
 255   my $ch   = shift;
 256   my $ent  = $self->{entries};
 257   my $ign  = $self->{ignored};
 258   my $cjk  = $self->{overrideCJK};
 259   my $hang = $self->{overrideHangul};
 260   return if !defined $ch || $ign->{$ch};   # ignored
 261   return @{ $ent->{$ch} } if $ent->{$ch};
 262   my $u = unpack('U', $ch);
 263   return
 264     _isHangul($u)
 265       ? $hang
 266         ? &$hang($u)
 267         : ($hasHangulUtil ?
 268               map(@{ $ent->{pack('U', $_)} }, decomposeHangul($u)) :
 269               # runtime compile error...
 270               (eval 'use Lingua::KO::Hangul::Util', print $@))
 271       : _isCJK($u)
 272         ? $cjk ? &$cjk($u) : map($self->altCE(0,@$_), _CJK($u))
 273         : map($self->altCE(0,@$_), _derivCE($u));
 274 }
 275
 276 ##
 277 ## int = index(string, substring)
 278 ##
 279 sub index
 280 {
 281   my $self = shift;
 282   my $lev  = $self->{level};
 283   my $str  = $self->splitCE(shift);
 284   my $sub  = $self->splitCE(shift);
 285
 286   return wantarray ? (0,0) : 0 if ! @$sub;
 287   return wantarray ?  ()  : -1 if ! @$str;
 288
 289   my @subWt = grep _ignorableAtLevel($_,$lev),
 290               map $self->getWt($_), @$sub;
 291
 292   my(@strWt,@strPt);
 293   my $count = 0;
 294   for my $e (@$str){
 295     my @tmp = grep _ignorableAtLevel($_,$lev), $self->getWt($e);
 296     push @strWt, @tmp;
 297     push @strPt, ($count) x @tmp;
 298     $count += length $e;
 299     while(@strWt >= @subWt){
 300       if(_eqArray(\@strWt, \@subWt, $lev)){
 301         my $pos = $strPt[0];
 302         return wantarray ? ($pos, $count-$pos) : $pos;
 303       }
 304       shift @strWt;
 305       shift @strPt;
 306     }
 307   }
 308   return wantarray ? () : -1;
 309 }
 310
 311 ##
 312 ## bool _eqArray(arrayref, arrayref, level)
 313 ##
 314 sub _eqArray($$$)
 315 {
 316   my $a   = shift; # length $a >= length $b;
 317   my $b   = shift;
 318   my $lev = shift;
 319   for my $v (0..$lev-1){
 320     for my $c (0..@$b-1){
 321       return if $a->[$c][$v] != $b->[$c][$v];
 322     }
 323   }
 324   return 1;
 325 }
 326
 327
 328 ##
 329 ## bool _ignorableAtLevel(CE, level)
 330 ##
 331 sub _ignorableAtLevel($$)
 332 {
 333   my $ce = shift;
 334   return if ! defined $ce;
 335   my $lv = shift;
 336   ! grep { ! $ce->[$_] } 0..$lv-1;
 337 }
 338
 339
 340 ##
 341 ## string sortkey = getSortKey(string arg)
 342 ##
 343 sub getSortKey
 344 {
 345   my $self = shift;
 346   my $lev  = $self->{level};
 347   my $rCE  = $self->splitCE(shift); # get an arrayref
 348
 349   # weight arrays
 350   my @buf = grep defined(), map $self->getWt($_), @$rCE;
 351
 352   # make sort key
 353   my @ret = ([],[],[],[]);
 354   foreach my $v (0..$lev-1){
 355     foreach my $b (@buf){
 356       push @{ $ret[$v] }, $b->[$v] if $b->[$v];
 357     }
 358   }
 359   foreach (@{ $self->{backwards} }){
 360     my $v = $_ - 1;
 361     @{ $ret[$v] } = reverse @{ $ret[$v] };
 362   }
 363
 364   # modification of tertiary weights
 365   if($self->{upper_before_lower}){
 366     foreach (@{ $ret[2] }){
 367       if   (0x8 <= $_ && $_ <= 0xC){ $_ -= 6 } # lower
 368       elsif(0x2 <= $_ && $_ <= 0x6){ $_ += 6 } # upper
 369       elsif($_ == 0x1C)            { $_ += 1 } # square upper
 370       elsif($_ == 0x1D)            { $_ -= 1 } # square lower
 371     }
 372   }
 373   if($self->{katakana_before_hiragana}){
 374     foreach (@{ $ret[2] }){
 375       if   (0x0F <= $_ && $_ <= 0x13){ $_ -= 2 } # katakana
 376       elsif(0x0D <= $_ && $_ <= 0x0E){ $_ += 5 } # hiragana
 377     }
 378   }
 379   join "\0\0", map pack('n*', @$_), @ret;
 380 }
 381
 382
 383 ##
 384 ## int compare = cmp(string a, string b)
 385 ##
 386 sub cmp
 387 {
 388   my $obj = shift;
 389   my $a   = shift;
 390   my $b   = shift;
 391   $obj->getSortKey($a) cmp $obj->getSortKey($b);
 392 }
 393
 394 ##
 395 ## list[strings] sorted = sort(list[strings] arg)
 396 ##
 397 sub sort
 398 {
 399   my $obj = shift;
 400
 401   map { $_->[1] }
 402   sort{ $a->[0] cmp $b->[0] }
 403   map [ $obj->getSortKey($_), $_ ], @_;
 404 }
 405
 406 ##
 407 ## list[arrayrefs] CE = _derivCE(int codepoint)
 408 ##
 409 sub _derivCE
 410 {
 411   my $code = shift;
 412   my $a = UNDEFINED + ($code >> 15); # ok
 413   my $b = ($code & 0x7FFF) | 0x8000; # ok
 414 # my $a = 0xFFC2 + ($code >> 15);    # ng
 415 # my $b = $code & 0x7FFF | 0x1000;   # ng
 416   $b ? ([$a,2,1,$code],[$b,0,0,$code]) : [$a,2,1,$code];
 417 }
 418
 419 ##
 420 ## "hhhh hhhh hhhh" to (dddd, dddd, dddd)
 421 ##
 422 sub _getHexArray
 423 {
 424   my $str = shift;
 425   map hex(), $str =~ /([0-9a-fA-F]+)/g;
 426 }
 427
 428 ##
 429 ## bool is_a_CJK_Unified_Ideograph = _isCJK(int codepoint)
 430 ##
 431 sub _isCJK
 432 {
 433   my $u = shift;
 434   return 0x3400 <= $u && $u <= 0x4DB5
 435       || 0x4E00 <= $u && $u <= 0x9FA5
 436 #      || 0x20000 <= $u && $u <= 0x2A6D6;
 437 }
 438
 439 ##
 440 ## list[arrayref] CE = _CJK(int codepoint_of_CJK)
 441 ##
 442 sub _CJK
 443 {
 444   my $u = shift;
 445   $u > 0xFFFF ? _derivCE($u) : [$u,0x20,0x02,$u];
 446 }
 447
 448 ##
 449 ## bool is_a_Hangul_Syllable = _isHangul(int codepoint)
 450 ##
 451 sub _isHangul
 452 {
 453   my $code = shift;
 454   return 0xAC00 <= $code && $code <= 0xD7A3;
 455 }
 456
 457 1;
 458 __END__
 459
 460 =head1 NAME
 461
 462 Unicode::Collate - use UCA (Unicode Collation Algorithm)
 463
 464 =head1 SYNOPSIS
 465
 466   use Unicode::Collate;
 467
 468   #construct
 469   $UCA = Unicode::Collate->new(%tailoring);
 470
 471   #sort
 472   @sorted = $UCA->sort(@not_sorted);
 473
 474   #compare
 475   $result = $UCA->cmp($a, $b); # returns 1, 0, or -1.
 476
 477 =head1 DESCRIPTION
 478
 479 =head2 Constructor and Tailoring
 480
 481 The C<new> method returns a collator object.
 482
 483    $UCA = Unicode::Collate->new(
 484       alternate => $alternate,
 485       backwards => $levelNumber, # or \@levelNumbers
 486       entry => $element,
 487       normalization  => $normalization_form,
 488       ignoreName => qr/$ignoreName/,
 489       ignoreChar => qr/$ignoreChar/,
 490       katakana_before_hiragana => $bool,
 491       level => $collationLevel,
 492       overrideCJK => \&overrideCJK,
 493       overrideHangul => \&overrideHangul,
 494       preprocess => \&preprocess,
 495       rearrange => \@charList,
 496       table => $filename,
 497       undefName => qr/$undefName/,
 498       undefChar => qr/$undefChar/,
 499       upper_before_lower => $bool,
 500    );
 501    # if %tailoring is false (empty),
 502    # $UCA should do the default collation.
 503
 504 =over 4
 505
 506 =item alternate
 507
 508 -- see 3.2.2 Alternate Weighting, UTR #10.
 509
 510    alternate => 'shifted', 'blanked', 'non-ignorable', or 'shift-trimmed'.
 511
 512 By default (if specification is omitted), 'shifted' is adopted.
 513
 514 =item backwards
 515
 516 -- see 3.1.2 French Accents, UTR #10.
 517
 518      backwards => $levelNumber or \@levelNumbers
 519
 520 Weights in reverse order; ex. level 2 (diacritic ordering) in French.
 521 If omitted, forwards at all the levels.
 522
 523 =item entry
 524
 525 -- see 3.1 Linguistic Features; 3.2.1 File Format, UTR #10.
 526
 527 Overrides a default order or adds a new element
 528
 529   entry => <<'ENTRIES', # use the UCA file format
 530 00E6 ; [.0861.0020.0002.00E6] [.08B1.0020.0002.00E6] # ligature <ae> as <a e>
 531 0063 0068 ; [.0893.0020.0002.0063]      # "ch" in traditional Spanish
 532 0043 0068 ; [.0893.0020.0008.0043]      # "Ch" in traditional Spanish
 533 ENTRIES
 534
 535 =item ignoreName
 536
 537 =item ignoreChar
 538
 539 -- see Completely Ignorable, 3.2.2 Alternate Weighting, UTR #10.
 540
 541 Ignores the entry in the table.
 542 If an ignored collation element appears in the string to be collated,
 543 it is ignored as if the element had been deleted from there.
 544
 545 E.g. when 'a' and 'e' are ignored,
 546 'element' is equal to 'lament' (or 'lmnt').
 547
 548 =item level
 549
 550 -- see 4.3 Form a sort key for each string, UTR #10.
 551
 552 Set the maximum level.
 553 Any higher levels than the specified one are ignored.
 554
 555   Level 1: alphabetic ordering
 556   Level 2: diacritic ordering
 557   Level 3: case ordering
 558   Level 4: tie-breaking (e.g. in the case when alternate is 'shifted')
 559
 560   ex.level => 2,
 561
 562 =item normalization
 563
 564 -- see 4.1 Normalize each input string, UTR #10.
 565
 566 If specified, strings are normalized before preparation sort keys
 567 (the normalization is executed after preprocess).
 568
 569 As a form name, one of the following names must be used.
 570
 571   'C'  or 'NFC'  for Normalization Form C
 572   'D'  or 'NFD'  for Normalization Form D
 573   'KC' or 'NFKC' for Normalization Form KC
 574   'KD' or 'NFKD' for Normalization Form KD
 575
 576 If omitted, the string is put into Normalization Form D.
 577
 578 If undefined explicitly (as C<normalization =E<gt> undef>),
 579 any normalization is not carried out (this may make tailoring easier
 580 if any normalization is not desired).
 581
 582 see B<CAVEAT>.
 583
 584 =item overrideCJK
 585
 586 =item overrideHangul
 587
 588 -- see 7.1 Derived Collation Elements, UTR #10.
 589
 590 By default, mapping of CJK Unified Ideographs
 591 uses the Unicode codepoint order
 592 and Hangul Syllables are decomposed into Hangul Jamo.
 593
 594 The mapping of CJK Unified Ideographs
 595 or Hangul Syllables may be overrided.
 596
 597 ex. CJK Unified Ideographs in the JIS codepoint order.
 598
 599   overrideCJK => sub {
 600     my $u = shift;               # get unicode codepoint
 601     my $b = pack('n', $u);       # to UTF-16BE
 602     my $s = your_unicode_to_sjis_converter($b); # convert
 603     my $n = unpack('n', $s);     # convert sjis to short
 604     [ $n, 1, 1 ];                # return collation element
 605   },
 606
 607 If you want to override the mapping of Hangul Syllables,
 608 the Normalization Forms D and KD are not appropriate
 609 (they will be decomposed before overriding).
 610
 611 =item preprocess
 612
 613 -- see 5.1 Preprocessing, UTR #10.
 614
 615 If specified, the coderef is used to preprocess
 616 before the formation of sort keys.
 617
 618 ex. dropping English articles, such as "a" or "the".
 619 Then, "the pen" is before "a pencil".
 620
 621      preprocess => sub {
 622            my $str = shift;
 623            $str =~ s/\b(?:an?|the)\s+//g;
 624            $str;
 625         },
 626
 627 =item rearrange
 628
 629 -- see 3.1.3 Rearrangement, UTR #10.
 630
 631 Characters that are not coded in logical order and to be rearranged.
 632 By default,
 633
 634     rearrange => [ 0x0E40..0x0E44, 0x0EC0..0x0EC4 ],
 635
 636 =item table
 637
 638 -- see 3.2 Default Unicode Collation Element Table, UTR #10.
 639
 640 You can use another element table if desired.
 641 The table file must be in your C<lib/Unicode/Collate> directory.
 642
 643 By default, the file C<lib/Unicode/Collate/allkeys.txt> is used.
 644
 645 =item undefName
 646
 647 =item undefChar
 648
 649 -- see 6.3.4 Reducing the Repertoire, UTR #10.
 650
 651 Undefines the collation element as if it were unassigned in the table.
 652 This reduces the size of the table.
 653 If an unassigned character appears in the string to be collated,
 654 the sort key is made from its codepoint
 655 as a single-character collation element,
 656 as it is greater than any other assigned collation elements
 657 (in the codepoint order among the unassigned characters).
 658 But, it'd be better to ignore characters
 659 unfamiliar to you and maybe never used.
 660
 661 =item katakana_before_hiragana
 662
 663 =item upper_before_lower
 664
 665 -- see 6.6 Case Comparisons; 7.3.1 Tertiary Weight Table, UTR #10.
 666
 667 By default, lowercase is before uppercase
 668 and hiragana is before katakana.
 669
 670 If the parameter is true, this is reversed.
 671
 672 =back
 673
 674 =head2 Other methods
 675
 676 =over 4
 677
 678 =item C<@sorted = $UCA-E<gt>sort(@not_sorted)>
 679
 680 Sorts a list of strings.
 681
 682 =item C<$result = $UCA-E<gt>cmp($a, $b)>
 683
 684 Returns 1 (when C<$a> is greater than C<$b>)
 685 or 0 (when C<$a> is equal to C<$b>)
 686 or -1 (when C<$a> is lesser than C<$b>).
 687
 688 =item C<$sortKey = $UCA-E<gt>getSortKey($string)>
 689
 690 -- see 4.3 Form a sort key for each string, UTR #10.
 691
 692 Returns a sort key.
 693
 694 You compare the sort keys using a binary comparison
 695 and get the result of the comparison of the strings using UCA.
 696
 697    $UCA->getSortKey($a) cmp $UCA->getSortKey($b)
 698
 699       is equivalent to
 700
 701    $UCA->cmp($a, $b)
 702
 703 =item C<$position = $UCA-E<gt>index($string, $substring)>
 704
 705 =item C<($position, $length) = $UCA-E<gt>index($string, $substring)>
 706
 707 -- see 6.8 Searching, UTR #10.
 708
 709 If C<$substring> matches a part of C<$string>, returns
 710 the position of the first occurrence of the matching part in scalar context;
 711 in list context, returns a two-element list of
 712 the position and the length of the matching part.
 713
 714 B<Notice> that the length of the matching part may differ from
 715 the length of C<$substring>.
 716
 717 B<Note> that the position and the length are counted on the string
 718 after the process of preprocess, normalization, and rearrangement.
 719 Therefore, in case the specified string is not binary equal to
 720 the preprocessed/normalized/rearranged string, the position and the length
 721 may differ form those on the specified string. But it is guaranteed
 722 that, if matched, it returns a non-negative value as C<$position>.
 723
 724 If C<$substring> does not match any part of C<$string>,
 725 returns C<-1> in scalar context and
 726 an empty list in list context.
 727
 728 e.g. you say
 729
 730   my $UCA = Unicode::Collate->new( normalization => undef, level => 1 );
 731   my $str = "Ich mu\x{00DF} studieren.";
 732   my $sub = "m\x{00FC}ss";
 733   my $match;
 734   if(my @tmp = $UCA->index($str, $sub)){
 735     $match = substr($str, $tmp[0], $tmp[1]);
 736   }
 737
 738 and get C<"mu\x{00DF}"> in C<$match> since C<"mu>E<223>C<">
 739 is primary equal to C<"m>E<252>C<ss">.
 740
 741 =back
 742
 743 =head2 EXPORT
 744
 745 None by default.
 746
 747 =head2 CAVEAT
 748
 749 Use of the C<normalization> parameter requires
 750 the B<Unicode::Normalize> module.
 751
 752 If you need not it (e.g. in the case when you need not
 753 handle any combining characters),
 754 assign C<normalization =E<gt> undef> explicitly.
 755
 756 =head1 AUTHOR
 757
 758 SADAHIRO Tomoyuki, E<lt>SADAHIRO@cpan.orgE<gt>
 759
 760   http://homepage1.nifty.com/nomenclator/perl/
 761
 762   Copyright(C) 2001, SADAHIRO Tomoyuki. Japan. All rights reserved.
 763
 764   This program is free software; you can redistribute it and/or
 765   modify it under the same terms as Perl itself.
 766
 767 =head1 SEE ALSO
 768
 769 =over 4
 770
 771 =item L<Lingua::KO::Hangul::Util>
 772
 773 utility functions for Hangul Syllables
 774
 775 =item L<Unicode::Normalize>
 776
 777 normalized forms of Unicode text
 778
 779 =item Unicode Collation Algorithm - Unicode TR #10
 780
 781 http://www.unicode.org/unicode/reports/tr10/
 782
 783 =back
 784
 785 =cut