lib/Unicode/Collate.pm

   1 package Unicode::Collate;
   2
   3 use 5.006;
   4 use strict;
   5 use warnings;
   6 use Carp;
   7 use Lingua::KO::Hangul::Util;
   8 require Exporter;
   9
  10 our $VERSION = '0.08';
  11 our $PACKAGE = __PACKAGE__;
  12
  13 our @ISA = qw(Exporter);
  14
  15 our %EXPORT_TAGS = ();
  16 our @EXPORT_OK = ();
  17 our @EXPORT = ();
  18
  19 (our $Path = $INC{'Unicode/Collate.pm'}) =~ s/\.pm$//;
  20 our $KeyFile = "allkeys.txt";
  21
  22 our %Combin; # combining class from Unicode::Normalize
  23
  24 use constant Min2      => 0x20;   # minimum weight at level 2
  25 use constant Min3      => 0x02;   # minimum weight at level 3
  26 use constant UNDEFINED => 0xFF80; # special value for undefined CE
  27
  28 ##
  29 ## constructor
  30 ##
  31 sub new
  32 {
  33   my $class = shift;
  34   my $self = bless { @_ }, $class;
  35
  36   # alternate
  37   $self->{alternate} =
  38      ! exists  $self->{alternate} ? 'shifted' :
  39      ! defined $self->{alternate} ? '' : $self->{alternate};
  40
  41   # collation level
  42   $self->{level} ||= ($self->{alternate} =~ /shift/ ? 4 : 3);
  43
  44   # normalization form
  45   $self->{normalization} = 'D' if ! exists $self->{normalization};
  46
  47   eval "use Unicode::Normalize;" if defined $self->{normalization};
  48
  49   $self->{normalize} =
  50     ! defined $self->{normalization}        ? undef :
  51     $self->{normalization} =~ /^(?:NF)?C$/  ? \&NFC :
  52     $self->{normalization} =~ /^(?:NF)?D$/  ? \&NFD :
  53     $self->{normalization} =~ /^(?:NF)?KC$/ ? \&NFKC :
  54     $self->{normalization} =~ /^(?:NF)?KD$/ ? \&NFKD :
  55     croak "$PACKAGE unknown normalization form name: $self->{normalization}";
  56
  57   *Combin = \%Unicode::Normalize::Combin if $self->{normalize} && ! %Combin;
  58
  59   # backwards
  60   $self->{backwards} ||= [];
  61   $self->{backwards} = [ $self->{backwards} ] if ! ref $self->{backwards};
  62
  63   # rearrange
  64   $self->{rearrange} ||= []; # maybe not U+0000 (an ASCII)
  65   $self->{rearrange} = [ $self->{rearrange} ] if ! ref $self->{rearrange};
  66
  67   # open the table file
  68   my $file = defined $self->{table} ? $self->{table} : $KeyFile;
  69   open my $fk, "<$Path/$file" or croak "File does not exist at $Path/$file";
  70
  71   while(<$fk>){
  72     next if /^\s*#/;
  73     if(/^\s*\@/){
  74        if(/^\@version\s*(\S*)/){
  75          $self->{version} ||= $1;
  76        }
  77        elsif(/^\@alternate\s+(.*)/){
  78          $self->{alternate} ||= $1;
  79        }
  80        elsif(/^\@backwards\s+(.*)/){
  81          push @{ $self->{backwards} }, $1;
  82        }
  83        elsif(/^\@rearrange\s+(.*)/){
  84          push @{ $self->{rearrange} }, _getHexArray($1);
  85        }
  86        next;
  87     }
  88     $self->parseEntry($_);
  89   }
  90   close $fk;
  91   if($self->{entry}){
  92     $self->parseEntry($_) foreach split /\n/, $self->{entry};
  93   }
  94
  95   # keys of $self->{rearrangeHash} are $self->{rearrange}.
  96   $self->{rearrangeHash} = {};
  97   @{ $self->{rearrangeHash} }{ @{ $self->{rearrange} } } = ();
  98
  99   return $self;
 100 }
 101
 102 ##
 103 ## get $line, parse it, and write an entry in $self
 104 ##
 105 sub parseEntry
 106 {
 107   my $self = shift;
 108   my $line = shift;
 109   my($name, $ele, @key);
 110
 111   return if $line !~ /^\s*[0-9A-Fa-f]/;
 112
 113   # get name
 114   $name = $1 if $line =~ s/#\s*(.*)//;
 115   return if defined $self->{undefName} && $name =~ /$self->{undefName}/;
 116
 117   # get element
 118   my($e, $k) = split /;/, $line;
 119   my @e = _getHexArray($e);
 120   $ele = pack('U*', @e);
 121   return if defined $self->{undefChar} && $ele =~ /$self->{undefChar}/;
 122
 123   # get sort key
 124   if(
 125      defined $self->{ignoreName} && $name =~ /$self->{ignoreName}/ ||
 126      defined $self->{ignoreChar} && $ele  =~ /$self->{ignoreChar}/
 127   )
 128   {
 129      $self->{entries}{$ele} = $self->{ignored}{$ele} = 1;
 130   }
 131   else
 132   {
 133     foreach my $arr ($k =~ /\[(\S+)\]/g) {
 134       my $var = $arr =~ /\*/;
 135       push @key, $self->altCE( $var, _getHexArray($arr) );
 136     }
 137     $self->{entries}{$ele} = \@key;
 138   }
 139   $self->{maxlength}{ord $ele} = scalar @e if @e > 1;
 140 }
 141
 142
 143 ##
 144 ## arrayref CE = altCE(bool variable?, list[num] weights)
 145 ##
 146 sub altCE
 147 {
 148   my $self = shift;
 149   my $var  = shift;
 150   my @c    = @_;
 151
 152   $self->{alternate} eq 'blanked' ?
 153     $var ? [0,0,0] : [ @c[0..2] ] :
 154   $self->{alternate} eq 'non-ignorable' ?
 155     [ @c[0..2] ] :
 156   $self->{alternate} eq 'shifted' ?
 157     $var ? [0,0,0,$c[0] ] : [ @c[0..2], $c[0]+$c[1]+$c[2] ? 0xFFFF : 0 ] :
 158   $self->{alternate} eq 'shift-trimmed' ?
 159     $var ? [0,0,0,$c[0] ] : [ @c[0..2], 0 ] :
 160    \@c;
 161 }
 162
 163 ##
 164 ## string hex_sortkey = splitCE(string arg)
 165 ##
 166 sub viewSortKey
 167 {
 168   my $self = shift;
 169   my $key  = $self->getSortKey(@_);
 170   my $view = join " ", map sprintf("%04X", $_), unpack 'n*', $key;
 171   $view =~ s/ ?0000 ?/|/g;
 172   "[$view]";
 173 }
 174
 175
 176 ##
 177 ## list[strings] elements = splitCE(string arg)
 178 ##
 179 sub splitCE
 180 {
 181   my $self = shift;
 182   my $code = $self->{preprocess};
 183   my $norm = $self->{normalize};
 184   my $ent  = $self->{entries};
 185   my $max  = $self->{maxlength};
 186   my $rear = $self->{rearrangeHash};
 187
 188   my $str = ref $code ? &$code(shift) : shift;
 189   $str = &$norm($str) if ref $norm;
 190
 191   my(@src, @buf);
 192   @src = unpack('U*', $str);
 193
 194   # rearrangement
 195   for(my $i = 0; $i < @src; $i++)
 196   {
 197      ($src[$i], $src[$i+1]) = ($src[$i+1], $src[$i])
 198         if $rear->{ $src[$i] };
 199      $i++;
 200   }
 201
 202   for(my $i = 0; $i < @src; $i++)
 203   {
 204     my $ch;
 205     my $u  = $src[$i];
 206
 207   # non-characters
 208     next if $u < 0 || 0x10FFFF < $u     # out of range
 209          || 0xD800 < $u && $u < 0xDFFF; # unpaired surrogates
 210     my $four = $u & 0xFFFF;
 211     next if $four == 0xFFFE || $four == 0xFFFF;
 212
 213     if($max->{$u}) # contract
 214     {
 215       for(my $j = $max->{$u}; $j >= 1; $j--)
 216       {
 217         next unless $i+$j-1 < @src;
 218         $ch = pack 'U*', @src[$i .. $i+$j-1];
 219         $i += $j-1, last if $ent->{$ch};
 220       }
 221     }
 222     else {  $ch = pack('U', $u) }
 223
 224     if(%Combin && defined $ch) # with Combining Char
 225     {
 226       for(my $j = $i+1; $j < @src && $Combin{ $src[$j] }; $j++)
 227       {
 228         my $comb = pack 'U', $src[$j];
 229         next if ! $ent->{ $ch.$comb };
 230         $ch .= $comb;
 231         splice(@src, $j, 1);
 232         last;
 233       }
 234     }
 235     push @buf, $ch;
 236   }
 237   wantarray ? @buf : \@buf;
 238 }
 239
 240
 241 ##
 242 ## list[arrayrefs] weight = getWt(string element)
 243 ##
 244 sub getWt
 245 {
 246   my $self = shift;
 247   my $ch   = shift;
 248   my $ent  = $self->{entries};
 249   my $ign  = $self->{ignored};
 250   my $cjk  = $self->{overrideCJK};
 251   my $hang = $self->{overrideHangul};
 252   return if !defined $ch || $ign->{$ch};   # ignored
 253   return @{ $ent->{$ch} } if $ent->{$ch};
 254   my $u = unpack('U', $ch);
 255   return
 256     _isHangul($u)
 257       ? $hang
 258         ? &$hang($u)
 259         : map(@{ $ent->{pack('U', $_)} }, decomposeHangul($u))
 260       : _isCJK($u)
 261         ? $cjk ? &$cjk($u) : map($self->altCE(0,@$_), _CJK($u))
 262         : map($self->altCE(0,@$_), _derivCE($u));
 263 }
 264
 265 ##
 266 ## int = index(string, substring)
 267 ##
 268 sub index
 269 {
 270   my $self = shift;
 271   my $lev  = $self->{level};
 272   my $str  = $self->splitCE(shift);
 273   my $sub  = $self->splitCE(shift);
 274
 275   return wantarray ? (0,0) : 0 if ! @$sub;
 276   return wantarray ?  ()  : -1 if ! @$str;
 277
 278   my @subWt = grep _ignorableAtLevel($_,$lev),
 279               map $self->getWt($_), @$sub;
 280
 281   my(@strWt,@strPt);
 282   my $count = 0;
 283   for my $e (@$str){
 284     my @tmp = grep _ignorableAtLevel($_,$lev), $self->getWt($e);
 285     push @strWt, @tmp;
 286     push @strPt, ($count) x @tmp;
 287     $count += length $e;
 288     while(@strWt >= @subWt){
 289       if(_eqArray(\@strWt, \@subWt, $lev)){
 290         my $pos = $strPt[0];
 291         return wantarray ? ($pos, $count-$pos) : $pos;
 292       }
 293       shift @strWt;
 294       shift @strPt;
 295     }
 296   }
 297   return wantarray ? () : -1;
 298 }
 299
 300 ##
 301 ## bool _eqArray(arrayref, arrayref, level)
 302 ##
 303 sub _eqArray($$$)
 304 {
 305   my $a   = shift; # length $a >= length $b;
 306   my $b   = shift;
 307   my $lev = shift;
 308   for my $v (0..$lev-1){
 309     for my $c (0..@$b-1){
 310       return if $a->[$c][$v] != $b->[$c][$v];
 311     }
 312   }
 313   return 1;
 314 }
 315
 316
 317 ##
 318 ## bool _ignorableAtLevel(CE, level)
 319 ##
 320 sub _ignorableAtLevel($$)
 321 {
 322   my $ce = shift;
 323   return if ! defined $ce;
 324   my $lv = shift;
 325   ! grep { ! $ce->[$_] } 0..$lv-1;
 326 }
 327
 328
 329 ##
 330 ## string sortkey = getSortKey(string arg)
 331 ##
 332 sub getSortKey
 333 {
 334   my $self = shift;
 335   my $lev  = $self->{level};
 336   my $rCE  = $self->splitCE(shift); # get an arrayref
 337
 338   # weight arrays
 339   my @buf = grep defined(), map $self->getWt($_), @$rCE;
 340
 341   # make sort key
 342   my @ret = ([],[],[],[]);
 343   foreach my $v (0..$lev-1){
 344     foreach my $b (@buf){
 345       push @{ $ret[$v] }, $b->[$v] if $b->[$v];
 346     }
 347   }
 348   foreach (@{ $self->{backwards} }){
 349     my $v = $_ - 1;
 350     @{ $ret[$v] } = reverse @{ $ret[$v] };
 351   }
 352
 353   # modification of tertiary weights
 354   if($self->{upper_before_lower}){
 355     foreach (@{ $ret[2] }){
 356       if   (0x8 <= $_ && $_ <= 0xC){ $_ -= 6 } # lower
 357       elsif(0x2 <= $_ && $_ <= 0x6){ $_ += 6 } # upper
 358       elsif($_ == 0x1C)            { $_ += 1 } # square upper
 359       elsif($_ == 0x1D)            { $_ -= 1 } # square lower
 360     }
 361   }
 362   if($self->{katakana_before_hiragana}){
 363     foreach (@{ $ret[2] }){
 364       if   (0x0F <= $_ && $_ <= 0x13){ $_ -= 2 } # katakana
 365       elsif(0x0D <= $_ && $_ <= 0x0E){ $_ += 5 } # hiragana
 366     }
 367   }
 368   join "\0\0", map pack('n*', @$_), @ret;
 369 }
 370
 371
 372 ##
 373 ## int compare = cmp(string a, string b)
 374 ##
 375 sub cmp
 376 {
 377   my $obj = shift;
 378   my $a   = shift;
 379   my $b   = shift;
 380   $obj->getSortKey($a) cmp $obj->getSortKey($b);
 381 }
 382
 383 ##
 384 ## list[strings] sorted = sort(list[strings] arg)
 385 ##
 386 sub sort
 387 {
 388   my $obj = shift;
 389
 390   map { $_->[1] }
 391   sort{ $a->[0] cmp $b->[0] }
 392   map [ $obj->getSortKey($_), $_ ], @_;
 393 }
 394
 395 ##
 396 ## list[arrayrefs] CE = _derivCE(int codepoint)
 397 ##
 398 sub _derivCE
 399 {
 400   my $code = shift;
 401   my $a = UNDEFINED + ($code >> 15); # ok
 402   my $b = ($code & 0x7FFF) | 0x8000; # ok
 403 # my $a = 0xFFC2 + ($code >> 15);    # ng
 404 # my $b = $code & 0x7FFF | 0x1000;   # ng
 405   $b ? ([$a,2,1,$code],[$b,0,0,$code]) : [$a,2,1,$code];
 406 }
 407
 408 ##
 409 ## "hhhh hhhh hhhh" to (dddd, dddd, dddd)
 410 ##
 411 sub _getHexArray
 412 {
 413   my $str = shift;
 414   map hex(), $str =~ /([0-9a-fA-F]+)/g;
 415 }
 416
 417 ##
 418 ## bool is_a_CJK_Unified_Ideograph = _isCJK(int codepoint)
 419 ##
 420 sub _isCJK
 421 {
 422   my $u = shift;
 423   return 0x3400 <= $u && $u <= 0x4DB5
 424       || 0x4E00 <= $u && $u <= 0x9FA5
 425 #      || 0x20000 <= $u && $u <= 0x2A6D6;
 426 }
 427
 428 ##
 429 ## list[arrayref] CE = _CJK(int codepoint_of_CJK)
 430 ##
 431 sub _CJK
 432 {
 433   my $u = shift;
 434   $u > 0xFFFF ? _derivCE($u) : [$u,0x20,0x02,$u];
 435 }
 436
 437 ##
 438 ## bool is_a_Hangul_Syllable = _isHangul(int codepoint)
 439 ##
 440 sub _isHangul
 441 {
 442   my $code = shift;
 443   return 0xAC00 <= $code && $code <= 0xD7A3;
 444 }
 445
 446 1;
 447 __END__
 448
 449 =head1 NAME
 450
 451 Unicode::Collate - use UCA (Unicode Collation Algorithm)
 452
 453 =head1 SYNOPSIS
 454
 455   use Unicode::Collate;
 456
 457   #construct
 458   $UCA = Unicode::Collate->new(%tailoring);
 459
 460   #sort
 461   @sorted = $UCA->sort(@not_sorted);
 462
 463   #compare
 464   $result = $UCA->cmp($a, $b); # returns 1, 0, or -1.
 465
 466 =head1 DESCRIPTION
 467
 468 =head2 Constructor and Tailoring
 469
 470 The C<new> method returns a collator object.
 471
 472    $UCA = Unicode::Collate->new(
 473       alternate => $alternate,
 474       backwards => $levelNumber, # or \@levelNumbers
 475       entry => $element,
 476       normalization  => $normalization_form,
 477       ignoreName => qr/$ignoreName/,
 478       ignoreChar => qr/$ignoreChar/,
 479       katakana_before_hiragana => $bool,
 480       level => $collationLevel,
 481       overrideCJK => \&overrideCJK,
 482       overrideHangul => \&overrideHangul,
 483       preprocess => \&preprocess,
 484       rearrange => \@charList,
 485       table => $filename,
 486       undefName => qr/$undefName/,
 487       undefChar => qr/$undefChar/,
 488       upper_before_lower => $bool,
 489    );
 490    # if %tailoring is false (empty),
 491    # $UCA should do the default collation.
 492
 493 =over 4
 494
 495 =item alternate
 496
 497 -- see 3.2.2 Alternate Weighting, UTR #10.
 498
 499    alternate => 'shifted', 'blanked', 'non-ignorable', or 'shift-trimmed'.
 500
 501 By default (if specification is omitted), 'shifted' is adopted.
 502
 503 =item backwards
 504
 505 -- see 3.1.2 French Accents, UTR #10.
 506
 507      backwards => $levelNumber or \@levelNumbers
 508
 509 Weights in reverse order; ex. level 2 (diacritic ordering) in French.
 510 If omitted, forwards at all the levels.
 511
 512 =item entry
 513
 514 -- see 3.1 Linguistic Features; 3.2.1 File Format, UTR #10.
 515
 516 Overrides a default order or adds a new element
 517
 518   entry => <<'ENTRIES', # use the UCA file format
 519 00E6 ; [.0861.0020.0002.00E6] [.08B1.0020.0002.00E6] # ligature <ae> as <a e>
 520 0063 0068 ; [.0893.0020.0002.0063]      # "ch" in traditional Spanish
 521 0043 0068 ; [.0893.0020.0008.0043]      # "Ch" in traditional Spanish
 522 ENTRIES
 523
 524 =item ignoreName
 525
 526 =item ignoreChar
 527
 528 -- see Completely Ignorable, 3.2.2 Alternate Weighting, UTR #10.
 529
 530 Ignores the entry in the table.
 531 If an ignored collation element appears in the string to be collated,
 532 it is ignored as if the element had been deleted from there.
 533
 534 E.g. when 'a' and 'e' are ignored,
 535 'element' is equal to 'lament' (or 'lmnt').
 536
 537 =item level
 538
 539 -- see 4.3 Form a sort key for each string, UTR #10.
 540
 541 Set the maximum level.
 542 Any higher levels than the specified one are ignored.
 543
 544   Level 1: alphabetic ordering
 545   Level 2: diacritic ordering
 546   Level 3: case ordering
 547   Level 4: tie-breaking (e.g. in the case when alternate is 'shifted')
 548
 549   ex.level => 2,
 550
 551 =item normalization
 552
 553 -- see 4.1 Normalize each input string, UTR #10.
 554
 555 If specified, strings are normalized before preparation sort keys
 556 (the normalization is executed after preprocess).
 557
 558 As a form name, one of the following names must be used.
 559
 560   'C'  or 'NFC'  for Normalization Form C
 561   'D'  or 'NFD'  for Normalization Form D
 562   'KC' or 'NFKC' for Normalization Form KC
 563   'KD' or 'NFKD' for Normalization Form KD
 564
 565 If omitted, the string is put into Normalization Form D.
 566
 567 If undefined explicitly (as C<normalization =E<gt> undef>),
 568 any normalization is not carried out (this may make tailoring easier
 569 if any normalization is not desired).
 570
 571 see B<CAVEAT>.
 572
 573 =item overrideCJK
 574
 575 =item overrideHangul
 576
 577 -- see 7.1 Derived Collation Elements, UTR #10.
 578
 579 By default, mapping of CJK Unified Ideographs
 580 uses the Unicode codepoint order
 581 and Hangul Syllables are decomposed into Hangul Jamo.
 582
 583 The mapping of CJK Unified Ideographs
 584 or Hangul Syllables may be overrided.
 585
 586 ex. CJK Unified Ideographs in the JIS codepoint order.
 587
 588   overrideCJK => sub {
 589     my $u = shift;               # get unicode codepoint
 590     my $b = pack('n', $u);       # to UTF-16BE
 591     my $s = your_unicode_to_sjis_converter($b); # convert
 592     my $n = unpack('n', $s);     # convert sjis to short
 593     [ $n, 1, 1 ];                # return collation element
 594   },
 595
 596 If you want to override the mapping of Hangul Syllables,
 597 the Normalization Forms D and KD are not appropriate
 598 (they will be decomposed before overriding).
 599
 600 =item preprocess
 601
 602 -- see 5.1 Preprocessing, UTR #10.
 603
 604 If specified, the coderef is used to preprocess
 605 before the formation of sort keys.
 606
 607 ex. dropping English articles, such as "a" or "the".
 608 Then, "the pen" is before "a pencil".
 609
 610      preprocess => sub {
 611            my $str = shift;
 612            $str =~ s/\b(?:an?|the)\s+//g;
 613            $str;
 614         },
 615
 616 =item rearrange
 617
 618 -- see 3.1.3 Rearrangement, UTR #10.
 619
 620 Characters that are not coded in logical order and to be rearranged.
 621 By default,
 622
 623     rearrange => [ 0x0E40..0x0E44, 0x0EC0..0x0EC4 ],
 624
 625 =item table
 626
 627 -- see 3.2 Default Unicode Collation Element Table, UTR #10.
 628
 629 You can use another element table if desired.
 630 The table file must be in your C<lib/Unicode/Collate> directory.
 631
 632 By default, the file C<lib/Unicode/Collate/allkeys.txt> is used.
 633
 634 =item undefName
 635
 636 =item undefChar
 637
 638 -- see 6.3.4 Reducing the Repertoire, UTR #10.
 639
 640 Undefines the collation element as if it were unassigned in the table.
 641 This reduces the size of the table.
 642 If an unassigned character appears in the string to be collated,
 643 the sort key is made from its codepoint
 644 as a single-character collation element,
 645 as it is greater than any other assigned collation elements
 646 (in the codepoint order among the unassigned characters).
 647 But, it'd be better to ignore characters
 648 unfamiliar to you and maybe never used.
 649
 650 =item katakana_before_hiragana
 651
 652 =item upper_before_lower
 653
 654 -- see 6.6 Case Comparisons; 7.3.1 Tertiary Weight Table, UTR #10.
 655
 656 By default, lowercase is before uppercase
 657 and hiragana is before katakana.
 658
 659 If the parameter is true, this is reversed.
 660
 661 =back
 662
 663 =head2 Other methods
 664
 665 =over 4
 666
 667 =item C<@sorted = $UCA-E<gt>sort(@not_sorted)>
 668
 669 Sorts a list of strings.
 670
 671 =item C<$result = $UCA-E<gt>cmp($a, $b)>
 672
 673 Returns 1 (when C<$a> is greater than C<$b>)
 674 or 0 (when C<$a> is equal to C<$b>)
 675 or -1 (when C<$a> is lesser than C<$b>).
 676
 677 =item C<$sortKey = $UCA-E<gt>getSortKey($string)>
 678
 679 -- see 4.3 Form a sort key for each string, UTR #10.
 680
 681 Returns a sort key.
 682
 683 You compare the sort keys using a binary comparison
 684 and get the result of the comparison of the strings using UCA.
 685
 686    $UCA->getSortKey($a) cmp $UCA->getSortKey($b)
 687
 688       is equivalent to
 689
 690    $UCA->cmp($a, $b)
 691
 692 =item C<$position = $UCA-E<gt>index($string, $substring)>
 693
 694 =item C<($position, $length) = $UCA-E<gt>index($string, $substring)>
 695
 696 -- see 6.8 Searching, UTR #10.
 697
 698 If C<$substring> matches a part of C<$string>, returns
 699 the position of the first occurrence of the matching part in scalar context;
 700 in list context, returns a two-element list of
 701 the position and the length of the matching part.
 702
 703 B<Notice> that the length of the matching part may differ from
 704 the length of C<$substring>.
 705
 706 B<Note> that the position and the length are counted on the string
 707 after the process of preprocess, normalization, and rearrangement.
 708 Therefore, in case the specified string is not binary equal to
 709 the preprocessed/normalized/rearranged string, the position and the length
 710 may differ form those on the specified string. But it is guaranteed
 711 that, if matched, it returns a non-negative value as C<$position>.
 712
 713 If C<$substring> does not match any part of C<$string>,
 714 returns C<-1> in scalar context and
 715 an empty list in list context.
 716
 717 e.g. you say
 718
 719   my $UCA = Unicode::Collate->new( normalization => undef, level => 1 );
 720   my $str = "Ich mu\x{00DF} studieren.";
 721   my $sub = "m\x{00FC}ss";
 722   my $match;
 723   if(my @tmp = $UCA->index($str, $sub)){
 724     $match = substr($str, $tmp[0], $tmp[1]);
 725   }
 726
 727 and get C<"mu\x{00DF}"> in C<$match> since C<"mu>E<223>C<">
 728 is primary equal to C<"m>E<252>C<ss">.
 729
 730 =back
 731
 732 =head2 EXPORT
 733
 734 None by default.
 735
 736 =head2 CAVEAT
 737
 738 Use of the C<normalization> parameter requires
 739 the B<Unicode::Normalize> module.
 740
 741 If you need not it (e.g. in the case when you need not
 742 handle any combining characters),
 743 assign C<normalization =E<gt> undef> explicitly.
 744
 745 =head1 AUTHOR
 746
 747 SADAHIRO Tomoyuki, E<lt>SADAHIRO@cpan.orgE<gt>
 748
 749   http://homepage1.nifty.com/nomenclator/perl/
 750
 751   Copyright(C) 2001, SADAHIRO Tomoyuki. Japan. All rights reserved.
 752
 753   This program is free software; you can redistribute it and/or
 754   modify it under the same terms as Perl itself.
 755
 756 =head1 SEE ALSO
 757
 758 =over 4
 759
 760 =item L<Lingua::KO::Hangul::Util>
 761
 762 utility functions for Hangul Syllables
 763
 764 =item L<Unicode::Normalize>
 765
 766 normalized forms of Unicode text
 767
 768 =item Unicode Collation Algorithm - Unicode TR #10
 769
 770 http://www.unicode.org/unicode/reports/tr10/
 771
 772 =back
 773
 774 =cut