[p5sagit/p5-mst-13.2.git] / lib / Unicode / Collate.pm

package Unicode::Collate;

use 5.006;
use strict;
use warnings;
use Carp;
use Lingua::KO::Hangul::Util;
require Exporter;

our $VERSION = '0.08';
our $PACKAGE = __PACKAGE__;

our @ISA = qw(Exporter);

our %EXPORT_TAGS = ();
our @EXPORT_OK = ();
our @EXPORT = ();

(our $Path = $INC{'Unicode/Collate.pm'}) =~ s/\.pm$//;
our $KeyFile = "allkeys.txt";

our %Combin; # combining class from Unicode::Normalize

use constant Min2      => 0x20;   # minimum weight at level 2
use constant Min3      => 0x02;   # minimum weight at level 3
use constant UNDEFINED => 0xFF80; # special value for undefined CE

##
## constructor
##
sub new
{
  my $class = shift;
  my $self = bless { @_ }, $class;

  # alternate
  $self->{alternate} = 
     ! exists  $self->{alternate} ? 'shifted' :
     ! defined $self->{alternate} ? '' : $self->{alternate};

  # collation level
  $self->{level} ||= ($self->{alternate} =~ /shift/ ? 4 : 3);

  # normalization form
  $self->{normalization} = 'D' if ! exists $self->{normalization};

  eval "use Unicode::Normalize;" if defined $self->{normalization};

  $self->{normalize} = 
    ! defined $self->{normalization}        ? undef :
    $self->{normalization} =~ /^(?:NF)?C$/  ? \&NFC :
    $self->{normalization} =~ /^(?:NF)?D$/  ? \&NFD :
    $self->{normalization} =~ /^(?:NF)?KC$/ ? \&NFKC :
    $self->{normalization} =~ /^(?:NF)?KD$/ ? \&NFKD :
    croak "$PACKAGE unknown normalization form name: $self->{normalization}";

  *Combin = \%Unicode::Normalize::Combin if $self->{normalize} && ! %Combin;

  # backwards
  $self->{backwards} ||= [];
  $self->{backwards} = [ $self->{backwards} ] if ! ref $self->{backwards};

  # rearrange
  $self->{rearrange} ||= []; # maybe not U+0000 (an ASCII)
  $self->{rearrange} = [ $self->{rearrange} ] if ! ref $self->{rearrange};

  # open the table file
  my $file = defined $self->{table} ? $self->{table} : $KeyFile;
  open my $fk, "<$Path/$file" or croak "File does not exist at $Path/$file";

  while(<$fk>){
    next if /^\s*#/;
    if(/^\s*\@/){
       if(/^\@version\s*(\S*)/){
         $self->{version} ||= $1;
       }
       elsif(/^\@alternate\s+(.*)/){
         $self->{alternate} ||= $1;
       }
       elsif(/^\@backwards\s+(.*)/){
         push @{ $self->{backwards} }, $1;
       }
       elsif(/^\@rearrange\s+(.*)/){
         push @{ $self->{rearrange} }, _getHexArray($1);
       }
       next;
    }
    $self->parseEntry($_);
  }
  close $fk;
  if($self->{entry}){
    $self->parseEntry($_) foreach split /\n/, $self->{entry};
  }

  # keys of $self->{rearrangeHash} are $self->{rearrange}.
  $self->{rearrangeHash} = {};
  @{ $self->{rearrangeHash} }{ @{ $self->{rearrange} } } = ();

  return $self;
}

##
## get $line, parse it, and write an entry in $self
##
sub parseEntry
{
  my $self = shift;
  my $line = shift;
  my($name, $ele, @key);

  return if $line !~ /^\s*[0-9A-Fa-f]/;

  # get name
  $name = $1 if $line =~ s/#\s*(.*)//;
  return if defined $self->{undefName} && $name =~ /$self->{undefName}/;

  # get element
  my($e, $k) = split /;/, $line;
  my @e = _getHexArray($e);
  $ele = pack('U*', @e);
  return if defined $self->{undefChar} && $ele =~ /$self->{undefChar}/;

  # get sort key
  if(
     defined $self->{ignoreName} && $name =~ /$self->{ignoreName}/ ||
     defined $self->{ignoreChar} && $ele  =~ /$self->{ignoreChar}/
  )
  {
     $self->{entries}{$ele} = $self->{ignored}{$ele} = 1;
  }
  else
  {
    foreach my $arr ($k =~ /\[(\S+)\]/g) {
      my $var = $arr =~ /\*/;
      push @key, $self->altCE( $var, _getHexArray($arr) );
    }
    $self->{entries}{$ele} = \@key;
  }
  $self->{maxlength}{ord $ele} = scalar @e if @e > 1;
}


##
## arrayref CE = altCE(bool variable?, list[num] weights)
##
sub altCE
{
  my $self = shift;
  my $var  = shift;
  my @c    = @_;

  $self->{alternate} eq 'blanked' ?
    $var ? [0,0,0] : [ @c[0..2] ] :
  $self->{alternate} eq 'non-ignorable' ?
    [ @c[0..2] ] :
  $self->{alternate} eq 'shifted' ?
    $var ? [0,0,0,$c[0] ] : [ @c[0..2], $c[0]+$c[1]+$c[2] ? 0xFFFF : 0 ] :
  $self->{alternate} eq 'shift-trimmed' ?
    $var ? [0,0,0,$c[0] ] : [ @c[0..2], 0 ] :
   \@c;
}

##
## string hex_sortkey = splitCE(string arg)
##
sub viewSortKey
{
  my $self = shift;
  my $key  = $self->getSortKey(@_);
  my $view = join " ", map sprintf("%04X", $_), unpack 'n*', $key;
  $view =~ s/ ?0000 ?/|/g;
  "[$view]";
}


##
## list[strings] elements = splitCE(string arg)
##
sub splitCE
{
  my $self = shift;
  my $code = $self->{preprocess};
  my $norm = $self->{normalize};
  my $ent  = $self->{entries};
  my $max  = $self->{maxlength};
  my $rear = $self->{rearrangeHash};

  my $str = ref $code ? &$code(shift) : shift;
  $str = &$norm($str) if ref $norm;

  my(@src, @buf);
  @src = unpack('U*', $str);

  # rearrangement
  for(my $i = 0; $i < @src; $i++)
  {
     ($src[$i], $src[$i+1]) = ($src[$i+1], $src[$i])
        if $rear->{ $src[$i] };
     $i++;
  }

  for(my $i = 0; $i < @src; $i++)
  {
    my $ch;
    my $u  = $src[$i];

  # non-characters
    next if $u < 0 || 0x10FFFF < $u     # out of range
         || 0xD800 < $u && $u < 0xDFFF; # unpaired surrogates
    my $four = $u & 0xFFFF; 
    next if $four == 0xFFFE || $four == 0xFFFF;

    if($max->{$u}) # contract
    {
      for(my $j = $max->{$u}; $j >= 1; $j--)
      { 
        next unless $i+$j-1 < @src;
        $ch = pack 'U*', @src[$i .. $i+$j-1];
        $i += $j-1, last if $ent->{$ch};
      }
    }
    else {  $ch = pack('U', $u) }

    if(%Combin && defined $ch) # with Combining Char
    {
      for(my $j = $i+1; $j < @src && $Combin{ $src[$j] }; $j++)
      {
        my $comb = pack 'U', $src[$j];
        next if ! $ent->{ $ch.$comb };
        $ch .= $comb;
        splice(@src, $j, 1);
        last;
      }
    }
    push @buf, $ch;
  }
  wantarray ? @buf : \@buf;
}


##
## list[arrayrefs] weight = getWt(string element)
##
sub getWt
{
  my $self = shift;
  my $ch   = shift;
  my $ent  = $self->{entries};
  my $ign  = $self->{ignored};
  my $cjk  = $self->{overrideCJK};
  my $hang = $self->{overrideHangul};
  return if !defined $ch || $ign->{$ch};   # ignored
  return @{ $ent->{$ch} } if $ent->{$ch};
  my $u = unpack('U', $ch);
  return
    _isHangul($u)
      ? $hang
        ? &$hang($u)
        : map(@{ $ent->{pack('U', $_)} }, decomposeHangul($u))
      : _isCJK($u)
        ? $cjk ? &$cjk($u) : map($self->altCE(0,@$_), _CJK($u))
        : map($self->altCE(0,@$_), _derivCE($u));
}

##
## int = index(string, substring)
##
sub index
{
  my $self = shift;
  my $lev  = $self->{level};
  my $str  = $self->splitCE(shift);
  my $sub  = $self->splitCE(shift);

  return wantarray ? (0,0) : 0 if ! @$sub;
  return wantarray ?  ()  : -1 if ! @$str;

  my @subWt = grep _ignorableAtLevel($_,$lev),
              map $self->getWt($_), @$sub;

  my(@strWt,@strPt);
  my $count = 0;
  for my $e (@$str){
    my @tmp = grep _ignorableAtLevel($_,$lev), $self->getWt($e);
    push @strWt, @tmp;
    push @strPt, ($count) x @tmp; 
    $count += length $e;
    while(@strWt >= @subWt){
      if(_eqArray(\@strWt, \@subWt, $lev)){
        my $pos = $strPt[0];
        return wantarray ? ($pos, $count-$pos) : $pos;
      }
      shift @strWt;
      shift @strPt;
    }
  }
  return wantarray ? () : -1;
}

##
## bool _eqArray(arrayref, arrayref, level)
##
sub _eqArray($$$)
{
  my $a   = shift; # length $a >= length $b;
  my $b   = shift;
  my $lev = shift;
  for my $v (0..$lev-1){
    for my $c (0..@$b-1){
      return if $a->[$c][$v] != $b->[$c][$v];
    }
  }
  return 1;
}


##
## bool _ignorableAtLevel(CE, level)
##
sub _ignorableAtLevel($$)
{
  my $ce = shift;
  return if ! defined $ce;
  my $lv = shift;
  ! grep { ! $ce->[$_] } 0..$lv-1;
}


##
## string sortkey = getSortKey(string arg)
##
sub getSortKey
{
  my $self = shift;
  my $lev  = $self->{level};
  my $rCE  = $self->splitCE(shift); # get an arrayref

  # weight arrays
  my @buf = grep defined(), map $self->getWt($_), @$rCE;

  # make sort key
  my @ret = ([],[],[],[]);
  foreach my $v (0..$lev-1){
    foreach my $b (@buf){
      push @{ $ret[$v] }, $b->[$v] if $b->[$v];
    }
  }
  foreach (@{ $self->{backwards} }){
    my $v = $_ - 1;
    @{ $ret[$v] } = reverse @{ $ret[$v] };
  }

  # modification of tertiary weights
  if($self->{upper_before_lower}){
    foreach (@{ $ret[2] }){
      if   (0x8 <= $_ && $_ <= 0xC){ $_ -= 6 } # lower
      elsif(0x2 <= $_ && $_ <= 0x6){ $_ += 6 } # upper
      elsif($_ == 0x1C)            { $_ += 1 } # square upper
      elsif($_ == 0x1D)            { $_ -= 1 } # square lower
    }
  }
  if($self->{katakana_before_hiragana}){
    foreach (@{ $ret[2] }){
      if   (0x0F <= $_ && $_ <= 0x13){ $_ -= 2 } # katakana
      elsif(0x0D <= $_ && $_ <= 0x0E){ $_ += 5 } # hiragana
    }
  }
  join "\0\0", map pack('n*', @$_), @ret;
}


##
## int compare = cmp(string a, string b)
##
sub cmp
{
  my $obj = shift;
  my $a   = shift;
  my $b   = shift;
  $obj->getSortKey($a) cmp $obj->getSortKey($b);
}

##
## list[strings] sorted = sort(list[strings] arg)
##
sub sort
{
  my $obj = shift;

  map { $_->[1] }
  sort{ $a->[0] cmp $b->[0] }
  map [ $obj->getSortKey($_), $_ ], @_;
}

##
## list[arrayrefs] CE = _derivCE(int codepoint)
##
sub _derivCE
{
  my $code = shift;
  my $a = UNDEFINED + ($code >> 15); # ok
  my $b = ($code & 0x7FFF) | 0x8000; # ok
# my $a = 0xFFC2 + ($code >> 15);    # ng
# my $b = $code & 0x7FFF | 0x1000;   # ng
  $b ? ([$a,2,1,$code],[$b,0,0,$code]) : [$a,2,1,$code];
}

##
## "hhhh hhhh hhhh" to (dddd, dddd, dddd)
##
sub _getHexArray
{
  my $str = shift;
  map hex(), $str =~ /([0-9a-fA-F]+)/g;
}

##
## bool is_a_CJK_Unified_Ideograph = _isCJK(int codepoint)
##
sub _isCJK
{
  my $u = shift;
  return 0x3400 <= $u && $u <= 0x4DB5  
      || 0x4E00 <= $u && $u <= 0x9FA5  
#      || 0x20000 <= $u && $u <= 0x2A6D6;
}

##
## list[arrayref] CE = _CJK(int codepoint_of_CJK)
##
sub _CJK
{
  my $u = shift;
  $u > 0xFFFF ? _derivCE($u) : [$u,0x20,0x02,$u];
}

##
## bool is_a_Hangul_Syllable = _isHangul(int codepoint) 
##
sub _isHangul
{
  my $code = shift;
  return 0xAC00 <= $code && $code <= 0xD7A3;
}

1;
__END__

=head1 NAME

Unicode::Collate - use UCA (Unicode Collation Algorithm)

=head1 SYNOPSIS

  use Unicode::Collate;

  #construct
  $UCA = Unicode::Collate->new(%tailoring);

  #sort
  @sorted = $UCA->sort(@not_sorted);

  #compare
  $result = $UCA->cmp($a, $b); # returns 1, 0, or -1. 

=head1 DESCRIPTION

=head2 Constructor and Tailoring

The C<new> method returns a collator object.

   $UCA = Unicode::Collate->new(
      alternate => $alternate,
      backwards => $levelNumber, # or \@levelNumbers
      entry => $element,
      normalization  => $normalization_form,
      ignoreName => qr/$ignoreName/,
      ignoreChar => qr/$ignoreChar/,
      katakana_before_hiragana => $bool,
      level => $collationLevel,
      overrideCJK => \&overrideCJK,
      overrideHangul => \&overrideHangul,
      preprocess => \&preprocess,
      rearrange => \@charList,
      table => $filename,
      undefName => qr/$undefName/,
      undefChar => qr/$undefChar/,
      upper_before_lower => $bool,
   );
   # if %tailoring is false (empty),
   # $UCA should do the default collation.

=over 4

=item alternate

-- see 3.2.2 Alternate Weighting, UTR #10.

   alternate => 'shifted', 'blanked', 'non-ignorable', or 'shift-trimmed'.

By default (if specification is omitted), 'shifted' is adopted.

=item backwards

-- see 3.1.2 French Accents, UTR #10.

     backwards => $levelNumber or \@levelNumbers

Weights in reverse order; ex. level 2 (diacritic ordering) in French.
If omitted, forwards at all the levels.

=item entry

-- see 3.1 Linguistic Features; 3.2.1 File Format, UTR #10.

Overrides a default order or adds a new element

  entry => <<'ENTRIES', # use the UCA file format
00E6 ; [.0861.0020.0002.00E6] [.08B1.0020.0002.00E6] # ligature <ae> as <a e>
0063 0068 ; [.0893.0020.0002.0063]      # "ch" in traditional Spanish
0043 0068 ; [.0893.0020.0008.0043]      # "Ch" in traditional Spanish
ENTRIES

=item ignoreName

=item ignoreChar

-- see Completely Ignorable, 3.2.2 Alternate Weighting, UTR #10.

Ignores the entry in the table.
If an ignored collation element appears in the string to be collated,
it is ignored as if the element had been deleted from there.

E.g. when 'a' and 'e' are ignored,
'element' is equal to 'lament' (or 'lmnt').

=item level

-- see 4.3 Form a sort key for each string, UTR #10.

Set the maximum level.
Any higher levels than the specified one are ignored.

  Level 1: alphabetic ordering
  Level 2: diacritic ordering
  Level 3: case ordering
  Level 4: tie-breaking (e.g. in the case when alternate is 'shifted')

  ex.level => 2,

=item normalization

-- see 4.1 Normalize each input string, UTR #10.

If specified, strings are normalized before preparation sort keys
(the normalization is executed after preprocess).

As a form name, one of the following names must be used.

  'C'  or 'NFC'  for Normalization Form C
  'D'  or 'NFD'  for Normalization Form D
  'KC' or 'NFKC' for Normalization Form KC
  'KD' or 'NFKD' for Normalization Form KD

If omitted, the string is put into Normalization Form D.

If undefined explicitly (as C<normalization =E<gt> undef>),
any normalization is not carried out (this may make tailoring easier
if any normalization is not desired).

see B<CAVEAT>.

=item overrideCJK

=item overrideHangul

-- see 7.1 Derived Collation Elements, UTR #10.

By default, mapping of CJK Unified Ideographs
uses the Unicode codepoint order
and Hangul Syllables are decomposed into Hangul Jamo.

The mapping of CJK Unified Ideographs
or Hangul Syllables may be overrided.

ex. CJK Unified Ideographs in the JIS codepoint order.

  overrideCJK => sub {
    my $u = shift;               # get unicode codepoint
    my $b = pack('n', $u);       # to UTF-16BE
    my $s = your_unicode_to_sjis_converter($b); # convert
    my $n = unpack('n', $s);     # convert sjis to short
    [ $n, 1, 1 ];                # return collation element
  },

If you want to override the mapping of Hangul Syllables,
the Normalization Forms D and KD are not appropriate
(they will be decomposed before overriding).

=item preprocess

-- see 5.1 Preprocessing, UTR #10.

If specified, the coderef is used to preprocess
before the formation of sort keys.

ex. dropping English articles, such as "a" or "the". 
Then, "the pen" is before "a pencil".

     preprocess => sub {
           my $str = shift;
           $str =~ s/\b(?:an?|the)\s+//g;
           $str;
        },

=item rearrange

-- see 3.1.3 Rearrangement, UTR #10.

Characters that are not coded in logical order and to be rearranged.
By default, 

    rearrange => [ 0x0E40..0x0E44, 0x0EC0..0x0EC4 ],

=item table

-- see 3.2 Default Unicode Collation Element Table, UTR #10.

You can use another element table if desired.
The table file must be in your C<lib/Unicode/Collate> directory.

By default, the file C<lib/Unicode/Collate/allkeys.txt> is used.

=item undefName

=item undefChar

-- see 6.3.4 Reducing the Repertoire, UTR #10.

Undefines the collation element as if it were unassigned in the table.
This reduces the size of the table.
If an unassigned character appears in the string to be collated,
the sort key is made from its codepoint
as a single-character collation element,
as it is greater than any other assigned collation elements
(in the codepoint order among the unassigned characters).
But, it'd be better to ignore characters
unfamiliar to you and maybe never used.

=item katakana_before_hiragana

=item upper_before_lower

-- see 6.6 Case Comparisons; 7.3.1 Tertiary Weight Table, UTR #10.

By default, lowercase is before uppercase
and hiragana is before katakana.

If the parameter is true, this is reversed.

=back

=head2 Other methods

=over 4

=item C<@sorted = $UCA-E<gt>sort(@not_sorted)>

Sorts a list of strings.

=item C<$result = $UCA-E<gt>cmp($a, $b)>

Returns 1 (when C<$a> is greater than C<$b>)
or 0 (when C<$a> is equal to C<$b>)
or -1 (when C<$a> is lesser than C<$b>).

=item C<$sortKey = $UCA-E<gt>getSortKey($string)>

-- see 4.3 Form a sort key for each string, UTR #10.

Returns a sort key.

You compare the sort keys using a binary comparison
and get the result of the comparison of the strings using UCA.

   $UCA->getSortKey($a) cmp $UCA->getSortKey($b)

      is equivalent to

   $UCA->cmp($a, $b)

=item C<$position = $UCA-E<gt>index($string, $substring)>

=item C<($position, $length) = $UCA-E<gt>index($string, $substring)>

-- see 6.8 Searching, UTR #10.

If C<$substring> matches a part of C<$string>, returns
the position of the first occurrence of the matching part in scalar context;
in list context, returns a two-element list of
the position and the length of the matching part.

B<Notice> that the length of the matching part may differ from
the length of C<$substring>.

B<Note> that the position and the length are counted on the string
after the process of preprocess, normalization, and rearrangement.
Therefore, in case the specified string is not binary equal to
the preprocessed/normalized/rearranged string, the position and the length
may differ form those on the specified string. But it is guaranteed 
that, if matched, it returns a non-negative value as C<$position>.

If C<$substring> does not match any part of C<$string>,
returns C<-1> in scalar context and
an empty list in list context.

e.g. you say

  my $UCA = Unicode::Collate->new( normalization => undef, level => 1 );
  my $str = "Ich mu\x{00DF} studieren.";
  my $sub = "m\x{00FC}ss";
  my $match;
  if(my @tmp = $UCA->index($str, $sub)){
    $match = substr($str, $tmp[0], $tmp[1]);
  }

and get C<"mu\x{00DF}"> in C<$match> since C<"mu>E<223>C<">
is primary equal to C<"m>E<252>C<ss">. 

=back

=head2 EXPORT

None by default.

=head2 CAVEAT

Use of the C<normalization> parameter requires
the B<Unicode::Normalize> module.

If you need not it (e.g. in the case when you need not
handle any combining characters),
assign C<normalization =E<gt> undef> explicitly.

=head1 AUTHOR

SADAHIRO Tomoyuki, E<lt>SADAHIRO@cpan.orgE<gt>

  http://homepage1.nifty.com/nomenclator/perl/

  Copyright(C) 2001, SADAHIRO Tomoyuki. Japan. All rights reserved.

  This program is free software; you can redistribute it and/or 
  modify it under the same terms as Perl itself.

=head1 SEE ALSO

=over 4

=item L<Lingua::KO::Hangul::Util>

utility functions for Hangul Syllables

=item L<Unicode::Normalize>

normalized forms of Unicode text

=item Unicode Collation Algorithm - Unicode TR #10

http://www.unicode.org/unicode/reports/tr10/

=back

=cut
Commit	Line	Data
45394607	1	package Unicode::Collate;
	2
	3	use 5.006;
	4	use strict;
	5	use warnings;
	6	use Carp;
	7	use Lingua::KO::Hangul::Util;
	8	require Exporter;
	9
d16e9e3d	10	our $VERSION = '0.08';
45394607	11	our $PACKAGE = __PACKAGE__;
	12
	13	our @ISA = qw(Exporter);
	14
	15	our %EXPORT_TAGS = ();
	16	our @EXPORT_OK = ();
	17	our @EXPORT = ();
	18
	19	(our $Path = $INC{'Unicode/Collate.pm'}) =~ s/\.pm$//;
	20	our $KeyFile = "allkeys.txt";
	21
	22	our %Combin; # combining class from Unicode::Normalize
	23
	24	use constant Min2 => 0x20; # minimum weight at level 2
	25	use constant Min3 => 0x02; # minimum weight at level 3
	26	use constant UNDEFINED => 0xFF80; # special value for undefined CE
	27
	28	##
	29	## constructor
	30	##
	31	sub new
	32	{
	33	my $class = shift;
	34	my $self = bless { @_ }, $class;
	35
	36	# alternate
	37	$self->{alternate} =
	38	! exists $self->{alternate} ? 'shifted' :
	39	! defined $self->{alternate} ? '' : $self->{alternate};
	40
	41	# collation level
d16e9e3d	42	$self->{level} \|\|= ($self->{alternate} =~ /shift/ ? 4 : 3);
45394607	43
	44	# normalization form
	45	$self->{normalization} = 'D' if ! exists $self->{normalization};
	46
	47	eval "use Unicode::Normalize;" if defined $self->{normalization};
	48
	49	$self->{normalize} =
	50	! defined $self->{normalization} ? undef :
	51	$self->{normalization} =~ /^(?:NF)?C$/ ? \&NFC :
	52	$self->{normalization} =~ /^(?:NF)?D$/ ? \&NFD :
	53	$self->{normalization} =~ /^(?:NF)?KC$/ ? \&NFKC :
	54	$self->{normalization} =~ /^(?:NF)?KD$/ ? \&NFKD :
	55	croak "$PACKAGE unknown normalization form name: $self->{normalization}";
	56
	57	*Combin = \%Unicode::Normalize::Combin if $self->{normalize} && ! %Combin;
	58
	59	# backwards
	60	$self->{backwards} \|\|= [];
	61	$self->{backwards} = [ $self->{backwards} ] if ! ref $self->{backwards};
	62
	63	# rearrange
	64	$self->{rearrange} \|\|= []; # maybe not U+0000 (an ASCII)
	65	$self->{rearrange} = [ $self->{rearrange} ] if ! ref $self->{rearrange};
	66
	67	# open the table file
	68	my $file = defined $self->{table} ? $self->{table} : $KeyFile;
	69	open my $fk, "<$Path/$file" or croak "File does not exist at $Path/$file";
	70
	71	while(<$fk>){
	72	next if /^\s*#/;
	73	if(/^\s*\@/){
	74	if(/^\@version\s(\S)/){
	75	$self->{version} \|\|= $1;
	76	}
	77	elsif(/^\@alternate\s+(.*)/){
	78	$self->{alternate} \|\|= $1;
	79	}
	80	elsif(/^\@backwards\s+(.*)/){
	81	push @{ $self->{backwards} }, $1;
	82	}
	83	elsif(/^\@rearrange\s+(.*)/){
	84	push @{ $self->{rearrange} }, _getHexArray($1);
	85	}
	86	next;
	87	}
	88	$self->parseEntry($_);
	89	}
	90	close $fk;
	91	if($self->{entry}){
	92	$self->parseEntry($_) foreach split /\n/, $self->{entry};
	93	}
	94
	95	# keys of $self->{rearrangeHash} are $self->{rearrange}.
	96	$self->{rearrangeHash} = {};
	97	@{ $self->{rearrangeHash} }{ @{ $self->{rearrange} } } = ();
	98
	99	return $self;
	100	}
	101
	102	##
	103	## get $line, parse it, and write an entry in $self
	104	##
	105	sub parseEntry
	106	{
107	my $self = shift;
108	my $line = shift;
109	my($name, $ele, @key);
110
111	return if $line !~ /^\s*[0-9A-Fa-f]/;
112
113	# get name
114	$name = $1 if $line =~ s/#\s(.)//;
115	return if defined $self->{undefName} && $name =~ /$self->{undefName}/;
116
117	# get element
118	my($e, $k) = split /;/, $line;
119	my @e = _getHexArray($e);
120	$ele = pack('U*', @e);
121	return if defined $self->{undefChar} && $ele =~ /$self->{undefChar}/;
122
123	# get sort key
124	if(
125	defined $self->{ignoreName} && $name =~ /$self->{ignoreName}/ \|\|
126	defined $self->{ignoreChar} && $ele =~ /$self->{ignoreChar}/
127	)
128	{
d16e9e3d	129	$self->{entries}{$ele} = $self->{ignored}{$ele} = 1;
45394607	130	}
	131	else
	132	{
	133	foreach my $arr ($k =~ /\[(\S+)\]/g) {
	134	my $var = $arr =~ /\*/;
d16e9e3d	135	push @key, $self->altCE( $var, _getHexArray($arr) );
45394607	136	}
	137	$self->{entries}{$ele} = \@key;
	138	}
	139	$self->{maxlength}{ord $ele} = scalar @e if @e > 1;
	140	}
	141
	142
	143	##
d16e9e3d	144	## arrayref CE = altCE(bool variable?, list[num] weights)
45394607	145	##
d16e9e3d	146	sub altCE
45394607	147	{
	148	my $self = shift;
	149	my $var = shift;
	150	my @c = @_;
	151
	152	$self->{alternate} eq 'blanked' ?
d16e9e3d	153	$var ? [0,0,0] : [ @c[0..2] ] :
	154	$self->{alternate} eq 'non-ignorable' ?
	155	[ @c[0..2] ] :
45394607	156	$self->{alternate} eq 'shifted' ?
	157	$var ? [0,0,0,$c[0] ] : [ @c[0..2], $c[0]+$c[1]+$c[2] ? 0xFFFF : 0 ] :
	158	$self->{alternate} eq 'shift-trimmed' ?
	159	$var ? [0,0,0,$c[0] ] : [ @c[0..2], 0 ] :
	160	\@c;
	161	}
	162
	163	##
d16e9e3d	164	## string hex_sortkey = splitCE(string arg)
45394607	165	##
	166	sub viewSortKey
	167	{
	168	my $self = shift;
	169	my $key = $self->getSortKey(@_);
	170	my $view = join " ", map sprintf("%04X", $_), unpack 'n*', $key;
	171	$view =~ s/ ?0000 ?/\|/g;
	172	"[$view]";
	173	}
	174
d16e9e3d	175
45394607	176	##
d16e9e3d	177	## list[strings] elements = splitCE(string arg)
45394607	178	##
d16e9e3d	179	sub splitCE
45394607	180	{
	181	my $self = shift;
	182	my $code = $self->{preprocess};
	183	my $norm = $self->{normalize};
	184	my $ent = $self->{entries};
45394607	185	my $max = $self->{maxlength};
45394607	186	my $rear = $self->{rearrangeHash};
	187
	188	my $str = ref $code ? &$code(shift) : shift;
	189	$str = &$norm($str) if ref $norm;
	190
	191	my(@src, @buf);
	192	@src = unpack('U*', $str);
	193
	194	# rearrangement
	195	for(my $i = 0; $i < @src; $i++)
	196	{
	197	($src[$i], $src[$i+1]) = ($src[$i+1], $src[$i])
	198	if $rear->{ $src[$i] };
	199	$i++;
	200	}
	201
	202	for(my $i = 0; $i < @src; $i++)
	203	{
	204	my $ch;
	205	my $u = $src[$i];
	206
	207	# non-characters
	208	next if $u < 0 \|\| 0x10FFFF < $u # out of range
	209	\|\| 0xD800 < $u && $u < 0xDFFF; # unpaired surrogates
	210	my $four = $u & 0xFFFF;
	211	next if $four == 0xFFFE \|\| $four == 0xFFFF;
	212
	213	if($max->{$u}) # contract
	214	{
	215	for(my $j = $max->{$u}; $j >= 1; $j--)
	216	{
	217	next unless $i+$j-1 < @src;
	218	$ch = pack 'U*', @src[$i .. $i+$j-1];
	219	$i += $j-1, last if $ent->{$ch};
	220	}
	221	}
	222	else { $ch = pack('U', $u) }
	223
	224	if(%Combin && defined $ch) # with Combining Char
	225	{
	226	for(my $j = $i+1; $j < @src && $Combin{ $src[$j] }; $j++)
	227	{
	228	my $comb = pack 'U', $src[$j];
	229	next if ! $ent->{ $ch.$comb };
	230	$ch .= $comb;
	231	splice(@src, $j, 1);
	232	last;
	233	}
	234	}
d16e9e3d	235	push @buf, $ch;
	236	}
	237	wantarray ? @buf : \@buf;
	238	}
45394607	239
d16e9e3d	240
	241	##
	242	## list[arrayrefs] weight = getWt(string element)
	243	##
	244	sub getWt
	245	{
	246	my $self = shift;
	247	my $ch = shift;
	248	my $ent = $self->{entries};
	249	my $ign = $self->{ignored};
	250	my $cjk = $self->{overrideCJK};
	251	my $hang = $self->{overrideHangul};
	252	return if !defined $ch \|\| $ign->{$ch}; # ignored
	253	return @{ $ent->{$ch} } if $ent->{$ch};
	254	my $u = unpack('U', $ch);
	255	return
	256	_isHangul($u)
	257	? $hang
	258	? &$hang($u)
	259	: map(@{ $ent->{pack('U', $_)} }, decomposeHangul($u))
	260	: _isCJK($u)
	261	? $cjk ? &$cjk($u) : map($self->altCE(0,@$_), _CJK($u))
	262	: map($self->altCE(0,@$_), _derivCE($u));
	263	}
	264
	265	##
	266	## int = index(string, substring)
	267	##
	268	sub index
	269	{
	270	my $self = shift;
	271	my $lev = $self->{level};
	272	my $str = $self->splitCE(shift);
	273	my $sub = $self->splitCE(shift);
	274
	275	return wantarray ? (0,0) : 0 if ! @$sub;
	276	return wantarray ? () : -1 if ! @$str;
	277
	278	my @subWt = grep _ignorableAtLevel($_,$lev),
	279	map $self->getWt($_), @$sub;
	280
	281	my(@strWt,@strPt);
	282	my $count = 0;
	283	for my $e (@$str){
	284	my @tmp = grep _ignorableAtLevel($_,$lev), $self->getWt($e);
	285	push @strWt, @tmp;
	286	push @strPt, ($count) x @tmp;
	287	$count += length $e;
	288	while(@strWt >= @subWt){
	289	if(_eqArray(\@strWt, \@subWt, $lev)){
	290	my $pos = $strPt[0];
	291	return wantarray ? ($pos, $count-$pos) : $pos;
	292	}
	293	shift @strWt;
	294	shift @strPt;
	295	}
	296	}
	297	return wantarray ? () : -1;
	298	}
	299
	300	##
	301	## bool _eqArray(arrayref, arrayref, level)
	302	##
	303	sub _eqArray($$$)
304	{
305	my $a = shift; # length $a >= length $b;
306	my $b = shift;
307	my $lev = shift;
308	for my $v (0..$lev-1){
309	for my $c (0..@$b-1){
310	return if $a->[$c][$v] != $b->[$c][$v];
311	}
45394607	312	}
d16e9e3d	313	return 1;
	314	}
	315
	316
	317	##
	318	## bool _ignorableAtLevel(CE, level)
	319	##
	320	sub _ignorableAtLevel($$)
	321	{
	322	my $ce = shift;
	323	return if ! defined $ce;
	324	my $lv = shift;
	325	! grep { ! $ce->[$_] } 0..$lv-1;
	326	}
	327
	328
	329	##
	330	## string sortkey = getSortKey(string arg)
	331	##
	332	sub getSortKey
	333	{
	334	my $self = shift;
	335	my $lev = $self->{level};
	336	my $rCE = $self->splitCE(shift); # get an arrayref
	337
	338	# weight arrays
	339	my @buf = grep defined(), map $self->getWt($_), @$rCE;
45394607	340
	341	# make sort key
	342	my @ret = ([],[],[],[]);
	343	foreach my $v (0..$lev-1){
	344	foreach my $b (@buf){
	345	push @{ $ret[$v] }, $b->[$v] if $b->[$v];
	346	}
	347	}
	348	foreach (@{ $self->{backwards} }){
	349	my $v = $_ - 1;
	350	@{ $ret[$v] } = reverse @{ $ret[$v] };
	351	}
	352
	353	# modification of tertiary weights
	354	if($self->{upper_before_lower}){
	355	foreach (@{ $ret[2] }){
	356	if (0x8 <= $_ && $_ <= 0xC){ $_ -= 6 } # lower
	357	elsif(0x2 <= $_ && $_ <= 0x6){ $_ += 6 } # upper
	358	elsif($_ == 0x1C) { $_ += 1 } # square upper
	359	elsif($_ == 0x1D) { $_ -= 1 } # square lower
	360	}
	361	}
	362	if($self->{katakana_before_hiragana}){
	363	foreach (@{ $ret[2] }){
	364	if (0x0F <= $_ && $_ <= 0x13){ $_ -= 2 } # katakana
	365	elsif(0x0D <= $_ && $_ <= 0x0E){ $_ += 5 } # hiragana
	366	}
	367	}
	368	join "\0\0", map pack('n*', @$_), @ret;
	369	}
	370
	371
	372	##
d16e9e3d	373	## int compare = cmp(string a, string b)
45394607	374	##
	375	sub cmp
	376	{
	377	my $obj = shift;
	378	my $a = shift;
	379	my $b = shift;
	380	$obj->getSortKey($a) cmp $obj->getSortKey($b);
	381	}
	382
	383	##
d16e9e3d	384	## list[strings] sorted = sort(list[strings] arg)
45394607	385	##
	386	sub sort
	387	{
	388	my $obj = shift;
	389
	390	map { $_->[1] }
	391	sort{ $a->[0] cmp $b->[0] }
	392	map [ $obj->getSortKey($_), $_ ], @_;
	393	}
	394
	395	##
d16e9e3d	396	## list[arrayrefs] CE = _derivCE(int codepoint)
45394607	397	##
	398	sub _derivCE
	399	{
	400	my $code = shift;
	401	my $a = UNDEFINED + ($code >> 15); # ok
	402	my $b = ($code & 0x7FFF) \| 0x8000; # ok
	403	# my $a = 0xFFC2 + ($code >> 15); # ng
	404	# my $b = $code & 0x7FFF \| 0x1000; # ng
	405	$b ? ([$a,2,1,$code],[$b,0,0,$code]) : [$a,2,1,$code];
	406	}
	407
	408	##
	409	## "hhhh hhhh hhhh" to (dddd, dddd, dddd)
	410	##
	411	sub _getHexArray
	412	{
	413	my $str = shift;
	414	map hex(), $str =~ /([0-9a-fA-F]+)/g;
	415	}
	416
	417	##
d16e9e3d	418	## bool is_a_CJK_Unified_Ideograph = _isCJK(int codepoint)
45394607	419	##
	420	sub _isCJK
	421	{
	422	my $u = shift;
	423	return 0x3400 <= $u && $u <= 0x4DB5
	424	\|\| 0x4E00 <= $u && $u <= 0x9FA5
	425	# \|\| 0x20000 <= $u && $u <= 0x2A6D6;
	426	}
	427
	428	##
d16e9e3d	429	## list[arrayref] CE = _CJK(int codepoint_of_CJK)
45394607	430	##
	431	sub _CJK
	432	{
	433	my $u = shift;
	434	$u > 0xFFFF ? _derivCE($u) : [$u,0x20,0x02,$u];
	435	}
	436
	437	##
d16e9e3d	438	## bool is_a_Hangul_Syllable = _isHangul(int codepoint)
45394607	439	##
	440	sub _isHangul
	441	{
	442	my $code = shift;
	443	return 0xAC00 <= $code && $code <= 0xD7A3;
	444	}
	445
	446	1;
	447	__END__
	448
	449	=head1 NAME
	450
	451	Unicode::Collate - use UCA (Unicode Collation Algorithm)
	452
	453	=head1 SYNOPSIS
	454
	455	use Unicode::Collate;
	456
	457	#construct
	458	$UCA = Unicode::Collate->new(%tailoring);
	459
	460	#sort
	461	@sorted = $UCA->sort(@not_sorted);
	462
	463	#compare
	464	$result = $UCA->cmp($a, $b); # returns 1, 0, or -1.
	465
	466	=head1 DESCRIPTION
	467
	468	=head2 Constructor and Tailoring
	469
d16e9e3d	470	The C<new> method returns a collator object.
d16e9e3d	471
45394607	472	$UCA = Unicode::Collate->new(
	473	alternate => $alternate,
	474	backwards => $levelNumber, # or \@levelNumbers
	475	entry => $element,
	476	normalization => $normalization_form,
	477	ignoreName => qr/$ignoreName/,
	478	ignoreChar => qr/$ignoreChar/,
	479	katakana_before_hiragana => $bool,
	480	level => $collationLevel,
	481	overrideCJK => \&overrideCJK,
	482	overrideHangul => \&overrideHangul,
	483	preprocess => \&preprocess,
	484	rearrange => \@charList,
	485	table => $filename,
	486	undefName => qr/$undefName/,
	487	undefChar => qr/$undefChar/,
	488	upper_before_lower => $bool,
	489	);
	490	# if %tailoring is false (empty),
	491	# $UCA should do the default collation.
	492
	493	=over 4
	494
	495	=item alternate
	496
	497	-- see 3.2.2 Alternate Weighting, UTR #10.
	498
	499	alternate => 'shifted', 'blanked', 'non-ignorable', or 'shift-trimmed'.
	500
	501	By default (if specification is omitted), 'shifted' is adopted.
	502
	503	=item backwards
	504
	505	-- see 3.1.2 French Accents, UTR #10.
	506
	507	backwards => $levelNumber or \@levelNumbers
	508
	509	Weights in reverse order; ex. level 2 (diacritic ordering) in French.
	510	If omitted, forwards at all the levels.
	511
	512	=item entry
	513
	514	-- see 3.1 Linguistic Features; 3.2.1 File Format, UTR #10.
	515
	516	Overrides a default order or adds a new element
	517
	518	entry => <<'ENTRIES', # use the UCA file format
	519	00E6 ; [.0861.0020.0002.00E6] [.08B1.0020.0002.00E6] # ligature <ae> as <a e>
	520	0063 0068 ; [.0893.0020.0002.0063] # "ch" in traditional Spanish
	521	0043 0068 ; [.0893.0020.0008.0043] # "Ch" in traditional Spanish
	522	ENTRIES
	523
	524	=item ignoreName
	525
	526	=item ignoreChar
	527
	528	-- see Completely Ignorable, 3.2.2 Alternate Weighting, UTR #10.
	529
	530	Ignores the entry in the table.
	531	If an ignored collation element appears in the string to be collated,
	532	it is ignored as if the element had been deleted from there.
	533
	534	E.g. when 'a' and 'e' are ignored,
	535	'element' is equal to 'lament' (or 'lmnt').
536
537	=item level
538
539	-- see 4.3 Form a sort key for each string, UTR #10.
540
541	Set the maximum level.
542	Any higher levels than the specified one are ignored.
543
544	Level 1: alphabetic ordering
545	Level 2: diacritic ordering
546	Level 3: case ordering
547	Level 4: tie-breaking (e.g. in the case when alternate is 'shifted')
548
549	ex.level => 2,
550
551	=item normalization
552
553	-- see 4.1 Normalize each input string, UTR #10.
554
555	If specified, strings are normalized before preparation sort keys
556	(the normalization is executed after preprocess).
557
558	As a form name, one of the following names must be used.
559
560	'C' or 'NFC' for Normalization Form C
561	'D' or 'NFD' for Normalization Form D
562	'KC' or 'NFKC' for Normalization Form KC
563	'KD' or 'NFKD' for Normalization Form KD
564
565	If omitted, the string is put into Normalization Form D.
566
567	If undefined explicitly (as C<normalization =E<gt> undef>),
568	any normalization is not carried out (this may make tailoring easier
569	if any normalization is not desired).
570
571	see B<CAVEAT>.
572
573	=item overrideCJK
574
575	=item overrideHangul
576
577	-- see 7.1 Derived Collation Elements, UTR #10.
578
579	By default, mapping of CJK Unified Ideographs
580	uses the Unicode codepoint order
581	and Hangul Syllables are decomposed into Hangul Jamo.
582
583	The mapping of CJK Unified Ideographs
584	or Hangul Syllables may be overrided.
585
586	ex. CJK Unified Ideographs in the JIS codepoint order.
587
588	overrideCJK => sub {
589	my $u = shift; # get unicode codepoint
590	my $b = pack('n', $u); # to UTF-16BE
591	my $s = your_unicode_to_sjis_converter($b); # convert
592	my $n = unpack('n', $s); # convert sjis to short
593	[ $n, 1, 1 ]; # return collation element
594	},
595
596	If you want to override the mapping of Hangul Syllables,
597	the Normalization Forms D and KD are not appropriate
598	(they will be decomposed before overriding).
599
600	=item preprocess
601
602	-- see 5.1 Preprocessing, UTR #10.
603
604	If specified, the coderef is used to preprocess
605	before the formation of sort keys.
606
607	ex. dropping English articles, such as "a" or "the".
608	Then, "the pen" is before "a pencil".
609
610	preprocess => sub {
611	my $str = shift;
612	$str =~ s/\b(?:an?\|the)\s+//g;
613	$str;
614	},
615
616	=item rearrange
617
618	-- see 3.1.3 Rearrangement, UTR #10.
619
620	Characters that are not coded in logical order and to be rearranged.
621	By default,
622
623	rearrange => [ 0x0E40..0x0E44, 0x0EC0..0x0EC4 ],
624
625	=item table
626
627	-- see 3.2 Default Unicode Collation Element Table, UTR #10.
628
629	You can use another element table if desired.
630	The table file must be in your C<lib/Unicode/Collate> directory.
631
632	By default, the file C<lib/Unicode/Collate/allkeys.txt> is used.
633
634	=item undefName
635
636	=item undefChar
637
638	-- see 6.3.4 Reducing the Repertoire, UTR #10.
639
640	Undefines the collation element as if it were unassigned in the table.
641	This reduces the size of the table.
642	If an unassigned character appears in the string to be collated,
643	the sort key is made from its codepoint
644	as a single-character collation element,
645	as it is greater than any other assigned collation elements
646	(in the codepoint order among the unassigned characters).
647	But, it'd be better to ignore characters
648	unfamiliar to you and maybe never used.
649
650	=item katakana_before_hiragana
651
652	=item upper_before_lower
653
654	-- see 6.6 Case Comparisons; 7.3.1 Tertiary Weight Table, UTR #10.
655
656	By default, lowercase is before uppercase
657	and hiragana is before katakana.
658
659	If the parameter is true, this is reversed.
660
661	=back
662
663	=head2 Other methods
664
665	=over 4
666
667	=item C<@sorted = $UCA-E<gt>sort(@not_sorted)>
668
669	Sorts a list of strings.
670
671	=item C<$result = $UCA-E<gt>cmp($a, $b)>
672
673	Returns 1 (when C<$a> is greater than C<$b>)
674	or 0 (when C<$a> is equal to C<$b>)
675	or -1 (when C<$a> is lesser than C<$b>).
676
677	=item C<$sortKey = $UCA-E<gt>getSortKey($string)>
678
679	-- see 4.3 Form a sort key for each string, UTR #10.
680
681	Returns a sort key.
682
683	You compare the sort keys using a binary comparison
684	and get the result of the comparison of the strings using UCA.
685
686	$UCA->getSortKey($a) cmp $UCA->getSortKey($b)
687
688	is equivalent to
689
690	$UCA->cmp($a, $b)
691
d16e9e3d	692	=item C<$position = $UCA-E<gt>index($string, $substring)>
	693
	694	=item C<($position, $length) = $UCA-E<gt>index($string, $substring)>
	695
	696	-- see 6.8 Searching, UTR #10.
	697
	698	If C<$substring> matches a part of C<$string>, returns
	699	the position of the first occurrence of the matching part in scalar context;
	700	in list context, returns a two-element list of
	701	the position and the length of the matching part.
	702
	703	B<Notice> that the length of the matching part may differ from
	704	the length of C<$substring>.
	705
	706	B<Note> that the position and the length are counted on the string
	707	after the process of preprocess, normalization, and rearrangement.
	708	Therefore, in case the specified string is not binary equal to
	709	the preprocessed/normalized/rearranged string, the position and the length
	710	may differ form those on the specified string. But it is guaranteed
	711	that, if matched, it returns a non-negative value as C<$position>.
	712
	713	If C<$substring> does not match any part of C<$string>,
	714	returns C<-1> in scalar context and
	715	an empty list in list context.
	716
	717	e.g. you say
	718
	719	my $UCA = Unicode::Collate->new( normalization => undef, level => 1 );
	720	my $str = "Ich mu\x{00DF} studieren.";
	721	my $sub = "m\x{00FC}ss";
	722	my $match;
	723	if(my @tmp = $UCA->index($str, $sub)){
	724	$match = substr($str, $tmp[0], $tmp[1]);
	725	}
	726
	727	and get C<"mu\x{00DF}"> in C<$match> since C<"mu>E<223>C<">
	728	is primary equal to C<"m>E<252>C<ss">.
	729
45394607	730	=back
	731
	732	=head2 EXPORT
	733
	734	None by default.
	735
	736	=head2 CAVEAT
	737
	738	Use of the C<normalization> parameter requires
	739	the B<Unicode::Normalize> module.
	740
	741	If you need not it (e.g. in the case when you need not
	742	handle any combining characters),
	743	assign C<normalization =E<gt> undef> explicitly.
	744
	745	=head1 AUTHOR
	746
	747	SADAHIRO Tomoyuki, E<lt>SADAHIRO@cpan.orgE<gt>
	748
	749	http://homepage1.nifty.com/nomenclator/perl/
	750
	751	Copyright(C) 2001, SADAHIRO Tomoyuki. Japan. All rights reserved.
	752
	753	This program is free software; you can redistribute it and/or
	754	modify it under the same terms as Perl itself.
	755
	756	=head1 SEE ALSO
	757
	758	=over 4
	759
	760	=item L<Lingua::KO::Hangul::Util>
	761
	762	utility functions for Hangul Syllables
	763
	764	=item L<Unicode::Normalize>
	765
	766	normalized forms of Unicode text
	767
	768	=item Unicode Collation Algorithm - Unicode TR #10
	769
	770	http://www.unicode.org/unicode/reports/tr10/
	771
	772	=back
	773
	774	=cut