[p5sagit/p5-mst-13.2.git] / lib / Unicode / Collate.pm

package Unicode::Collate;

use 5.006;
use strict;
use warnings;
use Carp;
use Lingua::KO::Hangul::Util;
require Exporter;

our $VERSION = '0.07';
our $PACKAGE = __PACKAGE__;

our @ISA = qw(Exporter);

our %EXPORT_TAGS = ();
our @EXPORT_OK = ();
our @EXPORT = ();

(our $Path = $INC{'Unicode/Collate.pm'}) =~ s/\.pm$//;
our $KeyFile = "allkeys.txt";

our %Combin; # combining class from Unicode::Normalize

use constant Min2      => 0x20;   # minimum weight at level 2
use constant Min3      => 0x02;   # minimum weight at level 3
use constant UNDEFINED => 0xFF80; # special value for undefined CE

##
## constructor
##
sub new
{
  my $class = shift;
  my $self = bless { @_ }, $class;

  # alternate
  $self->{alternate} = 
     ! exists  $self->{alternate} ? 'shifted' :
     ! defined $self->{alternate} ? '' : $self->{alternate};

  # collation level
  $self->{level} ||= $self->{alternate} =~ /shift/ ? 4 : 3;

  # normalization form
  $self->{normalization} = 'D' if ! exists $self->{normalization};

  eval "use Unicode::Normalize;" if defined $self->{normalization};

  $self->{normalize} = 
    ! defined $self->{normalization}        ? undef :
    $self->{normalization} =~ /^(?:NF)?C$/  ? \&NFC :
    $self->{normalization} =~ /^(?:NF)?D$/  ? \&NFD :
    $self->{normalization} =~ /^(?:NF)?KC$/ ? \&NFKC :
    $self->{normalization} =~ /^(?:NF)?KD$/ ? \&NFKD :
    croak "$PACKAGE unknown normalization form name: $self->{normalization}";

  *Combin = \%Unicode::Normalize::Combin if $self->{normalize} && ! %Combin;

  # backwards
  $self->{backwards} ||= [];
  $self->{backwards} = [ $self->{backwards} ] if ! ref $self->{backwards};

  # rearrange
  $self->{rearrange} ||= []; # maybe not U+0000 (an ASCII)
  $self->{rearrange} = [ $self->{rearrange} ] if ! ref $self->{rearrange};

  # open the table file
  my $file = defined $self->{table} ? $self->{table} : $KeyFile;
  open my $fk, "<$Path/$file" or croak "File does not exist at $Path/$file";

  while(<$fk>){
    next if /^\s*#/;
    if(/^\s*\@/){
       if(/^\@version\s*(\S*)/){
         $self->{version} ||= $1;
       }
       elsif(/^\@alternate\s+(.*)/){
         $self->{alternate} ||= $1;
       }
       elsif(/^\@backwards\s+(.*)/){
         push @{ $self->{backwards} }, $1;
       }
       elsif(/^\@rearrange\s+(.*)/){
         push @{ $self->{rearrange} }, _getHexArray($1);
       }
       next;
    }
    $self->parseEntry($_);
  }
  close $fk;
  if($self->{entry}){
    $self->parseEntry($_) foreach split /\n/, $self->{entry};
  }

  # keys of $self->{rearrangeHash} are $self->{rearrange}.
  $self->{rearrangeHash} = {};
  @{ $self->{rearrangeHash} }{ @{ $self->{rearrange} } } = ();

  return $self;
}

##
## get $line, parse it, and write an entry in $self
##
sub parseEntry
{
  my $self = shift;
  my $line = shift;
  my($name, $ele, @key);

  return if $line !~ /^\s*[0-9A-Fa-f]/;

  # get name
  $name = $1 if $line =~ s/#\s*(.*)//;
  return if defined $self->{undefName} && $name =~ /$self->{undefName}/;

  # get element
  my($e, $k) = split /;/, $line;
  my @e = _getHexArray($e);
  $ele = pack('U*', @e);
  return if defined $self->{undefChar} && $ele =~ /$self->{undefChar}/;

  # get sort key
  if(
     defined $self->{ignoreName} && $name =~ /$self->{ignoreName}/ ||
     defined $self->{ignoreChar} && $ele  =~ /$self->{ignoreChar}/
  )
  {
     $self->{ignored}{$ele} = 1;
     $self->{entries}{$ele} = 1; # true
  }
  else
  {
    foreach my $arr ($k =~ /\[(\S+)\]/g) {
      my $var = $arr =~ /\*/;
      push @key, $self->getCE( $var, _getHexArray($arr) );
    }
    $self->{entries}{$ele} = \@key;
  }
  $self->{maxlength}{ord $ele} = scalar @e if @e > 1;
}


##
## list to collation element
##
sub getCE
{
  my $self = shift;
  my $var  = shift;
  my @c    = @_;

  $self->{alternate} eq 'blanked' ?
     $var ? [0,0,0] : [ @c[0..2] ] :
  $self->{alternate} eq 'non-ignorable' ? [ @c[0..2] ] :
  $self->{alternate} eq 'shifted' ?
    $var ? [0,0,0,$c[0] ] : [ @c[0..2], $c[0]+$c[1]+$c[2] ? 0xFFFF : 0 ] :
  $self->{alternate} eq 'shift-trimmed' ?
    $var ? [0,0,0,$c[0] ] : [ @c[0..2], 0 ] :
   \@c;
}

##
## to debug
##
sub viewSortKey
{
  my $self = shift;
  my $key  = $self->getSortKey(@_);
  my $view = join " ", map sprintf("%04X", $_), unpack 'n*', $key;
  $view =~ s/ ?0000 ?/|/g;
  "[$view]";
}

##
## sort key
##
sub getSortKey
{
  my $self = shift;
  my $code = $self->{preprocess};
  my $norm = $self->{normalize};
  my $ent  = $self->{entries};
  my $ign  = $self->{ignored};
  my $max  = $self->{maxlength};
  my $lev  = $self->{level};
  my $cjk  = $self->{overrideCJK};
  my $hang = $self->{overrideHangul};
  my $rear = $self->{rearrangeHash};

  my $str = ref $code ? &$code(shift) : shift;
  $str = &$norm($str) if ref $norm;

  my(@src, @buf);
  @src = unpack('U*', $str);

  # rearrangement
  for(my $i = 0; $i < @src; $i++)
  {
     ($src[$i], $src[$i+1]) = ($src[$i+1], $src[$i])
        if $rear->{ $src[$i] };
     $i++;
  }

  for(my $i = 0; $i < @src; $i++)
  {
    my $ch;
    my $u  = $src[$i];

  # non-characters
    next if $u < 0 || 0x10FFFF < $u     # out of range
         || 0xD800 < $u && $u < 0xDFFF; # unpaired surrogates
    my $four = $u & 0xFFFF; 
    next if $four == 0xFFFE || $four == 0xFFFF;

    if($max->{$u}) # contract
    {
      for(my $j = $max->{$u}; $j >= 1; $j--)
      { 
        next unless $i+$j-1 < @src;
        $ch = pack 'U*', @src[$i .. $i+$j-1];
        $i += $j-1, last if $ent->{$ch};
      }
    }
    else {  $ch = pack('U', $u) }

    if(%Combin && defined $ch) # with Combining Char
    {
      for(my $j = $i+1; $j < @src && $Combin{ $src[$j] }; $j++)
      {
        my $comb = pack 'U', $src[$j];
        next if ! $ent->{ $ch.$comb };
        $ch .= $comb;
        splice(@src, $j, 1);
        last;
      }
    }

    next if !defined $ch || $ign->{$ch};   # ignored

    push @buf,
      $ent->{$ch}
        ? @{ $ent->{$ch} }
        : _isHangul($u)
          ? $hang
            ? &$hang($u)
            : map(@{ $ent->{pack('U', $_)} }, decomposeHangul($u))
          : _isCJK($u)
            ? $cjk ? &$cjk($u) : map($self->getCE(0,@$_), _CJK($u))
            : map($self->getCE(0,@$_), _derivCE($u));
  }

  # make sort key
  my @ret = ([],[],[],[]);
  foreach my $v (0..$lev-1){
    foreach my $b (@buf){
      push @{ $ret[$v] }, $b->[$v] if $b->[$v];
    }
  }
  foreach (@{ $self->{backwards} }){
    my $v = $_ - 1;
    @{ $ret[$v] } = reverse @{ $ret[$v] };
  }

  # modification of tertiary weights
  if($self->{upper_before_lower}){
    foreach (@{ $ret[2] }){
      if   (0x8 <= $_ && $_ <= 0xC){ $_ -= 6 } # lower
      elsif(0x2 <= $_ && $_ <= 0x6){ $_ += 6 } # upper
      elsif($_ == 0x1C)            { $_ += 1 } # square upper
      elsif($_ == 0x1D)            { $_ -= 1 } # square lower
    }
  }
  if($self->{katakana_before_hiragana}){
    foreach (@{ $ret[2] }){
      if   (0x0F <= $_ && $_ <= 0x13){ $_ -= 2 } # katakana
      elsif(0x0D <= $_ && $_ <= 0x0E){ $_ += 5 } # hiragana
    }
  }
  join "\0\0", map pack('n*', @$_), @ret;
}


##
## cmp
##
sub cmp
{
  my $obj = shift;
  my $a   = shift;
  my $b   = shift;
  $obj->getSortKey($a) cmp $obj->getSortKey($b);
}

##
## sort
##
sub sort
{
  my $obj = shift;

  map { $_->[1] }
  sort{ $a->[0] cmp $b->[0] }
  map [ $obj->getSortKey($_), $_ ], @_;
}

##
## Derived CE
##
sub _derivCE
{
  my $code = shift;
  my $a = UNDEFINED + ($code >> 15); # ok
  my $b = ($code & 0x7FFF) | 0x8000; # ok
# my $a = 0xFFC2 + ($code >> 15);    # ng
# my $b = $code & 0x7FFF | 0x1000;   # ng
  $b ? ([$a,2,1,$code],[$b,0,0,$code]) : [$a,2,1,$code];
}

##
## "hhhh hhhh hhhh" to (dddd, dddd, dddd)
##
sub _getHexArray
{
  my $str = shift;
  map hex(), $str =~ /([0-9a-fA-F]+)/g;
}

##
##  CJK Unified Ideographs
##
sub _isCJK
{
  my $u = shift;
  return 0x3400 <= $u && $u <= 0x4DB5  
      || 0x4E00 <= $u && $u <= 0x9FA5  
#      || 0x20000 <= $u && $u <= 0x2A6D6;
}

##
##  CJK Unified Ideographs
##
sub _CJK
{
  my $u = shift;
  $u > 0xFFFF ? _derivCE($u) : [$u,0x20,0x02,$u];
}

##
## Hangul Syllables
##
sub _isHangul
{
  my $code = shift;
  return 0xAC00 <= $code && $code <= 0xD7A3;
}

1;
__END__

=head1 NAME

Unicode::Collate - use UCA (Unicode Collation Algorithm)

=head1 SYNOPSIS

  use Unicode::Collate;

  #construct
  $UCA = Unicode::Collate->new(%tailoring);

  #sort
  @sorted = $UCA->sort(@not_sorted);

  #compare
  $result = $UCA->cmp($a, $b); # returns 1, 0, or -1. 

=head1 DESCRIPTION

=head2 Constructor and Tailoring

   $UCA = Unicode::Collate->new(
      alternate => $alternate,
      backwards => $levelNumber, # or \@levelNumbers
      entry => $element,
      normalization  => $normalization_form,
      ignoreName => qr/$ignoreName/,
      ignoreChar => qr/$ignoreChar/,
      katakana_before_hiragana => $bool,
      level => $collationLevel,
      overrideCJK => \&overrideCJK,
      overrideHangul => \&overrideHangul,
      preprocess => \&preprocess,
      rearrange => \@charList,
      table => $filename,
      undefName => qr/$undefName/,
      undefChar => qr/$undefChar/,
      upper_before_lower => $bool,
   );
   # if %tailoring is false (empty),
   # $UCA should do the default collation.

=over 4

=item alternate

-- see 3.2.2 Alternate Weighting, UTR #10.

   alternate => 'shifted', 'blanked', 'non-ignorable', or 'shift-trimmed'.

By default (if specification is omitted), 'shifted' is adopted.

=item backwards

-- see 3.1.2 French Accents, UTR #10.

     backwards => $levelNumber or \@levelNumbers

Weights in reverse order; ex. level 2 (diacritic ordering) in French.
If omitted, forwards at all the levels.

=item entry

-- see 3.1 Linguistic Features; 3.2.1 File Format, UTR #10.

Overrides a default order or adds a new element

  entry => <<'ENTRIES', # use the UCA file format
00E6 ; [.0861.0020.0002.00E6] [.08B1.0020.0002.00E6] # ligature <ae> as <a e>
0063 0068 ; [.0893.0020.0002.0063]      # "ch" in traditional Spanish
0043 0068 ; [.0893.0020.0008.0043]      # "Ch" in traditional Spanish
ENTRIES

=item ignoreName

=item ignoreChar

-- see Completely Ignorable, 3.2.2 Alternate Weighting, UTR #10.

Ignores the entry in the table.
If an ignored collation element appears in the string to be collated,
it is ignored as if the element had been deleted from there.

E.g. when 'a' and 'e' are ignored,
'element' is equal to 'lament' (or 'lmnt').

=item level

-- see 4.3 Form a sort key for each string, UTR #10.

Set the maximum level.
Any higher levels than the specified one are ignored.

  Level 1: alphabetic ordering
  Level 2: diacritic ordering
  Level 3: case ordering
  Level 4: tie-breaking (e.g. in the case when alternate is 'shifted')

  ex.level => 2,

=item normalization

-- see 4.1 Normalize each input string, UTR #10.

If specified, strings are normalized before preparation sort keys
(the normalization is executed after preprocess).

As a form name, one of the following names must be used.

  'C'  or 'NFC'  for Normalization Form C
  'D'  or 'NFD'  for Normalization Form D
  'KC' or 'NFKC' for Normalization Form KC
  'KD' or 'NFKD' for Normalization Form KD

If omitted, the string is put into Normalization Form D.

If undefined explicitly (as C<normalization =E<gt> undef>),
any normalization is not carried out (this may make tailoring easier
if any normalization is not desired).

see B<CAVEAT>.

=item overrideCJK

=item overrideHangul

-- see 7.1 Derived Collation Elements, UTR #10.

By default, mapping of CJK Unified Ideographs
uses the Unicode codepoint order
and Hangul Syllables are decomposed into Hangul Jamo.

The mapping of CJK Unified Ideographs
or Hangul Syllables may be overrided.

ex. CJK Unified Ideographs in the JIS codepoint order.

  overrideCJK => sub {
    my $u = shift;               # get unicode codepoint
    my $b = pack('n', $u);       # to UTF-16BE
    my $s = your_unicode_to_sjis_converter($b); # convert
    my $n = unpack('n', $s);     # convert sjis to short
    [ $n, 1, 1 ];                # return collation element
  },

If you want to override the mapping of Hangul Syllables,
the Normalization Forms D and KD are not appropriate
(they will be decomposed before overriding).

=item preprocess

-- see 5.1 Preprocessing, UTR #10.

If specified, the coderef is used to preprocess
before the formation of sort keys.

ex. dropping English articles, such as "a" or "the". 
Then, "the pen" is before "a pencil".

     preprocess => sub {
           my $str = shift;
           $str =~ s/\b(?:an?|the)\s+//g;
           $str;
        },

=item rearrange

-- see 3.1.3 Rearrangement, UTR #10.

Characters that are not coded in logical order and to be rearranged.
By default, 

    rearrange => [ 0x0E40..0x0E44, 0x0EC0..0x0EC4 ],

=item table

-- see 3.2 Default Unicode Collation Element Table, UTR #10.

You can use another element table if desired.
The table file must be in your C<lib/Unicode/Collate> directory.

By default, the file C<lib/Unicode/Collate/allkeys.txt> is used.

=item undefName

=item undefChar

-- see 6.3.4 Reducing the Repertoire, UTR #10.

Undefines the collation element as if it were unassigned in the table.
This reduces the size of the table.
If an unassigned character appears in the string to be collated,
the sort key is made from its codepoint
as a single-character collation element,
as it is greater than any other assigned collation elements
(in the codepoint order among the unassigned characters).
But, it'd be better to ignore characters
unfamiliar to you and maybe never used.

=item katakana_before_hiragana

=item upper_before_lower

-- see 6.6 Case Comparisons; 7.3.1 Tertiary Weight Table, UTR #10.

By default, lowercase is before uppercase
and hiragana is before katakana.

If the parameter is true, this is reversed.

=back

=head2 Other methods

=over 4

=item C<@sorted = $UCA-E<gt>sort(@not_sorted)>

Sorts a list of strings.

=item C<$result = $UCA-E<gt>cmp($a, $b)>

Returns 1 (when C<$a> is greater than C<$b>)
or 0 (when C<$a> is equal to C<$b>)
or -1 (when C<$a> is lesser than C<$b>).

=item C<$sortKey = $UCA-E<gt>getSortKey($string)>

-- see 4.3 Form a sort key for each string, UTR #10.

Returns a sort key.

You compare the sort keys using a binary comparison
and get the result of the comparison of the strings using UCA.

   $UCA->getSortKey($a) cmp $UCA->getSortKey($b)

      is equivalent to

   $UCA->cmp($a, $b)

=back

=head2 EXPORT

None by default.

=head2 CAVEAT

Use of the C<normalization> parameter requires
the B<Unicode::Normalize> module.

If you need not it (e.g. in the case when you need not
handle any combining characters),
assign C<normalization =E<gt> undef> explicitly.

=head1 AUTHOR

SADAHIRO Tomoyuki, E<lt>SADAHIRO@cpan.orgE<gt>

  http://homepage1.nifty.com/nomenclator/perl/

  Copyright(C) 2001, SADAHIRO Tomoyuki. Japan. All rights reserved.

  This program is free software; you can redistribute it and/or 
  modify it under the same terms as Perl itself.

=head1 SEE ALSO

=over 4

=item L<Lingua::KO::Hangul::Util>

utility functions for Hangul Syllables

=item L<Unicode::Normalize>

normalized forms of Unicode text

=item Unicode Collation Algorithm - Unicode TR #10

http://www.unicode.org/unicode/reports/tr10/

=back

=cut
Commit	Line	Data
45394607	1	package Unicode::Collate;
	2
	3	use 5.006;
	4	use strict;
	5	use warnings;
	6	use Carp;
	7	use Lingua::KO::Hangul::Util;
	8	require Exporter;
	9
	10	our $VERSION = '0.07';
	11	our $PACKAGE = __PACKAGE__;
	12
	13	our @ISA = qw(Exporter);
	14
	15	our %EXPORT_TAGS = ();
	16	our @EXPORT_OK = ();
	17	our @EXPORT = ();
	18
	19	(our $Path = $INC{'Unicode/Collate.pm'}) =~ s/\.pm$//;
	20	our $KeyFile = "allkeys.txt";
	21
	22	our %Combin; # combining class from Unicode::Normalize
	23
	24	use constant Min2 => 0x20; # minimum weight at level 2
	25	use constant Min3 => 0x02; # minimum weight at level 3
	26	use constant UNDEFINED => 0xFF80; # special value for undefined CE
	27
	28	##
	29	## constructor
	30	##
	31	sub new
	32	{
	33	my $class = shift;
	34	my $self = bless { @_ }, $class;
	35
	36	# alternate
	37	$self->{alternate} =
	38	! exists $self->{alternate} ? 'shifted' :
	39	! defined $self->{alternate} ? '' : $self->{alternate};
	40
	41	# collation level
	42	$self->{level} \|\|= $self->{alternate} =~ /shift/ ? 4 : 3;
	43
	44	# normalization form
	45	$self->{normalization} = 'D' if ! exists $self->{normalization};
	46
	47	eval "use Unicode::Normalize;" if defined $self->{normalization};
	48
	49	$self->{normalize} =
	50	! defined $self->{normalization} ? undef :
	51	$self->{normalization} =~ /^(?:NF)?C$/ ? \&NFC :
	52	$self->{normalization} =~ /^(?:NF)?D$/ ? \&NFD :
	53	$self->{normalization} =~ /^(?:NF)?KC$/ ? \&NFKC :
	54	$self->{normalization} =~ /^(?:NF)?KD$/ ? \&NFKD :
	55	croak "$PACKAGE unknown normalization form name: $self->{normalization}";
	56
	57	*Combin = \%Unicode::Normalize::Combin if $self->{normalize} && ! %Combin;
	58
	59	# backwards
	60	$self->{backwards} \|\|= [];
	61	$self->{backwards} = [ $self->{backwards} ] if ! ref $self->{backwards};
	62
	63	# rearrange
	64	$self->{rearrange} \|\|= []; # maybe not U+0000 (an ASCII)
65	$self->{rearrange} = [ $self->{rearrange} ] if ! ref $self->{rearrange};
66
67	# open the table file
68	my $file = defined $self->{table} ? $self->{table} : $KeyFile;
69	open my $fk, "<$Path/$file" or croak "File does not exist at $Path/$file";
70
71	while(<$fk>){
72	next if /^\s*#/;
73	if(/^\s*\@/){
74	if(/^\@version\s(\S)/){
75	$self->{version} \|\|= $1;
76	}
77	elsif(/^\@alternate\s+(.*)/){
78	$self->{alternate} \|\|= $1;
79	}
80	elsif(/^\@backwards\s+(.*)/){
81	push @{ $self->{backwards} }, $1;
82	}
83	elsif(/^\@rearrange\s+(.*)/){
84	push @{ $self->{rearrange} }, _getHexArray($1);
85	}
86	next;
87	}
88	$self->parseEntry($_);
89	}
90	close $fk;
91	if($self->{entry}){
92	$self->parseEntry($_) foreach split /\n/, $self->{entry};
93	}
94
95	# keys of $self->{rearrangeHash} are $self->{rearrange}.
96	$self->{rearrangeHash} = {};
97	@{ $self->{rearrangeHash} }{ @{ $self->{rearrange} } } = ();
98
99	return $self;
100	}
101
102	##
103	## get $line, parse it, and write an entry in $self
104	##
105	sub parseEntry
106	{
107	my $self = shift;
108	my $line = shift;
109	my($name, $ele, @key);
110
111	return if $line !~ /^\s*[0-9A-Fa-f]/;
112
113	# get name
114	$name = $1 if $line =~ s/#\s(.)//;
115	return if defined $self->{undefName} && $name =~ /$self->{undefName}/;
116
117	# get element
118	my($e, $k) = split /;/, $line;
119	my @e = _getHexArray($e);
120	$ele = pack('U*', @e);
121	return if defined $self->{undefChar} && $ele =~ /$self->{undefChar}/;
122
123	# get sort key
124	if(
125	defined $self->{ignoreName} && $name =~ /$self->{ignoreName}/ \|\|
126	defined $self->{ignoreChar} && $ele =~ /$self->{ignoreChar}/
127	)
128	{
129	$self->{ignored}{$ele} = 1;
130	$self->{entries}{$ele} = 1; # true
131	}
132	else
133	{
134	foreach my $arr ($k =~ /\[(\S+)\]/g) {
135	my $var = $arr =~ /\*/;
136	push @key, $self->getCE( $var, _getHexArray($arr) );
137	}
138	$self->{entries}{$ele} = \@key;
139	}
140	$self->{maxlength}{ord $ele} = scalar @e if @e > 1;
141	}
142
143
144	##
145	## list to collation element
146	##
147	sub getCE
148	{
149	my $self = shift;
150	my $var = shift;
151	my @c = @_;
152
153	$self->{alternate} eq 'blanked' ?
154	$var ? [0,0,0] : [ @c[0..2] ] :
155	$self->{alternate} eq 'non-ignorable' ? [ @c[0..2] ] :
156	$self->{alternate} eq 'shifted' ?
157	$var ? [0,0,0,$c[0] ] : [ @c[0..2], $c[0]+$c[1]+$c[2] ? 0xFFFF : 0 ] :
158	$self->{alternate} eq 'shift-trimmed' ?
159	$var ? [0,0,0,$c[0] ] : [ @c[0..2], 0 ] :
160	\@c;
161	}
162
163	##
164	## to debug
165	##
166	sub viewSortKey
167	{
168	my $self = shift;
169	my $key = $self->getSortKey(@_);
170	my $view = join " ", map sprintf("%04X", $_), unpack 'n*', $key;
171	$view =~ s/ ?0000 ?/\|/g;
172	"[$view]";
173	}
174
175	##
176	## sort key
177	##
178	sub getSortKey
179	{
180	my $self = shift;
181	my $code = $self->{preprocess};
182	my $norm = $self->{normalize};
183	my $ent = $self->{entries};
184	my $ign = $self->{ignored};
185	my $max = $self->{maxlength};
186	my $lev = $self->{level};
187	my $cjk = $self->{overrideCJK};
188	my $hang = $self->{overrideHangul};
189	my $rear = $self->{rearrangeHash};
190
191	my $str = ref $code ? &$code(shift) : shift;
192	$str = &$norm($str) if ref $norm;
193
194	my(@src, @buf);
195	@src = unpack('U*', $str);
196
197	# rearrangement
198	for(my $i = 0; $i < @src; $i++)
199	{
200	($src[$i], $src[$i+1]) = ($src[$i+1], $src[$i])
201	if $rear->{ $src[$i] };
202	$i++;
203	}
204
205	for(my $i = 0; $i < @src; $i++)
206	{
207	my $ch;
208	my $u = $src[$i];
209
210	# non-characters
211	next if $u < 0 \|\| 0x10FFFF < $u # out of range
212	\|\| 0xD800 < $u && $u < 0xDFFF; # unpaired surrogates
213	my $four = $u & 0xFFFF;
214	next if $four == 0xFFFE \|\| $four == 0xFFFF;
215
216	if($max->{$u}) # contract
217	{
218	for(my $j = $max->{$u}; $j >= 1; $j--)
219	{
220	next unless $i+$j-1 < @src;
221	$ch = pack 'U*', @src[$i .. $i+$j-1];
222	$i += $j-1, last if $ent->{$ch};
223	}
224	}
225	else { $ch = pack('U', $u) }
226
227	if(%Combin && defined $ch) # with Combining Char
228	{
229	for(my $j = $i+1; $j < @src && $Combin{ $src[$j] }; $j++)
230	{
231	my $comb = pack 'U', $src[$j];
232	next if ! $ent->{ $ch.$comb };
233	$ch .= $comb;
234	splice(@src, $j, 1);
235	last;
236	}
237	}
238
239	next if !defined $ch \|\| $ign->{$ch}; # ignored
240
241	push @buf,
242	$ent->{$ch}
243	? @{ $ent->{$ch} }
244	: _isHangul($u)
245	? $hang
246	? &$hang($u)
247	: map(@{ $ent->{pack('U', $_)} }, decomposeHangul($u))
248	: _isCJK($u)
249	? $cjk ? &$cjk($u) : map($self->getCE(0,@$_), _CJK($u))
250	: map($self->getCE(0,@$_), _derivCE($u));
251	}
252
253	# make sort key
254	my @ret = ([],[],[],[]);
255	foreach my $v (0..$lev-1){
256	foreach my $b (@buf){
257	push @{ $ret[$v] }, $b->[$v] if $b->[$v];
258	}
259	}
260	foreach (@{ $self->{backwards} }){
261	my $v = $_ - 1;
262	@{ $ret[$v] } = reverse @{ $ret[$v] };
263	}
264
265	# modification of tertiary weights
266	if($self->{upper_before_lower}){
267	foreach (@{ $ret[2] }){
268	if (0x8 <= $_ && $_ <= 0xC){ $_ -= 6 } # lower
269	elsif(0x2 <= $_ && $_ <= 0x6){ $_ += 6 } # upper
270	elsif($_ == 0x1C) { $_ += 1 } # square upper
271	elsif($_ == 0x1D) { $_ -= 1 } # square lower
272	}
273	}
274	if($self->{katakana_before_hiragana}){
275	foreach (@{ $ret[2] }){
276	if (0x0F <= $_ && $_ <= 0x13){ $_ -= 2 } # katakana
277	elsif(0x0D <= $_ && $_ <= 0x0E){ $_ += 5 } # hiragana
278	}
279	}
280	join "\0\0", map pack('n*', @$_), @ret;
281	}
282
283
284	##
285	## cmp
286	##
287	sub cmp
288	{
289	my $obj = shift;
290	my $a = shift;
291	my $b = shift;
292	$obj->getSortKey($a) cmp $obj->getSortKey($b);
293	}
294
295	##
296	## sort
297	##
298	sub sort
299	{
300	my $obj = shift;
301
302	map { $_->[1] }
303	sort{ $a->[0] cmp $b->[0] }
304	map [ $obj->getSortKey($_), $_ ], @_;
305	}
306
307	##
308	## Derived CE
309	##
310	sub _derivCE
311	{
312	my $code = shift;
313	my $a = UNDEFINED + ($code >> 15); # ok
314	my $b = ($code & 0x7FFF) \| 0x8000; # ok
315	# my $a = 0xFFC2 + ($code >> 15); # ng
316	# my $b = $code & 0x7FFF \| 0x1000; # ng
317	$b ? ([$a,2,1,$code],[$b,0,0,$code]) : [$a,2,1,$code];
318	}
319
320	##
321	## "hhhh hhhh hhhh" to (dddd, dddd, dddd)
322	##
323	sub _getHexArray
324	{
325	my $str = shift;
326	map hex(), $str =~ /([0-9a-fA-F]+)/g;
327	}
328
329	##
330	## CJK Unified Ideographs
331	##
332	sub _isCJK
333	{
334	my $u = shift;
335	return 0x3400 <= $u && $u <= 0x4DB5
336	\|\| 0x4E00 <= $u && $u <= 0x9FA5
337	# \|\| 0x20000 <= $u && $u <= 0x2A6D6;
338	}
339
340	##
341	## CJK Unified Ideographs
342	##
343	sub _CJK
344	{
345	my $u = shift;
346	$u > 0xFFFF ? _derivCE($u) : [$u,0x20,0x02,$u];
347	}
348
349	##
350	## Hangul Syllables
351	##
352	sub _isHangul
353	{
354	my $code = shift;
355	return 0xAC00 <= $code && $code <= 0xD7A3;
356	}
357
358	1;
359	__END__
360
361	=head1 NAME
362
363	Unicode::Collate - use UCA (Unicode Collation Algorithm)
364
365	=head1 SYNOPSIS
366
367	use Unicode::Collate;
368
369	#construct
370	$UCA = Unicode::Collate->new(%tailoring);
371
372	#sort
373	@sorted = $UCA->sort(@not_sorted);
374
375	#compare
376	$result = $UCA->cmp($a, $b); # returns 1, 0, or -1.
377
378	=head1 DESCRIPTION
379
380	=head2 Constructor and Tailoring
381
382	$UCA = Unicode::Collate->new(
383	alternate => $alternate,
384	backwards => $levelNumber, # or \@levelNumbers
385	entry => $element,
386	normalization => $normalization_form,
387	ignoreName => qr/$ignoreName/,
388	ignoreChar => qr/$ignoreChar/,
389	katakana_before_hiragana => $bool,
390	level => $collationLevel,
391	overrideCJK => \&overrideCJK,
392	overrideHangul => \&overrideHangul,
393	preprocess => \&preprocess,
394	rearrange => \@charList,
395	table => $filename,
396	undefName => qr/$undefName/,
397	undefChar => qr/$undefChar/,
398	upper_before_lower => $bool,
399	);
400	# if %tailoring is false (empty),
401	# $UCA should do the default collation.
402
403	=over 4
404
405	=item alternate
406
407	-- see 3.2.2 Alternate Weighting, UTR #10.
408
409	alternate => 'shifted', 'blanked', 'non-ignorable', or 'shift-trimmed'.
410
411	By default (if specification is omitted), 'shifted' is adopted.
412
413	=item backwards
414
415	-- see 3.1.2 French Accents, UTR #10.
416
417	backwards => $levelNumber or \@levelNumbers
418
419	Weights in reverse order; ex. level 2 (diacritic ordering) in French.
420	If omitted, forwards at all the levels.
421
422	=item entry
423
424	-- see 3.1 Linguistic Features; 3.2.1 File Format, UTR #10.
425
426	Overrides a default order or adds a new element
427
428	entry => <<'ENTRIES', # use the UCA file format
429	00E6 ; [.0861.0020.0002.00E6] [.08B1.0020.0002.00E6] # ligature <ae> as <a e>
430	0063 0068 ; [.0893.0020.0002.0063] # "ch" in traditional Spanish
431	0043 0068 ; [.0893.0020.0008.0043] # "Ch" in traditional Spanish
432	ENTRIES
433
434	=item ignoreName
435
436	=item ignoreChar
437
438	-- see Completely Ignorable, 3.2.2 Alternate Weighting, UTR #10.
439
440	Ignores the entry in the table.
441	If an ignored collation element appears in the string to be collated,
442	it is ignored as if the element had been deleted from there.
443
444	E.g. when 'a' and 'e' are ignored,
445	'element' is equal to 'lament' (or 'lmnt').
446
447	=item level
448
449	-- see 4.3 Form a sort key for each string, UTR #10.
450
451	Set the maximum level.
452	Any higher levels than the specified one are ignored.
453
454	Level 1: alphabetic ordering
455	Level 2: diacritic ordering
456	Level 3: case ordering
457	Level 4: tie-breaking (e.g. in the case when alternate is 'shifted')
458
459	ex.level => 2,
460
461	=item normalization
462
463	-- see 4.1 Normalize each input string, UTR #10.
464
465	If specified, strings are normalized before preparation sort keys
466	(the normalization is executed after preprocess).
467
468	As a form name, one of the following names must be used.
469
470	'C' or 'NFC' for Normalization Form C
471	'D' or 'NFD' for Normalization Form D
472	'KC' or 'NFKC' for Normalization Form KC
473	'KD' or 'NFKD' for Normalization Form KD
474
475	If omitted, the string is put into Normalization Form D.
476
477	If undefined explicitly (as C<normalization =E<gt> undef>),
478	any normalization is not carried out (this may make tailoring easier
479	if any normalization is not desired).
480
481	see B<CAVEAT>.
482
483	=item overrideCJK
484
485	=item overrideHangul
486
487	-- see 7.1 Derived Collation Elements, UTR #10.
488
489	By default, mapping of CJK Unified Ideographs
490	uses the Unicode codepoint order
491	and Hangul Syllables are decomposed into Hangul Jamo.
492
493	The mapping of CJK Unified Ideographs
494	or Hangul Syllables may be overrided.
495
496	ex. CJK Unified Ideographs in the JIS codepoint order.
497
498	overrideCJK => sub {
499	my $u = shift; # get unicode codepoint
500	my $b = pack('n', $u); # to UTF-16BE
501	my $s = your_unicode_to_sjis_converter($b); # convert
502	my $n = unpack('n', $s); # convert sjis to short
503	[ $n, 1, 1 ]; # return collation element
504	},
505
506	If you want to override the mapping of Hangul Syllables,
507	the Normalization Forms D and KD are not appropriate
508	(they will be decomposed before overriding).
509
510	=item preprocess
511
512	-- see 5.1 Preprocessing, UTR #10.
513
514	If specified, the coderef is used to preprocess
515	before the formation of sort keys.
516
517	ex. dropping English articles, such as "a" or "the".
518	Then, "the pen" is before "a pencil".
519
520	preprocess => sub {
521	my $str = shift;
522	$str =~ s/\b(?:an?\|the)\s+//g;
523	$str;
524	},
525
526	=item rearrange
527
528	-- see 3.1.3 Rearrangement, UTR #10.
529
530	Characters that are not coded in logical order and to be rearranged.
531	By default,
532
533	rearrange => [ 0x0E40..0x0E44, 0x0EC0..0x0EC4 ],
534
535	=item table
536
537	-- see 3.2 Default Unicode Collation Element Table, UTR #10.
538
539	You can use another element table if desired.
540	The table file must be in your C<lib/Unicode/Collate> directory.
541
542	By default, the file C<lib/Unicode/Collate/allkeys.txt> is used.
543
544	=item undefName
545
546	=item undefChar
547
548	-- see 6.3.4 Reducing the Repertoire, UTR #10.
549
550	Undefines the collation element as if it were unassigned in the table.
551	This reduces the size of the table.
552	If an unassigned character appears in the string to be collated,
553	the sort key is made from its codepoint
554	as a single-character collation element,
555	as it is greater than any other assigned collation elements
556	(in the codepoint order among the unassigned characters).
557	But, it'd be better to ignore characters
558	unfamiliar to you and maybe never used.
559
560	=item katakana_before_hiragana
561
562	=item upper_before_lower
563
564	-- see 6.6 Case Comparisons; 7.3.1 Tertiary Weight Table, UTR #10.
565
566	By default, lowercase is before uppercase
567	and hiragana is before katakana.
568
569	If the parameter is true, this is reversed.
570
571	=back
572
573	=head2 Other methods
574
575	=over 4
576
577	=item C<@sorted = $UCA-E<gt>sort(@not_sorted)>
578
579	Sorts a list of strings.
580
581	=item C<$result = $UCA-E<gt>cmp($a, $b)>
582
583	Returns 1 (when C<$a> is greater than C<$b>)
584	or 0 (when C<$a> is equal to C<$b>)
585	or -1 (when C<$a> is lesser than C<$b>).
586
587	=item C<$sortKey = $UCA-E<gt>getSortKey($string)>
588
589	-- see 4.3 Form a sort key for each string, UTR #10.
590
591	Returns a sort key.
592
593	You compare the sort keys using a binary comparison
594	and get the result of the comparison of the strings using UCA.
595
596	$UCA->getSortKey($a) cmp $UCA->getSortKey($b)
597
598	is equivalent to
599
600	$UCA->cmp($a, $b)
601
602	=back
603
604	=head2 EXPORT
605
606	None by default.
607
608	=head2 CAVEAT
609
610	Use of the C<normalization> parameter requires
611	the B<Unicode::Normalize> module.
612
613	If you need not it (e.g. in the case when you need not
614	handle any combining characters),
615	assign C<normalization =E<gt> undef> explicitly.
616
617	=head1 AUTHOR
618
619	SADAHIRO Tomoyuki, E<lt>SADAHIRO@cpan.orgE<gt>
620
621	http://homepage1.nifty.com/nomenclator/perl/
622
623	Copyright(C) 2001, SADAHIRO Tomoyuki. Japan. All rights reserved.
624
625	This program is free software; you can redistribute it and/or
626	modify it under the same terms as Perl itself.
627
628	=head1 SEE ALSO
629
630	=over 4
631
632	=item L<Lingua::KO::Hangul::Util>
633
634	utility functions for Hangul Syllables
635
636	=item L<Unicode::Normalize>
637
638	normalized forms of Unicode text
639
640	=item Unicode Collation Algorithm - Unicode TR #10
641
642	http://www.unicode.org/unicode/reports/tr10/
643
644	=back
645
646	=cut