[p5sagit/p5-mst-13.2.git] / lib / Unicode / Collate.pm

package Unicode::Collate;

use 5.006;
use strict;
use warnings;
use Carp;
require Exporter;

our $VERSION = '0.08';
our $PACKAGE = __PACKAGE__;

our @ISA = qw(Exporter);

our %EXPORT_TAGS = ();
our @EXPORT_OK = ();
our @EXPORT = ();

(our $Path = $INC{'Unicode/Collate.pm'}) =~ s/\.pm$//;
our $KeyFile = "allkeys.txt";

# Lingua::KO::Hangul::Util not part of the standard distribution
# but it will be used if available.

eval { require Lingua::KO::Hangul::Util };
my $hasHangulUtil = ! $@;
if ($hasHangulUtil) {
    Lingua::KO::Hangul::Util->import();
}

our %Combin; # combining class from Unicode::Normalize

use constant Min2      => 0x20;   # minimum weight at level 2
use constant Min3      => 0x02;   # minimum weight at level 3
use constant UNDEFINED => 0xFF80; # special value for undefined CE

##
## constructor
##
sub new
{
  my $class = shift;
  my $self = bless { @_ }, $class;

  # alternate
  $self->{alternate} = 
     ! exists  $self->{alternate} ? 'shifted' :
     ! defined $self->{alternate} ? '' : $self->{alternate};

  # collation level
  $self->{level} ||= ($self->{alternate} =~ /shift/ ? 4 : 3);

  # normalization form
  $self->{normalization} = 'D' if ! exists $self->{normalization};

  eval "use Unicode::Normalize;" if defined $self->{normalization};

  $self->{normalize} = 
    ! defined $self->{normalization}        ? undef :
    $self->{normalization} =~ /^(?:NF)?C$/  ? \&NFC :
    $self->{normalization} =~ /^(?:NF)?D$/  ? \&NFD :
    $self->{normalization} =~ /^(?:NF)?KC$/ ? \&NFKC :
    $self->{normalization} =~ /^(?:NF)?KD$/ ? \&NFKD :
    croak "$PACKAGE unknown normalization form name: $self->{normalization}";

  *Combin = \%Unicode::Normalize::Combin if $self->{normalize} && ! %Combin;

  # backwards
  $self->{backwards} ||= [];
  $self->{backwards} = [ $self->{backwards} ] if ! ref $self->{backwards};

  # rearrange
  $self->{rearrange} ||= []; # maybe not U+0000 (an ASCII)
  $self->{rearrange} = [ $self->{rearrange} ] if ! ref $self->{rearrange};

  # open the table file
  my $file = defined $self->{table} ? $self->{table} : $KeyFile;
  open my $fk, "<$Path/$file" or croak "File does not exist at $Path/$file";

  while(<$fk>){
    next if /^\s*#/;
    if(/^\s*\@/){
       if(/^\@version\s*(\S*)/){
         $self->{version} ||= $1;
       }
       elsif(/^\@alternate\s+(.*)/){
         $self->{alternate} ||= $1;
       }
       elsif(/^\@backwards\s+(.*)/){
         push @{ $self->{backwards} }, $1;
       }
       elsif(/^\@rearrange\s+(.*)/){
         push @{ $self->{rearrange} }, _getHexArray($1);
       }
       next;
    }
    $self->parseEntry($_);
  }
  close $fk;
  if($self->{entry}){
    $self->parseEntry($_) foreach split /\n/, $self->{entry};
  }

  # keys of $self->{rearrangeHash} are $self->{rearrange}.
  $self->{rearrangeHash} = {};
  @{ $self->{rearrangeHash} }{ @{ $self->{rearrange} } } = ();

  return $self;
}

##
## get $line, parse it, and write an entry in $self
##
sub parseEntry
{
  my $self = shift;
  my $line = shift;
  my($name, $ele, @key);

  return if $line !~ /^\s*[0-9A-Fa-f]/;

  # get name
  $name = $1 if $line =~ s/#\s*(.*)//;
  return if defined $self->{undefName} && $name =~ /$self->{undefName}/;

  # get element
  my($e, $k) = split /;/, $line;
  my @e = _getHexArray($e);
  $ele = pack('U*', @e);
  return if defined $self->{undefChar} && $ele =~ /$self->{undefChar}/;

  # get sort key
  if(
     defined $self->{ignoreName} && $name =~ /$self->{ignoreName}/ ||
     defined $self->{ignoreChar} && $ele  =~ /$self->{ignoreChar}/
  )
  {
     $self->{entries}{$ele} = $self->{ignored}{$ele} = 1;
  }
  else
  {
    foreach my $arr ($k =~ /\[(\S+)\]/g) {
      my $var = $arr =~ /\*/;
      push @key, $self->altCE( $var, _getHexArray($arr) );
    }
    $self->{entries}{$ele} = \@key;
  }
  $self->{maxlength}{ord $ele} = scalar @e if @e > 1;
}


##
## arrayref CE = altCE(bool variable?, list[num] weights)
##
sub altCE
{
  my $self = shift;
  my $var  = shift;
  my @c    = @_;

  $self->{alternate} eq 'blanked' ?
    $var ? [0,0,0] : [ @c[0..2] ] :
  $self->{alternate} eq 'non-ignorable' ?
    [ @c[0..2] ] :
  $self->{alternate} eq 'shifted' ?
    $var ? [0,0,0,$c[0] ] : [ @c[0..2], $c[0]+$c[1]+$c[2] ? 0xFFFF : 0 ] :
  $self->{alternate} eq 'shift-trimmed' ?
    $var ? [0,0,0,$c[0] ] : [ @c[0..2], 0 ] :
   \@c;
}

##
## string hex_sortkey = splitCE(string arg)
##
sub viewSortKey
{
  my $self = shift;
  my $key  = $self->getSortKey(@_);
  my $view = join " ", map sprintf("%04X", $_), unpack 'n*', $key;
  $view =~ s/ ?0000 ?/|/g;
  "[$view]";
}


##
## list[strings] elements = splitCE(string arg)
##
sub splitCE
{
  my $self = shift;
  my $code = $self->{preprocess};
  my $norm = $self->{normalize};
  my $ent  = $self->{entries};
  my $max  = $self->{maxlength};
  my $rear = $self->{rearrangeHash};

  my $str = ref $code ? &$code(shift) : shift;
  $str = &$norm($str) if ref $norm;

  my(@src, @buf);
  @src = unpack('U*', $str);

  # rearrangement
  for(my $i = 0; $i < @src; $i++)
  {
     ($src[$i], $src[$i+1]) = ($src[$i+1], $src[$i])
        if $rear->{ $src[$i] };
     $i++;
  }

  for(my $i = 0; $i < @src; $i++)
  {
    my $ch;
    my $u  = $src[$i];

  # non-characters
    next if $u < 0 || 0x10FFFF < $u     # out of range
         || 0xD800 < $u && $u < 0xDFFF; # unpaired surrogates
    my $four = $u & 0xFFFF; 
    next if $four == 0xFFFE || $four == 0xFFFF;

    if($max->{$u}) # contract
    {
      for(my $j = $max->{$u}; $j >= 1; $j--)
      { 
        next unless $i+$j-1 < @src;
        $ch = pack 'U*', @src[$i .. $i+$j-1];
        $i += $j-1, last if $ent->{$ch};
      }
    }
    else {  $ch = pack('U', $u) }

    if(%Combin && defined $ch) # with Combining Char
    {
      for(my $j = $i+1; $j < @src && $Combin{ $src[$j] }; $j++)
      {
        my $comb = pack 'U', $src[$j];
        next if ! $ent->{ $ch.$comb };
        $ch .= $comb;
        splice(@src, $j, 1);
        last;
      }
    }
    push @buf, $ch;
  }
  wantarray ? @buf : \@buf;
}


##
## list[arrayrefs] weight = getWt(string element)
##
sub getWt
{
  my $self = shift;
  my $ch   = shift;
  my $ent  = $self->{entries};
  my $ign  = $self->{ignored};
  my $cjk  = $self->{overrideCJK};
  my $hang = $self->{overrideHangul};
  return if !defined $ch || $ign->{$ch};   # ignored
  return @{ $ent->{$ch} } if $ent->{$ch};
  my $u = unpack('U', $ch);
  return
    _isHangul($u)
      ? $hang
        ? &$hang($u)
        : ($hasHangulUtil ?
              map(@{ $ent->{pack('U', $_)} }, decomposeHangul($u)) :
	      # runtime compile error...
              (eval 'use Lingua::KO::Hangul::Util', print $@))
      : _isCJK($u)
        ? $cjk ? &$cjk($u) : map($self->altCE(0,@$_), _CJK($u))
        : map($self->altCE(0,@$_), _derivCE($u));
}

##
## int = index(string, substring)
##
sub index
{
  my $self = shift;
  my $lev  = $self->{level};
  my $str  = $self->splitCE(shift);
  my $sub  = $self->splitCE(shift);

  return wantarray ? (0,0) : 0 if ! @$sub;
  return wantarray ?  ()  : -1 if ! @$str;

  my @subWt = grep _ignorableAtLevel($_,$lev),
              map $self->getWt($_), @$sub;

  my(@strWt,@strPt);
  my $count = 0;
  for my $e (@$str){
    my @tmp = grep _ignorableAtLevel($_,$lev), $self->getWt($e);
    push @strWt, @tmp;
    push @strPt, ($count) x @tmp; 
    $count += length $e;
    while(@strWt >= @subWt){
      if(_eqArray(\@strWt, \@subWt, $lev)){
        my $pos = $strPt[0];
        return wantarray ? ($pos, $count-$pos) : $pos;
      }
      shift @strWt;
      shift @strPt;
    }
  }
  return wantarray ? () : -1;
}

##
## bool _eqArray(arrayref, arrayref, level)
##
sub _eqArray($$$)
{
  my $a   = shift; # length $a >= length $b;
  my $b   = shift;
  my $lev = shift;
  for my $v (0..$lev-1){
    for my $c (0..@$b-1){
      return if $a->[$c][$v] != $b->[$c][$v];
    }
  }
  return 1;
}


##
## bool _ignorableAtLevel(CE, level)
##
sub _ignorableAtLevel($$)
{
  my $ce = shift;
  return if ! defined $ce;
  my $lv = shift;
  ! grep { ! $ce->[$_] } 0..$lv-1;
}


##
## string sortkey = getSortKey(string arg)
##
sub getSortKey
{
  my $self = shift;
  my $lev  = $self->{level};
  my $rCE  = $self->splitCE(shift); # get an arrayref

  # weight arrays
  my @buf = grep defined(), map $self->getWt($_), @$rCE;

  # make sort key
  my @ret = ([],[],[],[]);
  foreach my $v (0..$lev-1){
    foreach my $b (@buf){
      push @{ $ret[$v] }, $b->[$v] if $b->[$v];
    }
  }
  foreach (@{ $self->{backwards} }){
    my $v = $_ - 1;
    @{ $ret[$v] } = reverse @{ $ret[$v] };
  }

  # modification of tertiary weights
  if($self->{upper_before_lower}){
    foreach (@{ $ret[2] }){
      if   (0x8 <= $_ && $_ <= 0xC){ $_ -= 6 } # lower
      elsif(0x2 <= $_ && $_ <= 0x6){ $_ += 6 } # upper
      elsif($_ == 0x1C)            { $_ += 1 } # square upper
      elsif($_ == 0x1D)            { $_ -= 1 } # square lower
    }
  }
  if($self->{katakana_before_hiragana}){
    foreach (@{ $ret[2] }){
      if   (0x0F <= $_ && $_ <= 0x13){ $_ -= 2 } # katakana
      elsif(0x0D <= $_ && $_ <= 0x0E){ $_ += 5 } # hiragana
    }
  }
  join "\0\0", map pack('n*', @$_), @ret;
}


##
## int compare = cmp(string a, string b)
##
sub cmp
{
  my $obj = shift;
  my $a   = shift;
  my $b   = shift;
  $obj->getSortKey($a) cmp $obj->getSortKey($b);
}

##
## list[strings] sorted = sort(list[strings] arg)
##
sub sort
{
  my $obj = shift;

  map { $_->[1] }
  sort{ $a->[0] cmp $b->[0] }
  map [ $obj->getSortKey($_), $_ ], @_;
}

##
## list[arrayrefs] CE = _derivCE(int codepoint)
##
sub _derivCE
{
  my $code = shift;
  my $a = UNDEFINED + ($code >> 15); # ok
  my $b = ($code & 0x7FFF) | 0x8000; # ok
# my $a = 0xFFC2 + ($code >> 15);    # ng
# my $b = $code & 0x7FFF | 0x1000;   # ng
  $b ? ([$a,2,1,$code],[$b,0,0,$code]) : [$a,2,1,$code];
}

##
## "hhhh hhhh hhhh" to (dddd, dddd, dddd)
##
sub _getHexArray
{
  my $str = shift;
  map hex(), $str =~ /([0-9a-fA-F]+)/g;
}

##
## bool is_a_CJK_Unified_Ideograph = _isCJK(int codepoint)
##
sub _isCJK
{
  my $u = shift;
  return 0x3400 <= $u && $u <= 0x4DB5  
      || 0x4E00 <= $u && $u <= 0x9FA5  
#      || 0x20000 <= $u && $u <= 0x2A6D6;
}

##
## list[arrayref] CE = _CJK(int codepoint_of_CJK)
##
sub _CJK
{
  my $u = shift;
  $u > 0xFFFF ? _derivCE($u) : [$u,0x20,0x02,$u];
}

##
## bool is_a_Hangul_Syllable = _isHangul(int codepoint) 
##
sub _isHangul
{
  my $code = shift;
  return 0xAC00 <= $code && $code <= 0xD7A3;
}

1;
__END__

=head1 NAME

Unicode::Collate - use UCA (Unicode Collation Algorithm)

=head1 SYNOPSIS

  use Unicode::Collate;

  #construct
  $UCA = Unicode::Collate->new(%tailoring);

  #sort
  @sorted = $UCA->sort(@not_sorted);

  #compare
  $result = $UCA->cmp($a, $b); # returns 1, 0, or -1. 

=head1 DESCRIPTION

=head2 Constructor and Tailoring

The C<new> method returns a collator object.

   $UCA = Unicode::Collate->new(
      alternate => $alternate,
      backwards => $levelNumber, # or \@levelNumbers
      entry => $element,
      normalization  => $normalization_form,
      ignoreName => qr/$ignoreName/,
      ignoreChar => qr/$ignoreChar/,
      katakana_before_hiragana => $bool,
      level => $collationLevel,
      overrideCJK => \&overrideCJK,
      overrideHangul => \&overrideHangul,
      preprocess => \&preprocess,
      rearrange => \@charList,
      table => $filename,
      undefName => qr/$undefName/,
      undefChar => qr/$undefChar/,
      upper_before_lower => $bool,
   );
   # if %tailoring is false (empty),
   # $UCA should do the default collation.

=over 4

=item alternate

-- see 3.2.2 Alternate Weighting, UTR #10.

   alternate => 'shifted', 'blanked', 'non-ignorable', or 'shift-trimmed'.

By default (if specification is omitted), 'shifted' is adopted.

=item backwards

-- see 3.1.2 French Accents, UTR #10.

     backwards => $levelNumber or \@levelNumbers

Weights in reverse order; ex. level 2 (diacritic ordering) in French.
If omitted, forwards at all the levels.

=item entry

-- see 3.1 Linguistic Features; 3.2.1 File Format, UTR #10.

Overrides a default order or adds a new element

  entry => <<'ENTRIES', # use the UCA file format
00E6 ; [.0861.0020.0002.00E6] [.08B1.0020.0002.00E6] # ligature <ae> as <a e>
0063 0068 ; [.0893.0020.0002.0063]      # "ch" in traditional Spanish
0043 0068 ; [.0893.0020.0008.0043]      # "Ch" in traditional Spanish
ENTRIES

=item ignoreName

=item ignoreChar

-- see Completely Ignorable, 3.2.2 Alternate Weighting, UTR #10.

Ignores the entry in the table.
If an ignored collation element appears in the string to be collated,
it is ignored as if the element had been deleted from there.

E.g. when 'a' and 'e' are ignored,
'element' is equal to 'lament' (or 'lmnt').

=item level

-- see 4.3 Form a sort key for each string, UTR #10.

Set the maximum level.
Any higher levels than the specified one are ignored.

  Level 1: alphabetic ordering
  Level 2: diacritic ordering
  Level 3: case ordering
  Level 4: tie-breaking (e.g. in the case when alternate is 'shifted')

  ex.level => 2,

=item normalization

-- see 4.1 Normalize each input string, UTR #10.

If specified, strings are normalized before preparation sort keys
(the normalization is executed after preprocess).

As a form name, one of the following names must be used.

  'C'  or 'NFC'  for Normalization Form C
  'D'  or 'NFD'  for Normalization Form D
  'KC' or 'NFKC' for Normalization Form KC
  'KD' or 'NFKD' for Normalization Form KD

If omitted, the string is put into Normalization Form D.

If undefined explicitly (as C<normalization =E<gt> undef>),
any normalization is not carried out (this may make tailoring easier
if any normalization is not desired).

see B<CAVEAT>.

=item overrideCJK

=item overrideHangul

-- see 7.1 Derived Collation Elements, UTR #10.

By default, mapping of CJK Unified Ideographs
uses the Unicode codepoint order
and Hangul Syllables are decomposed into Hangul Jamo.

The mapping of CJK Unified Ideographs
or Hangul Syllables may be overrided.

ex. CJK Unified Ideographs in the JIS codepoint order.

  overrideCJK => sub {
    my $u = shift;               # get unicode codepoint
    my $b = pack('n', $u);       # to UTF-16BE
    my $s = your_unicode_to_sjis_converter($b); # convert
    my $n = unpack('n', $s);     # convert sjis to short
    [ $n, 1, 1 ];                # return collation element
  },

If you want to override the mapping of Hangul Syllables,
the Normalization Forms D and KD are not appropriate
(they will be decomposed before overriding).

=item preprocess

-- see 5.1 Preprocessing, UTR #10.

If specified, the coderef is used to preprocess
before the formation of sort keys.

ex. dropping English articles, such as "a" or "the". 
Then, "the pen" is before "a pencil".

     preprocess => sub {
           my $str = shift;
           $str =~ s/\b(?:an?|the)\s+//g;
           $str;
        },

=item rearrange

-- see 3.1.3 Rearrangement, UTR #10.

Characters that are not coded in logical order and to be rearranged.
By default, 

    rearrange => [ 0x0E40..0x0E44, 0x0EC0..0x0EC4 ],

=item table

-- see 3.2 Default Unicode Collation Element Table, UTR #10.

You can use another element table if desired.
The table file must be in your C<lib/Unicode/Collate> directory.

By default, the file C<lib/Unicode/Collate/allkeys.txt> is used.

=item undefName

=item undefChar

-- see 6.3.4 Reducing the Repertoire, UTR #10.

Undefines the collation element as if it were unassigned in the table.
This reduces the size of the table.
If an unassigned character appears in the string to be collated,
the sort key is made from its codepoint
as a single-character collation element,
as it is greater than any other assigned collation elements
(in the codepoint order among the unassigned characters).
But, it'd be better to ignore characters
unfamiliar to you and maybe never used.

=item katakana_before_hiragana

=item upper_before_lower

-- see 6.6 Case Comparisons; 7.3.1 Tertiary Weight Table, UTR #10.

By default, lowercase is before uppercase
and hiragana is before katakana.

If the parameter is true, this is reversed.

=back

=head2 Other methods

=over 4

=item C<@sorted = $UCA-E<gt>sort(@not_sorted)>

Sorts a list of strings.

=item C<$result = $UCA-E<gt>cmp($a, $b)>

Returns 1 (when C<$a> is greater than C<$b>)
or 0 (when C<$a> is equal to C<$b>)
or -1 (when C<$a> is lesser than C<$b>).

=item C<$sortKey = $UCA-E<gt>getSortKey($string)>

-- see 4.3 Form a sort key for each string, UTR #10.

Returns a sort key.

You compare the sort keys using a binary comparison
and get the result of the comparison of the strings using UCA.

   $UCA->getSortKey($a) cmp $UCA->getSortKey($b)

      is equivalent to

   $UCA->cmp($a, $b)

=item C<$position = $UCA-E<gt>index($string, $substring)>

=item C<($position, $length) = $UCA-E<gt>index($string, $substring)>

-- see 6.8 Searching, UTR #10.

If C<$substring> matches a part of C<$string>, returns
the position of the first occurrence of the matching part in scalar context;
in list context, returns a two-element list of
the position and the length of the matching part.

B<Notice> that the length of the matching part may differ from
the length of C<$substring>.

B<Note> that the position and the length are counted on the string
after the process of preprocess, normalization, and rearrangement.
Therefore, in case the specified string is not binary equal to
the preprocessed/normalized/rearranged string, the position and the length
may differ form those on the specified string. But it is guaranteed 
that, if matched, it returns a non-negative value as C<$position>.

If C<$substring> does not match any part of C<$string>,
returns C<-1> in scalar context and
an empty list in list context.

e.g. you say

  my $UCA = Unicode::Collate->new( normalization => undef, level => 1 );
  my $str = "Ich mu\x{00DF} studieren.";
  my $sub = "m\x{00FC}ss";
  my $match;
  if(my @tmp = $UCA->index($str, $sub)){
    $match = substr($str, $tmp[0], $tmp[1]);
  }

and get C<"mu\x{00DF}"> in C<$match> since C<"mu>E<223>C<">
is primary equal to C<"m>E<252>C<ss">. 

=back

=head2 EXPORT

None by default.

=head2 CAVEAT

Use of the C<normalization> parameter requires
the B<Unicode::Normalize> module.

If you need not it (e.g. in the case when you need not
handle any combining characters),
assign C<normalization =E<gt> undef> explicitly.

=head1 AUTHOR

SADAHIRO Tomoyuki, E<lt>SADAHIRO@cpan.orgE<gt>

  http://homepage1.nifty.com/nomenclator/perl/

  Copyright(C) 2001, SADAHIRO Tomoyuki. Japan. All rights reserved.

  This program is free software; you can redistribute it and/or 
  modify it under the same terms as Perl itself.

=head1 SEE ALSO

=over 4

=item L<Lingua::KO::Hangul::Util>

utility functions for Hangul Syllables

=item L<Unicode::Normalize>

normalized forms of Unicode text

=item Unicode Collation Algorithm - Unicode TR #10

http://www.unicode.org/unicode/reports/tr10/

=back

=cut
Commit	Line	Data
45394607	1	package Unicode::Collate;
	2
	3	use 5.006;
	4	use strict;
	5	use warnings;
	6	use Carp;
45394607	7	require Exporter;
45394607	8
d16e9e3d	9	our $VERSION = '0.08';
45394607	10	our $PACKAGE = __PACKAGE__;
	11
	12	our @ISA = qw(Exporter);
	13
	14	our %EXPORT_TAGS = ();
	15	our @EXPORT_OK = ();
	16	our @EXPORT = ();
	17
	18	(our $Path = $INC{'Unicode/Collate.pm'}) =~ s/\.pm$//;
	19	our $KeyFile = "allkeys.txt";
	20
ac5ea531	21	# Lingua::KO::Hangul::Util not part of the standard distribution
	22	# but it will be used if available.
	23
	24	eval { require Lingua::KO::Hangul::Util };
	25	my $hasHangulUtil = ! $@;
	26	if ($hasHangulUtil) {
	27	Lingua::KO::Hangul::Util->import();
	28	}
	29
45394607	30	our %Combin; # combining class from Unicode::Normalize
	31
	32	use constant Min2 => 0x20; # minimum weight at level 2
	33	use constant Min3 => 0x02; # minimum weight at level 3
	34	use constant UNDEFINED => 0xFF80; # special value for undefined CE
	35
	36	##
	37	## constructor
	38	##
	39	sub new
	40	{
	41	my $class = shift;
	42	my $self = bless { @_ }, $class;
	43
	44	# alternate
	45	$self->{alternate} =
	46	! exists $self->{alternate} ? 'shifted' :
	47	! defined $self->{alternate} ? '' : $self->{alternate};
	48
	49	# collation level
d16e9e3d	50	$self->{level} \|\|= ($self->{alternate} =~ /shift/ ? 4 : 3);
45394607	51
	52	# normalization form
	53	$self->{normalization} = 'D' if ! exists $self->{normalization};
	54
	55	eval "use Unicode::Normalize;" if defined $self->{normalization};
	56
	57	$self->{normalize} =
	58	! defined $self->{normalization} ? undef :
	59	$self->{normalization} =~ /^(?:NF)?C$/ ? \&NFC :
	60	$self->{normalization} =~ /^(?:NF)?D$/ ? \&NFD :
	61	$self->{normalization} =~ /^(?:NF)?KC$/ ? \&NFKC :
	62	$self->{normalization} =~ /^(?:NF)?KD$/ ? \&NFKD :
	63	croak "$PACKAGE unknown normalization form name: $self->{normalization}";
	64
	65	*Combin = \%Unicode::Normalize::Combin if $self->{normalize} && ! %Combin;
	66
	67	# backwards
	68	$self->{backwards} \|\|= [];
	69	$self->{backwards} = [ $self->{backwards} ] if ! ref $self->{backwards};
	70
	71	# rearrange
	72	$self->{rearrange} \|\|= []; # maybe not U+0000 (an ASCII)
	73	$self->{rearrange} = [ $self->{rearrange} ] if ! ref $self->{rearrange};
	74
	75	# open the table file
	76	my $file = defined $self->{table} ? $self->{table} : $KeyFile;
	77	open my $fk, "<$Path/$file" or croak "File does not exist at $Path/$file";
	78
	79	while(<$fk>){
	80	next if /^\s*#/;
	81	if(/^\s*\@/){
	82	if(/^\@version\s(\S)/){
	83	$self->{version} \|\|= $1;
	84	}
	85	elsif(/^\@alternate\s+(.*)/){
	86	$self->{alternate} \|\|= $1;
	87	}
	88	elsif(/^\@backwards\s+(.*)/){
	89	push @{ $self->{backwards} }, $1;
	90	}
	91	elsif(/^\@rearrange\s+(.*)/){
	92	push @{ $self->{rearrange} }, _getHexArray($1);
	93	}
	94	next;
	95	}
	96	$self->parseEntry($_);
	97	}
	98	close $fk;
	99	if($self->{entry}){
	100	$self->parseEntry($_) foreach split /\n/, $self->{entry};
	101	}
	102
	103	# keys of $self->{rearrangeHash} are $self->{rearrange}.
	104	$self->{rearrangeHash} = {};
	105	@{ $self->{rearrangeHash} }{ @{ $self->{rearrange} } } = ();
	106
	107	return $self;
	108	}
	109
	110	##
	111	## get $line, parse it, and write an entry in $self
	112	##
	113	sub parseEntry
	114	{
115	my $self = shift;
116	my $line = shift;
117	my($name, $ele, @key);
118
119	return if $line !~ /^\s*[0-9A-Fa-f]/;
120
121	# get name
122	$name = $1 if $line =~ s/#\s(.)//;
123	return if defined $self->{undefName} && $name =~ /$self->{undefName}/;
124
125	# get element
126	my($e, $k) = split /;/, $line;
127	my @e = _getHexArray($e);
128	$ele = pack('U*', @e);
129	return if defined $self->{undefChar} && $ele =~ /$self->{undefChar}/;
130
131	# get sort key
132	if(
133	defined $self->{ignoreName} && $name =~ /$self->{ignoreName}/ \|\|
134	defined $self->{ignoreChar} && $ele =~ /$self->{ignoreChar}/
135	)
136	{
d16e9e3d	137	$self->{entries}{$ele} = $self->{ignored}{$ele} = 1;
45394607	138	}
	139	else
	140	{
	141	foreach my $arr ($k =~ /\[(\S+)\]/g) {
	142	my $var = $arr =~ /\*/;
d16e9e3d	143	push @key, $self->altCE( $var, _getHexArray($arr) );
45394607	144	}
	145	$self->{entries}{$ele} = \@key;
	146	}
	147	$self->{maxlength}{ord $ele} = scalar @e if @e > 1;
	148	}
	149
	150
	151	##
d16e9e3d	152	## arrayref CE = altCE(bool variable?, list[num] weights)
45394607	153	##
d16e9e3d	154	sub altCE
45394607	155	{
	156	my $self = shift;
	157	my $var = shift;
	158	my @c = @_;
	159
	160	$self->{alternate} eq 'blanked' ?
d16e9e3d	161	$var ? [0,0,0] : [ @c[0..2] ] :
	162	$self->{alternate} eq 'non-ignorable' ?
	163	[ @c[0..2] ] :
45394607	164	$self->{alternate} eq 'shifted' ?
	165	$var ? [0,0,0,$c[0] ] : [ @c[0..2], $c[0]+$c[1]+$c[2] ? 0xFFFF : 0 ] :
	166	$self->{alternate} eq 'shift-trimmed' ?
	167	$var ? [0,0,0,$c[0] ] : [ @c[0..2], 0 ] :
	168	\@c;
	169	}
	170
	171	##
d16e9e3d	172	## string hex_sortkey = splitCE(string arg)
45394607	173	##
	174	sub viewSortKey
	175	{
	176	my $self = shift;
	177	my $key = $self->getSortKey(@_);
	178	my $view = join " ", map sprintf("%04X", $_), unpack 'n*', $key;
	179	$view =~ s/ ?0000 ?/\|/g;
	180	"[$view]";
	181	}
	182
d16e9e3d	183
45394607	184	##
d16e9e3d	185	## list[strings] elements = splitCE(string arg)
45394607	186	##
d16e9e3d	187	sub splitCE
45394607	188	{
	189	my $self = shift;
	190	my $code = $self->{preprocess};
	191	my $norm = $self->{normalize};
	192	my $ent = $self->{entries};
45394607	193	my $max = $self->{maxlength};
45394607	194	my $rear = $self->{rearrangeHash};
	195
	196	my $str = ref $code ? &$code(shift) : shift;
	197	$str = &$norm($str) if ref $norm;
	198
	199	my(@src, @buf);
	200	@src = unpack('U*', $str);
	201
	202	# rearrangement
	203	for(my $i = 0; $i < @src; $i++)
	204	{
	205	($src[$i], $src[$i+1]) = ($src[$i+1], $src[$i])
	206	if $rear->{ $src[$i] };
	207	$i++;
	208	}
	209
	210	for(my $i = 0; $i < @src; $i++)
	211	{
	212	my $ch;
	213	my $u = $src[$i];
	214
	215	# non-characters
	216	next if $u < 0 \|\| 0x10FFFF < $u # out of range
	217	\|\| 0xD800 < $u && $u < 0xDFFF; # unpaired surrogates
	218	my $four = $u & 0xFFFF;
	219	next if $four == 0xFFFE \|\| $four == 0xFFFF;
	220
	221	if($max->{$u}) # contract
	222	{
	223	for(my $j = $max->{$u}; $j >= 1; $j--)
	224	{
	225	next unless $i+$j-1 < @src;
	226	$ch = pack 'U*', @src[$i .. $i+$j-1];
	227	$i += $j-1, last if $ent->{$ch};
	228	}
	229	}
	230	else { $ch = pack('U', $u) }
	231
	232	if(%Combin && defined $ch) # with Combining Char
	233	{
	234	for(my $j = $i+1; $j < @src && $Combin{ $src[$j] }; $j++)
	235	{
	236	my $comb = pack 'U', $src[$j];
	237	next if ! $ent->{ $ch.$comb };
	238	$ch .= $comb;
	239	splice(@src, $j, 1);
	240	last;
	241	}
	242	}
d16e9e3d	243	push @buf, $ch;
	244	}
	245	wantarray ? @buf : \@buf;
	246	}
45394607	247
d16e9e3d	248
	249	##
	250	## list[arrayrefs] weight = getWt(string element)
	251	##
	252	sub getWt
	253	{
	254	my $self = shift;
	255	my $ch = shift;
	256	my $ent = $self->{entries};
	257	my $ign = $self->{ignored};
	258	my $cjk = $self->{overrideCJK};
	259	my $hang = $self->{overrideHangul};
	260	return if !defined $ch \|\| $ign->{$ch}; # ignored
	261	return @{ $ent->{$ch} } if $ent->{$ch};
	262	my $u = unpack('U', $ch);
	263	return
	264	_isHangul($u)
	265	? $hang
	266	? &$hang($u)
ac5ea531	267	: ($hasHangulUtil ?
	268	map(@{ $ent->{pack('U', $_)} }, decomposeHangul($u)) :
	269	# runtime compile error...
	270	(eval 'use Lingua::KO::Hangul::Util', print $@))
d16e9e3d	271	: _isCJK($u)
	272	? $cjk ? &$cjk($u) : map($self->altCE(0,@$_), _CJK($u))
	273	: map($self->altCE(0,@$_), _derivCE($u));
	274	}
	275
	276	##
	277	## int = index(string, substring)
	278	##
	279	sub index
	280	{
	281	my $self = shift;
	282	my $lev = $self->{level};
	283	my $str = $self->splitCE(shift);
	284	my $sub = $self->splitCE(shift);
	285
	286	return wantarray ? (0,0) : 0 if ! @$sub;
	287	return wantarray ? () : -1 if ! @$str;
	288
	289	my @subWt = grep _ignorableAtLevel($_,$lev),
	290	map $self->getWt($_), @$sub;
	291
	292	my(@strWt,@strPt);
	293	my $count = 0;
	294	for my $e (@$str){
	295	my @tmp = grep _ignorableAtLevel($_,$lev), $self->getWt($e);
	296	push @strWt, @tmp;
	297	push @strPt, ($count) x @tmp;
	298	$count += length $e;
	299	while(@strWt >= @subWt){
	300	if(_eqArray(\@strWt, \@subWt, $lev)){
	301	my $pos = $strPt[0];
	302	return wantarray ? ($pos, $count-$pos) : $pos;
	303	}
	304	shift @strWt;
	305	shift @strPt;
	306	}
	307	}
	308	return wantarray ? () : -1;
	309	}
	310
	311	##
	312	## bool _eqArray(arrayref, arrayref, level)
	313	##
	314	sub _eqArray($$$)
	315	{
	316	my $a = shift; # length $a >= length $b;
	317	my $b = shift;
	318	my $lev = shift;
	319	for my $v (0..$lev-1){
	320	for my $c (0..@$b-1){
	321	return if $a->[$c][$v] != $b->[$c][$v];
	322	}
45394607	323	}
d16e9e3d	324	return 1;
	325	}
	326
	327
	328	##
	329	## bool _ignorableAtLevel(CE, level)
	330	##
	331	sub _ignorableAtLevel($$)
	332	{
	333	my $ce = shift;
	334	return if ! defined $ce;
	335	my $lv = shift;
	336	! grep { ! $ce->[$_] } 0..$lv-1;
	337	}
	338
	339
	340	##
	341	## string sortkey = getSortKey(string arg)
	342	##
	343	sub getSortKey
	344	{
	345	my $self = shift;
	346	my $lev = $self->{level};
	347	my $rCE = $self->splitCE(shift); # get an arrayref
	348
	349	# weight arrays
	350	my @buf = grep defined(), map $self->getWt($_), @$rCE;
45394607	351
	352	# make sort key
	353	my @ret = ([],[],[],[]);
	354	foreach my $v (0..$lev-1){
	355	foreach my $b (@buf){
	356	push @{ $ret[$v] }, $b->[$v] if $b->[$v];
	357	}
	358	}
	359	foreach (@{ $self->{backwards} }){
	360	my $v = $_ - 1;
	361	@{ $ret[$v] } = reverse @{ $ret[$v] };
	362	}
	363
	364	# modification of tertiary weights
	365	if($self->{upper_before_lower}){
	366	foreach (@{ $ret[2] }){
	367	if (0x8 <= $_ && $_ <= 0xC){ $_ -= 6 } # lower
	368	elsif(0x2 <= $_ && $_ <= 0x6){ $_ += 6 } # upper
	369	elsif($_ == 0x1C) { $_ += 1 } # square upper
	370	elsif($_ == 0x1D) { $_ -= 1 } # square lower
	371	}
	372	}
	373	if($self->{katakana_before_hiragana}){
	374	foreach (@{ $ret[2] }){
	375	if (0x0F <= $_ && $_ <= 0x13){ $_ -= 2 } # katakana
	376	elsif(0x0D <= $_ && $_ <= 0x0E){ $_ += 5 } # hiragana
	377	}
	378	}
	379	join "\0\0", map pack('n*', @$_), @ret;
	380	}
	381
	382
	383	##
d16e9e3d	384	## int compare = cmp(string a, string b)
45394607	385	##
	386	sub cmp
	387	{
	388	my $obj = shift;
	389	my $a = shift;
	390	my $b = shift;
	391	$obj->getSortKey($a) cmp $obj->getSortKey($b);
	392	}
	393
	394	##
d16e9e3d	395	## list[strings] sorted = sort(list[strings] arg)
45394607	396	##
	397	sub sort
	398	{
	399	my $obj = shift;
	400
	401	map { $_->[1] }
	402	sort{ $a->[0] cmp $b->[0] }
	403	map [ $obj->getSortKey($_), $_ ], @_;
	404	}
	405
	406	##
d16e9e3d	407	## list[arrayrefs] CE = _derivCE(int codepoint)
45394607	408	##
	409	sub _derivCE
	410	{
	411	my $code = shift;
	412	my $a = UNDEFINED + ($code >> 15); # ok
	413	my $b = ($code & 0x7FFF) \| 0x8000; # ok
	414	# my $a = 0xFFC2 + ($code >> 15); # ng
	415	# my $b = $code & 0x7FFF \| 0x1000; # ng
	416	$b ? ([$a,2,1,$code],[$b,0,0,$code]) : [$a,2,1,$code];
	417	}
	418
	419	##
	420	## "hhhh hhhh hhhh" to (dddd, dddd, dddd)
	421	##
	422	sub _getHexArray
	423	{
	424	my $str = shift;
	425	map hex(), $str =~ /([0-9a-fA-F]+)/g;
	426	}
	427
	428	##
d16e9e3d	429	## bool is_a_CJK_Unified_Ideograph = _isCJK(int codepoint)
45394607	430	##
	431	sub _isCJK
	432	{
	433	my $u = shift;
	434	return 0x3400 <= $u && $u <= 0x4DB5
	435	\|\| 0x4E00 <= $u && $u <= 0x9FA5
	436	# \|\| 0x20000 <= $u && $u <= 0x2A6D6;
	437	}
	438
	439	##
d16e9e3d	440	## list[arrayref] CE = _CJK(int codepoint_of_CJK)
45394607	441	##
	442	sub _CJK
	443	{
	444	my $u = shift;
	445	$u > 0xFFFF ? _derivCE($u) : [$u,0x20,0x02,$u];
	446	}
	447
	448	##
d16e9e3d	449	## bool is_a_Hangul_Syllable = _isHangul(int codepoint)
45394607	450	##
	451	sub _isHangul
	452	{
	453	my $code = shift;
	454	return 0xAC00 <= $code && $code <= 0xD7A3;
	455	}
	456
	457	1;
	458	__END__
	459
	460	=head1 NAME
	461
	462	Unicode::Collate - use UCA (Unicode Collation Algorithm)
	463
	464	=head1 SYNOPSIS
	465
	466	use Unicode::Collate;
	467
	468	#construct
	469	$UCA = Unicode::Collate->new(%tailoring);
	470
	471	#sort
	472	@sorted = $UCA->sort(@not_sorted);
	473
	474	#compare
	475	$result = $UCA->cmp($a, $b); # returns 1, 0, or -1.
	476
	477	=head1 DESCRIPTION
	478
	479	=head2 Constructor and Tailoring
	480
d16e9e3d	481	The C<new> method returns a collator object.
d16e9e3d	482
45394607	483	$UCA = Unicode::Collate->new(
	484	alternate => $alternate,
	485	backwards => $levelNumber, # or \@levelNumbers
	486	entry => $element,
	487	normalization => $normalization_form,
	488	ignoreName => qr/$ignoreName/,
	489	ignoreChar => qr/$ignoreChar/,
	490	katakana_before_hiragana => $bool,
	491	level => $collationLevel,
	492	overrideCJK => \&overrideCJK,
	493	overrideHangul => \&overrideHangul,
	494	preprocess => \&preprocess,
	495	rearrange => \@charList,
	496	table => $filename,
	497	undefName => qr/$undefName/,
	498	undefChar => qr/$undefChar/,
	499	upper_before_lower => $bool,
	500	);
	501	# if %tailoring is false (empty),
	502	# $UCA should do the default collation.
	503
	504	=over 4
	505
	506	=item alternate
	507
	508	-- see 3.2.2 Alternate Weighting, UTR #10.
	509
	510	alternate => 'shifted', 'blanked', 'non-ignorable', or 'shift-trimmed'.
	511
	512	By default (if specification is omitted), 'shifted' is adopted.
	513
	514	=item backwards
	515
	516	-- see 3.1.2 French Accents, UTR #10.
	517
	518	backwards => $levelNumber or \@levelNumbers
	519
	520	Weights in reverse order; ex. level 2 (diacritic ordering) in French.
	521	If omitted, forwards at all the levels.
	522
	523	=item entry
	524
	525	-- see 3.1 Linguistic Features; 3.2.1 File Format, UTR #10.
	526
	527	Overrides a default order or adds a new element
	528
	529	entry => <<'ENTRIES', # use the UCA file format
	530	00E6 ; [.0861.0020.0002.00E6] [.08B1.0020.0002.00E6] # ligature <ae> as <a e>
	531	0063 0068 ; [.0893.0020.0002.0063] # "ch" in traditional Spanish
	532	0043 0068 ; [.0893.0020.0008.0043] # "Ch" in traditional Spanish
	533	ENTRIES
	534
	535	=item ignoreName
	536
	537	=item ignoreChar
	538
	539	-- see Completely Ignorable, 3.2.2 Alternate Weighting, UTR #10.
	540
	541	Ignores the entry in the table.
	542	If an ignored collation element appears in the string to be collated,
	543	it is ignored as if the element had been deleted from there.
	544
	545	E.g. when 'a' and 'e' are ignored,
	546	'element' is equal to 'lament' (or 'lmnt').
547
548	=item level
549
550	-- see 4.3 Form a sort key for each string, UTR #10.
551
552	Set the maximum level.
553	Any higher levels than the specified one are ignored.
554
555	Level 1: alphabetic ordering
556	Level 2: diacritic ordering
557	Level 3: case ordering
558	Level 4: tie-breaking (e.g. in the case when alternate is 'shifted')
559
560	ex.level => 2,
561
562	=item normalization
563
564	-- see 4.1 Normalize each input string, UTR #10.
565
566	If specified, strings are normalized before preparation sort keys
567	(the normalization is executed after preprocess).
568
569	As a form name, one of the following names must be used.
570
571	'C' or 'NFC' for Normalization Form C
572	'D' or 'NFD' for Normalization Form D
573	'KC' or 'NFKC' for Normalization Form KC
574	'KD' or 'NFKD' for Normalization Form KD
575
576	If omitted, the string is put into Normalization Form D.
577
578	If undefined explicitly (as C<normalization =E<gt> undef>),
579	any normalization is not carried out (this may make tailoring easier
580	if any normalization is not desired).
581
582	see B<CAVEAT>.
583
584	=item overrideCJK
585
586	=item overrideHangul
587
588	-- see 7.1 Derived Collation Elements, UTR #10.
589
590	By default, mapping of CJK Unified Ideographs
591	uses the Unicode codepoint order
592	and Hangul Syllables are decomposed into Hangul Jamo.
593
594	The mapping of CJK Unified Ideographs
595	or Hangul Syllables may be overrided.
596
597	ex. CJK Unified Ideographs in the JIS codepoint order.
598
599	overrideCJK => sub {
600	my $u = shift; # get unicode codepoint
601	my $b = pack('n', $u); # to UTF-16BE
602	my $s = your_unicode_to_sjis_converter($b); # convert
603	my $n = unpack('n', $s); # convert sjis to short
604	[ $n, 1, 1 ]; # return collation element
605	},
606
607	If you want to override the mapping of Hangul Syllables,
608	the Normalization Forms D and KD are not appropriate
609	(they will be decomposed before overriding).
610
611	=item preprocess
612
613	-- see 5.1 Preprocessing, UTR #10.
614
615	If specified, the coderef is used to preprocess
616	before the formation of sort keys.
617
618	ex. dropping English articles, such as "a" or "the".
619	Then, "the pen" is before "a pencil".
620
621	preprocess => sub {
622	my $str = shift;
623	$str =~ s/\b(?:an?\|the)\s+//g;
624	$str;
625	},
626
627	=item rearrange
628
629	-- see 3.1.3 Rearrangement, UTR #10.
630
631	Characters that are not coded in logical order and to be rearranged.
632	By default,
633
634	rearrange => [ 0x0E40..0x0E44, 0x0EC0..0x0EC4 ],
635
636	=item table
637
638	-- see 3.2 Default Unicode Collation Element Table, UTR #10.
639
640	You can use another element table if desired.
641	The table file must be in your C<lib/Unicode/Collate> directory.
642
643	By default, the file C<lib/Unicode/Collate/allkeys.txt> is used.
644
645	=item undefName
646
647	=item undefChar
648
649	-- see 6.3.4 Reducing the Repertoire, UTR #10.
650
651	Undefines the collation element as if it were unassigned in the table.
652	This reduces the size of the table.
653	If an unassigned character appears in the string to be collated,
654	the sort key is made from its codepoint
655	as a single-character collation element,
656	as it is greater than any other assigned collation elements
657	(in the codepoint order among the unassigned characters).
658	But, it'd be better to ignore characters
659	unfamiliar to you and maybe never used.
660
661	=item katakana_before_hiragana
662
663	=item upper_before_lower
664
665	-- see 6.6 Case Comparisons; 7.3.1 Tertiary Weight Table, UTR #10.
666
667	By default, lowercase is before uppercase
668	and hiragana is before katakana.
669
670	If the parameter is true, this is reversed.
671
672	=back
673
674	=head2 Other methods
675
676	=over 4
677
678	=item C<@sorted = $UCA-E<gt>sort(@not_sorted)>
679
680	Sorts a list of strings.
681
682	=item C<$result = $UCA-E<gt>cmp($a, $b)>
683
684	Returns 1 (when C<$a> is greater than C<$b>)
685	or 0 (when C<$a> is equal to C<$b>)
686	or -1 (when C<$a> is lesser than C<$b>).
687
688	=item C<$sortKey = $UCA-E<gt>getSortKey($string)>
689
690	-- see 4.3 Form a sort key for each string, UTR #10.
691
692	Returns a sort key.
693
694	You compare the sort keys using a binary comparison
695	and get the result of the comparison of the strings using UCA.
696
697	$UCA->getSortKey($a) cmp $UCA->getSortKey($b)
698
699	is equivalent to
700
701	$UCA->cmp($a, $b)
702
d16e9e3d	703	=item C<$position = $UCA-E<gt>index($string, $substring)>
	704
	705	=item C<($position, $length) = $UCA-E<gt>index($string, $substring)>
	706
	707	-- see 6.8 Searching, UTR #10.
	708
	709	If C<$substring> matches a part of C<$string>, returns
	710	the position of the first occurrence of the matching part in scalar context;
	711	in list context, returns a two-element list of
	712	the position and the length of the matching part.
	713
	714	B<Notice> that the length of the matching part may differ from
	715	the length of C<$substring>.
	716
	717	B<Note> that the position and the length are counted on the string
	718	after the process of preprocess, normalization, and rearrangement.
	719	Therefore, in case the specified string is not binary equal to
	720	the preprocessed/normalized/rearranged string, the position and the length
	721	may differ form those on the specified string. But it is guaranteed
	722	that, if matched, it returns a non-negative value as C<$position>.
	723
	724	If C<$substring> does not match any part of C<$string>,
	725	returns C<-1> in scalar context and
	726	an empty list in list context.
	727
	728	e.g. you say
	729
	730	my $UCA = Unicode::Collate->new( normalization => undef, level => 1 );
	731	my $str = "Ich mu\x{00DF} studieren.";
	732	my $sub = "m\x{00FC}ss";
	733	my $match;
	734	if(my @tmp = $UCA->index($str, $sub)){
	735	$match = substr($str, $tmp[0], $tmp[1]);
	736	}
	737
	738	and get C<"mu\x{00DF}"> in C<$match> since C<"mu>E<223>C<">
	739	is primary equal to C<"m>E<252>C<ss">.
	740
45394607	741	=back
	742
	743	=head2 EXPORT
	744
	745	None by default.
	746
	747	=head2 CAVEAT
	748
	749	Use of the C<normalization> parameter requires
	750	the B<Unicode::Normalize> module.
	751
	752	If you need not it (e.g. in the case when you need not
	753	handle any combining characters),
	754	assign C<normalization =E<gt> undef> explicitly.
	755
	756	=head1 AUTHOR
	757
	758	SADAHIRO Tomoyuki, E<lt>SADAHIRO@cpan.orgE<gt>
	759
	760	http://homepage1.nifty.com/nomenclator/perl/
	761
	762	Copyright(C) 2001, SADAHIRO Tomoyuki. Japan. All rights reserved.
	763
	764	This program is free software; you can redistribute it and/or
	765	modify it under the same terms as Perl itself.
	766
	767	=head1 SEE ALSO
	768
	769	=over 4
	770
	771	=item L<Lingua::KO::Hangul::Util>
	772
	773	utility functions for Hangul Syllables
	774
	775	=item L<Unicode::Normalize>
	776
	777	normalized forms of Unicode text
	778
	779	=item Unicode Collation Algorithm - Unicode TR #10
	780
	781	http://www.unicode.org/unicode/reports/tr10/
	782
	783	=back
	784
	785	=cut