1 package Unicode::Collate;
7 use Lingua::KO::Hangul::Util;
10 our $VERSION = '0.07';
11 our $PACKAGE = __PACKAGE__;
13 our @ISA = qw(Exporter);
15 our %EXPORT_TAGS = ();
19 (our $Path = $INC{'Unicode/Collate.pm'}) =~ s/\.pm$//;
20 our $KeyFile = "allkeys.txt";
22 our %Combin; # combining class from Unicode::Normalize
24 use constant Min2 => 0x20; # minimum weight at level 2
25 use constant Min3 => 0x02; # minimum weight at level 3
26 use constant UNDEFINED => 0xFF80; # special value for undefined CE
34 my $self = bless { @_ }, $class;
38 ! exists $self->{alternate} ? 'shifted' :
39 ! defined $self->{alternate} ? '' : $self->{alternate};
42 $self->{level} ||= $self->{alternate} =~ /shift/ ? 4 : 3;
45 $self->{normalization} = 'D' if ! exists $self->{normalization};
47 eval "use Unicode::Normalize;" if defined $self->{normalization};
50 ! defined $self->{normalization} ? undef :
51 $self->{normalization} =~ /^(?:NF)?C$/ ? \&NFC :
52 $self->{normalization} =~ /^(?:NF)?D$/ ? \&NFD :
53 $self->{normalization} =~ /^(?:NF)?KC$/ ? \&NFKC :
54 $self->{normalization} =~ /^(?:NF)?KD$/ ? \&NFKD :
55 croak "$PACKAGE unknown normalization form name: $self->{normalization}";
57 *Combin = \%Unicode::Normalize::Combin if $self->{normalize} && ! %Combin;
60 $self->{backwards} ||= [];
61 $self->{backwards} = [ $self->{backwards} ] if ! ref $self->{backwards};
64 $self->{rearrange} ||= []; # maybe not U+0000 (an ASCII)
65 $self->{rearrange} = [ $self->{rearrange} ] if ! ref $self->{rearrange};
68 my $file = defined $self->{table} ? $self->{table} : $KeyFile;
69 open my $fk, "<$Path/$file" or croak "File does not exist at $Path/$file";
74 if(/^\@version\s*(\S*)/){
75 $self->{version} ||= $1;
77 elsif(/^\@alternate\s+(.*)/){
78 $self->{alternate} ||= $1;
80 elsif(/^\@backwards\s+(.*)/){
81 push @{ $self->{backwards} }, $1;
83 elsif(/^\@rearrange\s+(.*)/){
84 push @{ $self->{rearrange} }, _getHexArray($1);
88 $self->parseEntry($_);
92 $self->parseEntry($_) foreach split /\n/, $self->{entry};
95 # keys of $self->{rearrangeHash} are $self->{rearrange}.
96 $self->{rearrangeHash} = {};
97 @{ $self->{rearrangeHash} }{ @{ $self->{rearrange} } } = ();
103 ## get $line, parse it, and write an entry in $self
109 my($name, $ele, @key);
111 return if $line !~ /^\s*[0-9A-Fa-f]/;
114 $name = $1 if $line =~ s/#\s*(.*)//;
115 return if defined $self->{undefName} && $name =~ /$self->{undefName}/;
118 my($e, $k) = split /;/, $line;
119 my @e = _getHexArray($e);
120 $ele = pack('U*', @e);
121 return if defined $self->{undefChar} && $ele =~ /$self->{undefChar}/;
125 defined $self->{ignoreName} && $name =~ /$self->{ignoreName}/ ||
126 defined $self->{ignoreChar} && $ele =~ /$self->{ignoreChar}/
129 $self->{ignored}{$ele} = 1;
130 $self->{entries}{$ele} = 1; # true
134 foreach my $arr ($k =~ /\[(\S+)\]/g) {
135 my $var = $arr =~ /\*/;
136 push @key, $self->getCE( $var, _getHexArray($arr) );
138 $self->{entries}{$ele} = \@key;
140 $self->{maxlength}{ord $ele} = scalar @e if @e > 1;
145 ## list to collation element
153 $self->{alternate} eq 'blanked' ?
154 $var ? [0,0,0] : [ @c[0..2] ] :
155 $self->{alternate} eq 'non-ignorable' ? [ @c[0..2] ] :
156 $self->{alternate} eq 'shifted' ?
157 $var ? [0,0,0,$c[0] ] : [ @c[0..2], $c[0]+$c[1]+$c[2] ? 0xFFFF : 0 ] :
158 $self->{alternate} eq 'shift-trimmed' ?
159 $var ? [0,0,0,$c[0] ] : [ @c[0..2], 0 ] :
169 my $key = $self->getSortKey(@_);
170 my $view = join " ", map sprintf("%04X", $_), unpack 'n*', $key;
171 $view =~ s/ ?0000 ?/|/g;
181 my $code = $self->{preprocess};
182 my $norm = $self->{normalize};
183 my $ent = $self->{entries};
184 my $ign = $self->{ignored};
185 my $max = $self->{maxlength};
186 my $lev = $self->{level};
187 my $cjk = $self->{overrideCJK};
188 my $hang = $self->{overrideHangul};
189 my $rear = $self->{rearrangeHash};
191 my $str = ref $code ? &$code(shift) : shift;
192 $str = &$norm($str) if ref $norm;
195 @src = unpack('U*', $str);
198 for(my $i = 0; $i < @src; $i++)
200 ($src[$i], $src[$i+1]) = ($src[$i+1], $src[$i])
201 if $rear->{ $src[$i] };
205 for(my $i = 0; $i < @src; $i++)
211 next if $u < 0 || 0x10FFFF < $u # out of range
212 || 0xD800 < $u && $u < 0xDFFF; # unpaired surrogates
213 my $four = $u & 0xFFFF;
214 next if $four == 0xFFFE || $four == 0xFFFF;
216 if($max->{$u}) # contract
218 for(my $j = $max->{$u}; $j >= 1; $j--)
220 next unless $i+$j-1 < @src;
221 $ch = pack 'U*', @src[$i .. $i+$j-1];
222 $i += $j-1, last if $ent->{$ch};
225 else { $ch = pack('U', $u) }
227 if(%Combin && defined $ch) # with Combining Char
229 for(my $j = $i+1; $j < @src && $Combin{ $src[$j] }; $j++)
231 my $comb = pack 'U', $src[$j];
232 next if ! $ent->{ $ch.$comb };
239 next if !defined $ch || $ign->{$ch}; # ignored
247 : map(@{ $ent->{pack('U', $_)} }, decomposeHangul($u))
249 ? $cjk ? &$cjk($u) : map($self->getCE(0,@$_), _CJK($u))
250 : map($self->getCE(0,@$_), _derivCE($u));
254 my @ret = ([],[],[],[]);
255 foreach my $v (0..$lev-1){
256 foreach my $b (@buf){
257 push @{ $ret[$v] }, $b->[$v] if $b->[$v];
260 foreach (@{ $self->{backwards} }){
262 @{ $ret[$v] } = reverse @{ $ret[$v] };
265 # modification of tertiary weights
266 if($self->{upper_before_lower}){
267 foreach (@{ $ret[2] }){
268 if (0x8 <= $_ && $_ <= 0xC){ $_ -= 6 } # lower
269 elsif(0x2 <= $_ && $_ <= 0x6){ $_ += 6 } # upper
270 elsif($_ == 0x1C) { $_ += 1 } # square upper
271 elsif($_ == 0x1D) { $_ -= 1 } # square lower
274 if($self->{katakana_before_hiragana}){
275 foreach (@{ $ret[2] }){
276 if (0x0F <= $_ && $_ <= 0x13){ $_ -= 2 } # katakana
277 elsif(0x0D <= $_ && $_ <= 0x0E){ $_ += 5 } # hiragana
280 join "\0\0", map pack('n*', @$_), @ret;
292 $obj->getSortKey($a) cmp $obj->getSortKey($b);
303 sort{ $a->[0] cmp $b->[0] }
304 map [ $obj->getSortKey($_), $_ ], @_;
313 my $a = UNDEFINED + ($code >> 15); # ok
314 my $b = ($code & 0x7FFF) | 0x8000; # ok
315 # my $a = 0xFFC2 + ($code >> 15); # ng
316 # my $b = $code & 0x7FFF | 0x1000; # ng
317 $b ? ([$a,2,1,$code],[$b,0,0,$code]) : [$a,2,1,$code];
321 ## "hhhh hhhh hhhh" to (dddd, dddd, dddd)
326 map hex(), $str =~ /([0-9a-fA-F]+)/g;
330 ## CJK Unified Ideographs
335 return 0x3400 <= $u && $u <= 0x4DB5
336 || 0x4E00 <= $u && $u <= 0x9FA5
337 # || 0x20000 <= $u && $u <= 0x2A6D6;
341 ## CJK Unified Ideographs
346 $u > 0xFFFF ? _derivCE($u) : [$u,0x20,0x02,$u];
355 return 0xAC00 <= $code && $code <= 0xD7A3;
363 Unicode::Collate - use UCA (Unicode Collation Algorithm)
367 use Unicode::Collate;
370 $UCA = Unicode::Collate->new(%tailoring);
373 @sorted = $UCA->sort(@not_sorted);
376 $result = $UCA->cmp($a, $b); # returns 1, 0, or -1.
380 =head2 Constructor and Tailoring
382 $UCA = Unicode::Collate->new(
383 alternate => $alternate,
384 backwards => $levelNumber, # or \@levelNumbers
386 normalization => $normalization_form,
387 ignoreName => qr/$ignoreName/,
388 ignoreChar => qr/$ignoreChar/,
389 katakana_before_hiragana => $bool,
390 level => $collationLevel,
391 overrideCJK => \&overrideCJK,
392 overrideHangul => \&overrideHangul,
393 preprocess => \&preprocess,
394 rearrange => \@charList,
396 undefName => qr/$undefName/,
397 undefChar => qr/$undefChar/,
398 upper_before_lower => $bool,
400 # if %tailoring is false (empty),
401 # $UCA should do the default collation.
407 -- see 3.2.2 Alternate Weighting, UTR #10.
409 alternate => 'shifted', 'blanked', 'non-ignorable', or 'shift-trimmed'.
411 By default (if specification is omitted), 'shifted' is adopted.
415 -- see 3.1.2 French Accents, UTR #10.
417 backwards => $levelNumber or \@levelNumbers
419 Weights in reverse order; ex. level 2 (diacritic ordering) in French.
420 If omitted, forwards at all the levels.
424 -- see 3.1 Linguistic Features; 3.2.1 File Format, UTR #10.
426 Overrides a default order or adds a new element
428 entry => <<'ENTRIES', # use the UCA file format
429 00E6 ; [.0861.0020.0002.00E6] [.08B1.0020.0002.00E6] # ligature <ae> as <a e>
430 0063 0068 ; [.0893.0020.0002.0063] # "ch" in traditional Spanish
431 0043 0068 ; [.0893.0020.0008.0043] # "Ch" in traditional Spanish
438 -- see Completely Ignorable, 3.2.2 Alternate Weighting, UTR #10.
440 Ignores the entry in the table.
441 If an ignored collation element appears in the string to be collated,
442 it is ignored as if the element had been deleted from there.
444 E.g. when 'a' and 'e' are ignored,
445 'element' is equal to 'lament' (or 'lmnt').
449 -- see 4.3 Form a sort key for each string, UTR #10.
451 Set the maximum level.
452 Any higher levels than the specified one are ignored.
454 Level 1: alphabetic ordering
455 Level 2: diacritic ordering
456 Level 3: case ordering
457 Level 4: tie-breaking (e.g. in the case when alternate is 'shifted')
463 -- see 4.1 Normalize each input string, UTR #10.
465 If specified, strings are normalized before preparation sort keys
466 (the normalization is executed after preprocess).
468 As a form name, one of the following names must be used.
470 'C' or 'NFC' for Normalization Form C
471 'D' or 'NFD' for Normalization Form D
472 'KC' or 'NFKC' for Normalization Form KC
473 'KD' or 'NFKD' for Normalization Form KD
475 If omitted, the string is put into Normalization Form D.
477 If undefined explicitly (as C<normalization =E<gt> undef>),
478 any normalization is not carried out (this may make tailoring easier
479 if any normalization is not desired).
487 -- see 7.1 Derived Collation Elements, UTR #10.
489 By default, mapping of CJK Unified Ideographs
490 uses the Unicode codepoint order
491 and Hangul Syllables are decomposed into Hangul Jamo.
493 The mapping of CJK Unified Ideographs
494 or Hangul Syllables may be overrided.
496 ex. CJK Unified Ideographs in the JIS codepoint order.
499 my $u = shift; # get unicode codepoint
500 my $b = pack('n', $u); # to UTF-16BE
501 my $s = your_unicode_to_sjis_converter($b); # convert
502 my $n = unpack('n', $s); # convert sjis to short
503 [ $n, 1, 1 ]; # return collation element
506 If you want to override the mapping of Hangul Syllables,
507 the Normalization Forms D and KD are not appropriate
508 (they will be decomposed before overriding).
512 -- see 5.1 Preprocessing, UTR #10.
514 If specified, the coderef is used to preprocess
515 before the formation of sort keys.
517 ex. dropping English articles, such as "a" or "the".
518 Then, "the pen" is before "a pencil".
522 $str =~ s/\b(?:an?|the)\s+//g;
528 -- see 3.1.3 Rearrangement, UTR #10.
530 Characters that are not coded in logical order and to be rearranged.
533 rearrange => [ 0x0E40..0x0E44, 0x0EC0..0x0EC4 ],
537 -- see 3.2 Default Unicode Collation Element Table, UTR #10.
539 You can use another element table if desired.
540 The table file must be in your C<lib/Unicode/Collate> directory.
542 By default, the file C<lib/Unicode/Collate/allkeys.txt> is used.
548 -- see 6.3.4 Reducing the Repertoire, UTR #10.
550 Undefines the collation element as if it were unassigned in the table.
551 This reduces the size of the table.
552 If an unassigned character appears in the string to be collated,
553 the sort key is made from its codepoint
554 as a single-character collation element,
555 as it is greater than any other assigned collation elements
556 (in the codepoint order among the unassigned characters).
557 But, it'd be better to ignore characters
558 unfamiliar to you and maybe never used.
560 =item katakana_before_hiragana
562 =item upper_before_lower
564 -- see 6.6 Case Comparisons; 7.3.1 Tertiary Weight Table, UTR #10.
566 By default, lowercase is before uppercase
567 and hiragana is before katakana.
569 If the parameter is true, this is reversed.
577 =item C<@sorted = $UCA-E<gt>sort(@not_sorted)>
579 Sorts a list of strings.
581 =item C<$result = $UCA-E<gt>cmp($a, $b)>
583 Returns 1 (when C<$a> is greater than C<$b>)
584 or 0 (when C<$a> is equal to C<$b>)
585 or -1 (when C<$a> is lesser than C<$b>).
587 =item C<$sortKey = $UCA-E<gt>getSortKey($string)>
589 -- see 4.3 Form a sort key for each string, UTR #10.
593 You compare the sort keys using a binary comparison
594 and get the result of the comparison of the strings using UCA.
596 $UCA->getSortKey($a) cmp $UCA->getSortKey($b)
610 Use of the C<normalization> parameter requires
611 the B<Unicode::Normalize> module.
613 If you need not it (e.g. in the case when you need not
614 handle any combining characters),
615 assign C<normalization =E<gt> undef> explicitly.
619 SADAHIRO Tomoyuki, E<lt>SADAHIRO@cpan.orgE<gt>
621 http://homepage1.nifty.com/nomenclator/perl/
623 Copyright(C) 2001, SADAHIRO Tomoyuki. Japan. All rights reserved.
625 This program is free software; you can redistribute it and/or
626 modify it under the same terms as Perl itself.
632 =item L<Lingua::KO::Hangul::Util>
634 utility functions for Hangul Syllables
636 =item L<Unicode::Normalize>
638 normalized forms of Unicode text
640 =item Unicode Collation Algorithm - Unicode TR #10
642 http://www.unicode.org/unicode/reports/tr10/