[Unicode::Collate] UCA Version number
[p5sagit/p5-mst-13.2.git] / lib / Unicode / Collate.pm
CommitLineData
45394607 1package Unicode::Collate;
2
4a2e806c 3BEGIN {
4 if (ord("A") == 193) {
5 die "Unicode::Collate not ported to EBCDIC\n";
6 }
7}
8
45394607 9use 5.006;
10use strict;
11use warnings;
12use Carp;
5398038e 13
45394607 14require Exporter;
15
a7fbee98 16our $VERSION = '0.11';
45394607 17our $PACKAGE = __PACKAGE__;
18
19our @ISA = qw(Exporter);
20
21our %EXPORT_TAGS = ();
22our @EXPORT_OK = ();
23our @EXPORT = ();
24
25(our $Path = $INC{'Unicode/Collate.pm'}) =~ s/\.pm$//;
26our $KeyFile = "allkeys.txt";
27
3164dd77 28our $UNICODE_VERSION;
29
30{
31 my($f, $fh);
32 foreach my $d (@INC) {
33 use File::Spec;
34 $f = File::Spec->catfile($d, "unicore", "version");
35 if (open($fh, $f)) {
36 chomp($UNICODE_VERSION = <$fh>);
37 close $fh;
38 last;
39 }
40
41 # XXX, Perl 5.6.1
42 $f = File::Spec->catfile($d, "unicode", "Unicode.301");
43 if (open($fh, $f)) {
44 $UNICODE_VERSION = "3.0.1";
45 close $fh;
46 last;
47 }
48 }
49}
50
5398038e 51our $getCombinClass; # coderef for combining class from Unicode::Normalize
45394607 52
53use constant Min2 => 0x20; # minimum weight at level 2
54use constant Min3 => 0x02; # minimum weight at level 3
a7fbee98 55use constant UNDEFINED => 0xFF80; # special value for undefined CE's
56
57our $DefaultRearrange = [ 0x0E40..0x0E44, 0x0EC0..0x0EC4 ];
45394607 58
3164dd77 59sub UCA_Version { "8.0" }
60
61sub Base_Unicode_Version { $UNICODE_VERSION }
62
45394607 63##
64## constructor
65##
66sub new
67{
a7fbee98 68 my $class = shift;
69 my $self = bless { @_ }, $class;
70
71 # alternate lowercased
72 $self->{alternate} =
73 ! exists $self->{alternate} ? 'shifted' : lc($self->{alternate});
74
75 croak "$PACKAGE unknown alternate tag name: $self->{alternate}"
76 unless $self->{alternate} eq 'blanked'
77 || $self->{alternate} eq 'non-ignorable'
78 || $self->{alternate} eq 'shifted'
79 || $self->{alternate} eq 'shift-trimmed';
80
81 # collation level
82 $self->{level} ||= 4;
83
84 croak "Illegal level lower than 1 (passed $self->{level})."
85 if $self->{level} < 1;
86 croak "A level higher than 4 (passed $self->{level}) is not supported."
87 if 4 < $self->{level};
88
89 # overrideHangul and -CJK
90 # If true: CODEREF used; '': default; undef: derived elements
91 $self->{overrideHangul} = ''
92 if ! exists $self->{overrideHangul};
93 $self->{overrideCJK} = ''
94 if ! exists $self->{overrideCJK};
95
96 # normalization form
97 $self->{normalization} = 'D'
98 if ! exists $self->{normalization};
99 $self->{UNF} = undef;
100
101 if (defined $self->{normalization}) {
102 eval { require Unicode::Normalize };
103 croak "Unicode/Normalize.pm is required to normalize strings: $@"
104 if $@;
105
106 Unicode::Normalize->import();
107 $getCombinClass = \&Unicode::Normalize::getCombinClass
108 if ! $getCombinClass;
109
110 $self->{UNF} =
111 $self->{normalization} =~ /^(?:NF)?C$/ ? \&NFC :
112 $self->{normalization} =~ /^(?:NF)?D$/ ? \&NFD :
113 $self->{normalization} =~ /^(?:NF)?KC$/ ? \&NFKC :
114 $self->{normalization} =~ /^(?:NF)?KD$/ ? \&NFKD :
115 croak "$PACKAGE unknown normalization form name: "
116 . $self->{normalization};
117 }
45394607 118
a7fbee98 119 # Open a table file.
120 # If undef is passed explicitly, no file is read.
121 $self->{table} = $KeyFile
122 if ! exists $self->{table};
123 $self->read_table
124 if defined $self->{table};
905aa9f0 125
a7fbee98 126 if ($self->{entry}) {
127 $self->parseEntry($_) foreach split /\n/, $self->{entry};
128 }
905aa9f0 129
a7fbee98 130 # backwards
131 $self->{backwards} ||= [ ];
132 $self->{backwards} = [ $self->{backwards} ]
133 if ! ref $self->{backwards};
905aa9f0 134
a7fbee98 135 # rearrange
136 $self->{rearrange} = $DefaultRearrange
137 if ! exists $self->{rearrange};
138 $self->{rearrange} = []
139 if ! defined $self->{rearrange};
140 croak "$PACKAGE: A list for rearrangement must be store in an ARRAYREF"
141 if ! ref $self->{rearrange};
142
143 # keys of $self->{rearrangeHash} are $self->{rearrange}.
144 $self->{rearrangeHash} = undef;
145
146 if (@{ $self->{rearrange} }) {
147 @{ $self->{rearrangeHash} }{ @{ $self->{rearrange} } } = ();
148 }
149
150 return $self;
151}
905aa9f0 152
153sub read_table {
a7fbee98 154 my $self = shift;
155 my $file = $self->{table} ne '' ? $self->{table} : $KeyFile;
156
157 open my $fk, "<$Path/$file"
158 or croak "File does not exist at $Path/$file";
159
160 while (<$fk>) {
161 next if /^\s*#/;
162 if (/^\s*\@/) {
163 if (/^\@version\s*(\S*)/) {
164 $self->{version} ||= $1;
165 }
166 elsif (/^\@alternate\s+(.*)/) {
167 $self->{alternate} ||= $1;
168 }
169 elsif (/^\@backwards\s+(.*)/) {
170 push @{ $self->{backwards} }, $1;
171 }
172 elsif (/^\@rearrange\s+(.*)/) {
173 push @{ $self->{rearrange} }, _getHexArray($1);
174 }
175 next;
176 }
177 $self->parseEntry($_);
45394607 178 }
a7fbee98 179 close $fk;
45394607 180}
181
905aa9f0 182
45394607 183##
184## get $line, parse it, and write an entry in $self
185##
186sub parseEntry
187{
a7fbee98 188 my $self = shift;
189 my $line = shift;
190 my($name, $ele, @key);
191
192 return if $line !~ /^\s*[0-9A-Fa-f]/;
193
194 # removes comment and gets name
195 $name = $1
196 if $line =~ s/[#%]\s*(.*)//;
197 return if defined $self->{undefName} && $name =~ /$self->{undefName}/;
198
199 # gets element
200 my($e, $k) = split /;/, $line;
201 croak "Wrong Entry: <charList> must be separated by ';' from <collElement>"
202 if ! $k;
203
204 my @e = _getHexArray($e);
205 $ele = pack('U*', @e);
206 return if defined $self->{undefChar} && $ele =~ /$self->{undefChar}/;
207
208 # get sort key
209 if (defined $self->{ignoreName} && $name =~ /$self->{ignoreName}/ ||
210 defined $self->{ignoreChar} && $ele =~ /$self->{ignoreChar}/)
211 {
212 $self->{entries}{$ele} = $self->{ignored}{$ele} = 1;
45394607 213 }
a7fbee98 214 else {
215 my $combining = 1; # primary = 0, secondary != 0;
216
217 foreach my $arr ($k =~ /\[([^\[\]]+)\]/g) { # SPACEs allowed
218 my $var = $arr =~ /\*/; # exactly /^\*/ but be lenient.
219 push @key, $self->altCE($var, _getHexArray($arr));
220 $combining = 0 unless $key[-1][0] == 0 && $key[-1][1] != 0;
221 }
222 $self->{entries}{$ele} = \@key;
223 $self->{combining}{$ele} = 1 if $combining;
224 }
225 $self->{maxlength}{ord $ele} = scalar @e if @e > 1;
45394607 226}
227
228
229##
d16e9e3d 230## arrayref CE = altCE(bool variable?, list[num] weights)
45394607 231##
d16e9e3d 232sub altCE
45394607 233{
a7fbee98 234 my $self = shift;
235 my $var = shift;
236 my @c = @_;
237
238 $self->{alternate} eq 'blanked' ?
239 $var ? [0,0,0,$c[3]] : \@c :
240 $self->{alternate} eq 'non-ignorable' ?
241 \@c :
242 $self->{alternate} eq 'shifted' ?
243 $var ? [0,0,0,$c[0] ] : [ @c[0..2], $c[0]+$c[1]+$c[2] ? 0xFFFF : 0 ] :
244 $self->{alternate} eq 'shift-trimmed' ?
245 $var ? [0,0,0,$c[0] ] : [ @c[0..2], 0 ] :
246 croak "$PACKAGE unknown alternate name: $self->{alternate}";
45394607 247}
248
249##
d16e9e3d 250## string hex_sortkey = splitCE(string arg)
45394607 251##
252sub viewSortKey
253{
a7fbee98 254 my $self = shift;
255 my $key = $self->getSortKey(@_);
256 my $view = join " ", map sprintf("%04X", $_), unpack 'n*', $key;
257 $view =~ s/ ?0000 ?/|/g;
258 return "[$view]";
45394607 259}
260
d16e9e3d 261
45394607 262##
d16e9e3d 263## list[strings] elements = splitCE(string arg)
45394607 264##
d16e9e3d 265sub splitCE
45394607 266{
a7fbee98 267 my $self = shift;
268 my $code = $self->{preprocess};
269 my $norm = $self->{UNF};
270 my $ent = $self->{entries};
271 my $max = $self->{maxlength};
272 my $reH = $self->{rearrangeHash};
273
274 my $str = ref $code ? &$code(shift) : shift;
275 $str = &$norm($str) if ref $norm;
276
277 my @src = unpack('U*', $str);
278 my @buf;
279
280 # rearrangement
281 if ($reH) {
282 for (my $i = 0; $i < @src; $i++) {
283 if (exists $reH->{ $src[$i] } && $i + 1 < @src) {
284 ($src[$i], $src[$i+1]) = ($src[$i+1], $src[$i]);
285 $i++;
286 }
287 }
45394607 288 }
45394607 289
a7fbee98 290 for (my $i = 0; $i < @src; $i++) {
291 my $ch;
292 my $u = $src[$i];
293
294 # non-characters
295 next unless defined $u;
296 next if $u < 0 || 0x10FFFF < $u # out of range
297 || (0xD800 <= $u && $u <= 0xDFFF); # unpaired surrogates
298 my $four = $u & 0xFFFF;
299 next if $four == 0xFFFE || $four == 0xFFFF;
300
301 if ($max->{$u}) { # contract
302 for (my $j = $max->{$u}; $j >= 1; $j--) {
303 next unless $i+$j-1 < @src;
304 $ch = pack 'U*', @src[$i .. $i+$j-1];
305 $i += $j-1, last if $ent->{$ch};
306 }
307 } else {
308 $ch = pack('U', $u);
309 }
310
311 # with Combining Char (UTS#10, 4.2.1), here requires Unicode::Normalize.
312 if ($getCombinClass && defined $ch) {
313 for (my $j = $i+1; $j < @src; $j++) {
314 next unless defined $src[$j];
315 last unless $getCombinClass->( $src[$j] );
316 my $comb = pack 'U', $src[$j];
317 next if ! $ent->{ $ch.$comb };
318 $ch .= $comb;
319 $src[$j] = undef;
320 }
321 }
322 push @buf, $ch;
45394607 323 }
a7fbee98 324 wantarray ? @buf : \@buf;
d16e9e3d 325}
45394607 326
d16e9e3d 327
328##
329## list[arrayrefs] weight = getWt(string element)
330##
331sub getWt
332{
a7fbee98 333 my $self = shift;
334 my $ch = shift;
335 my $ent = $self->{entries};
336 my $ign = $self->{ignored};
337 my $cjk = $self->{overrideCJK};
338 my $hang = $self->{overrideHangul};
339
340 return if !defined $ch || $ign->{$ch}; # ignored
341 return @{ $ent->{$ch} } if $ent->{$ch};
342 my $u = unpack('U', $ch);
343
344 if (0xAC00 <= $u && $u <= 0xD7A3) { # is_Hangul
345 return $hang
346 ? &$hang($u)
347 : defined $hang
348 ? map({
349 my $v = $_;
350 my $ar = $ent->{pack('U', $v)};
351 $ar ? @$ar : map($self->altCE(0,@$_), _derivCE($v));
352 } _decompHangul($u))
353 : map($self->altCE(0,@$_), _derivCE($u));
354 }
355 elsif (0x3400 <= $u && $u <= 0x4DB5 ||
356 0x4E00 <= $u && $u <= 0x9FA5 ||
357 0x20000 <= $u && $u <= 0x2A6D6) { # is_CJK
358 return $cjk
359 ? &$cjk($u)
360 : defined $cjk && $u <= 0xFFFF
361 ? $self->altCE(0, ($u, 0x20, 0x02, $u))
362 : map($self->altCE(0,@$_), _derivCE($u));
363 }
364 else {
365 return map($self->altCE(0,@$_), _derivCE($u));
366 }
d16e9e3d 367}
368
369##
370## int = index(string, substring)
371##
372sub index
373{
a7fbee98 374 my $self = shift;
375 my $lev = $self->{level};
376 my $comb = $self->{combining};
377 my $str = $self->splitCE(shift);
378 my $sub = $self->splitCE(shift);
379
380 return wantarray ? (0,0) : 0 if ! @$sub;
381 return wantarray ? () : -1 if ! @$str;
382
383 my @subWt = grep _ignorableAtLevel($_,$lev),
384 map $self->getWt($_), @$sub;
385
386 my(@strWt,@strPt);
387 my $count = 0;
388 for (my $i = 0; $i < @$str; $i++) {
389 my $go_ahead = 0;
390
391 my @tmp = grep _ignorableAtLevel($_,$lev), $self->getWt($str->[$i]);
392 $go_ahead += length $str->[$i];
393
394 # /*XXX*/ still broken.
395 # index("e\x{300}", "e") should be 'no match' at level 2 or higher
396 # as "e\x{300}" is a *single* grapheme cluster and not equal to "e".
397
398 # go ahead as far as we find a combining character;
399 while ($i + 1 < @$str &&
400 (! defined $str->[$i+1] || $comb->{ $str->[$i+1] }) ) {
401 $i++;
402 $go_ahead += length $str->[$i];
403 next if ! defined $str->[$i];
404 push @tmp,
405 grep _ignorableAtLevel($_,$lev), $self->getWt($str->[$i]);
406 }
407
408 push @strWt, @tmp;
409 push @strPt, ($count) x @tmp;
410 $count += $go_ahead;
411
412 while (@strWt >= @subWt) {
413 if (_eqArray(\@strWt, \@subWt, $lev)) {
414 my $pos = $strPt[0];
415 return wantarray ? ($pos, $count-$pos) : $pos;
416 }
417 shift @strWt;
418 shift @strPt;
419 }
d16e9e3d 420 }
a7fbee98 421 return wantarray ? () : -1;
d16e9e3d 422}
423
424##
425## bool _eqArray(arrayref, arrayref, level)
426##
427sub _eqArray($$$)
428{
a7fbee98 429 my $a = shift; # length $a >= length $b;
430 my $b = shift;
431 my $lev = shift;
432 for my $v (0..$lev-1) {
433 for my $c (0..@$b-1){
434 return if $a->[$c][$v] != $b->[$c][$v];
435 }
d16e9e3d 436 }
a7fbee98 437 return 1;
d16e9e3d 438}
439
440
441##
442## bool _ignorableAtLevel(CE, level)
443##
444sub _ignorableAtLevel($$)
445{
a7fbee98 446 my $ce = shift;
447 return unless defined $ce;
448 my $lv = shift;
449 return ! grep { ! $ce->[$_] } 0..$lv-1;
d16e9e3d 450}
451
452
453##
454## string sortkey = getSortKey(string arg)
455##
456sub getSortKey
457{
a7fbee98 458 my $self = shift;
459 my $lev = $self->{level};
460 my $rCE = $self->splitCE(shift); # get an arrayref
461
462 # weight arrays
463 my @buf = grep defined(), map $self->getWt($_), @$rCE;
464
465 # make sort key
466 my @ret = ([],[],[],[]);
467 foreach my $v (0..$lev-1) {
468 foreach my $b (@buf) {
469 push @{ $ret[$v] }, $b->[$v] if $b->[$v];
470 }
471 }
472 foreach (@{ $self->{backwards} }) {
473 my $v = $_ - 1;
474 @{ $ret[$v] } = reverse @{ $ret[$v] };
45394607 475 }
45394607 476
a7fbee98 477 # modification of tertiary weights
478 if ($self->{upper_before_lower}) {
479 foreach (@{ $ret[2] }) {
480 if (0x8 <= $_ && $_ <= 0xC) { $_ -= 6 } # lower
481 elsif (0x2 <= $_ && $_ <= 0x6) { $_ += 6 } # upper
482 elsif ($_ == 0x1C) { $_ += 1 } # square upper
483 elsif ($_ == 0x1D) { $_ -= 1 } # square lower
484 }
45394607 485 }
a7fbee98 486 if ($self->{katakana_before_hiragana}) {
487 foreach (@{ $ret[2] }) {
488 if (0x0F <= $_ && $_ <= 0x13) { $_ -= 2 } # katakana
489 elsif (0x0D <= $_ && $_ <= 0x0E) { $_ += 5 } # hiragana
490 }
45394607 491 }
a7fbee98 492 join "\0\0", map pack('n*', @$_), @ret;
45394607 493}
494
495
496##
d16e9e3d 497## int compare = cmp(string a, string b)
45394607 498##
5398038e 499sub cmp { $_[0]->getSortKey($_[1]) cmp $_[0]->getSortKey($_[2]) }
500sub eq { $_[0]->getSortKey($_[1]) eq $_[0]->getSortKey($_[2]) }
501sub ne { $_[0]->getSortKey($_[1]) ne $_[0]->getSortKey($_[2]) }
502sub lt { $_[0]->getSortKey($_[1]) lt $_[0]->getSortKey($_[2]) }
503sub le { $_[0]->getSortKey($_[1]) le $_[0]->getSortKey($_[2]) }
504sub gt { $_[0]->getSortKey($_[1]) gt $_[0]->getSortKey($_[2]) }
505sub ge { $_[0]->getSortKey($_[1]) ge $_[0]->getSortKey($_[2]) }
45394607 506
507##
d16e9e3d 508## list[strings] sorted = sort(list[strings] arg)
45394607 509##
a7fbee98 510sub sort {
511 my $obj = shift;
512 return
513 map { $_->[1] }
514 sort{ $a->[0] cmp $b->[0] }
515 map [ $obj->getSortKey($_), $_ ], @_;
45394607 516}
517
518##
d16e9e3d 519## list[arrayrefs] CE = _derivCE(int codepoint)
45394607 520##
a7fbee98 521sub _derivCE {
522 my $code = shift;
523 my $a = UNDEFINED + ($code >> 15); # ok
524 my $b = ($code & 0x7FFF) | 0x8000; # ok
525# my $a = 0xFFC2 + ($code >> 15); # ng
526# my $b = $code & 0x7FFF | 0x1000; # ng
527 $b ? ([$a,2,1,$code],[$b,0,0,$code]) : [$a,2,1,$code];
45394607 528}
529
530##
531## "hhhh hhhh hhhh" to (dddd, dddd, dddd)
532##
a7fbee98 533sub _getHexArray { map hex, $_[0] =~ /([0-9a-fA-F]+)/g }
5398038e 534
a7fbee98 535#
536# $code must be in Hangul syllable.
537# Check it before you enter here.
538#
5398038e 539sub _decompHangul {
540 my $code = shift;
5398038e 541 my $SIndex = $code - 0xAC00;
542 my $LIndex = int( $SIndex / 588);
543 my $VIndex = int(($SIndex % 588) / 28);
544 my $TIndex = $SIndex % 28;
545 return (
a7fbee98 546 0x1100 + $LIndex,
547 0x1161 + $VIndex,
548 $TIndex ? (0x11A7 + $TIndex) : (),
5398038e 549 );
45394607 550}
551
5521;
553__END__
554
555=head1 NAME
556
a7fbee98 557Unicode::Collate - Unicode Collation Algorithm
45394607 558
559=head1 SYNOPSIS
560
561 use Unicode::Collate;
562
563 #construct
5398038e 564 $Collator = Unicode::Collate->new(%tailoring);
45394607 565
566 #sort
5398038e 567 @sorted = $Collator->sort(@not_sorted);
45394607 568
569 #compare
a7fbee98 570 $result = $Collator->cmp($a, $b); # returns 1, 0, or -1.
45394607 571
572=head1 DESCRIPTION
573
574=head2 Constructor and Tailoring
575
d16e9e3d 576The C<new> method returns a collator object.
577
5398038e 578 $Collator = Unicode::Collate->new(
45394607 579 alternate => $alternate,
580 backwards => $levelNumber, # or \@levelNumbers
581 entry => $element,
582 normalization => $normalization_form,
583 ignoreName => qr/$ignoreName/,
584 ignoreChar => qr/$ignoreChar/,
585 katakana_before_hiragana => $bool,
586 level => $collationLevel,
587 overrideCJK => \&overrideCJK,
588 overrideHangul => \&overrideHangul,
589 preprocess => \&preprocess,
590 rearrange => \@charList,
591 table => $filename,
592 undefName => qr/$undefName/,
593 undefChar => qr/$undefChar/,
594 upper_before_lower => $bool,
595 );
a7fbee98 596 # if %tailoring is false (i.e. empty),
5398038e 597 # $Collator should do the default collation.
45394607 598
599=over 4
600
601=item alternate
602
603-- see 3.2.2 Alternate Weighting, UTR #10.
604
a7fbee98 605This key allows to alternate weighting for variable collation elements,
606which are marked with an ASTERISK in the table
607(NOTE: Many punction marks and symbols are variable in F<allkeys.txt>).
608
609 alternate => 'blanked', 'non-ignorable', 'shifted', or 'shift-trimmed'.
45394607 610
a7fbee98 611These names are case-insensitive.
45394607 612By default (if specification is omitted), 'shifted' is adopted.
613
a7fbee98 614 'Blanked' Variable elements are ignorable at levels 1 through 3;
615 considered at the 4th level.
616
617 'Non-ignorable' Variable elements are not reset to ignorable.
618
619 'Shifted' Variable elements are ignorable at levels 1 through 3
620 their level 4 weight is replaced by the old level 1 weight.
621 Level 4 weight for Non-Variable elements is 0xFFFF.
622
623 'Shift-Trimmed' Same as 'shifted', but all FFFF's at the 4th level
624 are trimmed.
625
45394607 626=item backwards
627
628-- see 3.1.2 French Accents, UTR #10.
629
630 backwards => $levelNumber or \@levelNumbers
631
632Weights in reverse order; ex. level 2 (diacritic ordering) in French.
633If omitted, forwards at all the levels.
634
635=item entry
636
637-- see 3.1 Linguistic Features; 3.2.1 File Format, UTR #10.
638
a7fbee98 639Overrides a default order or defines additional collation elements
45394607 640
641 entry => <<'ENTRIES', # use the UCA file format
a7fbee98 64200E6 ; [.0861.0020.0002.00E6] [.08B1.0020.0002.00E6] # ligature <ae> as <a><e>
45394607 6430063 0068 ; [.0893.0020.0002.0063] # "ch" in traditional Spanish
6440043 0068 ; [.0893.0020.0008.0043] # "Ch" in traditional Spanish
645ENTRIES
646
647=item ignoreName
648
649=item ignoreChar
650
651-- see Completely Ignorable, 3.2.2 Alternate Weighting, UTR #10.
652
a7fbee98 653Makes the entry in the table ignorable.
654If a collation element is ignorable,
45394607 655it is ignored as if the element had been deleted from there.
656
a7fbee98 657E.g. when 'a' and 'e' are ignorable,
45394607 658'element' is equal to 'lament' (or 'lmnt').
659
660=item level
661
662-- see 4.3 Form a sort key for each string, UTR #10.
663
664Set the maximum level.
665Any higher levels than the specified one are ignored.
666
667 Level 1: alphabetic ordering
668 Level 2: diacritic ordering
669 Level 3: case ordering
670 Level 4: tie-breaking (e.g. in the case when alternate is 'shifted')
671
672 ex.level => 2,
673
a7fbee98 674If omitted, the maximum is the 4th.
675
45394607 676=item normalization
677
678-- see 4.1 Normalize each input string, UTR #10.
679
905aa9f0 680If specified, strings are normalized before preparation of sort keys
45394607 681(the normalization is executed after preprocess).
682
683As a form name, one of the following names must be used.
684
685 'C' or 'NFC' for Normalization Form C
686 'D' or 'NFD' for Normalization Form D
687 'KC' or 'NFKC' for Normalization Form KC
688 'KD' or 'NFKD' for Normalization Form KD
689
690If omitted, the string is put into Normalization Form D.
691
a7fbee98 692If C<undef> is passed explicitly as the value for this key,
45394607 693any normalization is not carried out (this may make tailoring easier
694if any normalization is not desired).
695
696see B<CAVEAT>.
697
698=item overrideCJK
699
45394607 700-- see 7.1 Derived Collation Elements, UTR #10.
701
702By default, mapping of CJK Unified Ideographs
a7fbee98 703uses the Unicode codepoint order.
704But the mapping of CJK Unified Ideographs may be overrided.
45394607 705
a7fbee98 706ex. CJK Unified Ideographs in the JIS code point order.
45394607 707
708 overrideCJK => sub {
a7fbee98 709 my $u = shift; # get a Unicode codepoint
710 my $b = pack('n', $u); # to UTF-16BE
711 my $s = your_unicode_to_sjis_converter($b); # convert
712 my $n = unpack('n', $s); # convert sjis to short
713 [ $n, 0x20, 0x2, $u ]; # return the collation element
45394607 714 },
715
a7fbee98 716ex. ignores all CJK Unified Ideographs.
717
718 overrideCJK => sub {()}, # CODEREF returning empty list
719
720 # where ->eq("Pe\x{4E00}rl", "Perl") is true
721 # as U+4E00 is a CJK Unified Ideograph and to be ignorable.
722
723If C<undef> is passed explicitly as the value for this key,
724weights for CJK Unified Ideographs are treated as undefined.
725But assignment of weight for CJK Unified Ideographs
726in table or L<entry> is still valid.
727
728=item overrideHangul
729
730-- see 7.1 Derived Collation Elements, UTR #10.
731
732By default, Hangul Syllables are decomposed into Hangul Jamo.
733But the mapping of Hangul Syllables may be overrided.
734
735This tag works like L<overrideCJK>, so see there for examples.
736
45394607 737If you want to override the mapping of Hangul Syllables,
738the Normalization Forms D and KD are not appropriate
739(they will be decomposed before overriding).
740
a7fbee98 741If C<undef> is passed explicitly as the value for this key,
742weight for Hangul Syllables is treated as undefined
743without decomposition into Hangul Jamo.
744But definition of weight for Hangul Syllables
745in table or L<entry> is still valid.
746
45394607 747=item preprocess
748
749-- see 5.1 Preprocessing, UTR #10.
750
751If specified, the coderef is used to preprocess
752before the formation of sort keys.
753
a7fbee98 754ex. dropping English articles, such as "a" or "the".
45394607 755Then, "the pen" is before "a pencil".
756
757 preprocess => sub {
758 my $str = shift;
a7fbee98 759 $str =~ s/\b(?:an?|the)\s+//gi;
45394607 760 $str;
761 },
762
763=item rearrange
764
765-- see 3.1.3 Rearrangement, UTR #10.
766
767Characters that are not coded in logical order and to be rearranged.
a7fbee98 768By default,
45394607 769
770 rearrange => [ 0x0E40..0x0E44, 0x0EC0..0x0EC4 ],
771
a7fbee98 772If you want to disallow any rearrangement,
773pass C<undef> or C<[]> (a reference to an empty list)
774as the value for this key.
775
45394607 776=item table
777
778-- see 3.2 Default Unicode Collation Element Table, UTR #10.
779
780You can use another element table if desired.
781The table file must be in your C<lib/Unicode/Collate> directory.
782
783By default, the file C<lib/Unicode/Collate/allkeys.txt> is used.
784
a7fbee98 785If C<undef> is passed explicitly as the value for this key,
786no file is read (but you can define collation elements via L<entry>).
787
788A typical way to define a collation element table
789without any file of table:
790
791 $onlyABC = Unicode::Collate->new(
792 table => undef,
793 entry => << 'ENTRIES',
7940061 ; [.0101.0020.0002.0061] # LATIN SMALL LETTER A
7950041 ; [.0101.0020.0008.0041] # LATIN CAPITAL LETTER A
7960062 ; [.0102.0020.0002.0062] # LATIN SMALL LETTER B
7970042 ; [.0102.0020.0008.0042] # LATIN CAPITAL LETTER B
7980063 ; [.0103.0020.0002.0063] # LATIN SMALL LETTER C
7990043 ; [.0103.0020.0008.0043] # LATIN CAPITAL LETTER C
800ENTRIES
801 );
905aa9f0 802
45394607 803=item undefName
804
805=item undefChar
806
807-- see 6.3.4 Reducing the Repertoire, UTR #10.
808
809Undefines the collation element as if it were unassigned in the table.
810This reduces the size of the table.
811If an unassigned character appears in the string to be collated,
812the sort key is made from its codepoint
813as a single-character collation element,
814as it is greater than any other assigned collation elements
815(in the codepoint order among the unassigned characters).
816But, it'd be better to ignore characters
817unfamiliar to you and maybe never used.
818
819=item katakana_before_hiragana
820
821=item upper_before_lower
822
823-- see 6.6 Case Comparisons; 7.3.1 Tertiary Weight Table, UTR #10.
824
825By default, lowercase is before uppercase
826and hiragana is before katakana.
827
a7fbee98 828If the tag is made true, this is reversed.
829
830B<NOTE>: These tags simplemindedly assume
831any lowercase/uppercase or hiragana/katakana distinctions
832should occur in level 3, and their weights at level 3
833should be same as those mentioned in 7.3.1, UTR #10.
834If you define your collation elements which violates this,
835these tags doesn't work validly.
45394607 836
837=back
838
3164dd77 839=head2 Methods for Collation
45394607 840
841=over 4
842
5398038e 843=item C<@sorted = $Collator-E<gt>sort(@not_sorted)>
45394607 844
845Sorts a list of strings.
846
5398038e 847=item C<$result = $Collator-E<gt>cmp($a, $b)>
45394607 848
849Returns 1 (when C<$a> is greater than C<$b>)
850or 0 (when C<$a> is equal to C<$b>)
851or -1 (when C<$a> is lesser than C<$b>).
852
5398038e 853=item C<$result = $Collator-E<gt>eq($a, $b)>
854
855=item C<$result = $Collator-E<gt>ne($a, $b)>
856
857=item C<$result = $Collator-E<gt>lt($a, $b)>
858
859=item C<$result = $Collator-E<gt>le($a, $b)>
860
861=item C<$result = $Collator-E<gt>gt($a, $b)>
862
863=item C<$result = $Collator-E<gt>ge($a, $b)>
864
a7fbee98 865They works like the same name operators as theirs.
5398038e 866
867 eq : whether $a is equal to $b.
868 ne : whether $a is not equal to $b.
869 lt : whether $a is lesser than $b.
870 le : whether $a is lesser than $b or equal to $b.
871 gt : whether $a is greater than $b.
872 ge : whether $a is greater than $b or equal to $b.
873
874=item C<$sortKey = $Collator-E<gt>getSortKey($string)>
45394607 875
876-- see 4.3 Form a sort key for each string, UTR #10.
877
878Returns a sort key.
879
880You compare the sort keys using a binary comparison
881and get the result of the comparison of the strings using UCA.
882
5398038e 883 $Collator->getSortKey($a) cmp $Collator->getSortKey($b)
45394607 884
885 is equivalent to
886
5398038e 887 $Collator->cmp($a, $b)
45394607 888
a7fbee98 889=item C<$sortKeyForm = $Collator-E<gt>viewSortKey($string)>
890
891Returns a string formalized to display a sort key.
892Weights are enclosed with C<'['> and C<']'>
893and level boundaries are denoted by C<'|'>.
894
895 use Unicode::Collate;
896 my $c = Unicode::Collate->new();
897 print $c->viewSortKey("Perl"),"\n";
898
899 # output:
900 # [09B3 08B1 09CB 094F|0020 0020 0020 0020|0008 0002 0002 0002|FFFF FFFF FFFF FFFF]
901 # Level 1 Level 2 Level 3 Level 4
902
5398038e 903=item C<$position = $Collator-E<gt>index($string, $substring)>
d16e9e3d 904
5398038e 905=item C<($position, $length) = $Collator-E<gt>index($string, $substring)>
d16e9e3d 906
907-- see 6.8 Searching, UTR #10.
908
909If C<$substring> matches a part of C<$string>, returns
910the position of the first occurrence of the matching part in scalar context;
911in list context, returns a two-element list of
912the position and the length of the matching part.
913
914B<Notice> that the length of the matching part may differ from
915the length of C<$substring>.
916
917B<Note> that the position and the length are counted on the string
918after the process of preprocess, normalization, and rearrangement.
919Therefore, in case the specified string is not binary equal to
920the preprocessed/normalized/rearranged string, the position and the length
a7fbee98 921may differ form those on the specified string. But it is guaranteed
d16e9e3d 922that, if matched, it returns a non-negative value as C<$position>.
923
924If C<$substring> does not match any part of C<$string>,
925returns C<-1> in scalar context and
926an empty list in list context.
927
928e.g. you say
929
5398038e 930 my $Collator = Unicode::Collate->new( normalization => undef, level => 1 );
d16e9e3d 931 my $str = "Ich mu\x{00DF} studieren.";
932 my $sub = "m\x{00FC}ss";
933 my $match;
a7fbee98 934 if (my($pos,$len) = $Collator->index($str, $sub)) {
5398038e 935 $match = substr($str, $pos, $len);
d16e9e3d 936 }
937
938and get C<"mu\x{00DF}"> in C<$match> since C<"mu>E<223>C<">
939is primary equal to C<"m>E<252>C<ss">.
940
45394607 941=back
942
3164dd77 943=head2 Other Methods
944
945=over 4
946
947=item UCA_Version
948
949Returns the version number of Unicode Technical Standard 10
950this module consults.
951
952=item Base_Unicode_Version
953
954Returns the version number of the Unicode Standard
955this module is based on.
956
957=back
958
45394607 959=head2 EXPORT
960
961None by default.
962
4a2e806c 963=head2 TODO
964
965Unicode::Collate has not been ported to EBCDIC. The code mostly would
966work just fine but a decision needs to be made: how the module should
967work in EBCDIC? Should the low 256 characters be understood as
968Unicode or as EBCDIC code points? Should one be chosen or should
969there be a way to do either? Or should such translation be left
970outside the module for the user to do, for example by using
971Encode::from_to()?
a7fbee98 972(or utf8::unicode_to_native()/utf8::native_to_unicode()?)
4a2e806c 973
45394607 974=head2 CAVEAT
975
976Use of the C<normalization> parameter requires
977the B<Unicode::Normalize> module.
978
5398038e 979If you need not it (say, in the case when you need not
45394607 980handle any combining characters),
981assign C<normalization =E<gt> undef> explicitly.
982
5398038e 983-- see 6.5 Avoiding Normalization, UTR #10.
984
a7fbee98 985=head2 BUGS
986
987C<index()> is an experimental method and
988its return value may be unreliable.
989The correct implementation for C<index()> must be based
990on Locale-Sensitive Support: Level 3 in UTR #18,
991F<Unicode Regular Expression Guidelines>.
992
993See also 4.2 Locale-Dependent Graphemes in UTR #18.
994
45394607 995=head1 AUTHOR
996
997SADAHIRO Tomoyuki, E<lt>SADAHIRO@cpan.orgE<gt>
998
999 http://homepage1.nifty.com/nomenclator/perl/
1000
a7fbee98 1001 Copyright(C) 2001-2002, SADAHIRO Tomoyuki. Japan. All rights reserved.
45394607 1002
a7fbee98 1003 This library is free software; you can redistribute it
1004 and/or modify it under the same terms as Perl itself.
45394607 1005
1006=head1 SEE ALSO
1007
1008=over 4
1009
a7fbee98 1010=item http://www.unicode.org/unicode/reports/tr10/
45394607 1011
a7fbee98 1012Unicode Collation Algorithm - UTR #10
45394607 1013
a7fbee98 1014=item http://www.unicode.org/unicode/reports/tr10/allkeys.txt
1015
1016The Default Unicode Collation Element Table
45394607 1017
a7fbee98 1018=item http://www.unicode.org/unicode/reports/tr15/
1019
1020Unicode Normalization Forms - UAX #15
1021
1022=item http://www.unicode.org/unicode/reports/tr18
1023
1024Unicode Regular Expression Guidelines - UTR #18
1025
1026=item L<Unicode::Normalize>
45394607 1027
45394607 1028=back
1029
1030=cut