Re: [ PATCH ] module test fest
[p5sagit/p5-mst-13.2.git] / lib / Unicode / Collate.pm
CommitLineData
45394607 1package Unicode::Collate;
2
4a2e806c 3BEGIN {
4 if (ord("A") == 193) {
5 die "Unicode::Collate not ported to EBCDIC\n";
6 }
7}
8
45394607 9use 5.006;
10use strict;
11use warnings;
12use Carp;
5398038e 13
45394607 14require Exporter;
15
327745dc 16our $VERSION = '0.12';
45394607 17our $PACKAGE = __PACKAGE__;
18
19our @ISA = qw(Exporter);
20
21our %EXPORT_TAGS = ();
22our @EXPORT_OK = ();
23our @EXPORT = ();
24
25(our $Path = $INC{'Unicode/Collate.pm'}) =~ s/\.pm$//;
26our $KeyFile = "allkeys.txt";
27
3164dd77 28our $UNICODE_VERSION;
29
327745dc 30eval { require Unicode::UCD };
31
32unless ($@) {
33 $UNICODE_VERSION = Unicode::UCD::UnicodeVersion();
34}
35else { # XXX, Perl 5.6.1
3164dd77 36 my($f, $fh);
37 foreach my $d (@INC) {
38 use File::Spec;
3164dd77 39 $f = File::Spec->catfile($d, "unicode", "Unicode.301");
40 if (open($fh, $f)) {
327745dc 41 $UNICODE_VERSION = '3.0.1';
3164dd77 42 close $fh;
43 last;
44 }
45 }
46}
47
5398038e 48our $getCombinClass; # coderef for combining class from Unicode::Normalize
45394607 49
50use constant Min2 => 0x20; # minimum weight at level 2
51use constant Min3 => 0x02; # minimum weight at level 3
a7fbee98 52use constant UNDEFINED => 0xFF80; # special value for undefined CE's
53
54our $DefaultRearrange = [ 0x0E40..0x0E44, 0x0EC0..0x0EC4 ];
45394607 55
3164dd77 56sub UCA_Version { "8.0" }
57
327745dc 58sub Base_Unicode_Version { $UNICODE_VERSION || 'unknown' }
3164dd77 59
45394607 60##
61## constructor
62##
63sub new
64{
a7fbee98 65 my $class = shift;
66 my $self = bless { @_ }, $class;
67
68 # alternate lowercased
69 $self->{alternate} =
70 ! exists $self->{alternate} ? 'shifted' : lc($self->{alternate});
71
72 croak "$PACKAGE unknown alternate tag name: $self->{alternate}"
73 unless $self->{alternate} eq 'blanked'
74 || $self->{alternate} eq 'non-ignorable'
75 || $self->{alternate} eq 'shifted'
76 || $self->{alternate} eq 'shift-trimmed';
77
78 # collation level
79 $self->{level} ||= 4;
80
81 croak "Illegal level lower than 1 (passed $self->{level})."
82 if $self->{level} < 1;
83 croak "A level higher than 4 (passed $self->{level}) is not supported."
84 if 4 < $self->{level};
85
86 # overrideHangul and -CJK
87 # If true: CODEREF used; '': default; undef: derived elements
88 $self->{overrideHangul} = ''
89 if ! exists $self->{overrideHangul};
90 $self->{overrideCJK} = ''
91 if ! exists $self->{overrideCJK};
92
93 # normalization form
94 $self->{normalization} = 'D'
95 if ! exists $self->{normalization};
96 $self->{UNF} = undef;
97
98 if (defined $self->{normalization}) {
99 eval { require Unicode::Normalize };
100 croak "Unicode/Normalize.pm is required to normalize strings: $@"
101 if $@;
102
103 Unicode::Normalize->import();
104 $getCombinClass = \&Unicode::Normalize::getCombinClass
105 if ! $getCombinClass;
106
107 $self->{UNF} =
108 $self->{normalization} =~ /^(?:NF)?C$/ ? \&NFC :
109 $self->{normalization} =~ /^(?:NF)?D$/ ? \&NFD :
110 $self->{normalization} =~ /^(?:NF)?KC$/ ? \&NFKC :
111 $self->{normalization} =~ /^(?:NF)?KD$/ ? \&NFKD :
112 croak "$PACKAGE unknown normalization form name: "
113 . $self->{normalization};
114 }
45394607 115
a7fbee98 116 # Open a table file.
117 # If undef is passed explicitly, no file is read.
118 $self->{table} = $KeyFile
119 if ! exists $self->{table};
120 $self->read_table
121 if defined $self->{table};
905aa9f0 122
a7fbee98 123 if ($self->{entry}) {
124 $self->parseEntry($_) foreach split /\n/, $self->{entry};
125 }
905aa9f0 126
a7fbee98 127 # backwards
128 $self->{backwards} ||= [ ];
129 $self->{backwards} = [ $self->{backwards} ]
130 if ! ref $self->{backwards};
905aa9f0 131
a7fbee98 132 # rearrange
133 $self->{rearrange} = $DefaultRearrange
134 if ! exists $self->{rearrange};
135 $self->{rearrange} = []
136 if ! defined $self->{rearrange};
137 croak "$PACKAGE: A list for rearrangement must be store in an ARRAYREF"
138 if ! ref $self->{rearrange};
139
140 # keys of $self->{rearrangeHash} are $self->{rearrange}.
141 $self->{rearrangeHash} = undef;
142
143 if (@{ $self->{rearrange} }) {
144 @{ $self->{rearrangeHash} }{ @{ $self->{rearrange} } } = ();
145 }
146
147 return $self;
148}
905aa9f0 149
150sub read_table {
a7fbee98 151 my $self = shift;
152 my $file = $self->{table} ne '' ? $self->{table} : $KeyFile;
153
154 open my $fk, "<$Path/$file"
155 or croak "File does not exist at $Path/$file";
156
157 while (<$fk>) {
158 next if /^\s*#/;
159 if (/^\s*\@/) {
160 if (/^\@version\s*(\S*)/) {
161 $self->{version} ||= $1;
162 }
163 elsif (/^\@alternate\s+(.*)/) {
164 $self->{alternate} ||= $1;
165 }
166 elsif (/^\@backwards\s+(.*)/) {
167 push @{ $self->{backwards} }, $1;
168 }
169 elsif (/^\@rearrange\s+(.*)/) {
170 push @{ $self->{rearrange} }, _getHexArray($1);
171 }
172 next;
173 }
174 $self->parseEntry($_);
45394607 175 }
a7fbee98 176 close $fk;
45394607 177}
178
905aa9f0 179
45394607 180##
181## get $line, parse it, and write an entry in $self
182##
183sub parseEntry
184{
a7fbee98 185 my $self = shift;
186 my $line = shift;
187 my($name, $ele, @key);
188
189 return if $line !~ /^\s*[0-9A-Fa-f]/;
190
191 # removes comment and gets name
192 $name = $1
193 if $line =~ s/[#%]\s*(.*)//;
194 return if defined $self->{undefName} && $name =~ /$self->{undefName}/;
195
196 # gets element
197 my($e, $k) = split /;/, $line;
198 croak "Wrong Entry: <charList> must be separated by ';' from <collElement>"
199 if ! $k;
200
201 my @e = _getHexArray($e);
202 $ele = pack('U*', @e);
203 return if defined $self->{undefChar} && $ele =~ /$self->{undefChar}/;
204
205 # get sort key
206 if (defined $self->{ignoreName} && $name =~ /$self->{ignoreName}/ ||
207 defined $self->{ignoreChar} && $ele =~ /$self->{ignoreChar}/)
208 {
209 $self->{entries}{$ele} = $self->{ignored}{$ele} = 1;
45394607 210 }
a7fbee98 211 else {
212 my $combining = 1; # primary = 0, secondary != 0;
213
214 foreach my $arr ($k =~ /\[([^\[\]]+)\]/g) { # SPACEs allowed
215 my $var = $arr =~ /\*/; # exactly /^\*/ but be lenient.
216 push @key, $self->altCE($var, _getHexArray($arr));
217 $combining = 0 unless $key[-1][0] == 0 && $key[-1][1] != 0;
218 }
219 $self->{entries}{$ele} = \@key;
220 $self->{combining}{$ele} = 1 if $combining;
221 }
222 $self->{maxlength}{ord $ele} = scalar @e if @e > 1;
45394607 223}
224
225
226##
d16e9e3d 227## arrayref CE = altCE(bool variable?, list[num] weights)
45394607 228##
d16e9e3d 229sub altCE
45394607 230{
a7fbee98 231 my $self = shift;
232 my $var = shift;
233 my @c = @_;
234
235 $self->{alternate} eq 'blanked' ?
236 $var ? [0,0,0,$c[3]] : \@c :
237 $self->{alternate} eq 'non-ignorable' ?
238 \@c :
239 $self->{alternate} eq 'shifted' ?
240 $var ? [0,0,0,$c[0] ] : [ @c[0..2], $c[0]+$c[1]+$c[2] ? 0xFFFF : 0 ] :
241 $self->{alternate} eq 'shift-trimmed' ?
242 $var ? [0,0,0,$c[0] ] : [ @c[0..2], 0 ] :
243 croak "$PACKAGE unknown alternate name: $self->{alternate}";
45394607 244}
245
246##
d16e9e3d 247## string hex_sortkey = splitCE(string arg)
45394607 248##
249sub viewSortKey
250{
a7fbee98 251 my $self = shift;
252 my $key = $self->getSortKey(@_);
253 my $view = join " ", map sprintf("%04X", $_), unpack 'n*', $key;
254 $view =~ s/ ?0000 ?/|/g;
255 return "[$view]";
45394607 256}
257
d16e9e3d 258
45394607 259##
d16e9e3d 260## list[strings] elements = splitCE(string arg)
45394607 261##
d16e9e3d 262sub splitCE
45394607 263{
a7fbee98 264 my $self = shift;
265 my $code = $self->{preprocess};
266 my $norm = $self->{UNF};
267 my $ent = $self->{entries};
268 my $max = $self->{maxlength};
269 my $reH = $self->{rearrangeHash};
270
271 my $str = ref $code ? &$code(shift) : shift;
272 $str = &$norm($str) if ref $norm;
273
274 my @src = unpack('U*', $str);
275 my @buf;
276
277 # rearrangement
278 if ($reH) {
279 for (my $i = 0; $i < @src; $i++) {
280 if (exists $reH->{ $src[$i] } && $i + 1 < @src) {
281 ($src[$i], $src[$i+1]) = ($src[$i+1], $src[$i]);
282 $i++;
283 }
284 }
45394607 285 }
45394607 286
a7fbee98 287 for (my $i = 0; $i < @src; $i++) {
288 my $ch;
289 my $u = $src[$i];
290
291 # non-characters
292 next unless defined $u;
293 next if $u < 0 || 0x10FFFF < $u # out of range
294 || (0xD800 <= $u && $u <= 0xDFFF); # unpaired surrogates
295 my $four = $u & 0xFFFF;
296 next if $four == 0xFFFE || $four == 0xFFFF;
297
298 if ($max->{$u}) { # contract
299 for (my $j = $max->{$u}; $j >= 1; $j--) {
300 next unless $i+$j-1 < @src;
301 $ch = pack 'U*', @src[$i .. $i+$j-1];
302 $i += $j-1, last if $ent->{$ch};
303 }
304 } else {
305 $ch = pack('U', $u);
306 }
307
308 # with Combining Char (UTS#10, 4.2.1), here requires Unicode::Normalize.
309 if ($getCombinClass && defined $ch) {
310 for (my $j = $i+1; $j < @src; $j++) {
311 next unless defined $src[$j];
312 last unless $getCombinClass->( $src[$j] );
313 my $comb = pack 'U', $src[$j];
314 next if ! $ent->{ $ch.$comb };
315 $ch .= $comb;
316 $src[$j] = undef;
317 }
318 }
319 push @buf, $ch;
45394607 320 }
a7fbee98 321 wantarray ? @buf : \@buf;
d16e9e3d 322}
45394607 323
d16e9e3d 324
325##
326## list[arrayrefs] weight = getWt(string element)
327##
328sub getWt
329{
a7fbee98 330 my $self = shift;
331 my $ch = shift;
332 my $ent = $self->{entries};
333 my $ign = $self->{ignored};
334 my $cjk = $self->{overrideCJK};
335 my $hang = $self->{overrideHangul};
336
337 return if !defined $ch || $ign->{$ch}; # ignored
338 return @{ $ent->{$ch} } if $ent->{$ch};
339 my $u = unpack('U', $ch);
340
341 if (0xAC00 <= $u && $u <= 0xD7A3) { # is_Hangul
342 return $hang
343 ? &$hang($u)
344 : defined $hang
345 ? map({
346 my $v = $_;
347 my $ar = $ent->{pack('U', $v)};
348 $ar ? @$ar : map($self->altCE(0,@$_), _derivCE($v));
349 } _decompHangul($u))
350 : map($self->altCE(0,@$_), _derivCE($u));
351 }
352 elsif (0x3400 <= $u && $u <= 0x4DB5 ||
353 0x4E00 <= $u && $u <= 0x9FA5 ||
354 0x20000 <= $u && $u <= 0x2A6D6) { # is_CJK
355 return $cjk
356 ? &$cjk($u)
357 : defined $cjk && $u <= 0xFFFF
358 ? $self->altCE(0, ($u, 0x20, 0x02, $u))
359 : map($self->altCE(0,@$_), _derivCE($u));
360 }
361 else {
362 return map($self->altCE(0,@$_), _derivCE($u));
363 }
d16e9e3d 364}
365
366##
367## int = index(string, substring)
368##
369sub index
370{
a7fbee98 371 my $self = shift;
372 my $lev = $self->{level};
373 my $comb = $self->{combining};
374 my $str = $self->splitCE(shift);
375 my $sub = $self->splitCE(shift);
376
377 return wantarray ? (0,0) : 0 if ! @$sub;
378 return wantarray ? () : -1 if ! @$str;
379
380 my @subWt = grep _ignorableAtLevel($_,$lev),
381 map $self->getWt($_), @$sub;
382
383 my(@strWt,@strPt);
384 my $count = 0;
385 for (my $i = 0; $i < @$str; $i++) {
386 my $go_ahead = 0;
387
388 my @tmp = grep _ignorableAtLevel($_,$lev), $self->getWt($str->[$i]);
389 $go_ahead += length $str->[$i];
390
391 # /*XXX*/ still broken.
392 # index("e\x{300}", "e") should be 'no match' at level 2 or higher
393 # as "e\x{300}" is a *single* grapheme cluster and not equal to "e".
394
395 # go ahead as far as we find a combining character;
396 while ($i + 1 < @$str &&
397 (! defined $str->[$i+1] || $comb->{ $str->[$i+1] }) ) {
398 $i++;
399 $go_ahead += length $str->[$i];
400 next if ! defined $str->[$i];
401 push @tmp,
402 grep _ignorableAtLevel($_,$lev), $self->getWt($str->[$i]);
403 }
404
405 push @strWt, @tmp;
406 push @strPt, ($count) x @tmp;
407 $count += $go_ahead;
408
409 while (@strWt >= @subWt) {
410 if (_eqArray(\@strWt, \@subWt, $lev)) {
411 my $pos = $strPt[0];
412 return wantarray ? ($pos, $count-$pos) : $pos;
413 }
414 shift @strWt;
415 shift @strPt;
416 }
d16e9e3d 417 }
a7fbee98 418 return wantarray ? () : -1;
d16e9e3d 419}
420
421##
422## bool _eqArray(arrayref, arrayref, level)
423##
424sub _eqArray($$$)
425{
a7fbee98 426 my $a = shift; # length $a >= length $b;
427 my $b = shift;
428 my $lev = shift;
429 for my $v (0..$lev-1) {
430 for my $c (0..@$b-1){
431 return if $a->[$c][$v] != $b->[$c][$v];
432 }
d16e9e3d 433 }
a7fbee98 434 return 1;
d16e9e3d 435}
436
437
438##
439## bool _ignorableAtLevel(CE, level)
440##
441sub _ignorableAtLevel($$)
442{
a7fbee98 443 my $ce = shift;
444 return unless defined $ce;
445 my $lv = shift;
446 return ! grep { ! $ce->[$_] } 0..$lv-1;
d16e9e3d 447}
448
449
450##
451## string sortkey = getSortKey(string arg)
452##
453sub getSortKey
454{
a7fbee98 455 my $self = shift;
456 my $lev = $self->{level};
457 my $rCE = $self->splitCE(shift); # get an arrayref
458
459 # weight arrays
460 my @buf = grep defined(), map $self->getWt($_), @$rCE;
461
462 # make sort key
463 my @ret = ([],[],[],[]);
464 foreach my $v (0..$lev-1) {
465 foreach my $b (@buf) {
466 push @{ $ret[$v] }, $b->[$v] if $b->[$v];
467 }
468 }
469 foreach (@{ $self->{backwards} }) {
470 my $v = $_ - 1;
471 @{ $ret[$v] } = reverse @{ $ret[$v] };
45394607 472 }
45394607 473
a7fbee98 474 # modification of tertiary weights
475 if ($self->{upper_before_lower}) {
476 foreach (@{ $ret[2] }) {
477 if (0x8 <= $_ && $_ <= 0xC) { $_ -= 6 } # lower
478 elsif (0x2 <= $_ && $_ <= 0x6) { $_ += 6 } # upper
479 elsif ($_ == 0x1C) { $_ += 1 } # square upper
480 elsif ($_ == 0x1D) { $_ -= 1 } # square lower
481 }
45394607 482 }
a7fbee98 483 if ($self->{katakana_before_hiragana}) {
484 foreach (@{ $ret[2] }) {
485 if (0x0F <= $_ && $_ <= 0x13) { $_ -= 2 } # katakana
486 elsif (0x0D <= $_ && $_ <= 0x0E) { $_ += 5 } # hiragana
487 }
45394607 488 }
a7fbee98 489 join "\0\0", map pack('n*', @$_), @ret;
45394607 490}
491
492
493##
d16e9e3d 494## int compare = cmp(string a, string b)
45394607 495##
5398038e 496sub cmp { $_[0]->getSortKey($_[1]) cmp $_[0]->getSortKey($_[2]) }
497sub eq { $_[0]->getSortKey($_[1]) eq $_[0]->getSortKey($_[2]) }
498sub ne { $_[0]->getSortKey($_[1]) ne $_[0]->getSortKey($_[2]) }
499sub lt { $_[0]->getSortKey($_[1]) lt $_[0]->getSortKey($_[2]) }
500sub le { $_[0]->getSortKey($_[1]) le $_[0]->getSortKey($_[2]) }
501sub gt { $_[0]->getSortKey($_[1]) gt $_[0]->getSortKey($_[2]) }
502sub ge { $_[0]->getSortKey($_[1]) ge $_[0]->getSortKey($_[2]) }
45394607 503
504##
d16e9e3d 505## list[strings] sorted = sort(list[strings] arg)
45394607 506##
a7fbee98 507sub sort {
508 my $obj = shift;
509 return
510 map { $_->[1] }
511 sort{ $a->[0] cmp $b->[0] }
512 map [ $obj->getSortKey($_), $_ ], @_;
45394607 513}
514
515##
d16e9e3d 516## list[arrayrefs] CE = _derivCE(int codepoint)
45394607 517##
a7fbee98 518sub _derivCE {
519 my $code = shift;
520 my $a = UNDEFINED + ($code >> 15); # ok
521 my $b = ($code & 0x7FFF) | 0x8000; # ok
522# my $a = 0xFFC2 + ($code >> 15); # ng
523# my $b = $code & 0x7FFF | 0x1000; # ng
524 $b ? ([$a,2,1,$code],[$b,0,0,$code]) : [$a,2,1,$code];
45394607 525}
526
527##
528## "hhhh hhhh hhhh" to (dddd, dddd, dddd)
529##
a7fbee98 530sub _getHexArray { map hex, $_[0] =~ /([0-9a-fA-F]+)/g }
5398038e 531
a7fbee98 532#
533# $code must be in Hangul syllable.
534# Check it before you enter here.
535#
5398038e 536sub _decompHangul {
537 my $code = shift;
5398038e 538 my $SIndex = $code - 0xAC00;
539 my $LIndex = int( $SIndex / 588);
540 my $VIndex = int(($SIndex % 588) / 28);
541 my $TIndex = $SIndex % 28;
542 return (
a7fbee98 543 0x1100 + $LIndex,
544 0x1161 + $VIndex,
545 $TIndex ? (0x11A7 + $TIndex) : (),
5398038e 546 );
45394607 547}
548
5491;
550__END__
551
552=head1 NAME
553
a7fbee98 554Unicode::Collate - Unicode Collation Algorithm
45394607 555
556=head1 SYNOPSIS
557
558 use Unicode::Collate;
559
560 #construct
5398038e 561 $Collator = Unicode::Collate->new(%tailoring);
45394607 562
563 #sort
5398038e 564 @sorted = $Collator->sort(@not_sorted);
45394607 565
566 #compare
a7fbee98 567 $result = $Collator->cmp($a, $b); # returns 1, 0, or -1.
45394607 568
569=head1 DESCRIPTION
570
571=head2 Constructor and Tailoring
572
d16e9e3d 573The C<new> method returns a collator object.
574
5398038e 575 $Collator = Unicode::Collate->new(
45394607 576 alternate => $alternate,
577 backwards => $levelNumber, # or \@levelNumbers
578 entry => $element,
579 normalization => $normalization_form,
580 ignoreName => qr/$ignoreName/,
581 ignoreChar => qr/$ignoreChar/,
582 katakana_before_hiragana => $bool,
583 level => $collationLevel,
584 overrideCJK => \&overrideCJK,
585 overrideHangul => \&overrideHangul,
586 preprocess => \&preprocess,
587 rearrange => \@charList,
588 table => $filename,
589 undefName => qr/$undefName/,
590 undefChar => qr/$undefChar/,
591 upper_before_lower => $bool,
592 );
a7fbee98 593 # if %tailoring is false (i.e. empty),
5398038e 594 # $Collator should do the default collation.
45394607 595
596=over 4
597
598=item alternate
599
600-- see 3.2.2 Alternate Weighting, UTR #10.
601
a7fbee98 602This key allows to alternate weighting for variable collation elements,
603which are marked with an ASTERISK in the table
604(NOTE: Many punction marks and symbols are variable in F<allkeys.txt>).
605
606 alternate => 'blanked', 'non-ignorable', 'shifted', or 'shift-trimmed'.
45394607 607
a7fbee98 608These names are case-insensitive.
45394607 609By default (if specification is omitted), 'shifted' is adopted.
610
a7fbee98 611 'Blanked' Variable elements are ignorable at levels 1 through 3;
612 considered at the 4th level.
613
614 'Non-ignorable' Variable elements are not reset to ignorable.
615
616 'Shifted' Variable elements are ignorable at levels 1 through 3
617 their level 4 weight is replaced by the old level 1 weight.
618 Level 4 weight for Non-Variable elements is 0xFFFF.
619
620 'Shift-Trimmed' Same as 'shifted', but all FFFF's at the 4th level
621 are trimmed.
622
45394607 623=item backwards
624
625-- see 3.1.2 French Accents, UTR #10.
626
627 backwards => $levelNumber or \@levelNumbers
628
629Weights in reverse order; ex. level 2 (diacritic ordering) in French.
630If omitted, forwards at all the levels.
631
632=item entry
633
634-- see 3.1 Linguistic Features; 3.2.1 File Format, UTR #10.
635
a7fbee98 636Overrides a default order or defines additional collation elements
45394607 637
638 entry => <<'ENTRIES', # use the UCA file format
a7fbee98 63900E6 ; [.0861.0020.0002.00E6] [.08B1.0020.0002.00E6] # ligature <ae> as <a><e>
45394607 6400063 0068 ; [.0893.0020.0002.0063] # "ch" in traditional Spanish
6410043 0068 ; [.0893.0020.0008.0043] # "Ch" in traditional Spanish
642ENTRIES
643
644=item ignoreName
645
646=item ignoreChar
647
648-- see Completely Ignorable, 3.2.2 Alternate Weighting, UTR #10.
649
a7fbee98 650Makes the entry in the table ignorable.
651If a collation element is ignorable,
45394607 652it is ignored as if the element had been deleted from there.
653
a7fbee98 654E.g. when 'a' and 'e' are ignorable,
45394607 655'element' is equal to 'lament' (or 'lmnt').
656
657=item level
658
659-- see 4.3 Form a sort key for each string, UTR #10.
660
661Set the maximum level.
662Any higher levels than the specified one are ignored.
663
664 Level 1: alphabetic ordering
665 Level 2: diacritic ordering
666 Level 3: case ordering
667 Level 4: tie-breaking (e.g. in the case when alternate is 'shifted')
668
669 ex.level => 2,
670
a7fbee98 671If omitted, the maximum is the 4th.
672
45394607 673=item normalization
674
675-- see 4.1 Normalize each input string, UTR #10.
676
905aa9f0 677If specified, strings are normalized before preparation of sort keys
45394607 678(the normalization is executed after preprocess).
679
680As a form name, one of the following names must be used.
681
682 'C' or 'NFC' for Normalization Form C
683 'D' or 'NFD' for Normalization Form D
684 'KC' or 'NFKC' for Normalization Form KC
685 'KD' or 'NFKD' for Normalization Form KD
686
687If omitted, the string is put into Normalization Form D.
688
a7fbee98 689If C<undef> is passed explicitly as the value for this key,
45394607 690any normalization is not carried out (this may make tailoring easier
691if any normalization is not desired).
692
693see B<CAVEAT>.
694
695=item overrideCJK
696
45394607 697-- see 7.1 Derived Collation Elements, UTR #10.
698
699By default, mapping of CJK Unified Ideographs
a7fbee98 700uses the Unicode codepoint order.
701But the mapping of CJK Unified Ideographs may be overrided.
45394607 702
a7fbee98 703ex. CJK Unified Ideographs in the JIS code point order.
45394607 704
705 overrideCJK => sub {
a7fbee98 706 my $u = shift; # get a Unicode codepoint
707 my $b = pack('n', $u); # to UTF-16BE
708 my $s = your_unicode_to_sjis_converter($b); # convert
709 my $n = unpack('n', $s); # convert sjis to short
710 [ $n, 0x20, 0x2, $u ]; # return the collation element
45394607 711 },
712
a7fbee98 713ex. ignores all CJK Unified Ideographs.
714
715 overrideCJK => sub {()}, # CODEREF returning empty list
716
717 # where ->eq("Pe\x{4E00}rl", "Perl") is true
718 # as U+4E00 is a CJK Unified Ideograph and to be ignorable.
719
720If C<undef> is passed explicitly as the value for this key,
721weights for CJK Unified Ideographs are treated as undefined.
722But assignment of weight for CJK Unified Ideographs
723in table or L<entry> is still valid.
724
725=item overrideHangul
726
727-- see 7.1 Derived Collation Elements, UTR #10.
728
729By default, Hangul Syllables are decomposed into Hangul Jamo.
730But the mapping of Hangul Syllables may be overrided.
731
732This tag works like L<overrideCJK>, so see there for examples.
733
45394607 734If you want to override the mapping of Hangul Syllables,
735the Normalization Forms D and KD are not appropriate
736(they will be decomposed before overriding).
737
a7fbee98 738If C<undef> is passed explicitly as the value for this key,
739weight for Hangul Syllables is treated as undefined
740without decomposition into Hangul Jamo.
741But definition of weight for Hangul Syllables
742in table or L<entry> is still valid.
743
45394607 744=item preprocess
745
746-- see 5.1 Preprocessing, UTR #10.
747
748If specified, the coderef is used to preprocess
749before the formation of sort keys.
750
a7fbee98 751ex. dropping English articles, such as "a" or "the".
45394607 752Then, "the pen" is before "a pencil".
753
754 preprocess => sub {
755 my $str = shift;
a7fbee98 756 $str =~ s/\b(?:an?|the)\s+//gi;
45394607 757 $str;
758 },
759
760=item rearrange
761
762-- see 3.1.3 Rearrangement, UTR #10.
763
764Characters that are not coded in logical order and to be rearranged.
a7fbee98 765By default,
45394607 766
767 rearrange => [ 0x0E40..0x0E44, 0x0EC0..0x0EC4 ],
768
a7fbee98 769If you want to disallow any rearrangement,
770pass C<undef> or C<[]> (a reference to an empty list)
771as the value for this key.
772
45394607 773=item table
774
775-- see 3.2 Default Unicode Collation Element Table, UTR #10.
776
777You can use another element table if desired.
778The table file must be in your C<lib/Unicode/Collate> directory.
779
780By default, the file C<lib/Unicode/Collate/allkeys.txt> is used.
781
a7fbee98 782If C<undef> is passed explicitly as the value for this key,
783no file is read (but you can define collation elements via L<entry>).
784
785A typical way to define a collation element table
786without any file of table:
787
788 $onlyABC = Unicode::Collate->new(
789 table => undef,
790 entry => << 'ENTRIES',
7910061 ; [.0101.0020.0002.0061] # LATIN SMALL LETTER A
7920041 ; [.0101.0020.0008.0041] # LATIN CAPITAL LETTER A
7930062 ; [.0102.0020.0002.0062] # LATIN SMALL LETTER B
7940042 ; [.0102.0020.0008.0042] # LATIN CAPITAL LETTER B
7950063 ; [.0103.0020.0002.0063] # LATIN SMALL LETTER C
7960043 ; [.0103.0020.0008.0043] # LATIN CAPITAL LETTER C
797ENTRIES
798 );
905aa9f0 799
45394607 800=item undefName
801
802=item undefChar
803
804-- see 6.3.4 Reducing the Repertoire, UTR #10.
805
806Undefines the collation element as if it were unassigned in the table.
807This reduces the size of the table.
808If an unassigned character appears in the string to be collated,
809the sort key is made from its codepoint
810as a single-character collation element,
811as it is greater than any other assigned collation elements
812(in the codepoint order among the unassigned characters).
813But, it'd be better to ignore characters
814unfamiliar to you and maybe never used.
815
816=item katakana_before_hiragana
817
818=item upper_before_lower
819
820-- see 6.6 Case Comparisons; 7.3.1 Tertiary Weight Table, UTR #10.
821
822By default, lowercase is before uppercase
823and hiragana is before katakana.
824
a7fbee98 825If the tag is made true, this is reversed.
826
827B<NOTE>: These tags simplemindedly assume
828any lowercase/uppercase or hiragana/katakana distinctions
829should occur in level 3, and their weights at level 3
830should be same as those mentioned in 7.3.1, UTR #10.
831If you define your collation elements which violates this,
832these tags doesn't work validly.
45394607 833
834=back
835
3164dd77 836=head2 Methods for Collation
45394607 837
838=over 4
839
5398038e 840=item C<@sorted = $Collator-E<gt>sort(@not_sorted)>
45394607 841
842Sorts a list of strings.
843
5398038e 844=item C<$result = $Collator-E<gt>cmp($a, $b)>
45394607 845
846Returns 1 (when C<$a> is greater than C<$b>)
847or 0 (when C<$a> is equal to C<$b>)
848or -1 (when C<$a> is lesser than C<$b>).
849
5398038e 850=item C<$result = $Collator-E<gt>eq($a, $b)>
851
852=item C<$result = $Collator-E<gt>ne($a, $b)>
853
854=item C<$result = $Collator-E<gt>lt($a, $b)>
855
856=item C<$result = $Collator-E<gt>le($a, $b)>
857
858=item C<$result = $Collator-E<gt>gt($a, $b)>
859
860=item C<$result = $Collator-E<gt>ge($a, $b)>
861
a7fbee98 862They works like the same name operators as theirs.
5398038e 863
864 eq : whether $a is equal to $b.
865 ne : whether $a is not equal to $b.
866 lt : whether $a is lesser than $b.
867 le : whether $a is lesser than $b or equal to $b.
868 gt : whether $a is greater than $b.
869 ge : whether $a is greater than $b or equal to $b.
870
871=item C<$sortKey = $Collator-E<gt>getSortKey($string)>
45394607 872
873-- see 4.3 Form a sort key for each string, UTR #10.
874
875Returns a sort key.
876
877You compare the sort keys using a binary comparison
878and get the result of the comparison of the strings using UCA.
879
5398038e 880 $Collator->getSortKey($a) cmp $Collator->getSortKey($b)
45394607 881
882 is equivalent to
883
5398038e 884 $Collator->cmp($a, $b)
45394607 885
a7fbee98 886=item C<$sortKeyForm = $Collator-E<gt>viewSortKey($string)>
887
888Returns a string formalized to display a sort key.
889Weights are enclosed with C<'['> and C<']'>
890and level boundaries are denoted by C<'|'>.
891
892 use Unicode::Collate;
893 my $c = Unicode::Collate->new();
894 print $c->viewSortKey("Perl"),"\n";
895
896 # output:
897 # [09B3 08B1 09CB 094F|0020 0020 0020 0020|0008 0002 0002 0002|FFFF FFFF FFFF FFFF]
898 # Level 1 Level 2 Level 3 Level 4
899
5398038e 900=item C<$position = $Collator-E<gt>index($string, $substring)>
d16e9e3d 901
5398038e 902=item C<($position, $length) = $Collator-E<gt>index($string, $substring)>
d16e9e3d 903
904-- see 6.8 Searching, UTR #10.
905
906If C<$substring> matches a part of C<$string>, returns
907the position of the first occurrence of the matching part in scalar context;
908in list context, returns a two-element list of
909the position and the length of the matching part.
910
911B<Notice> that the length of the matching part may differ from
912the length of C<$substring>.
913
914B<Note> that the position and the length are counted on the string
915after the process of preprocess, normalization, and rearrangement.
916Therefore, in case the specified string is not binary equal to
917the preprocessed/normalized/rearranged string, the position and the length
a7fbee98 918may differ form those on the specified string. But it is guaranteed
d16e9e3d 919that, if matched, it returns a non-negative value as C<$position>.
920
921If C<$substring> does not match any part of C<$string>,
922returns C<-1> in scalar context and
923an empty list in list context.
924
925e.g. you say
926
5398038e 927 my $Collator = Unicode::Collate->new( normalization => undef, level => 1 );
d16e9e3d 928 my $str = "Ich mu\x{00DF} studieren.";
929 my $sub = "m\x{00FC}ss";
930 my $match;
a7fbee98 931 if (my($pos,$len) = $Collator->index($str, $sub)) {
5398038e 932 $match = substr($str, $pos, $len);
d16e9e3d 933 }
934
935and get C<"mu\x{00DF}"> in C<$match> since C<"mu>E<223>C<">
936is primary equal to C<"m>E<252>C<ss">.
937
45394607 938=back
939
3164dd77 940=head2 Other Methods
941
942=over 4
943
944=item UCA_Version
945
946Returns the version number of Unicode Technical Standard 10
947this module consults.
948
949=item Base_Unicode_Version
950
951Returns the version number of the Unicode Standard
952this module is based on.
953
954=back
955
45394607 956=head2 EXPORT
957
958None by default.
959
4a2e806c 960=head2 TODO
961
962Unicode::Collate has not been ported to EBCDIC. The code mostly would
963work just fine but a decision needs to be made: how the module should
964work in EBCDIC? Should the low 256 characters be understood as
965Unicode or as EBCDIC code points? Should one be chosen or should
966there be a way to do either? Or should such translation be left
967outside the module for the user to do, for example by using
968Encode::from_to()?
a7fbee98 969(or utf8::unicode_to_native()/utf8::native_to_unicode()?)
4a2e806c 970
45394607 971=head2 CAVEAT
972
973Use of the C<normalization> parameter requires
974the B<Unicode::Normalize> module.
975
5398038e 976If you need not it (say, in the case when you need not
45394607 977handle any combining characters),
978assign C<normalization =E<gt> undef> explicitly.
979
5398038e 980-- see 6.5 Avoiding Normalization, UTR #10.
981
a7fbee98 982=head2 BUGS
983
984C<index()> is an experimental method and
985its return value may be unreliable.
986The correct implementation for C<index()> must be based
987on Locale-Sensitive Support: Level 3 in UTR #18,
988F<Unicode Regular Expression Guidelines>.
989
990See also 4.2 Locale-Dependent Graphemes in UTR #18.
991
45394607 992=head1 AUTHOR
993
994SADAHIRO Tomoyuki, E<lt>SADAHIRO@cpan.orgE<gt>
995
996 http://homepage1.nifty.com/nomenclator/perl/
997
a7fbee98 998 Copyright(C) 2001-2002, SADAHIRO Tomoyuki. Japan. All rights reserved.
45394607 999
a7fbee98 1000 This library is free software; you can redistribute it
1001 and/or modify it under the same terms as Perl itself.
45394607 1002
1003=head1 SEE ALSO
1004
1005=over 4
1006
a7fbee98 1007=item http://www.unicode.org/unicode/reports/tr10/
45394607 1008
a7fbee98 1009Unicode Collation Algorithm - UTR #10
45394607 1010
a7fbee98 1011=item http://www.unicode.org/unicode/reports/tr10/allkeys.txt
1012
1013The Default Unicode Collation Element Table
45394607 1014
a7fbee98 1015=item http://www.unicode.org/unicode/reports/tr15/
1016
1017Unicode Normalization Forms - UAX #15
1018
1019=item http://www.unicode.org/unicode/reports/tr18
1020
1021Unicode Regular Expression Guidelines - UTR #18
1022
1023=item L<Unicode::Normalize>
45394607 1024
45394607 1025=back
1026
1027=cut