"\"\""
[p5sagit/p5-mst-13.2.git] / lib / Unicode / Collate.pm
CommitLineData
45394607 1package Unicode::Collate;
2
4a2e806c 3BEGIN {
4 if (ord("A") == 193) {
5 die "Unicode::Collate not ported to EBCDIC\n";
6 }
7}
8
45394607 9use 5.006;
10use strict;
11use warnings;
12use Carp;
5398038e 13
45394607 14require Exporter;
15
a7fbee98 16our $VERSION = '0.11';
45394607 17our $PACKAGE = __PACKAGE__;
18
19our @ISA = qw(Exporter);
20
21our %EXPORT_TAGS = ();
22our @EXPORT_OK = ();
23our @EXPORT = ();
24
25(our $Path = $INC{'Unicode/Collate.pm'}) =~ s/\.pm$//;
26our $KeyFile = "allkeys.txt";
27
5398038e 28our $getCombinClass; # coderef for combining class from Unicode::Normalize
45394607 29
30use constant Min2 => 0x20; # minimum weight at level 2
31use constant Min3 => 0x02; # minimum weight at level 3
a7fbee98 32use constant UNDEFINED => 0xFF80; # special value for undefined CE's
33
34our $DefaultRearrange = [ 0x0E40..0x0E44, 0x0EC0..0x0EC4 ];
45394607 35
36##
37## constructor
38##
39sub new
40{
a7fbee98 41 my $class = shift;
42 my $self = bless { @_ }, $class;
43
44 # alternate lowercased
45 $self->{alternate} =
46 ! exists $self->{alternate} ? 'shifted' : lc($self->{alternate});
47
48 croak "$PACKAGE unknown alternate tag name: $self->{alternate}"
49 unless $self->{alternate} eq 'blanked'
50 || $self->{alternate} eq 'non-ignorable'
51 || $self->{alternate} eq 'shifted'
52 || $self->{alternate} eq 'shift-trimmed';
53
54 # collation level
55 $self->{level} ||= 4;
56
57 croak "Illegal level lower than 1 (passed $self->{level})."
58 if $self->{level} < 1;
59 croak "A level higher than 4 (passed $self->{level}) is not supported."
60 if 4 < $self->{level};
61
62 # overrideHangul and -CJK
63 # If true: CODEREF used; '': default; undef: derived elements
64 $self->{overrideHangul} = ''
65 if ! exists $self->{overrideHangul};
66 $self->{overrideCJK} = ''
67 if ! exists $self->{overrideCJK};
68
69 # normalization form
70 $self->{normalization} = 'D'
71 if ! exists $self->{normalization};
72 $self->{UNF} = undef;
73
74 if (defined $self->{normalization}) {
75 eval { require Unicode::Normalize };
76 croak "Unicode/Normalize.pm is required to normalize strings: $@"
77 if $@;
78
79 Unicode::Normalize->import();
80 $getCombinClass = \&Unicode::Normalize::getCombinClass
81 if ! $getCombinClass;
82
83 $self->{UNF} =
84 $self->{normalization} =~ /^(?:NF)?C$/ ? \&NFC :
85 $self->{normalization} =~ /^(?:NF)?D$/ ? \&NFD :
86 $self->{normalization} =~ /^(?:NF)?KC$/ ? \&NFKC :
87 $self->{normalization} =~ /^(?:NF)?KD$/ ? \&NFKD :
88 croak "$PACKAGE unknown normalization form name: "
89 . $self->{normalization};
90 }
45394607 91
a7fbee98 92 # Open a table file.
93 # If undef is passed explicitly, no file is read.
94 $self->{table} = $KeyFile
95 if ! exists $self->{table};
96 $self->read_table
97 if defined $self->{table};
905aa9f0 98
a7fbee98 99 if ($self->{entry}) {
100 $self->parseEntry($_) foreach split /\n/, $self->{entry};
101 }
905aa9f0 102
a7fbee98 103 # backwards
104 $self->{backwards} ||= [ ];
105 $self->{backwards} = [ $self->{backwards} ]
106 if ! ref $self->{backwards};
905aa9f0 107
a7fbee98 108 # rearrange
109 $self->{rearrange} = $DefaultRearrange
110 if ! exists $self->{rearrange};
111 $self->{rearrange} = []
112 if ! defined $self->{rearrange};
113 croak "$PACKAGE: A list for rearrangement must be store in an ARRAYREF"
114 if ! ref $self->{rearrange};
115
116 # keys of $self->{rearrangeHash} are $self->{rearrange}.
117 $self->{rearrangeHash} = undef;
118
119 if (@{ $self->{rearrange} }) {
120 @{ $self->{rearrangeHash} }{ @{ $self->{rearrange} } } = ();
121 }
122
123 return $self;
124}
905aa9f0 125
126sub read_table {
a7fbee98 127 my $self = shift;
128 my $file = $self->{table} ne '' ? $self->{table} : $KeyFile;
129
130 open my $fk, "<$Path/$file"
131 or croak "File does not exist at $Path/$file";
132
133 while (<$fk>) {
134 next if /^\s*#/;
135 if (/^\s*\@/) {
136 if (/^\@version\s*(\S*)/) {
137 $self->{version} ||= $1;
138 }
139 elsif (/^\@alternate\s+(.*)/) {
140 $self->{alternate} ||= $1;
141 }
142 elsif (/^\@backwards\s+(.*)/) {
143 push @{ $self->{backwards} }, $1;
144 }
145 elsif (/^\@rearrange\s+(.*)/) {
146 push @{ $self->{rearrange} }, _getHexArray($1);
147 }
148 next;
149 }
150 $self->parseEntry($_);
45394607 151 }
a7fbee98 152 close $fk;
45394607 153}
154
905aa9f0 155
45394607 156##
157## get $line, parse it, and write an entry in $self
158##
159sub parseEntry
160{
a7fbee98 161 my $self = shift;
162 my $line = shift;
163 my($name, $ele, @key);
164
165 return if $line !~ /^\s*[0-9A-Fa-f]/;
166
167 # removes comment and gets name
168 $name = $1
169 if $line =~ s/[#%]\s*(.*)//;
170 return if defined $self->{undefName} && $name =~ /$self->{undefName}/;
171
172 # gets element
173 my($e, $k) = split /;/, $line;
174 croak "Wrong Entry: <charList> must be separated by ';' from <collElement>"
175 if ! $k;
176
177 my @e = _getHexArray($e);
178 $ele = pack('U*', @e);
179 return if defined $self->{undefChar} && $ele =~ /$self->{undefChar}/;
180
181 # get sort key
182 if (defined $self->{ignoreName} && $name =~ /$self->{ignoreName}/ ||
183 defined $self->{ignoreChar} && $ele =~ /$self->{ignoreChar}/)
184 {
185 $self->{entries}{$ele} = $self->{ignored}{$ele} = 1;
45394607 186 }
a7fbee98 187 else {
188 my $combining = 1; # primary = 0, secondary != 0;
189
190 foreach my $arr ($k =~ /\[([^\[\]]+)\]/g) { # SPACEs allowed
191 my $var = $arr =~ /\*/; # exactly /^\*/ but be lenient.
192 push @key, $self->altCE($var, _getHexArray($arr));
193 $combining = 0 unless $key[-1][0] == 0 && $key[-1][1] != 0;
194 }
195 $self->{entries}{$ele} = \@key;
196 $self->{combining}{$ele} = 1 if $combining;
197 }
198 $self->{maxlength}{ord $ele} = scalar @e if @e > 1;
45394607 199}
200
201
202##
d16e9e3d 203## arrayref CE = altCE(bool variable?, list[num] weights)
45394607 204##
d16e9e3d 205sub altCE
45394607 206{
a7fbee98 207 my $self = shift;
208 my $var = shift;
209 my @c = @_;
210
211 $self->{alternate} eq 'blanked' ?
212 $var ? [0,0,0,$c[3]] : \@c :
213 $self->{alternate} eq 'non-ignorable' ?
214 \@c :
215 $self->{alternate} eq 'shifted' ?
216 $var ? [0,0,0,$c[0] ] : [ @c[0..2], $c[0]+$c[1]+$c[2] ? 0xFFFF : 0 ] :
217 $self->{alternate} eq 'shift-trimmed' ?
218 $var ? [0,0,0,$c[0] ] : [ @c[0..2], 0 ] :
219 croak "$PACKAGE unknown alternate name: $self->{alternate}";
45394607 220}
221
222##
d16e9e3d 223## string hex_sortkey = splitCE(string arg)
45394607 224##
225sub viewSortKey
226{
a7fbee98 227 my $self = shift;
228 my $key = $self->getSortKey(@_);
229 my $view = join " ", map sprintf("%04X", $_), unpack 'n*', $key;
230 $view =~ s/ ?0000 ?/|/g;
231 return "[$view]";
45394607 232}
233
d16e9e3d 234
45394607 235##
d16e9e3d 236## list[strings] elements = splitCE(string arg)
45394607 237##
d16e9e3d 238sub splitCE
45394607 239{
a7fbee98 240 my $self = shift;
241 my $code = $self->{preprocess};
242 my $norm = $self->{UNF};
243 my $ent = $self->{entries};
244 my $max = $self->{maxlength};
245 my $reH = $self->{rearrangeHash};
246
247 my $str = ref $code ? &$code(shift) : shift;
248 $str = &$norm($str) if ref $norm;
249
250 my @src = unpack('U*', $str);
251 my @buf;
252
253 # rearrangement
254 if ($reH) {
255 for (my $i = 0; $i < @src; $i++) {
256 if (exists $reH->{ $src[$i] } && $i + 1 < @src) {
257 ($src[$i], $src[$i+1]) = ($src[$i+1], $src[$i]);
258 $i++;
259 }
260 }
45394607 261 }
45394607 262
a7fbee98 263 for (my $i = 0; $i < @src; $i++) {
264 my $ch;
265 my $u = $src[$i];
266
267 # non-characters
268 next unless defined $u;
269 next if $u < 0 || 0x10FFFF < $u # out of range
270 || (0xD800 <= $u && $u <= 0xDFFF); # unpaired surrogates
271 my $four = $u & 0xFFFF;
272 next if $four == 0xFFFE || $four == 0xFFFF;
273
274 if ($max->{$u}) { # contract
275 for (my $j = $max->{$u}; $j >= 1; $j--) {
276 next unless $i+$j-1 < @src;
277 $ch = pack 'U*', @src[$i .. $i+$j-1];
278 $i += $j-1, last if $ent->{$ch};
279 }
280 } else {
281 $ch = pack('U', $u);
282 }
283
284 # with Combining Char (UTS#10, 4.2.1), here requires Unicode::Normalize.
285 if ($getCombinClass && defined $ch) {
286 for (my $j = $i+1; $j < @src; $j++) {
287 next unless defined $src[$j];
288 last unless $getCombinClass->( $src[$j] );
289 my $comb = pack 'U', $src[$j];
290 next if ! $ent->{ $ch.$comb };
291 $ch .= $comb;
292 $src[$j] = undef;
293 }
294 }
295 push @buf, $ch;
45394607 296 }
a7fbee98 297 wantarray ? @buf : \@buf;
d16e9e3d 298}
45394607 299
d16e9e3d 300
301##
302## list[arrayrefs] weight = getWt(string element)
303##
304sub getWt
305{
a7fbee98 306 my $self = shift;
307 my $ch = shift;
308 my $ent = $self->{entries};
309 my $ign = $self->{ignored};
310 my $cjk = $self->{overrideCJK};
311 my $hang = $self->{overrideHangul};
312
313 return if !defined $ch || $ign->{$ch}; # ignored
314 return @{ $ent->{$ch} } if $ent->{$ch};
315 my $u = unpack('U', $ch);
316
317 if (0xAC00 <= $u && $u <= 0xD7A3) { # is_Hangul
318 return $hang
319 ? &$hang($u)
320 : defined $hang
321 ? map({
322 my $v = $_;
323 my $ar = $ent->{pack('U', $v)};
324 $ar ? @$ar : map($self->altCE(0,@$_), _derivCE($v));
325 } _decompHangul($u))
326 : map($self->altCE(0,@$_), _derivCE($u));
327 }
328 elsif (0x3400 <= $u && $u <= 0x4DB5 ||
329 0x4E00 <= $u && $u <= 0x9FA5 ||
330 0x20000 <= $u && $u <= 0x2A6D6) { # is_CJK
331 return $cjk
332 ? &$cjk($u)
333 : defined $cjk && $u <= 0xFFFF
334 ? $self->altCE(0, ($u, 0x20, 0x02, $u))
335 : map($self->altCE(0,@$_), _derivCE($u));
336 }
337 else {
338 return map($self->altCE(0,@$_), _derivCE($u));
339 }
d16e9e3d 340}
341
342##
343## int = index(string, substring)
344##
345sub index
346{
a7fbee98 347 my $self = shift;
348 my $lev = $self->{level};
349 my $comb = $self->{combining};
350 my $str = $self->splitCE(shift);
351 my $sub = $self->splitCE(shift);
352
353 return wantarray ? (0,0) : 0 if ! @$sub;
354 return wantarray ? () : -1 if ! @$str;
355
356 my @subWt = grep _ignorableAtLevel($_,$lev),
357 map $self->getWt($_), @$sub;
358
359 my(@strWt,@strPt);
360 my $count = 0;
361 for (my $i = 0; $i < @$str; $i++) {
362 my $go_ahead = 0;
363
364 my @tmp = grep _ignorableAtLevel($_,$lev), $self->getWt($str->[$i]);
365 $go_ahead += length $str->[$i];
366
367 # /*XXX*/ still broken.
368 # index("e\x{300}", "e") should be 'no match' at level 2 or higher
369 # as "e\x{300}" is a *single* grapheme cluster and not equal to "e".
370
371 # go ahead as far as we find a combining character;
372 while ($i + 1 < @$str &&
373 (! defined $str->[$i+1] || $comb->{ $str->[$i+1] }) ) {
374 $i++;
375 $go_ahead += length $str->[$i];
376 next if ! defined $str->[$i];
377 push @tmp,
378 grep _ignorableAtLevel($_,$lev), $self->getWt($str->[$i]);
379 }
380
381 push @strWt, @tmp;
382 push @strPt, ($count) x @tmp;
383 $count += $go_ahead;
384
385 while (@strWt >= @subWt) {
386 if (_eqArray(\@strWt, \@subWt, $lev)) {
387 my $pos = $strPt[0];
388 return wantarray ? ($pos, $count-$pos) : $pos;
389 }
390 shift @strWt;
391 shift @strPt;
392 }
d16e9e3d 393 }
a7fbee98 394 return wantarray ? () : -1;
d16e9e3d 395}
396
397##
398## bool _eqArray(arrayref, arrayref, level)
399##
400sub _eqArray($$$)
401{
a7fbee98 402 my $a = shift; # length $a >= length $b;
403 my $b = shift;
404 my $lev = shift;
405 for my $v (0..$lev-1) {
406 for my $c (0..@$b-1){
407 return if $a->[$c][$v] != $b->[$c][$v];
408 }
d16e9e3d 409 }
a7fbee98 410 return 1;
d16e9e3d 411}
412
413
414##
415## bool _ignorableAtLevel(CE, level)
416##
417sub _ignorableAtLevel($$)
418{
a7fbee98 419 my $ce = shift;
420 return unless defined $ce;
421 my $lv = shift;
422 return ! grep { ! $ce->[$_] } 0..$lv-1;
d16e9e3d 423}
424
425
426##
427## string sortkey = getSortKey(string arg)
428##
429sub getSortKey
430{
a7fbee98 431 my $self = shift;
432 my $lev = $self->{level};
433 my $rCE = $self->splitCE(shift); # get an arrayref
434
435 # weight arrays
436 my @buf = grep defined(), map $self->getWt($_), @$rCE;
437
438 # make sort key
439 my @ret = ([],[],[],[]);
440 foreach my $v (0..$lev-1) {
441 foreach my $b (@buf) {
442 push @{ $ret[$v] }, $b->[$v] if $b->[$v];
443 }
444 }
445 foreach (@{ $self->{backwards} }) {
446 my $v = $_ - 1;
447 @{ $ret[$v] } = reverse @{ $ret[$v] };
45394607 448 }
45394607 449
a7fbee98 450 # modification of tertiary weights
451 if ($self->{upper_before_lower}) {
452 foreach (@{ $ret[2] }) {
453 if (0x8 <= $_ && $_ <= 0xC) { $_ -= 6 } # lower
454 elsif (0x2 <= $_ && $_ <= 0x6) { $_ += 6 } # upper
455 elsif ($_ == 0x1C) { $_ += 1 } # square upper
456 elsif ($_ == 0x1D) { $_ -= 1 } # square lower
457 }
45394607 458 }
a7fbee98 459 if ($self->{katakana_before_hiragana}) {
460 foreach (@{ $ret[2] }) {
461 if (0x0F <= $_ && $_ <= 0x13) { $_ -= 2 } # katakana
462 elsif (0x0D <= $_ && $_ <= 0x0E) { $_ += 5 } # hiragana
463 }
45394607 464 }
a7fbee98 465 join "\0\0", map pack('n*', @$_), @ret;
45394607 466}
467
468
469##
d16e9e3d 470## int compare = cmp(string a, string b)
45394607 471##
5398038e 472sub cmp { $_[0]->getSortKey($_[1]) cmp $_[0]->getSortKey($_[2]) }
473sub eq { $_[0]->getSortKey($_[1]) eq $_[0]->getSortKey($_[2]) }
474sub ne { $_[0]->getSortKey($_[1]) ne $_[0]->getSortKey($_[2]) }
475sub lt { $_[0]->getSortKey($_[1]) lt $_[0]->getSortKey($_[2]) }
476sub le { $_[0]->getSortKey($_[1]) le $_[0]->getSortKey($_[2]) }
477sub gt { $_[0]->getSortKey($_[1]) gt $_[0]->getSortKey($_[2]) }
478sub ge { $_[0]->getSortKey($_[1]) ge $_[0]->getSortKey($_[2]) }
45394607 479
480##
d16e9e3d 481## list[strings] sorted = sort(list[strings] arg)
45394607 482##
a7fbee98 483sub sort {
484 my $obj = shift;
485 return
486 map { $_->[1] }
487 sort{ $a->[0] cmp $b->[0] }
488 map [ $obj->getSortKey($_), $_ ], @_;
45394607 489}
490
491##
d16e9e3d 492## list[arrayrefs] CE = _derivCE(int codepoint)
45394607 493##
a7fbee98 494sub _derivCE {
495 my $code = shift;
496 my $a = UNDEFINED + ($code >> 15); # ok
497 my $b = ($code & 0x7FFF) | 0x8000; # ok
498# my $a = 0xFFC2 + ($code >> 15); # ng
499# my $b = $code & 0x7FFF | 0x1000; # ng
500 $b ? ([$a,2,1,$code],[$b,0,0,$code]) : [$a,2,1,$code];
45394607 501}
502
503##
504## "hhhh hhhh hhhh" to (dddd, dddd, dddd)
505##
a7fbee98 506sub _getHexArray { map hex, $_[0] =~ /([0-9a-fA-F]+)/g }
5398038e 507
a7fbee98 508#
509# $code must be in Hangul syllable.
510# Check it before you enter here.
511#
5398038e 512sub _decompHangul {
513 my $code = shift;
5398038e 514 my $SIndex = $code - 0xAC00;
515 my $LIndex = int( $SIndex / 588);
516 my $VIndex = int(($SIndex % 588) / 28);
517 my $TIndex = $SIndex % 28;
518 return (
a7fbee98 519 0x1100 + $LIndex,
520 0x1161 + $VIndex,
521 $TIndex ? (0x11A7 + $TIndex) : (),
5398038e 522 );
45394607 523}
524
5251;
526__END__
527
528=head1 NAME
529
a7fbee98 530Unicode::Collate - Unicode Collation Algorithm
45394607 531
532=head1 SYNOPSIS
533
534 use Unicode::Collate;
535
536 #construct
5398038e 537 $Collator = Unicode::Collate->new(%tailoring);
45394607 538
539 #sort
5398038e 540 @sorted = $Collator->sort(@not_sorted);
45394607 541
542 #compare
a7fbee98 543 $result = $Collator->cmp($a, $b); # returns 1, 0, or -1.
45394607 544
545=head1 DESCRIPTION
546
547=head2 Constructor and Tailoring
548
d16e9e3d 549The C<new> method returns a collator object.
550
5398038e 551 $Collator = Unicode::Collate->new(
45394607 552 alternate => $alternate,
553 backwards => $levelNumber, # or \@levelNumbers
554 entry => $element,
555 normalization => $normalization_form,
556 ignoreName => qr/$ignoreName/,
557 ignoreChar => qr/$ignoreChar/,
558 katakana_before_hiragana => $bool,
559 level => $collationLevel,
560 overrideCJK => \&overrideCJK,
561 overrideHangul => \&overrideHangul,
562 preprocess => \&preprocess,
563 rearrange => \@charList,
564 table => $filename,
565 undefName => qr/$undefName/,
566 undefChar => qr/$undefChar/,
567 upper_before_lower => $bool,
568 );
a7fbee98 569 # if %tailoring is false (i.e. empty),
5398038e 570 # $Collator should do the default collation.
45394607 571
572=over 4
573
574=item alternate
575
576-- see 3.2.2 Alternate Weighting, UTR #10.
577
a7fbee98 578This key allows to alternate weighting for variable collation elements,
579which are marked with an ASTERISK in the table
580(NOTE: Many punction marks and symbols are variable in F<allkeys.txt>).
581
582 alternate => 'blanked', 'non-ignorable', 'shifted', or 'shift-trimmed'.
45394607 583
a7fbee98 584These names are case-insensitive.
45394607 585By default (if specification is omitted), 'shifted' is adopted.
586
a7fbee98 587 'Blanked' Variable elements are ignorable at levels 1 through 3;
588 considered at the 4th level.
589
590 'Non-ignorable' Variable elements are not reset to ignorable.
591
592 'Shifted' Variable elements are ignorable at levels 1 through 3
593 their level 4 weight is replaced by the old level 1 weight.
594 Level 4 weight for Non-Variable elements is 0xFFFF.
595
596 'Shift-Trimmed' Same as 'shifted', but all FFFF's at the 4th level
597 are trimmed.
598
45394607 599=item backwards
600
601-- see 3.1.2 French Accents, UTR #10.
602
603 backwards => $levelNumber or \@levelNumbers
604
605Weights in reverse order; ex. level 2 (diacritic ordering) in French.
606If omitted, forwards at all the levels.
607
608=item entry
609
610-- see 3.1 Linguistic Features; 3.2.1 File Format, UTR #10.
611
a7fbee98 612Overrides a default order or defines additional collation elements
45394607 613
614 entry => <<'ENTRIES', # use the UCA file format
a7fbee98 61500E6 ; [.0861.0020.0002.00E6] [.08B1.0020.0002.00E6] # ligature <ae> as <a><e>
45394607 6160063 0068 ; [.0893.0020.0002.0063] # "ch" in traditional Spanish
6170043 0068 ; [.0893.0020.0008.0043] # "Ch" in traditional Spanish
618ENTRIES
619
620=item ignoreName
621
622=item ignoreChar
623
624-- see Completely Ignorable, 3.2.2 Alternate Weighting, UTR #10.
625
a7fbee98 626Makes the entry in the table ignorable.
627If a collation element is ignorable,
45394607 628it is ignored as if the element had been deleted from there.
629
a7fbee98 630E.g. when 'a' and 'e' are ignorable,
45394607 631'element' is equal to 'lament' (or 'lmnt').
632
633=item level
634
635-- see 4.3 Form a sort key for each string, UTR #10.
636
637Set the maximum level.
638Any higher levels than the specified one are ignored.
639
640 Level 1: alphabetic ordering
641 Level 2: diacritic ordering
642 Level 3: case ordering
643 Level 4: tie-breaking (e.g. in the case when alternate is 'shifted')
644
645 ex.level => 2,
646
a7fbee98 647If omitted, the maximum is the 4th.
648
45394607 649=item normalization
650
651-- see 4.1 Normalize each input string, UTR #10.
652
905aa9f0 653If specified, strings are normalized before preparation of sort keys
45394607 654(the normalization is executed after preprocess).
655
656As a form name, one of the following names must be used.
657
658 'C' or 'NFC' for Normalization Form C
659 'D' or 'NFD' for Normalization Form D
660 'KC' or 'NFKC' for Normalization Form KC
661 'KD' or 'NFKD' for Normalization Form KD
662
663If omitted, the string is put into Normalization Form D.
664
a7fbee98 665If C<undef> is passed explicitly as the value for this key,
45394607 666any normalization is not carried out (this may make tailoring easier
667if any normalization is not desired).
668
669see B<CAVEAT>.
670
671=item overrideCJK
672
45394607 673-- see 7.1 Derived Collation Elements, UTR #10.
674
675By default, mapping of CJK Unified Ideographs
a7fbee98 676uses the Unicode codepoint order.
677But the mapping of CJK Unified Ideographs may be overrided.
45394607 678
a7fbee98 679ex. CJK Unified Ideographs in the JIS code point order.
45394607 680
681 overrideCJK => sub {
a7fbee98 682 my $u = shift; # get a Unicode codepoint
683 my $b = pack('n', $u); # to UTF-16BE
684 my $s = your_unicode_to_sjis_converter($b); # convert
685 my $n = unpack('n', $s); # convert sjis to short
686 [ $n, 0x20, 0x2, $u ]; # return the collation element
45394607 687 },
688
a7fbee98 689ex. ignores all CJK Unified Ideographs.
690
691 overrideCJK => sub {()}, # CODEREF returning empty list
692
693 # where ->eq("Pe\x{4E00}rl", "Perl") is true
694 # as U+4E00 is a CJK Unified Ideograph and to be ignorable.
695
696If C<undef> is passed explicitly as the value for this key,
697weights for CJK Unified Ideographs are treated as undefined.
698But assignment of weight for CJK Unified Ideographs
699in table or L<entry> is still valid.
700
701=item overrideHangul
702
703-- see 7.1 Derived Collation Elements, UTR #10.
704
705By default, Hangul Syllables are decomposed into Hangul Jamo.
706But the mapping of Hangul Syllables may be overrided.
707
708This tag works like L<overrideCJK>, so see there for examples.
709
45394607 710If you want to override the mapping of Hangul Syllables,
711the Normalization Forms D and KD are not appropriate
712(they will be decomposed before overriding).
713
a7fbee98 714If C<undef> is passed explicitly as the value for this key,
715weight for Hangul Syllables is treated as undefined
716without decomposition into Hangul Jamo.
717But definition of weight for Hangul Syllables
718in table or L<entry> is still valid.
719
45394607 720=item preprocess
721
722-- see 5.1 Preprocessing, UTR #10.
723
724If specified, the coderef is used to preprocess
725before the formation of sort keys.
726
a7fbee98 727ex. dropping English articles, such as "a" or "the".
45394607 728Then, "the pen" is before "a pencil".
729
730 preprocess => sub {
731 my $str = shift;
a7fbee98 732 $str =~ s/\b(?:an?|the)\s+//gi;
45394607 733 $str;
734 },
735
736=item rearrange
737
738-- see 3.1.3 Rearrangement, UTR #10.
739
740Characters that are not coded in logical order and to be rearranged.
a7fbee98 741By default,
45394607 742
743 rearrange => [ 0x0E40..0x0E44, 0x0EC0..0x0EC4 ],
744
a7fbee98 745If you want to disallow any rearrangement,
746pass C<undef> or C<[]> (a reference to an empty list)
747as the value for this key.
748
45394607 749=item table
750
751-- see 3.2 Default Unicode Collation Element Table, UTR #10.
752
753You can use another element table if desired.
754The table file must be in your C<lib/Unicode/Collate> directory.
755
756By default, the file C<lib/Unicode/Collate/allkeys.txt> is used.
757
a7fbee98 758If C<undef> is passed explicitly as the value for this key,
759no file is read (but you can define collation elements via L<entry>).
760
761A typical way to define a collation element table
762without any file of table:
763
764 $onlyABC = Unicode::Collate->new(
765 table => undef,
766 entry => << 'ENTRIES',
7670061 ; [.0101.0020.0002.0061] # LATIN SMALL LETTER A
7680041 ; [.0101.0020.0008.0041] # LATIN CAPITAL LETTER A
7690062 ; [.0102.0020.0002.0062] # LATIN SMALL LETTER B
7700042 ; [.0102.0020.0008.0042] # LATIN CAPITAL LETTER B
7710063 ; [.0103.0020.0002.0063] # LATIN SMALL LETTER C
7720043 ; [.0103.0020.0008.0043] # LATIN CAPITAL LETTER C
773ENTRIES
774 );
905aa9f0 775
45394607 776=item undefName
777
778=item undefChar
779
780-- see 6.3.4 Reducing the Repertoire, UTR #10.
781
782Undefines the collation element as if it were unassigned in the table.
783This reduces the size of the table.
784If an unassigned character appears in the string to be collated,
785the sort key is made from its codepoint
786as a single-character collation element,
787as it is greater than any other assigned collation elements
788(in the codepoint order among the unassigned characters).
789But, it'd be better to ignore characters
790unfamiliar to you and maybe never used.
791
792=item katakana_before_hiragana
793
794=item upper_before_lower
795
796-- see 6.6 Case Comparisons; 7.3.1 Tertiary Weight Table, UTR #10.
797
798By default, lowercase is before uppercase
799and hiragana is before katakana.
800
a7fbee98 801If the tag is made true, this is reversed.
802
803B<NOTE>: These tags simplemindedly assume
804any lowercase/uppercase or hiragana/katakana distinctions
805should occur in level 3, and their weights at level 3
806should be same as those mentioned in 7.3.1, UTR #10.
807If you define your collation elements which violates this,
808these tags doesn't work validly.
45394607 809
810=back
811
812=head2 Other methods
813
814=over 4
815
5398038e 816=item C<@sorted = $Collator-E<gt>sort(@not_sorted)>
45394607 817
818Sorts a list of strings.
819
5398038e 820=item C<$result = $Collator-E<gt>cmp($a, $b)>
45394607 821
822Returns 1 (when C<$a> is greater than C<$b>)
823or 0 (when C<$a> is equal to C<$b>)
824or -1 (when C<$a> is lesser than C<$b>).
825
5398038e 826=item C<$result = $Collator-E<gt>eq($a, $b)>
827
828=item C<$result = $Collator-E<gt>ne($a, $b)>
829
830=item C<$result = $Collator-E<gt>lt($a, $b)>
831
832=item C<$result = $Collator-E<gt>le($a, $b)>
833
834=item C<$result = $Collator-E<gt>gt($a, $b)>
835
836=item C<$result = $Collator-E<gt>ge($a, $b)>
837
a7fbee98 838They works like the same name operators as theirs.
5398038e 839
840 eq : whether $a is equal to $b.
841 ne : whether $a is not equal to $b.
842 lt : whether $a is lesser than $b.
843 le : whether $a is lesser than $b or equal to $b.
844 gt : whether $a is greater than $b.
845 ge : whether $a is greater than $b or equal to $b.
846
847=item C<$sortKey = $Collator-E<gt>getSortKey($string)>
45394607 848
849-- see 4.3 Form a sort key for each string, UTR #10.
850
851Returns a sort key.
852
853You compare the sort keys using a binary comparison
854and get the result of the comparison of the strings using UCA.
855
5398038e 856 $Collator->getSortKey($a) cmp $Collator->getSortKey($b)
45394607 857
858 is equivalent to
859
5398038e 860 $Collator->cmp($a, $b)
45394607 861
a7fbee98 862=item C<$sortKeyForm = $Collator-E<gt>viewSortKey($string)>
863
864Returns a string formalized to display a sort key.
865Weights are enclosed with C<'['> and C<']'>
866and level boundaries are denoted by C<'|'>.
867
868 use Unicode::Collate;
869 my $c = Unicode::Collate->new();
870 print $c->viewSortKey("Perl"),"\n";
871
872 # output:
873 # [09B3 08B1 09CB 094F|0020 0020 0020 0020|0008 0002 0002 0002|FFFF FFFF FFFF FFFF]
874 # Level 1 Level 2 Level 3 Level 4
875
5398038e 876=item C<$position = $Collator-E<gt>index($string, $substring)>
d16e9e3d 877
5398038e 878=item C<($position, $length) = $Collator-E<gt>index($string, $substring)>
d16e9e3d 879
880-- see 6.8 Searching, UTR #10.
881
882If C<$substring> matches a part of C<$string>, returns
883the position of the first occurrence of the matching part in scalar context;
884in list context, returns a two-element list of
885the position and the length of the matching part.
886
887B<Notice> that the length of the matching part may differ from
888the length of C<$substring>.
889
890B<Note> that the position and the length are counted on the string
891after the process of preprocess, normalization, and rearrangement.
892Therefore, in case the specified string is not binary equal to
893the preprocessed/normalized/rearranged string, the position and the length
a7fbee98 894may differ form those on the specified string. But it is guaranteed
d16e9e3d 895that, if matched, it returns a non-negative value as C<$position>.
896
897If C<$substring> does not match any part of C<$string>,
898returns C<-1> in scalar context and
899an empty list in list context.
900
901e.g. you say
902
5398038e 903 my $Collator = Unicode::Collate->new( normalization => undef, level => 1 );
d16e9e3d 904 my $str = "Ich mu\x{00DF} studieren.";
905 my $sub = "m\x{00FC}ss";
906 my $match;
a7fbee98 907 if (my($pos,$len) = $Collator->index($str, $sub)) {
5398038e 908 $match = substr($str, $pos, $len);
d16e9e3d 909 }
910
911and get C<"mu\x{00DF}"> in C<$match> since C<"mu>E<223>C<">
912is primary equal to C<"m>E<252>C<ss">.
913
45394607 914=back
915
916=head2 EXPORT
917
918None by default.
919
4a2e806c 920=head2 TODO
921
922Unicode::Collate has not been ported to EBCDIC. The code mostly would
923work just fine but a decision needs to be made: how the module should
924work in EBCDIC? Should the low 256 characters be understood as
925Unicode or as EBCDIC code points? Should one be chosen or should
926there be a way to do either? Or should such translation be left
927outside the module for the user to do, for example by using
928Encode::from_to()?
a7fbee98 929(or utf8::unicode_to_native()/utf8::native_to_unicode()?)
4a2e806c 930
45394607 931=head2 CAVEAT
932
933Use of the C<normalization> parameter requires
934the B<Unicode::Normalize> module.
935
5398038e 936If you need not it (say, in the case when you need not
45394607 937handle any combining characters),
938assign C<normalization =E<gt> undef> explicitly.
939
5398038e 940-- see 6.5 Avoiding Normalization, UTR #10.
941
a7fbee98 942=head2 BUGS
943
944C<index()> is an experimental method and
945its return value may be unreliable.
946The correct implementation for C<index()> must be based
947on Locale-Sensitive Support: Level 3 in UTR #18,
948F<Unicode Regular Expression Guidelines>.
949
950See also 4.2 Locale-Dependent Graphemes in UTR #18.
951
45394607 952=head1 AUTHOR
953
954SADAHIRO Tomoyuki, E<lt>SADAHIRO@cpan.orgE<gt>
955
956 http://homepage1.nifty.com/nomenclator/perl/
957
a7fbee98 958 Copyright(C) 2001-2002, SADAHIRO Tomoyuki. Japan. All rights reserved.
45394607 959
a7fbee98 960 This library is free software; you can redistribute it
961 and/or modify it under the same terms as Perl itself.
45394607 962
963=head1 SEE ALSO
964
965=over 4
966
a7fbee98 967=item http://www.unicode.org/unicode/reports/tr10/
45394607 968
a7fbee98 969Unicode Collation Algorithm - UTR #10
45394607 970
a7fbee98 971=item http://www.unicode.org/unicode/reports/tr10/allkeys.txt
972
973The Default Unicode Collation Element Table
45394607 974
a7fbee98 975=item http://www.unicode.org/unicode/reports/tr15/
976
977Unicode Normalization Forms - UAX #15
978
979=item http://www.unicode.org/unicode/reports/tr18
980
981Unicode Regular Expression Guidelines - UTR #18
982
983=item L<Unicode::Normalize>
45394607 984
45394607 985=back
986
987=cut