Commas, schcommas.
[p5sagit/p5-mst-13.2.git] / lib / Unicode / Collate.pm
CommitLineData
45394607 1package Unicode::Collate;
2
4a2e806c 3BEGIN {
4 if (ord("A") == 193) {
5 die "Unicode::Collate not ported to EBCDIC\n";
6 }
7}
8
45394607 9use 5.006;
10use strict;
11use warnings;
12use Carp;
e69a2255 13use File::Spec;
5398038e 14
45394607 15require Exporter;
16
327745dc 17our $VERSION = '0.12';
45394607 18our $PACKAGE = __PACKAGE__;
19
20our @ISA = qw(Exporter);
21
22our %EXPORT_TAGS = ();
23our @EXPORT_OK = ();
24our @EXPORT = ();
25
26(our $Path = $INC{'Unicode/Collate.pm'}) =~ s/\.pm$//;
27our $KeyFile = "allkeys.txt";
28
3164dd77 29our $UNICODE_VERSION;
30
327745dc 31eval { require Unicode::UCD };
32
33unless ($@) {
34 $UNICODE_VERSION = Unicode::UCD::UnicodeVersion();
35}
36else { # XXX, Perl 5.6.1
3164dd77 37 my($f, $fh);
38 foreach my $d (@INC) {
39 use File::Spec;
3164dd77 40 $f = File::Spec->catfile($d, "unicode", "Unicode.301");
41 if (open($fh, $f)) {
327745dc 42 $UNICODE_VERSION = '3.0.1';
3164dd77 43 close $fh;
44 last;
45 }
46 }
47}
48
5398038e 49our $getCombinClass; # coderef for combining class from Unicode::Normalize
45394607 50
51use constant Min2 => 0x20; # minimum weight at level 2
52use constant Min3 => 0x02; # minimum weight at level 3
a7fbee98 53use constant UNDEFINED => 0xFF80; # special value for undefined CE's
54
55our $DefaultRearrange = [ 0x0E40..0x0E44, 0x0EC0..0x0EC4 ];
45394607 56
3164dd77 57sub UCA_Version { "8.0" }
58
327745dc 59sub Base_Unicode_Version { $UNICODE_VERSION || 'unknown' }
3164dd77 60
45394607 61##
62## constructor
63##
64sub new
65{
a7fbee98 66 my $class = shift;
67 my $self = bless { @_ }, $class;
68
69 # alternate lowercased
70 $self->{alternate} =
71 ! exists $self->{alternate} ? 'shifted' : lc($self->{alternate});
72
73 croak "$PACKAGE unknown alternate tag name: $self->{alternate}"
74 unless $self->{alternate} eq 'blanked'
75 || $self->{alternate} eq 'non-ignorable'
76 || $self->{alternate} eq 'shifted'
77 || $self->{alternate} eq 'shift-trimmed';
78
79 # collation level
80 $self->{level} ||= 4;
81
82 croak "Illegal level lower than 1 (passed $self->{level})."
83 if $self->{level} < 1;
84 croak "A level higher than 4 (passed $self->{level}) is not supported."
85 if 4 < $self->{level};
86
87 # overrideHangul and -CJK
88 # If true: CODEREF used; '': default; undef: derived elements
89 $self->{overrideHangul} = ''
90 if ! exists $self->{overrideHangul};
91 $self->{overrideCJK} = ''
92 if ! exists $self->{overrideCJK};
93
94 # normalization form
95 $self->{normalization} = 'D'
96 if ! exists $self->{normalization};
97 $self->{UNF} = undef;
98
99 if (defined $self->{normalization}) {
100 eval { require Unicode::Normalize };
101 croak "Unicode/Normalize.pm is required to normalize strings: $@"
102 if $@;
103
104 Unicode::Normalize->import();
105 $getCombinClass = \&Unicode::Normalize::getCombinClass
106 if ! $getCombinClass;
107
108 $self->{UNF} =
109 $self->{normalization} =~ /^(?:NF)?C$/ ? \&NFC :
110 $self->{normalization} =~ /^(?:NF)?D$/ ? \&NFD :
111 $self->{normalization} =~ /^(?:NF)?KC$/ ? \&NFKC :
112 $self->{normalization} =~ /^(?:NF)?KD$/ ? \&NFKD :
113 croak "$PACKAGE unknown normalization form name: "
114 . $self->{normalization};
115 }
45394607 116
a7fbee98 117 # Open a table file.
118 # If undef is passed explicitly, no file is read.
119 $self->{table} = $KeyFile
120 if ! exists $self->{table};
121 $self->read_table
122 if defined $self->{table};
905aa9f0 123
a7fbee98 124 if ($self->{entry}) {
125 $self->parseEntry($_) foreach split /\n/, $self->{entry};
126 }
905aa9f0 127
a7fbee98 128 # backwards
129 $self->{backwards} ||= [ ];
130 $self->{backwards} = [ $self->{backwards} ]
131 if ! ref $self->{backwards};
905aa9f0 132
a7fbee98 133 # rearrange
134 $self->{rearrange} = $DefaultRearrange
135 if ! exists $self->{rearrange};
136 $self->{rearrange} = []
137 if ! defined $self->{rearrange};
138 croak "$PACKAGE: A list for rearrangement must be store in an ARRAYREF"
139 if ! ref $self->{rearrange};
140
141 # keys of $self->{rearrangeHash} are $self->{rearrange}.
142 $self->{rearrangeHash} = undef;
143
144 if (@{ $self->{rearrange} }) {
145 @{ $self->{rearrangeHash} }{ @{ $self->{rearrange} } } = ();
146 }
147
148 return $self;
149}
905aa9f0 150
151sub read_table {
a7fbee98 152 my $self = shift;
153 my $file = $self->{table} ne '' ? $self->{table} : $KeyFile;
154
e69a2255 155 my $filepath = File::Spec->catfile($Path, $file);
156 open my $fk, "<$filepath"
157 or croak "File does not exist at $filepath";
a7fbee98 158
159 while (<$fk>) {
160 next if /^\s*#/;
161 if (/^\s*\@/) {
162 if (/^\@version\s*(\S*)/) {
163 $self->{version} ||= $1;
164 }
165 elsif (/^\@alternate\s+(.*)/) {
166 $self->{alternate} ||= $1;
167 }
168 elsif (/^\@backwards\s+(.*)/) {
169 push @{ $self->{backwards} }, $1;
170 }
171 elsif (/^\@rearrange\s+(.*)/) {
172 push @{ $self->{rearrange} }, _getHexArray($1);
173 }
174 next;
175 }
176 $self->parseEntry($_);
45394607 177 }
a7fbee98 178 close $fk;
45394607 179}
180
905aa9f0 181
45394607 182##
183## get $line, parse it, and write an entry in $self
184##
185sub parseEntry
186{
a7fbee98 187 my $self = shift;
188 my $line = shift;
189 my($name, $ele, @key);
190
191 return if $line !~ /^\s*[0-9A-Fa-f]/;
192
193 # removes comment and gets name
194 $name = $1
195 if $line =~ s/[#%]\s*(.*)//;
196 return if defined $self->{undefName} && $name =~ /$self->{undefName}/;
197
198 # gets element
199 my($e, $k) = split /;/, $line;
200 croak "Wrong Entry: <charList> must be separated by ';' from <collElement>"
201 if ! $k;
202
203 my @e = _getHexArray($e);
204 $ele = pack('U*', @e);
205 return if defined $self->{undefChar} && $ele =~ /$self->{undefChar}/;
206
207 # get sort key
208 if (defined $self->{ignoreName} && $name =~ /$self->{ignoreName}/ ||
209 defined $self->{ignoreChar} && $ele =~ /$self->{ignoreChar}/)
210 {
211 $self->{entries}{$ele} = $self->{ignored}{$ele} = 1;
45394607 212 }
a7fbee98 213 else {
214 my $combining = 1; # primary = 0, secondary != 0;
215
216 foreach my $arr ($k =~ /\[([^\[\]]+)\]/g) { # SPACEs allowed
217 my $var = $arr =~ /\*/; # exactly /^\*/ but be lenient.
218 push @key, $self->altCE($var, _getHexArray($arr));
219 $combining = 0 unless $key[-1][0] == 0 && $key[-1][1] != 0;
220 }
221 $self->{entries}{$ele} = \@key;
222 $self->{combining}{$ele} = 1 if $combining;
223 }
224 $self->{maxlength}{ord $ele} = scalar @e if @e > 1;
45394607 225}
226
227
228##
d16e9e3d 229## arrayref CE = altCE(bool variable?, list[num] weights)
45394607 230##
d16e9e3d 231sub altCE
45394607 232{
a7fbee98 233 my $self = shift;
234 my $var = shift;
235 my @c = @_;
236
237 $self->{alternate} eq 'blanked' ?
238 $var ? [0,0,0,$c[3]] : \@c :
239 $self->{alternate} eq 'non-ignorable' ?
240 \@c :
241 $self->{alternate} eq 'shifted' ?
242 $var ? [0,0,0,$c[0] ] : [ @c[0..2], $c[0]+$c[1]+$c[2] ? 0xFFFF : 0 ] :
243 $self->{alternate} eq 'shift-trimmed' ?
244 $var ? [0,0,0,$c[0] ] : [ @c[0..2], 0 ] :
245 croak "$PACKAGE unknown alternate name: $self->{alternate}";
45394607 246}
247
248##
d16e9e3d 249## string hex_sortkey = splitCE(string arg)
45394607 250##
251sub viewSortKey
252{
a7fbee98 253 my $self = shift;
254 my $key = $self->getSortKey(@_);
255 my $view = join " ", map sprintf("%04X", $_), unpack 'n*', $key;
256 $view =~ s/ ?0000 ?/|/g;
257 return "[$view]";
45394607 258}
259
d16e9e3d 260
45394607 261##
d16e9e3d 262## list[strings] elements = splitCE(string arg)
45394607 263##
d16e9e3d 264sub splitCE
45394607 265{
a7fbee98 266 my $self = shift;
267 my $code = $self->{preprocess};
268 my $norm = $self->{UNF};
269 my $ent = $self->{entries};
270 my $max = $self->{maxlength};
271 my $reH = $self->{rearrangeHash};
272
273 my $str = ref $code ? &$code(shift) : shift;
274 $str = &$norm($str) if ref $norm;
275
276 my @src = unpack('U*', $str);
277 my @buf;
278
279 # rearrangement
280 if ($reH) {
281 for (my $i = 0; $i < @src; $i++) {
282 if (exists $reH->{ $src[$i] } && $i + 1 < @src) {
283 ($src[$i], $src[$i+1]) = ($src[$i+1], $src[$i]);
284 $i++;
285 }
286 }
45394607 287 }
45394607 288
a7fbee98 289 for (my $i = 0; $i < @src; $i++) {
290 my $ch;
291 my $u = $src[$i];
292
293 # non-characters
294 next unless defined $u;
295 next if $u < 0 || 0x10FFFF < $u # out of range
296 || (0xD800 <= $u && $u <= 0xDFFF); # unpaired surrogates
297 my $four = $u & 0xFFFF;
298 next if $four == 0xFFFE || $four == 0xFFFF;
299
300 if ($max->{$u}) { # contract
301 for (my $j = $max->{$u}; $j >= 1; $j--) {
302 next unless $i+$j-1 < @src;
303 $ch = pack 'U*', @src[$i .. $i+$j-1];
304 $i += $j-1, last if $ent->{$ch};
305 }
306 } else {
307 $ch = pack('U', $u);
308 }
309
310 # with Combining Char (UTS#10, 4.2.1), here requires Unicode::Normalize.
311 if ($getCombinClass && defined $ch) {
312 for (my $j = $i+1; $j < @src; $j++) {
313 next unless defined $src[$j];
314 last unless $getCombinClass->( $src[$j] );
315 my $comb = pack 'U', $src[$j];
316 next if ! $ent->{ $ch.$comb };
317 $ch .= $comb;
318 $src[$j] = undef;
319 }
320 }
321 push @buf, $ch;
45394607 322 }
a7fbee98 323 wantarray ? @buf : \@buf;
d16e9e3d 324}
45394607 325
d16e9e3d 326
327##
328## list[arrayrefs] weight = getWt(string element)
329##
330sub getWt
331{
a7fbee98 332 my $self = shift;
333 my $ch = shift;
334 my $ent = $self->{entries};
335 my $ign = $self->{ignored};
336 my $cjk = $self->{overrideCJK};
337 my $hang = $self->{overrideHangul};
338
339 return if !defined $ch || $ign->{$ch}; # ignored
340 return @{ $ent->{$ch} } if $ent->{$ch};
341 my $u = unpack('U', $ch);
342
343 if (0xAC00 <= $u && $u <= 0xD7A3) { # is_Hangul
344 return $hang
345 ? &$hang($u)
346 : defined $hang
347 ? map({
348 my $v = $_;
349 my $ar = $ent->{pack('U', $v)};
350 $ar ? @$ar : map($self->altCE(0,@$_), _derivCE($v));
351 } _decompHangul($u))
352 : map($self->altCE(0,@$_), _derivCE($u));
353 }
354 elsif (0x3400 <= $u && $u <= 0x4DB5 ||
355 0x4E00 <= $u && $u <= 0x9FA5 ||
356 0x20000 <= $u && $u <= 0x2A6D6) { # is_CJK
357 return $cjk
358 ? &$cjk($u)
359 : defined $cjk && $u <= 0xFFFF
360 ? $self->altCE(0, ($u, 0x20, 0x02, $u))
361 : map($self->altCE(0,@$_), _derivCE($u));
362 }
363 else {
364 return map($self->altCE(0,@$_), _derivCE($u));
365 }
d16e9e3d 366}
367
368##
369## int = index(string, substring)
370##
371sub index
372{
a7fbee98 373 my $self = shift;
374 my $lev = $self->{level};
375 my $comb = $self->{combining};
376 my $str = $self->splitCE(shift);
377 my $sub = $self->splitCE(shift);
378
379 return wantarray ? (0,0) : 0 if ! @$sub;
380 return wantarray ? () : -1 if ! @$str;
381
382 my @subWt = grep _ignorableAtLevel($_,$lev),
383 map $self->getWt($_), @$sub;
384
385 my(@strWt,@strPt);
386 my $count = 0;
387 for (my $i = 0; $i < @$str; $i++) {
388 my $go_ahead = 0;
389
390 my @tmp = grep _ignorableAtLevel($_,$lev), $self->getWt($str->[$i]);
391 $go_ahead += length $str->[$i];
392
393 # /*XXX*/ still broken.
394 # index("e\x{300}", "e") should be 'no match' at level 2 or higher
395 # as "e\x{300}" is a *single* grapheme cluster and not equal to "e".
396
397 # go ahead as far as we find a combining character;
398 while ($i + 1 < @$str &&
399 (! defined $str->[$i+1] || $comb->{ $str->[$i+1] }) ) {
400 $i++;
401 $go_ahead += length $str->[$i];
402 next if ! defined $str->[$i];
403 push @tmp,
404 grep _ignorableAtLevel($_,$lev), $self->getWt($str->[$i]);
405 }
406
407 push @strWt, @tmp;
408 push @strPt, ($count) x @tmp;
409 $count += $go_ahead;
410
411 while (@strWt >= @subWt) {
412 if (_eqArray(\@strWt, \@subWt, $lev)) {
413 my $pos = $strPt[0];
414 return wantarray ? ($pos, $count-$pos) : $pos;
415 }
416 shift @strWt;
417 shift @strPt;
418 }
d16e9e3d 419 }
a7fbee98 420 return wantarray ? () : -1;
d16e9e3d 421}
422
423##
424## bool _eqArray(arrayref, arrayref, level)
425##
426sub _eqArray($$$)
427{
a7fbee98 428 my $a = shift; # length $a >= length $b;
429 my $b = shift;
430 my $lev = shift;
431 for my $v (0..$lev-1) {
432 for my $c (0..@$b-1){
433 return if $a->[$c][$v] != $b->[$c][$v];
434 }
d16e9e3d 435 }
a7fbee98 436 return 1;
d16e9e3d 437}
438
439
440##
441## bool _ignorableAtLevel(CE, level)
442##
443sub _ignorableAtLevel($$)
444{
a7fbee98 445 my $ce = shift;
446 return unless defined $ce;
447 my $lv = shift;
448 return ! grep { ! $ce->[$_] } 0..$lv-1;
d16e9e3d 449}
450
451
452##
453## string sortkey = getSortKey(string arg)
454##
455sub getSortKey
456{
a7fbee98 457 my $self = shift;
458 my $lev = $self->{level};
459 my $rCE = $self->splitCE(shift); # get an arrayref
460
461 # weight arrays
462 my @buf = grep defined(), map $self->getWt($_), @$rCE;
463
464 # make sort key
465 my @ret = ([],[],[],[]);
466 foreach my $v (0..$lev-1) {
467 foreach my $b (@buf) {
468 push @{ $ret[$v] }, $b->[$v] if $b->[$v];
469 }
470 }
471 foreach (@{ $self->{backwards} }) {
472 my $v = $_ - 1;
473 @{ $ret[$v] } = reverse @{ $ret[$v] };
45394607 474 }
45394607 475
a7fbee98 476 # modification of tertiary weights
477 if ($self->{upper_before_lower}) {
478 foreach (@{ $ret[2] }) {
479 if (0x8 <= $_ && $_ <= 0xC) { $_ -= 6 } # lower
480 elsif (0x2 <= $_ && $_ <= 0x6) { $_ += 6 } # upper
481 elsif ($_ == 0x1C) { $_ += 1 } # square upper
482 elsif ($_ == 0x1D) { $_ -= 1 } # square lower
483 }
45394607 484 }
a7fbee98 485 if ($self->{katakana_before_hiragana}) {
486 foreach (@{ $ret[2] }) {
487 if (0x0F <= $_ && $_ <= 0x13) { $_ -= 2 } # katakana
488 elsif (0x0D <= $_ && $_ <= 0x0E) { $_ += 5 } # hiragana
489 }
45394607 490 }
a7fbee98 491 join "\0\0", map pack('n*', @$_), @ret;
45394607 492}
493
494
495##
d16e9e3d 496## int compare = cmp(string a, string b)
45394607 497##
5398038e 498sub cmp { $_[0]->getSortKey($_[1]) cmp $_[0]->getSortKey($_[2]) }
499sub eq { $_[0]->getSortKey($_[1]) eq $_[0]->getSortKey($_[2]) }
500sub ne { $_[0]->getSortKey($_[1]) ne $_[0]->getSortKey($_[2]) }
501sub lt { $_[0]->getSortKey($_[1]) lt $_[0]->getSortKey($_[2]) }
502sub le { $_[0]->getSortKey($_[1]) le $_[0]->getSortKey($_[2]) }
503sub gt { $_[0]->getSortKey($_[1]) gt $_[0]->getSortKey($_[2]) }
504sub ge { $_[0]->getSortKey($_[1]) ge $_[0]->getSortKey($_[2]) }
45394607 505
506##
d16e9e3d 507## list[strings] sorted = sort(list[strings] arg)
45394607 508##
a7fbee98 509sub sort {
510 my $obj = shift;
511 return
512 map { $_->[1] }
513 sort{ $a->[0] cmp $b->[0] }
514 map [ $obj->getSortKey($_), $_ ], @_;
45394607 515}
516
517##
d16e9e3d 518## list[arrayrefs] CE = _derivCE(int codepoint)
45394607 519##
a7fbee98 520sub _derivCE {
521 my $code = shift;
522 my $a = UNDEFINED + ($code >> 15); # ok
523 my $b = ($code & 0x7FFF) | 0x8000; # ok
524# my $a = 0xFFC2 + ($code >> 15); # ng
525# my $b = $code & 0x7FFF | 0x1000; # ng
526 $b ? ([$a,2,1,$code],[$b,0,0,$code]) : [$a,2,1,$code];
45394607 527}
528
529##
530## "hhhh hhhh hhhh" to (dddd, dddd, dddd)
531##
a7fbee98 532sub _getHexArray { map hex, $_[0] =~ /([0-9a-fA-F]+)/g }
5398038e 533
a7fbee98 534#
535# $code must be in Hangul syllable.
536# Check it before you enter here.
537#
5398038e 538sub _decompHangul {
539 my $code = shift;
5398038e 540 my $SIndex = $code - 0xAC00;
541 my $LIndex = int( $SIndex / 588);
542 my $VIndex = int(($SIndex % 588) / 28);
543 my $TIndex = $SIndex % 28;
544 return (
a7fbee98 545 0x1100 + $LIndex,
546 0x1161 + $VIndex,
547 $TIndex ? (0x11A7 + $TIndex) : (),
5398038e 548 );
45394607 549}
550
5511;
552__END__
553
554=head1 NAME
555
a7fbee98 556Unicode::Collate - Unicode Collation Algorithm
45394607 557
558=head1 SYNOPSIS
559
560 use Unicode::Collate;
561
562 #construct
5398038e 563 $Collator = Unicode::Collate->new(%tailoring);
45394607 564
565 #sort
5398038e 566 @sorted = $Collator->sort(@not_sorted);
45394607 567
568 #compare
a7fbee98 569 $result = $Collator->cmp($a, $b); # returns 1, 0, or -1.
45394607 570
571=head1 DESCRIPTION
572
573=head2 Constructor and Tailoring
574
d16e9e3d 575The C<new> method returns a collator object.
576
5398038e 577 $Collator = Unicode::Collate->new(
45394607 578 alternate => $alternate,
579 backwards => $levelNumber, # or \@levelNumbers
580 entry => $element,
581 normalization => $normalization_form,
582 ignoreName => qr/$ignoreName/,
583 ignoreChar => qr/$ignoreChar/,
584 katakana_before_hiragana => $bool,
585 level => $collationLevel,
586 overrideCJK => \&overrideCJK,
587 overrideHangul => \&overrideHangul,
588 preprocess => \&preprocess,
589 rearrange => \@charList,
590 table => $filename,
591 undefName => qr/$undefName/,
592 undefChar => qr/$undefChar/,
593 upper_before_lower => $bool,
594 );
a7fbee98 595 # if %tailoring is false (i.e. empty),
5398038e 596 # $Collator should do the default collation.
45394607 597
598=over 4
599
600=item alternate
601
602-- see 3.2.2 Alternate Weighting, UTR #10.
603
a7fbee98 604This key allows to alternate weighting for variable collation elements,
605which are marked with an ASTERISK in the table
606(NOTE: Many punction marks and symbols are variable in F<allkeys.txt>).
607
608 alternate => 'blanked', 'non-ignorable', 'shifted', or 'shift-trimmed'.
45394607 609
a7fbee98 610These names are case-insensitive.
45394607 611By default (if specification is omitted), 'shifted' is adopted.
612
a7fbee98 613 'Blanked' Variable elements are ignorable at levels 1 through 3;
614 considered at the 4th level.
615
616 'Non-ignorable' Variable elements are not reset to ignorable.
617
618 'Shifted' Variable elements are ignorable at levels 1 through 3
619 their level 4 weight is replaced by the old level 1 weight.
620 Level 4 weight for Non-Variable elements is 0xFFFF.
621
622 'Shift-Trimmed' Same as 'shifted', but all FFFF's at the 4th level
623 are trimmed.
624
45394607 625=item backwards
626
627-- see 3.1.2 French Accents, UTR #10.
628
629 backwards => $levelNumber or \@levelNumbers
630
631Weights in reverse order; ex. level 2 (diacritic ordering) in French.
632If omitted, forwards at all the levels.
633
634=item entry
635
636-- see 3.1 Linguistic Features; 3.2.1 File Format, UTR #10.
637
a7fbee98 638Overrides a default order or defines additional collation elements
45394607 639
640 entry => <<'ENTRIES', # use the UCA file format
a7fbee98 64100E6 ; [.0861.0020.0002.00E6] [.08B1.0020.0002.00E6] # ligature <ae> as <a><e>
45394607 6420063 0068 ; [.0893.0020.0002.0063] # "ch" in traditional Spanish
6430043 0068 ; [.0893.0020.0008.0043] # "Ch" in traditional Spanish
644ENTRIES
645
646=item ignoreName
647
648=item ignoreChar
649
650-- see Completely Ignorable, 3.2.2 Alternate Weighting, UTR #10.
651
a7fbee98 652Makes the entry in the table ignorable.
653If a collation element is ignorable,
45394607 654it is ignored as if the element had been deleted from there.
655
a7fbee98 656E.g. when 'a' and 'e' are ignorable,
45394607 657'element' is equal to 'lament' (or 'lmnt').
658
659=item level
660
661-- see 4.3 Form a sort key for each string, UTR #10.
662
663Set the maximum level.
664Any higher levels than the specified one are ignored.
665
666 Level 1: alphabetic ordering
667 Level 2: diacritic ordering
668 Level 3: case ordering
669 Level 4: tie-breaking (e.g. in the case when alternate is 'shifted')
670
671 ex.level => 2,
672
a7fbee98 673If omitted, the maximum is the 4th.
674
45394607 675=item normalization
676
677-- see 4.1 Normalize each input string, UTR #10.
678
905aa9f0 679If specified, strings are normalized before preparation of sort keys
45394607 680(the normalization is executed after preprocess).
681
682As a form name, one of the following names must be used.
683
684 'C' or 'NFC' for Normalization Form C
685 'D' or 'NFD' for Normalization Form D
686 'KC' or 'NFKC' for Normalization Form KC
687 'KD' or 'NFKD' for Normalization Form KD
688
689If omitted, the string is put into Normalization Form D.
690
a7fbee98 691If C<undef> is passed explicitly as the value for this key,
45394607 692any normalization is not carried out (this may make tailoring easier
693if any normalization is not desired).
694
695see B<CAVEAT>.
696
697=item overrideCJK
698
45394607 699-- see 7.1 Derived Collation Elements, UTR #10.
700
701By default, mapping of CJK Unified Ideographs
a7fbee98 702uses the Unicode codepoint order.
703But the mapping of CJK Unified Ideographs may be overrided.
45394607 704
a7fbee98 705ex. CJK Unified Ideographs in the JIS code point order.
45394607 706
707 overrideCJK => sub {
a7fbee98 708 my $u = shift; # get a Unicode codepoint
709 my $b = pack('n', $u); # to UTF-16BE
710 my $s = your_unicode_to_sjis_converter($b); # convert
711 my $n = unpack('n', $s); # convert sjis to short
712 [ $n, 0x20, 0x2, $u ]; # return the collation element
45394607 713 },
714
a7fbee98 715ex. ignores all CJK Unified Ideographs.
716
717 overrideCJK => sub {()}, # CODEREF returning empty list
718
719 # where ->eq("Pe\x{4E00}rl", "Perl") is true
720 # as U+4E00 is a CJK Unified Ideograph and to be ignorable.
721
722If C<undef> is passed explicitly as the value for this key,
723weights for CJK Unified Ideographs are treated as undefined.
724But assignment of weight for CJK Unified Ideographs
725in table or L<entry> is still valid.
726
727=item overrideHangul
728
729-- see 7.1 Derived Collation Elements, UTR #10.
730
731By default, Hangul Syllables are decomposed into Hangul Jamo.
732But the mapping of Hangul Syllables may be overrided.
733
734This tag works like L<overrideCJK>, so see there for examples.
735
45394607 736If you want to override the mapping of Hangul Syllables,
737the Normalization Forms D and KD are not appropriate
738(they will be decomposed before overriding).
739
a7fbee98 740If C<undef> is passed explicitly as the value for this key,
741weight for Hangul Syllables is treated as undefined
742without decomposition into Hangul Jamo.
743But definition of weight for Hangul Syllables
744in table or L<entry> is still valid.
745
45394607 746=item preprocess
747
748-- see 5.1 Preprocessing, UTR #10.
749
750If specified, the coderef is used to preprocess
751before the formation of sort keys.
752
a7fbee98 753ex. dropping English articles, such as "a" or "the".
45394607 754Then, "the pen" is before "a pencil".
755
756 preprocess => sub {
757 my $str = shift;
a7fbee98 758 $str =~ s/\b(?:an?|the)\s+//gi;
45394607 759 $str;
760 },
761
762=item rearrange
763
764-- see 3.1.3 Rearrangement, UTR #10.
765
766Characters that are not coded in logical order and to be rearranged.
a7fbee98 767By default,
45394607 768
769 rearrange => [ 0x0E40..0x0E44, 0x0EC0..0x0EC4 ],
770
a7fbee98 771If you want to disallow any rearrangement,
772pass C<undef> or C<[]> (a reference to an empty list)
773as the value for this key.
774
45394607 775=item table
776
777-- see 3.2 Default Unicode Collation Element Table, UTR #10.
778
779You can use another element table if desired.
780The table file must be in your C<lib/Unicode/Collate> directory.
781
782By default, the file C<lib/Unicode/Collate/allkeys.txt> is used.
783
a7fbee98 784If C<undef> is passed explicitly as the value for this key,
785no file is read (but you can define collation elements via L<entry>).
786
787A typical way to define a collation element table
788without any file of table:
789
790 $onlyABC = Unicode::Collate->new(
791 table => undef,
792 entry => << 'ENTRIES',
7930061 ; [.0101.0020.0002.0061] # LATIN SMALL LETTER A
7940041 ; [.0101.0020.0008.0041] # LATIN CAPITAL LETTER A
7950062 ; [.0102.0020.0002.0062] # LATIN SMALL LETTER B
7960042 ; [.0102.0020.0008.0042] # LATIN CAPITAL LETTER B
7970063 ; [.0103.0020.0002.0063] # LATIN SMALL LETTER C
7980043 ; [.0103.0020.0008.0043] # LATIN CAPITAL LETTER C
799ENTRIES
800 );
905aa9f0 801
45394607 802=item undefName
803
804=item undefChar
805
806-- see 6.3.4 Reducing the Repertoire, UTR #10.
807
808Undefines the collation element as if it were unassigned in the table.
809This reduces the size of the table.
810If an unassigned character appears in the string to be collated,
811the sort key is made from its codepoint
812as a single-character collation element,
813as it is greater than any other assigned collation elements
814(in the codepoint order among the unassigned characters).
815But, it'd be better to ignore characters
816unfamiliar to you and maybe never used.
817
818=item katakana_before_hiragana
819
820=item upper_before_lower
821
822-- see 6.6 Case Comparisons; 7.3.1 Tertiary Weight Table, UTR #10.
823
824By default, lowercase is before uppercase
825and hiragana is before katakana.
826
a7fbee98 827If the tag is made true, this is reversed.
828
829B<NOTE>: These tags simplemindedly assume
830any lowercase/uppercase or hiragana/katakana distinctions
831should occur in level 3, and their weights at level 3
832should be same as those mentioned in 7.3.1, UTR #10.
833If you define your collation elements which violates this,
834these tags doesn't work validly.
45394607 835
836=back
837
3164dd77 838=head2 Methods for Collation
45394607 839
840=over 4
841
5398038e 842=item C<@sorted = $Collator-E<gt>sort(@not_sorted)>
45394607 843
844Sorts a list of strings.
845
5398038e 846=item C<$result = $Collator-E<gt>cmp($a, $b)>
45394607 847
848Returns 1 (when C<$a> is greater than C<$b>)
849or 0 (when C<$a> is equal to C<$b>)
850or -1 (when C<$a> is lesser than C<$b>).
851
5398038e 852=item C<$result = $Collator-E<gt>eq($a, $b)>
853
854=item C<$result = $Collator-E<gt>ne($a, $b)>
855
856=item C<$result = $Collator-E<gt>lt($a, $b)>
857
858=item C<$result = $Collator-E<gt>le($a, $b)>
859
860=item C<$result = $Collator-E<gt>gt($a, $b)>
861
862=item C<$result = $Collator-E<gt>ge($a, $b)>
863
a7fbee98 864They works like the same name operators as theirs.
5398038e 865
866 eq : whether $a is equal to $b.
867 ne : whether $a is not equal to $b.
868 lt : whether $a is lesser than $b.
869 le : whether $a is lesser than $b or equal to $b.
870 gt : whether $a is greater than $b.
871 ge : whether $a is greater than $b or equal to $b.
872
873=item C<$sortKey = $Collator-E<gt>getSortKey($string)>
45394607 874
875-- see 4.3 Form a sort key for each string, UTR #10.
876
877Returns a sort key.
878
879You compare the sort keys using a binary comparison
880and get the result of the comparison of the strings using UCA.
881
5398038e 882 $Collator->getSortKey($a) cmp $Collator->getSortKey($b)
45394607 883
884 is equivalent to
885
5398038e 886 $Collator->cmp($a, $b)
45394607 887
a7fbee98 888=item C<$sortKeyForm = $Collator-E<gt>viewSortKey($string)>
889
890Returns a string formalized to display a sort key.
891Weights are enclosed with C<'['> and C<']'>
892and level boundaries are denoted by C<'|'>.
893
894 use Unicode::Collate;
895 my $c = Unicode::Collate->new();
896 print $c->viewSortKey("Perl"),"\n";
897
898 # output:
899 # [09B3 08B1 09CB 094F|0020 0020 0020 0020|0008 0002 0002 0002|FFFF FFFF FFFF FFFF]
900 # Level 1 Level 2 Level 3 Level 4
901
5398038e 902=item C<$position = $Collator-E<gt>index($string, $substring)>
d16e9e3d 903
5398038e 904=item C<($position, $length) = $Collator-E<gt>index($string, $substring)>
d16e9e3d 905
906-- see 6.8 Searching, UTR #10.
907
908If C<$substring> matches a part of C<$string>, returns
909the position of the first occurrence of the matching part in scalar context;
910in list context, returns a two-element list of
911the position and the length of the matching part.
912
913B<Notice> that the length of the matching part may differ from
914the length of C<$substring>.
915
916B<Note> that the position and the length are counted on the string
917after the process of preprocess, normalization, and rearrangement.
918Therefore, in case the specified string is not binary equal to
919the preprocessed/normalized/rearranged string, the position and the length
a7fbee98 920may differ form those on the specified string. But it is guaranteed
d16e9e3d 921that, if matched, it returns a non-negative value as C<$position>.
922
923If C<$substring> does not match any part of C<$string>,
924returns C<-1> in scalar context and
925an empty list in list context.
926
927e.g. you say
928
5398038e 929 my $Collator = Unicode::Collate->new( normalization => undef, level => 1 );
d16e9e3d 930 my $str = "Ich mu\x{00DF} studieren.";
931 my $sub = "m\x{00FC}ss";
932 my $match;
a7fbee98 933 if (my($pos,$len) = $Collator->index($str, $sub)) {
5398038e 934 $match = substr($str, $pos, $len);
d16e9e3d 935 }
936
937and get C<"mu\x{00DF}"> in C<$match> since C<"mu>E<223>C<">
938is primary equal to C<"m>E<252>C<ss">.
939
45394607 940=back
941
3164dd77 942=head2 Other Methods
943
944=over 4
945
946=item UCA_Version
947
948Returns the version number of Unicode Technical Standard 10
949this module consults.
950
951=item Base_Unicode_Version
952
953Returns the version number of the Unicode Standard
954this module is based on.
955
956=back
957
45394607 958=head2 EXPORT
959
960None by default.
961
4a2e806c 962=head2 TODO
963
964Unicode::Collate has not been ported to EBCDIC. The code mostly would
965work just fine but a decision needs to be made: how the module should
966work in EBCDIC? Should the low 256 characters be understood as
967Unicode or as EBCDIC code points? Should one be chosen or should
968there be a way to do either? Or should such translation be left
969outside the module for the user to do, for example by using
970Encode::from_to()?
a7fbee98 971(or utf8::unicode_to_native()/utf8::native_to_unicode()?)
4a2e806c 972
45394607 973=head2 CAVEAT
974
975Use of the C<normalization> parameter requires
976the B<Unicode::Normalize> module.
977
5398038e 978If you need not it (say, in the case when you need not
45394607 979handle any combining characters),
980assign C<normalization =E<gt> undef> explicitly.
981
5398038e 982-- see 6.5 Avoiding Normalization, UTR #10.
983
a7fbee98 984=head2 BUGS
985
986C<index()> is an experimental method and
987its return value may be unreliable.
988The correct implementation for C<index()> must be based
989on Locale-Sensitive Support: Level 3 in UTR #18,
990F<Unicode Regular Expression Guidelines>.
991
992See also 4.2 Locale-Dependent Graphemes in UTR #18.
993
45394607 994=head1 AUTHOR
995
996SADAHIRO Tomoyuki, E<lt>SADAHIRO@cpan.orgE<gt>
997
998 http://homepage1.nifty.com/nomenclator/perl/
999
a7fbee98 1000 Copyright(C) 2001-2002, SADAHIRO Tomoyuki. Japan. All rights reserved.
45394607 1001
a7fbee98 1002 This library is free software; you can redistribute it
1003 and/or modify it under the same terms as Perl itself.
45394607 1004
1005=head1 SEE ALSO
1006
1007=over 4
1008
a7fbee98 1009=item http://www.unicode.org/unicode/reports/tr10/
45394607 1010
a7fbee98 1011Unicode Collation Algorithm - UTR #10
45394607 1012
a7fbee98 1013=item http://www.unicode.org/unicode/reports/tr10/allkeys.txt
1014
1015The Default Unicode Collation Element Table
45394607 1016
a7fbee98 1017=item http://www.unicode.org/unicode/reports/tr15/
1018
1019Unicode Normalization Forms - UAX #15
1020
1021=item http://www.unicode.org/unicode/reports/tr18
1022
1023Unicode Regular Expression Guidelines - UTR #18
1024
1025=item L<Unicode::Normalize>
45394607 1026
45394607 1027=back
1028
1029=cut