fix test failure from #17747
[p5sagit/p5-mst-13.2.git] / lib / Unicode / Collate.pm
CommitLineData
45394607 1package Unicode::Collate;
2
4a2e806c 3BEGIN {
4 if (ord("A") == 193) {
5 die "Unicode::Collate not ported to EBCDIC\n";
6 }
7}
8
45394607 9use 5.006;
10use strict;
11use warnings;
12use Carp;
e69a2255 13use File::Spec;
5398038e 14
45394607 15require Exporter;
16
0116f5dc 17our $VERSION = '0.20';
45394607 18our $PACKAGE = __PACKAGE__;
19
20our @ISA = qw(Exporter);
21
22our %EXPORT_TAGS = ();
23our @EXPORT_OK = ();
24our @EXPORT = ();
25
26(our $Path = $INC{'Unicode/Collate.pm'}) =~ s/\.pm$//;
27our $KeyFile = "allkeys.txt";
28
3164dd77 29our $UNICODE_VERSION;
30
327745dc 31eval { require Unicode::UCD };
32
33unless ($@) {
34 $UNICODE_VERSION = Unicode::UCD::UnicodeVersion();
35}
36else { # XXX, Perl 5.6.1
3164dd77 37 my($f, $fh);
38 foreach my $d (@INC) {
3164dd77 39 $f = File::Spec->catfile($d, "unicode", "Unicode.301");
40 if (open($fh, $f)) {
327745dc 41 $UNICODE_VERSION = '3.0.1';
3164dd77 42 close $fh;
43 last;
44 }
45 }
46}
47
5398038e 48our $getCombinClass; # coderef for combining class from Unicode::Normalize
45394607 49
0116f5dc 50use constant Min2 => 0x20; # minimum weight at level 2
51use constant Min3 => 0x02; # minimum weight at level 3
a7fbee98 52
0116f5dc 53# format for pack
54use constant VCE_FORMAT => 'Cn4'; # for variable + CE with 4 levels
3164dd77 55
0116f5dc 56# values of variable
57use constant NON_VAR => 0; # Non-Variable character
58use constant VAR => 1; # Variable character
3164dd77 59
0116f5dc 60our $DefaultRearrange = [ 0x0E40..0x0E44, 0x0EC0..0x0EC4 ];
a7fbee98 61
0116f5dc 62sub UCA_Version { "9" }
a7fbee98 63
0116f5dc 64sub Base_Unicode_Version { $UNICODE_VERSION || 'unknown' }
a7fbee98 65
0116f5dc 66my (%AlternateOK);
67@AlternateOK{ qw/
68 blanked non-ignorable shifted shift-trimmed
69 / } = ();
70
71our @ChangeOK = qw/
72 alternate backwards level normalization rearrange
73 katakana_before_hiragana upper_before_lower
74 overrideHangul overrideCJK preprocess UCA_Version
75 /;
76
77our @ChangeNG = qw/
78 entry entries table ignored combining maxlength
79 ignoreChar ignoreName undefChar undefName
80 versionTable alternateTable backwardsTable forwardsTable rearrangeTable
81 derivCode normCode rearrangeHash isShift L3ignorable
82 /;
83
84my (%ChangeOK, %ChangeNG);
85@ChangeOK{ @ChangeOK } = ();
86@ChangeNG{ @ChangeNG } = ();
87
88sub change {
89 my $self = shift;
90 my %hash = @_;
91 my %old;
92 foreach my $k (keys %hash) {
93 if (exists $ChangeOK{$k}) {
94 $old{$k} = $self->{$k};
95 $self->{$k} = $hash{$k};
96 }
97 elsif (exists $ChangeNG{$k}) {
98 croak "change of $k via change() is not allowed!";
99 }
100 # else => ignored
101 }
102 $self->checkCollator;
103 return wantarray ? %old : $self;
104}
a7fbee98 105
0116f5dc 106sub checkCollator {
107 my $self = shift;
a7fbee98 108 croak "Illegal level lower than 1 (passed $self->{level})."
109 if $self->{level} < 1;
110 croak "A level higher than 4 (passed $self->{level}) is not supported."
111 if 4 < $self->{level};
112
0116f5dc 113 $self->{derivCode} =
114 $self->{UCA_Version} == -1 ? \&broken_derivCE :
115 $self->{UCA_Version} == 8 ? \&derivCE_8 :
116 $self->{UCA_Version} == 9 ? \&derivCE_9 :
117 croak "Illegal UCA version (passed $self->{UCA_Version}).";
a7fbee98 118
0116f5dc 119 $self->{alternate} = lc($self->{alternate});
120 croak "$PACKAGE unknown alternate tag name: $self->{alternate}"
121 unless exists $AlternateOK{ $self->{alternate} };
122
123 $self->{isShift} = $self->{alternate} eq 'shifted' ||
124 $self->{alternate} eq 'shift-trimmed';
125
126 $self->{backwards} = []
127 if ! defined $self->{backwards};
128 $self->{backwards} = [ $self->{backwards} ]
129 if ! ref $self->{backwards};
130
131 $self->{rearrange} = []
132 if ! defined $self->{rearrange};
133 croak "$PACKAGE: A list for rearrangement must be store in an ARRAYREF"
134 if ! ref $self->{rearrange};
135
136 # keys of $self->{rearrangeHash} are $self->{rearrange}.
137 $self->{rearrangeHash} = undef;
138
139 if (@{ $self->{rearrange} }) {
140 @{ $self->{rearrangeHash} }{ @{ $self->{rearrange} } } = ();
141 }
142
143 $self->{normCode} = undef;
a7fbee98 144
145 if (defined $self->{normalization}) {
146 eval { require Unicode::Normalize };
147 croak "Unicode/Normalize.pm is required to normalize strings: $@"
148 if $@;
149
150 Unicode::Normalize->import();
151 $getCombinClass = \&Unicode::Normalize::getCombinClass
152 if ! $getCombinClass;
153
0116f5dc 154 $self->{normCode} =
a7fbee98 155 $self->{normalization} =~ /^(?:NF)?C$/ ? \&NFC :
156 $self->{normalization} =~ /^(?:NF)?D$/ ? \&NFD :
157 $self->{normalization} =~ /^(?:NF)?KC$/ ? \&NFKC :
158 $self->{normalization} =~ /^(?:NF)?KD$/ ? \&NFKD :
159 croak "$PACKAGE unknown normalization form name: "
160 . $self->{normalization};
161 }
0116f5dc 162 return;
163}
164
165sub new
166{
167 my $class = shift;
168 my $self = bless { @_ }, $class;
45394607 169
a7fbee98 170 # If undef is passed explicitly, no file is read.
0116f5dc 171 $self->{table} = $KeyFile if ! exists $self->{table};
172 $self->read_table if defined $self->{table};
905aa9f0 173
a7fbee98 174 if ($self->{entry}) {
175 $self->parseEntry($_) foreach split /\n/, $self->{entry};
176 }
905aa9f0 177
0116f5dc 178 $self->{level} ||= 4;
179 $self->{UCA_Version} ||= UCA_Version();
905aa9f0 180
0116f5dc 181 $self->{overrideHangul} = ''
182 if ! exists $self->{overrideHangul};
183 $self->{overrideCJK} = ''
184 if ! exists $self->{overrideCJK};
185 $self->{normalization} = 'D'
186 if ! exists $self->{normalization};
187 $self->{alternate} = $self->{alternateTable} || 'shifted'
188 if ! exists $self->{alternate};
189 $self->{rearrange} = $self->{rearrangeTable} || $DefaultRearrange
a7fbee98 190 if ! exists $self->{rearrange};
0116f5dc 191 $self->{backwards} = $self->{backwardsTable}
192 if ! exists $self->{backwards};
a7fbee98 193
0116f5dc 194 $self->checkCollator;
a7fbee98 195
196 return $self;
197}
905aa9f0 198
199sub read_table {
a7fbee98 200 my $self = shift;
201 my $file = $self->{table} ne '' ? $self->{table} : $KeyFile;
202
e69a2255 203 my $filepath = File::Spec->catfile($Path, $file);
204 open my $fk, "<$filepath"
205 or croak "File does not exist at $filepath";
a7fbee98 206
207 while (<$fk>) {
208 next if /^\s*#/;
209 if (/^\s*\@/) {
0116f5dc 210 if (/^\s*\@version\s*(\S*)/) {
211 $self->{versionTable} ||= $1;
212 }
213 elsif (/^\s*\@alternate\s+(\S*)/) {
214 $self->{alternateTable} ||= $1;
a7fbee98 215 }
0116f5dc 216 elsif (/^\s*\@backwards\s+(\S*)/) {
217 push @{ $self->{backwardsTable} }, $1;
a7fbee98 218 }
0116f5dc 219 elsif (/^\s*\@forwards\s+(\S*)/) { # parhaps no use
220 push @{ $self->{forwardsTable} }, $1;
a7fbee98 221 }
0116f5dc 222 elsif (/^\s*\@rearrange\s+(.*)/) { # (\S*) is NG
223 push @{ $self->{rearrangeTable} }, _getHexArray($1);
a7fbee98 224 }
225 next;
226 }
227 $self->parseEntry($_);
45394607 228 }
a7fbee98 229 close $fk;
45394607 230}
231
905aa9f0 232
45394607 233##
234## get $line, parse it, and write an entry in $self
235##
236sub parseEntry
237{
a7fbee98 238 my $self = shift;
239 my $line = shift;
240 my($name, $ele, @key);
241
242 return if $line !~ /^\s*[0-9A-Fa-f]/;
243
244 # removes comment and gets name
245 $name = $1
246 if $line =~ s/[#%]\s*(.*)//;
247 return if defined $self->{undefName} && $name =~ /$self->{undefName}/;
248
249 # gets element
250 my($e, $k) = split /;/, $line;
251 croak "Wrong Entry: <charList> must be separated by ';' from <collElement>"
252 if ! $k;
253
254 my @e = _getHexArray($e);
0116f5dc 255 return if !@e;
256
a7fbee98 257 $ele = pack('U*', @e);
258 return if defined $self->{undefChar} && $ele =~ /$self->{undefChar}/;
259
260 # get sort key
261 if (defined $self->{ignoreName} && $name =~ /$self->{ignoreName}/ ||
262 defined $self->{ignoreChar} && $ele =~ /$self->{ignoreChar}/)
263 {
264 $self->{entries}{$ele} = $self->{ignored}{$ele} = 1;
45394607 265 }
a7fbee98 266 else {
267 my $combining = 1; # primary = 0, secondary != 0;
0116f5dc 268 my $level3ingore;
a7fbee98 269
270 foreach my $arr ($k =~ /\[([^\[\]]+)\]/g) { # SPACEs allowed
271 my $var = $arr =~ /\*/; # exactly /^\*/ but be lenient.
0116f5dc 272 my @arr = _getHexArray($arr);
273 push @key, pack(VCE_FORMAT, $var, @arr);
274 $combining = 0 unless $arr[0] == 0 && $arr[1] != 0;
275 $level3ingore = 1 if $arr[0] == 0 && $arr[1] == 0 && $arr[2] == 0;
a7fbee98 276 }
277 $self->{entries}{$ele} = \@key;
0116f5dc 278
279 $self->{combining}{$ele} = 1
280 if $combining;
281
282 $self->{L3ignorable}{$e[0]} = 1
283 if @e == 1 && $level3ingore;
a7fbee98 284 }
285 $self->{maxlength}{ord $ele} = scalar @e if @e > 1;
45394607 286}
287
45394607 288##
d16e9e3d 289## arrayref CE = altCE(bool variable?, list[num] weights)
45394607 290##
d16e9e3d 291sub altCE
45394607 292{
a7fbee98 293 my $self = shift;
0116f5dc 294 my($var, @c) = unpack(VCE_FORMAT, shift);
a7fbee98 295
296 $self->{alternate} eq 'blanked' ?
297 $var ? [0,0,0,$c[3]] : \@c :
298 $self->{alternate} eq 'non-ignorable' ?
299 \@c :
300 $self->{alternate} eq 'shifted' ?
301 $var ? [0,0,0,$c[0] ] : [ @c[0..2], $c[0]+$c[1]+$c[2] ? 0xFFFF : 0 ] :
302 $self->{alternate} eq 'shift-trimmed' ?
303 $var ? [0,0,0,$c[0] ] : [ @c[0..2], 0 ] :
304 croak "$PACKAGE unknown alternate name: $self->{alternate}";
45394607 305}
306
45394607 307sub viewSortKey
308{
a7fbee98 309 my $self = shift;
0116f5dc 310 my $ver = $self->{UCA_Version};
311
a7fbee98 312 my $key = $self->getSortKey(@_);
313 my $view = join " ", map sprintf("%04X", $_), unpack 'n*', $key;
0116f5dc 314 if ($ver <= 8) {
315 $view =~ s/ ?0000 ?/|/g;
316 } else {
317 $view =~ s/\b0000\b/|/g;
318 }
a7fbee98 319 return "[$view]";
45394607 320}
321
d16e9e3d 322
45394607 323##
d16e9e3d 324## list[strings] elements = splitCE(string arg)
45394607 325##
d16e9e3d 326sub splitCE
45394607 327{
a7fbee98 328 my $self = shift;
329 my $code = $self->{preprocess};
0116f5dc 330 my $norm = $self->{normCode};
a7fbee98 331 my $ent = $self->{entries};
332 my $max = $self->{maxlength};
333 my $reH = $self->{rearrangeHash};
0116f5dc 334 my $L3i = $self->{L3ignorable};
335 my $ver9 = $self->{UCA_Version} > 8;
a7fbee98 336
337 my $str = ref $code ? &$code(shift) : shift;
338 $str = &$norm($str) if ref $norm;
339
340 my @src = unpack('U*', $str);
341 my @buf;
342
343 # rearrangement
344 if ($reH) {
345 for (my $i = 0; $i < @src; $i++) {
346 if (exists $reH->{ $src[$i] } && $i + 1 < @src) {
347 ($src[$i], $src[$i+1]) = ($src[$i+1], $src[$i]);
348 $i++;
349 }
350 }
45394607 351 }
45394607 352
0116f5dc 353 if ($ver9) {
354 @src = grep ! $L3i->{$_}, @src;
355 }
356
a7fbee98 357 for (my $i = 0; $i < @src; $i++) {
358 my $ch;
359 my $u = $src[$i];
360
361 # non-characters
362 next unless defined $u;
363 next if $u < 0 || 0x10FFFF < $u # out of range
0116f5dc 364 || (0xD800 <= $u && $u <= 0xDFFF) # unpaired surrogates
365 || (0xFDD0 <= $u && $u <= 0xFDEF) # non-character
366 ;
367
a7fbee98 368 my $four = $u & 0xFFFF;
369 next if $four == 0xFFFE || $four == 0xFFFF;
370
371 if ($max->{$u}) { # contract
372 for (my $j = $max->{$u}; $j >= 1; $j--) {
373 next unless $i+$j-1 < @src;
374 $ch = pack 'U*', @src[$i .. $i+$j-1];
375 $i += $j-1, last if $ent->{$ch};
376 }
377 } else {
378 $ch = pack('U', $u);
379 }
380
381 # with Combining Char (UTS#10, 4.2.1), here requires Unicode::Normalize.
382 if ($getCombinClass && defined $ch) {
383 for (my $j = $i+1; $j < @src; $j++) {
384 next unless defined $src[$j];
385 last unless $getCombinClass->( $src[$j] );
386 my $comb = pack 'U', $src[$j];
387 next if ! $ent->{ $ch.$comb };
388 $ch .= $comb;
389 $src[$j] = undef;
390 }
391 }
392 push @buf, $ch;
45394607 393 }
a7fbee98 394 wantarray ? @buf : \@buf;
d16e9e3d 395}
45394607 396
d16e9e3d 397
398##
399## list[arrayrefs] weight = getWt(string element)
400##
401sub getWt
402{
a7fbee98 403 my $self = shift;
404 my $ch = shift;
405 my $ent = $self->{entries};
406 my $ign = $self->{ignored};
407 my $cjk = $self->{overrideCJK};
408 my $hang = $self->{overrideHangul};
0116f5dc 409 my $der = $self->{derivCode};
a7fbee98 410
411 return if !defined $ch || $ign->{$ch}; # ignored
0116f5dc 412 return map($self->altCE($_), @{ $ent->{$ch} })
413 if $ent->{$ch};
414
a7fbee98 415 my $u = unpack('U', $ch);
416
417 if (0xAC00 <= $u && $u <= 0xD7A3) { # is_Hangul
0116f5dc 418 return map $self->altCE($_),
419 $hang
420 ? map(pack(VCE_FORMAT, NON_VAR, @$_), &$hang($u))
421 : defined $hang
422 ? map({
423 my $v = $_;
424 my $vCE = $ent->{pack('U', $v)};
425 $vCE ? @$vCE : $der->($v);
426 } _decompHangul($u))
427 : $der->($u);
a7fbee98 428 }
429 elsif (0x3400 <= $u && $u <= 0x4DB5 ||
430 0x4E00 <= $u && $u <= 0x9FA5 ||
0116f5dc 431 0x20000 <= $u && $u <= 0x2A6D6) { # CJK Ideograph
432 return map $self->altCE($_),
433 $cjk
434 ? map(pack(VCE_FORMAT, NON_VAR, @$_), &$cjk($u))
435 : defined $cjk && $self->{UCA_Version} <= 8 && $u < 0x10000
436 ? pack(VCE_FORMAT, NON_VAR, $u, 0x20, 0x02, $u)
437 : $der->($u);
a7fbee98 438 }
439 else {
0116f5dc 440 return map $self->altCE($_), $der->($u);
a7fbee98 441 }
d16e9e3d 442}
443
444##
445## int = index(string, substring)
446##
447sub index
448{
a7fbee98 449 my $self = shift;
450 my $lev = $self->{level};
451 my $comb = $self->{combining};
452 my $str = $self->splitCE(shift);
453 my $sub = $self->splitCE(shift);
454
455 return wantarray ? (0,0) : 0 if ! @$sub;
456 return wantarray ? () : -1 if ! @$str;
457
458 my @subWt = grep _ignorableAtLevel($_,$lev),
459 map $self->getWt($_), @$sub;
460
461 my(@strWt,@strPt);
462 my $count = 0;
463 for (my $i = 0; $i < @$str; $i++) {
464 my $go_ahead = 0;
465
466 my @tmp = grep _ignorableAtLevel($_,$lev), $self->getWt($str->[$i]);
467 $go_ahead += length $str->[$i];
468
469 # /*XXX*/ still broken.
470 # index("e\x{300}", "e") should be 'no match' at level 2 or higher
471 # as "e\x{300}" is a *single* grapheme cluster and not equal to "e".
472
473 # go ahead as far as we find a combining character;
474 while ($i + 1 < @$str &&
475 (! defined $str->[$i+1] || $comb->{ $str->[$i+1] }) ) {
476 $i++;
a7fbee98 477 next if ! defined $str->[$i];
0116f5dc 478 $go_ahead += length $str->[$i];
a7fbee98 479 push @tmp,
480 grep _ignorableAtLevel($_,$lev), $self->getWt($str->[$i]);
481 }
482
483 push @strWt, @tmp;
484 push @strPt, ($count) x @tmp;
485 $count += $go_ahead;
486
487 while (@strWt >= @subWt) {
488 if (_eqArray(\@strWt, \@subWt, $lev)) {
489 my $pos = $strPt[0];
490 return wantarray ? ($pos, $count-$pos) : $pos;
491 }
492 shift @strWt;
493 shift @strPt;
494 }
d16e9e3d 495 }
a7fbee98 496 return wantarray ? () : -1;
d16e9e3d 497}
498
499##
500## bool _eqArray(arrayref, arrayref, level)
501##
502sub _eqArray($$$)
503{
a7fbee98 504 my $a = shift; # length $a >= length $b;
505 my $b = shift;
506 my $lev = shift;
507 for my $v (0..$lev-1) {
508 for my $c (0..@$b-1){
509 return if $a->[$c][$v] != $b->[$c][$v];
510 }
d16e9e3d 511 }
a7fbee98 512 return 1;
d16e9e3d 513}
514
515
516##
517## bool _ignorableAtLevel(CE, level)
518##
519sub _ignorableAtLevel($$)
520{
a7fbee98 521 my $ce = shift;
522 return unless defined $ce;
523 my $lv = shift;
524 return ! grep { ! $ce->[$_] } 0..$lv-1;
d16e9e3d 525}
526
527
528##
529## string sortkey = getSortKey(string arg)
530##
531sub getSortKey
532{
a7fbee98 533 my $self = shift;
534 my $lev = $self->{level};
535 my $rCE = $self->splitCE(shift); # get an arrayref
0116f5dc 536 my $ver9 = $self->{UCA_Version} > 8;
537 my $sht = $self->{isShift};
a7fbee98 538
539 # weight arrays
0116f5dc 540 my (@buf, $last_is_variable);
541
542 foreach my $ce (@$rCE) {
543 my @t = $self->getWt($ce);
544 if ($sht && $ver9) {
545 if (@t == 1 && $t[0][0] == 0) {
546 if ($t[0][1] == 0 && $t[0][2] == 0) {
547 $last_is_variable = 1;
548 } else {
549 next if $last_is_variable;
550 }
551 } else {
552 $last_is_variable = 0;
553 }
554 }
555 push @buf, @t;
556 }
a7fbee98 557
558 # make sort key
559 my @ret = ([],[],[],[]);
560 foreach my $v (0..$lev-1) {
561 foreach my $b (@buf) {
562 push @{ $ret[$v] }, $b->[$v] if $b->[$v];
563 }
564 }
565 foreach (@{ $self->{backwards} }) {
566 my $v = $_ - 1;
567 @{ $ret[$v] } = reverse @{ $ret[$v] };
45394607 568 }
45394607 569
a7fbee98 570 # modification of tertiary weights
571 if ($self->{upper_before_lower}) {
572 foreach (@{ $ret[2] }) {
573 if (0x8 <= $_ && $_ <= 0xC) { $_ -= 6 } # lower
574 elsif (0x2 <= $_ && $_ <= 0x6) { $_ += 6 } # upper
575 elsif ($_ == 0x1C) { $_ += 1 } # square upper
576 elsif ($_ == 0x1D) { $_ -= 1 } # square lower
577 }
45394607 578 }
a7fbee98 579 if ($self->{katakana_before_hiragana}) {
580 foreach (@{ $ret[2] }) {
581 if (0x0F <= $_ && $_ <= 0x13) { $_ -= 2 } # katakana
582 elsif (0x0D <= $_ && $_ <= 0x0E) { $_ += 5 } # hiragana
583 }
45394607 584 }
a7fbee98 585 join "\0\0", map pack('n*', @$_), @ret;
45394607 586}
587
588
589##
d16e9e3d 590## int compare = cmp(string a, string b)
45394607 591##
5398038e 592sub cmp { $_[0]->getSortKey($_[1]) cmp $_[0]->getSortKey($_[2]) }
593sub eq { $_[0]->getSortKey($_[1]) eq $_[0]->getSortKey($_[2]) }
594sub ne { $_[0]->getSortKey($_[1]) ne $_[0]->getSortKey($_[2]) }
595sub lt { $_[0]->getSortKey($_[1]) lt $_[0]->getSortKey($_[2]) }
596sub le { $_[0]->getSortKey($_[1]) le $_[0]->getSortKey($_[2]) }
597sub gt { $_[0]->getSortKey($_[1]) gt $_[0]->getSortKey($_[2]) }
598sub ge { $_[0]->getSortKey($_[1]) ge $_[0]->getSortKey($_[2]) }
45394607 599
600##
d16e9e3d 601## list[strings] sorted = sort(list[strings] arg)
45394607 602##
a7fbee98 603sub sort {
604 my $obj = shift;
605 return
606 map { $_->[1] }
607 sort{ $a->[0] cmp $b->[0] }
608 map [ $obj->getSortKey($_), $_ ], @_;
45394607 609}
610
0116f5dc 611
612sub derivCE_9 {
613 my $u = shift;
614 my $base =
615 (0x4E00 <= $u && $u <= 0x9FA5) # CJK
616 ? 0xFB40 :
617 (0x3400 <= $u && $u <= 0x4DB5 || 0x20000 <= $u && $u <= 0x2A6D6)
618 ? 0xFB80 : 0xFBC0;
619
620 my $aaaa = $base + ($u >> 15);
621 my $bbbb = ($u & 0x7FFF) | 0x8000;
622 return
623 pack(VCE_FORMAT, NON_VAR, $aaaa, Min2, Min3, $u),
624 pack(VCE_FORMAT, NON_VAR, $bbbb, 0, 0, $u);
625}
626
627sub derivCE_8 {
628 my $code = shift;
629 my $aaaa = 0xFF80 + ($code >> 15);
630 my $bbbb = ($code & 0x7FFF) | 0x8000;
631 return
632 pack(VCE_FORMAT, NON_VAR, $aaaa, 2, 1, $code),
633 pack(VCE_FORMAT, NON_VAR, $bbbb, 0, 0, $code);
634}
635
636sub broken_derivCE { # NG
a7fbee98 637 my $code = shift;
0116f5dc 638 my $aaaa = 0xFFC2 + ($code >> 15);
639 my $bbbb = $code & 0x7FFF | 0x1000;
640 return
641 pack(VCE_FORMAT, NON_VAR, $aaaa, 2, 1, $code),
642 pack(VCE_FORMAT, NON_VAR, $bbbb, 0, 0, $code);
45394607 643}
644
645##
646## "hhhh hhhh hhhh" to (dddd, dddd, dddd)
647##
a7fbee98 648sub _getHexArray { map hex, $_[0] =~ /([0-9a-fA-F]+)/g }
5398038e 649
a7fbee98 650#
651# $code must be in Hangul syllable.
652# Check it before you enter here.
653#
5398038e 654sub _decompHangul {
655 my $code = shift;
5398038e 656 my $SIndex = $code - 0xAC00;
657 my $LIndex = int( $SIndex / 588);
658 my $VIndex = int(($SIndex % 588) / 28);
659 my $TIndex = $SIndex % 28;
660 return (
a7fbee98 661 0x1100 + $LIndex,
662 0x1161 + $VIndex,
663 $TIndex ? (0x11A7 + $TIndex) : (),
5398038e 664 );
45394607 665}
666
6671;
668__END__
669
670=head1 NAME
671
a7fbee98 672Unicode::Collate - Unicode Collation Algorithm
45394607 673
674=head1 SYNOPSIS
675
676 use Unicode::Collate;
677
678 #construct
5398038e 679 $Collator = Unicode::Collate->new(%tailoring);
45394607 680
681 #sort
5398038e 682 @sorted = $Collator->sort(@not_sorted);
45394607 683
684 #compare
a7fbee98 685 $result = $Collator->cmp($a, $b); # returns 1, 0, or -1.
45394607 686
687=head1 DESCRIPTION
688
689=head2 Constructor and Tailoring
690
d16e9e3d 691The C<new> method returns a collator object.
692
5398038e 693 $Collator = Unicode::Collate->new(
0116f5dc 694 UCA_Version => $UCA_Version,
45394607 695 alternate => $alternate,
696 backwards => $levelNumber, # or \@levelNumbers
697 entry => $element,
698 normalization => $normalization_form,
699 ignoreName => qr/$ignoreName/,
700 ignoreChar => qr/$ignoreChar/,
701 katakana_before_hiragana => $bool,
702 level => $collationLevel,
703 overrideCJK => \&overrideCJK,
704 overrideHangul => \&overrideHangul,
705 preprocess => \&preprocess,
706 rearrange => \@charList,
707 table => $filename,
708 undefName => qr/$undefName/,
709 undefChar => qr/$undefChar/,
710 upper_before_lower => $bool,
711 );
a7fbee98 712 # if %tailoring is false (i.e. empty),
5398038e 713 # $Collator should do the default collation.
45394607 714
715=over 4
716
0116f5dc 717=item UCA_Version
718
719If the version number of the older UCA is given,
720the older behavior of that version is emulated on collating.
721If omitted, the return value of C<UCA_Version()> is used.
722
723The supported version: 8 or 9.
724
725B<This parameter may be removed in the future version,
726as switching the algorithm would affect the performance.>
727
45394607 728=item alternate
729
730-- see 3.2.2 Alternate Weighting, UTR #10.
731
a7fbee98 732This key allows to alternate weighting for variable collation elements,
733which are marked with an ASTERISK in the table
734(NOTE: Many punction marks and symbols are variable in F<allkeys.txt>).
735
736 alternate => 'blanked', 'non-ignorable', 'shifted', or 'shift-trimmed'.
45394607 737
a7fbee98 738These names are case-insensitive.
45394607 739By default (if specification is omitted), 'shifted' is adopted.
740
a7fbee98 741 'Blanked' Variable elements are ignorable at levels 1 through 3;
742 considered at the 4th level.
743
744 'Non-ignorable' Variable elements are not reset to ignorable.
745
746 'Shifted' Variable elements are ignorable at levels 1 through 3
747 their level 4 weight is replaced by the old level 1 weight.
748 Level 4 weight for Non-Variable elements is 0xFFFF.
749
750 'Shift-Trimmed' Same as 'shifted', but all FFFF's at the 4th level
751 are trimmed.
752
45394607 753=item backwards
754
755-- see 3.1.2 French Accents, UTR #10.
756
757 backwards => $levelNumber or \@levelNumbers
758
759Weights in reverse order; ex. level 2 (diacritic ordering) in French.
760If omitted, forwards at all the levels.
761
762=item entry
763
764-- see 3.1 Linguistic Features; 3.2.1 File Format, UTR #10.
765
a7fbee98 766Overrides a default order or defines additional collation elements
45394607 767
768 entry => <<'ENTRIES', # use the UCA file format
a7fbee98 76900E6 ; [.0861.0020.0002.00E6] [.08B1.0020.0002.00E6] # ligature <ae> as <a><e>
45394607 7700063 0068 ; [.0893.0020.0002.0063] # "ch" in traditional Spanish
7710043 0068 ; [.0893.0020.0008.0043] # "Ch" in traditional Spanish
772ENTRIES
773
774=item ignoreName
775
776=item ignoreChar
777
778-- see Completely Ignorable, 3.2.2 Alternate Weighting, UTR #10.
779
a7fbee98 780Makes the entry in the table ignorable.
781If a collation element is ignorable,
45394607 782it is ignored as if the element had been deleted from there.
783
a7fbee98 784E.g. when 'a' and 'e' are ignorable,
45394607 785'element' is equal to 'lament' (or 'lmnt').
786
787=item level
788
789-- see 4.3 Form a sort key for each string, UTR #10.
790
791Set the maximum level.
792Any higher levels than the specified one are ignored.
793
794 Level 1: alphabetic ordering
795 Level 2: diacritic ordering
796 Level 3: case ordering
797 Level 4: tie-breaking (e.g. in the case when alternate is 'shifted')
798
799 ex.level => 2,
800
a7fbee98 801If omitted, the maximum is the 4th.
802
45394607 803=item normalization
804
805-- see 4.1 Normalize each input string, UTR #10.
806
905aa9f0 807If specified, strings are normalized before preparation of sort keys
45394607 808(the normalization is executed after preprocess).
809
810As a form name, one of the following names must be used.
811
812 'C' or 'NFC' for Normalization Form C
813 'D' or 'NFD' for Normalization Form D
814 'KC' or 'NFKC' for Normalization Form KC
815 'KD' or 'NFKD' for Normalization Form KD
816
817If omitted, the string is put into Normalization Form D.
818
a7fbee98 819If C<undef> is passed explicitly as the value for this key,
45394607 820any normalization is not carried out (this may make tailoring easier
821if any normalization is not desired).
822
823see B<CAVEAT>.
824
825=item overrideCJK
826
45394607 827-- see 7.1 Derived Collation Elements, UTR #10.
828
829By default, mapping of CJK Unified Ideographs
a7fbee98 830uses the Unicode codepoint order.
831But the mapping of CJK Unified Ideographs may be overrided.
45394607 832
a7fbee98 833ex. CJK Unified Ideographs in the JIS code point order.
45394607 834
835 overrideCJK => sub {
a7fbee98 836 my $u = shift; # get a Unicode codepoint
837 my $b = pack('n', $u); # to UTF-16BE
838 my $s = your_unicode_to_sjis_converter($b); # convert
839 my $n = unpack('n', $s); # convert sjis to short
840 [ $n, 0x20, 0x2, $u ]; # return the collation element
45394607 841 },
842
a7fbee98 843ex. ignores all CJK Unified Ideographs.
844
845 overrideCJK => sub {()}, # CODEREF returning empty list
846
847 # where ->eq("Pe\x{4E00}rl", "Perl") is true
848 # as U+4E00 is a CJK Unified Ideograph and to be ignorable.
849
850If C<undef> is passed explicitly as the value for this key,
851weights for CJK Unified Ideographs are treated as undefined.
852But assignment of weight for CJK Unified Ideographs
853in table or L<entry> is still valid.
854
855=item overrideHangul
856
857-- see 7.1 Derived Collation Elements, UTR #10.
858
859By default, Hangul Syllables are decomposed into Hangul Jamo.
860But the mapping of Hangul Syllables may be overrided.
861
862This tag works like L<overrideCJK>, so see there for examples.
863
45394607 864If you want to override the mapping of Hangul Syllables,
865the Normalization Forms D and KD are not appropriate
866(they will be decomposed before overriding).
867
a7fbee98 868If C<undef> is passed explicitly as the value for this key,
869weight for Hangul Syllables is treated as undefined
870without decomposition into Hangul Jamo.
871But definition of weight for Hangul Syllables
872in table or L<entry> is still valid.
873
45394607 874=item preprocess
875
876-- see 5.1 Preprocessing, UTR #10.
877
878If specified, the coderef is used to preprocess
879before the formation of sort keys.
880
a7fbee98 881ex. dropping English articles, such as "a" or "the".
45394607 882Then, "the pen" is before "a pencil".
883
884 preprocess => sub {
885 my $str = shift;
a7fbee98 886 $str =~ s/\b(?:an?|the)\s+//gi;
45394607 887 $str;
888 },
889
890=item rearrange
891
892-- see 3.1.3 Rearrangement, UTR #10.
893
894Characters that are not coded in logical order and to be rearranged.
a7fbee98 895By default,
45394607 896
897 rearrange => [ 0x0E40..0x0E44, 0x0EC0..0x0EC4 ],
898
a7fbee98 899If you want to disallow any rearrangement,
900pass C<undef> or C<[]> (a reference to an empty list)
901as the value for this key.
902
0116f5dc 903B<According to the version 9 of UCA, this parameter shall not be used;
904but it is not warned at present.>
905
45394607 906=item table
907
908-- see 3.2 Default Unicode Collation Element Table, UTR #10.
909
910You can use another element table if desired.
911The table file must be in your C<lib/Unicode/Collate> directory.
912
913By default, the file C<lib/Unicode/Collate/allkeys.txt> is used.
914
a7fbee98 915If C<undef> is passed explicitly as the value for this key,
916no file is read (but you can define collation elements via L<entry>).
917
918A typical way to define a collation element table
919without any file of table:
920
921 $onlyABC = Unicode::Collate->new(
922 table => undef,
923 entry => << 'ENTRIES',
9240061 ; [.0101.0020.0002.0061] # LATIN SMALL LETTER A
9250041 ; [.0101.0020.0008.0041] # LATIN CAPITAL LETTER A
9260062 ; [.0102.0020.0002.0062] # LATIN SMALL LETTER B
9270042 ; [.0102.0020.0008.0042] # LATIN CAPITAL LETTER B
9280063 ; [.0103.0020.0002.0063] # LATIN SMALL LETTER C
9290043 ; [.0103.0020.0008.0043] # LATIN CAPITAL LETTER C
930ENTRIES
931 );
905aa9f0 932
45394607 933=item undefName
934
935=item undefChar
936
937-- see 6.3.4 Reducing the Repertoire, UTR #10.
938
939Undefines the collation element as if it were unassigned in the table.
940This reduces the size of the table.
941If an unassigned character appears in the string to be collated,
942the sort key is made from its codepoint
943as a single-character collation element,
944as it is greater than any other assigned collation elements
945(in the codepoint order among the unassigned characters).
946But, it'd be better to ignore characters
947unfamiliar to you and maybe never used.
948
949=item katakana_before_hiragana
950
951=item upper_before_lower
952
953-- see 6.6 Case Comparisons; 7.3.1 Tertiary Weight Table, UTR #10.
954
955By default, lowercase is before uppercase
956and hiragana is before katakana.
957
a7fbee98 958If the tag is made true, this is reversed.
959
960B<NOTE>: These tags simplemindedly assume
961any lowercase/uppercase or hiragana/katakana distinctions
962should occur in level 3, and their weights at level 3
963should be same as those mentioned in 7.3.1, UTR #10.
964If you define your collation elements which violates this,
965these tags doesn't work validly.
45394607 966
967=back
968
3164dd77 969=head2 Methods for Collation
45394607 970
971=over 4
972
5398038e 973=item C<@sorted = $Collator-E<gt>sort(@not_sorted)>
45394607 974
975Sorts a list of strings.
976
5398038e 977=item C<$result = $Collator-E<gt>cmp($a, $b)>
45394607 978
979Returns 1 (when C<$a> is greater than C<$b>)
980or 0 (when C<$a> is equal to C<$b>)
981or -1 (when C<$a> is lesser than C<$b>).
982
5398038e 983=item C<$result = $Collator-E<gt>eq($a, $b)>
984
985=item C<$result = $Collator-E<gt>ne($a, $b)>
986
987=item C<$result = $Collator-E<gt>lt($a, $b)>
988
989=item C<$result = $Collator-E<gt>le($a, $b)>
990
991=item C<$result = $Collator-E<gt>gt($a, $b)>
992
993=item C<$result = $Collator-E<gt>ge($a, $b)>
994
a7fbee98 995They works like the same name operators as theirs.
5398038e 996
997 eq : whether $a is equal to $b.
998 ne : whether $a is not equal to $b.
999 lt : whether $a is lesser than $b.
1000 le : whether $a is lesser than $b or equal to $b.
1001 gt : whether $a is greater than $b.
1002 ge : whether $a is greater than $b or equal to $b.
1003
1004=item C<$sortKey = $Collator-E<gt>getSortKey($string)>
45394607 1005
1006-- see 4.3 Form a sort key for each string, UTR #10.
1007
1008Returns a sort key.
1009
1010You compare the sort keys using a binary comparison
1011and get the result of the comparison of the strings using UCA.
1012
5398038e 1013 $Collator->getSortKey($a) cmp $Collator->getSortKey($b)
45394607 1014
1015 is equivalent to
1016
5398038e 1017 $Collator->cmp($a, $b)
45394607 1018
a7fbee98 1019=item C<$sortKeyForm = $Collator-E<gt>viewSortKey($string)>
1020
a7fbee98 1021 use Unicode::Collate;
1022 my $c = Unicode::Collate->new();
1023 print $c->viewSortKey("Perl"),"\n";
1024
0116f5dc 1025 # output:
1026 # [0B67 0A65 0B7F 0B03 | 0020 0020 0020 0020 | 0008 0002 0002 0002 | FFFF FFFF FFFF FFFF]
1027 # Level 1 Level 2 Level 3 Level 4
1028
1029 (If C<UCA_Version> is 8, the output is slightly different.)
a7fbee98 1030
5398038e 1031=item C<$position = $Collator-E<gt>index($string, $substring)>
d16e9e3d 1032
5398038e 1033=item C<($position, $length) = $Collator-E<gt>index($string, $substring)>
d16e9e3d 1034
1035-- see 6.8 Searching, UTR #10.
1036
1037If C<$substring> matches a part of C<$string>, returns
1038the position of the first occurrence of the matching part in scalar context;
1039in list context, returns a two-element list of
1040the position and the length of the matching part.
1041
1042B<Notice> that the length of the matching part may differ from
1043the length of C<$substring>.
1044
1045B<Note> that the position and the length are counted on the string
1046after the process of preprocess, normalization, and rearrangement.
1047Therefore, in case the specified string is not binary equal to
1048the preprocessed/normalized/rearranged string, the position and the length
a7fbee98 1049may differ form those on the specified string. But it is guaranteed
d16e9e3d 1050that, if matched, it returns a non-negative value as C<$position>.
1051
1052If C<$substring> does not match any part of C<$string>,
1053returns C<-1> in scalar context and
1054an empty list in list context.
1055
1056e.g. you say
1057
5398038e 1058 my $Collator = Unicode::Collate->new( normalization => undef, level => 1 );
d16e9e3d 1059 my $str = "Ich mu\x{00DF} studieren.";
1060 my $sub = "m\x{00FC}ss";
1061 my $match;
a7fbee98 1062 if (my($pos,$len) = $Collator->index($str, $sub)) {
5398038e 1063 $match = substr($str, $pos, $len);
d16e9e3d 1064 }
1065
1066and get C<"mu\x{00DF}"> in C<$match> since C<"mu>E<223>C<">
1067is primary equal to C<"m>E<252>C<ss">.
1068
45394607 1069=back
1070
3164dd77 1071=head2 Other Methods
1072
1073=over 4
1074
0116f5dc 1075=item C<%old_tailoring = $Collator-E<gt>change(%new_tailoring)>
1076
1077Change the value of specified keys and returns the changed part.
1078
1079 $Collator = Unicode::Collate->new(level => 4);
1080
1081 $Collator->eq("perl", "PERL"); # false
1082
1083 %old = $Collator->change(level => 2); # returns (level => 4).
1084
1085 $Collator->eq("perl", "PERL"); # true
1086
1087 $Collator->change(%old); # returns (level => 2).
1088
1089 $Collator->eq("perl", "PERL"); # false
1090
1091Not all C<(key,value)>s are allowed to be changed.
1092See also C<@Unicode::Collate::ChangeOK> and C<@Unicode::Collate::ChangeNG>.
1093
1094In the scalar context, returns the modified collator
1095(but it is B<not> a clone from the original).
1096
1097 $Collator->change(level => 2)->eq("perl", "PERL"); # true
1098
1099 $Collator->eq("perl", "PERL"); # true; now max level is 2nd.
1100
1101 $Collator->change(level => 4)->eq("perl", "PERL"); # false
1102
3164dd77 1103=item UCA_Version
1104
1105Returns the version number of Unicode Technical Standard 10
1106this module consults.
1107
1108=item Base_Unicode_Version
1109
1110Returns the version number of the Unicode Standard
1111this module is based on.
1112
1113=back
1114
45394607 1115=head2 EXPORT
1116
1117None by default.
1118
4a2e806c 1119=head2 TODO
1120
1121Unicode::Collate has not been ported to EBCDIC. The code mostly would
1122work just fine but a decision needs to be made: how the module should
1123work in EBCDIC? Should the low 256 characters be understood as
1124Unicode or as EBCDIC code points? Should one be chosen or should
1125there be a way to do either? Or should such translation be left
1126outside the module for the user to do, for example by using
1127Encode::from_to()?
a7fbee98 1128(or utf8::unicode_to_native()/utf8::native_to_unicode()?)
4a2e806c 1129
45394607 1130=head2 CAVEAT
1131
1132Use of the C<normalization> parameter requires
1133the B<Unicode::Normalize> module.
1134
5398038e 1135If you need not it (say, in the case when you need not
45394607 1136handle any combining characters),
1137assign C<normalization =E<gt> undef> explicitly.
1138
5398038e 1139-- see 6.5 Avoiding Normalization, UTR #10.
1140
0116f5dc 1141=head2 Conformance Test
1142
1143The Conformance Test for the UCA is provided
1144in L<http://www.unicode.org/reports/tr10/CollationTest.html>
1145and L<http://www.unicode.org/reports/tr10/CollationTest.zip>
1146
1147For F<CollationTest_SHIFTED.txt>,
1148a collator via C<Unicode::Collate-E<gt>new( )> should be used;
1149for F<CollationTest_NON_IGNORABLE.txt>, a collator via
1150C<Unicode::Collate-E<gt>new(alternate =E<gt> "non-ignorable", level =E<gt> 3)>.
1151
1152B<Unicode::Normalize is required to try this test.>
1153
a7fbee98 1154=head2 BUGS
1155
1156C<index()> is an experimental method and
1157its return value may be unreliable.
1158The correct implementation for C<index()> must be based
1159on Locale-Sensitive Support: Level 3 in UTR #18,
1160F<Unicode Regular Expression Guidelines>.
1161
1162See also 4.2 Locale-Dependent Graphemes in UTR #18.
1163
45394607 1164=head1 AUTHOR
1165
1166SADAHIRO Tomoyuki, E<lt>SADAHIRO@cpan.orgE<gt>
1167
1168 http://homepage1.nifty.com/nomenclator/perl/
1169
a7fbee98 1170 Copyright(C) 2001-2002, SADAHIRO Tomoyuki. Japan. All rights reserved.
45394607 1171
a7fbee98 1172 This library is free software; you can redistribute it
1173 and/or modify it under the same terms as Perl itself.
45394607 1174
1175=head1 SEE ALSO
1176
1177=over 4
1178
0116f5dc 1179=item http://www.unicode.org/reports/tr10/
45394607 1180
a7fbee98 1181Unicode Collation Algorithm - UTR #10
45394607 1182
0116f5dc 1183=item http://www.unicode.org/reports/tr10/allkeys.txt
a7fbee98 1184
1185The Default Unicode Collation Element Table
45394607 1186
0116f5dc 1187=item http://www.unicode.org/reports/tr10/CollationTest.html
1188http://www.unicode.org/reports/tr10/CollationTest.zip
1189
1190The latest versions of the conformance test for the UCA
1191
1192=item http://www.unicode.org/reports/tr15/
a7fbee98 1193
1194Unicode Normalization Forms - UAX #15
1195
0116f5dc 1196=item http://www.unicode.org/reports/tr18
a7fbee98 1197
1198Unicode Regular Expression Guidelines - UTR #18
1199
1200=item L<Unicode::Normalize>
45394607 1201
45394607 1202=back
1203
1204=cut