sv_2pv_flags and ROK and UTF8 flags
[p5sagit/p5-mst-13.2.git] / lib / Unicode / Collate.pm
CommitLineData
45394607 1package Unicode::Collate;
2
4a2e806c 3BEGIN {
4 if (ord("A") == 193) {
5 die "Unicode::Collate not ported to EBCDIC\n";
6 }
7}
8
45394607 9use 5.006;
10use strict;
11use warnings;
12use Carp;
e69a2255 13use File::Spec;
5398038e 14
45394607 15require Exporter;
16
caffd4cf 17our $VERSION = '0.21';
45394607 18our $PACKAGE = __PACKAGE__;
19
20our @ISA = qw(Exporter);
21
22our %EXPORT_TAGS = ();
23our @EXPORT_OK = ();
24our @EXPORT = ();
25
26(our $Path = $INC{'Unicode/Collate.pm'}) =~ s/\.pm$//;
27our $KeyFile = "allkeys.txt";
28
3164dd77 29our $UNICODE_VERSION;
30
327745dc 31eval { require Unicode::UCD };
32
33unless ($@) {
34 $UNICODE_VERSION = Unicode::UCD::UnicodeVersion();
35}
36else { # XXX, Perl 5.6.1
3164dd77 37 my($f, $fh);
38 foreach my $d (@INC) {
3164dd77 39 $f = File::Spec->catfile($d, "unicode", "Unicode.301");
40 if (open($fh, $f)) {
327745dc 41 $UNICODE_VERSION = '3.0.1';
3164dd77 42 close $fh;
43 last;
44 }
45 }
46}
47
5398038e 48our $getCombinClass; # coderef for combining class from Unicode::Normalize
45394607 49
0116f5dc 50use constant Min2 => 0x20; # minimum weight at level 2
51use constant Min3 => 0x02; # minimum weight at level 3
a7fbee98 52
0116f5dc 53# format for pack
54use constant VCE_FORMAT => 'Cn4'; # for variable + CE with 4 levels
3164dd77 55
0116f5dc 56# values of variable
57use constant NON_VAR => 0; # Non-Variable character
58use constant VAR => 1; # Variable character
3164dd77 59
0116f5dc 60our $DefaultRearrange = [ 0x0E40..0x0E44, 0x0EC0..0x0EC4 ];
a7fbee98 61
0116f5dc 62sub UCA_Version { "9" }
a7fbee98 63
0116f5dc 64sub Base_Unicode_Version { $UNICODE_VERSION || 'unknown' }
a7fbee98 65
0116f5dc 66my (%AlternateOK);
67@AlternateOK{ qw/
68 blanked non-ignorable shifted shift-trimmed
69 / } = ();
70
71our @ChangeOK = qw/
72 alternate backwards level normalization rearrange
73 katakana_before_hiragana upper_before_lower
74 overrideHangul overrideCJK preprocess UCA_Version
75 /;
76
77our @ChangeNG = qw/
caffd4cf 78 entry entries table combining maxlength
0116f5dc 79 ignoreChar ignoreName undefChar undefName
80 versionTable alternateTable backwardsTable forwardsTable rearrangeTable
81 derivCode normCode rearrangeHash isShift L3ignorable
82 /;
caffd4cf 83# The hash key 'ignored' is deleted at VERSION 0.21.
0116f5dc 84
85my (%ChangeOK, %ChangeNG);
86@ChangeOK{ @ChangeOK } = ();
87@ChangeNG{ @ChangeNG } = ();
88
89sub change {
90 my $self = shift;
91 my %hash = @_;
92 my %old;
93 foreach my $k (keys %hash) {
94 if (exists $ChangeOK{$k}) {
95 $old{$k} = $self->{$k};
96 $self->{$k} = $hash{$k};
97 }
98 elsif (exists $ChangeNG{$k}) {
99 croak "change of $k via change() is not allowed!";
100 }
101 # else => ignored
102 }
103 $self->checkCollator;
104 return wantarray ? %old : $self;
105}
a7fbee98 106
0116f5dc 107sub checkCollator {
108 my $self = shift;
a7fbee98 109 croak "Illegal level lower than 1 (passed $self->{level})."
110 if $self->{level} < 1;
111 croak "A level higher than 4 (passed $self->{level}) is not supported."
112 if 4 < $self->{level};
113
0116f5dc 114 $self->{derivCode} =
115 $self->{UCA_Version} == -1 ? \&broken_derivCE :
116 $self->{UCA_Version} == 8 ? \&derivCE_8 :
117 $self->{UCA_Version} == 9 ? \&derivCE_9 :
118 croak "Illegal UCA version (passed $self->{UCA_Version}).";
a7fbee98 119
0116f5dc 120 $self->{alternate} = lc($self->{alternate});
121 croak "$PACKAGE unknown alternate tag name: $self->{alternate}"
122 unless exists $AlternateOK{ $self->{alternate} };
123
124 $self->{isShift} = $self->{alternate} eq 'shifted' ||
125 $self->{alternate} eq 'shift-trimmed';
126
127 $self->{backwards} = []
128 if ! defined $self->{backwards};
129 $self->{backwards} = [ $self->{backwards} ]
130 if ! ref $self->{backwards};
131
132 $self->{rearrange} = []
133 if ! defined $self->{rearrange};
134 croak "$PACKAGE: A list for rearrangement must be store in an ARRAYREF"
135 if ! ref $self->{rearrange};
136
137 # keys of $self->{rearrangeHash} are $self->{rearrange}.
138 $self->{rearrangeHash} = undef;
139
140 if (@{ $self->{rearrange} }) {
141 @{ $self->{rearrangeHash} }{ @{ $self->{rearrange} } } = ();
142 }
143
144 $self->{normCode} = undef;
a7fbee98 145
146 if (defined $self->{normalization}) {
147 eval { require Unicode::Normalize };
148 croak "Unicode/Normalize.pm is required to normalize strings: $@"
149 if $@;
150
151 Unicode::Normalize->import();
152 $getCombinClass = \&Unicode::Normalize::getCombinClass
153 if ! $getCombinClass;
154
0116f5dc 155 $self->{normCode} =
a7fbee98 156 $self->{normalization} =~ /^(?:NF)?C$/ ? \&NFC :
157 $self->{normalization} =~ /^(?:NF)?D$/ ? \&NFD :
158 $self->{normalization} =~ /^(?:NF)?KC$/ ? \&NFKC :
159 $self->{normalization} =~ /^(?:NF)?KD$/ ? \&NFKD :
160 croak "$PACKAGE unknown normalization form name: "
161 . $self->{normalization};
162 }
0116f5dc 163 return;
164}
165
166sub new
167{
168 my $class = shift;
169 my $self = bless { @_ }, $class;
45394607 170
a7fbee98 171 # If undef is passed explicitly, no file is read.
0116f5dc 172 $self->{table} = $KeyFile if ! exists $self->{table};
173 $self->read_table if defined $self->{table};
905aa9f0 174
a7fbee98 175 if ($self->{entry}) {
176 $self->parseEntry($_) foreach split /\n/, $self->{entry};
177 }
905aa9f0 178
0116f5dc 179 $self->{level} ||= 4;
180 $self->{UCA_Version} ||= UCA_Version();
905aa9f0 181
0116f5dc 182 $self->{overrideHangul} = ''
183 if ! exists $self->{overrideHangul};
184 $self->{overrideCJK} = ''
185 if ! exists $self->{overrideCJK};
186 $self->{normalization} = 'D'
187 if ! exists $self->{normalization};
188 $self->{alternate} = $self->{alternateTable} || 'shifted'
189 if ! exists $self->{alternate};
190 $self->{rearrange} = $self->{rearrangeTable} || $DefaultRearrange
a7fbee98 191 if ! exists $self->{rearrange};
0116f5dc 192 $self->{backwards} = $self->{backwardsTable}
193 if ! exists $self->{backwards};
a7fbee98 194
0116f5dc 195 $self->checkCollator;
a7fbee98 196
197 return $self;
198}
905aa9f0 199
200sub read_table {
a7fbee98 201 my $self = shift;
202 my $file = $self->{table} ne '' ? $self->{table} : $KeyFile;
203
e69a2255 204 my $filepath = File::Spec->catfile($Path, $file);
205 open my $fk, "<$filepath"
206 or croak "File does not exist at $filepath";
a7fbee98 207
208 while (<$fk>) {
209 next if /^\s*#/;
210 if (/^\s*\@/) {
0116f5dc 211 if (/^\s*\@version\s*(\S*)/) {
212 $self->{versionTable} ||= $1;
213 }
214 elsif (/^\s*\@alternate\s+(\S*)/) {
215 $self->{alternateTable} ||= $1;
a7fbee98 216 }
0116f5dc 217 elsif (/^\s*\@backwards\s+(\S*)/) {
218 push @{ $self->{backwardsTable} }, $1;
a7fbee98 219 }
0116f5dc 220 elsif (/^\s*\@forwards\s+(\S*)/) { # parhaps no use
221 push @{ $self->{forwardsTable} }, $1;
a7fbee98 222 }
0116f5dc 223 elsif (/^\s*\@rearrange\s+(.*)/) { # (\S*) is NG
224 push @{ $self->{rearrangeTable} }, _getHexArray($1);
a7fbee98 225 }
226 next;
227 }
228 $self->parseEntry($_);
45394607 229 }
a7fbee98 230 close $fk;
45394607 231}
232
905aa9f0 233
45394607 234##
235## get $line, parse it, and write an entry in $self
236##
237sub parseEntry
238{
a7fbee98 239 my $self = shift;
240 my $line = shift;
241 my($name, $ele, @key);
242
243 return if $line !~ /^\s*[0-9A-Fa-f]/;
244
245 # removes comment and gets name
246 $name = $1
247 if $line =~ s/[#%]\s*(.*)//;
248 return if defined $self->{undefName} && $name =~ /$self->{undefName}/;
249
250 # gets element
251 my($e, $k) = split /;/, $line;
252 croak "Wrong Entry: <charList> must be separated by ';' from <collElement>"
253 if ! $k;
254
255 my @e = _getHexArray($e);
0116f5dc 256 return if !@e;
257
a7fbee98 258 $ele = pack('U*', @e);
259 return if defined $self->{undefChar} && $ele =~ /$self->{undefChar}/;
260
caffd4cf 261 my $combining = 1; # primary = 0, secondary != 0;
262 my $level3ignore;
263
264 # replace with completely ignorable
a7fbee98 265 if (defined $self->{ignoreName} && $name =~ /$self->{ignoreName}/ ||
266 defined $self->{ignoreChar} && $ele =~ /$self->{ignoreChar}/)
267 {
caffd4cf 268 $k = '[.0000.0000.0000.0000]';
45394607 269 }
0116f5dc 270
caffd4cf 271 foreach my $arr ($k =~ /\[([^\[\]]+)\]/g) { # SPACEs allowed
272 my $var = $arr =~ /\*/; # exactly /^\*/ but be lenient.
273 my @arr = _getHexArray($arr);
274 push @key, pack(VCE_FORMAT, $var, @arr);
275 $combining = 0 unless $arr[0] == 0 && $arr[1] != 0;
276 $level3ignore = 1 if $arr[0] == 0 && $arr[1] == 0 && $arr[2] == 0;
a7fbee98 277 }
caffd4cf 278
279 $self->{entries}{$ele} = \@key;
280
281 $self->{combining}{$ele} = 1
282 if $combining;
283
284 $self->{L3ignorable}{$e[0]} = 1
285 if @e == 1 && $level3ignore;
286
a7fbee98 287 $self->{maxlength}{ord $ele} = scalar @e if @e > 1;
45394607 288}
289
45394607 290##
d16e9e3d 291## arrayref CE = altCE(bool variable?, list[num] weights)
45394607 292##
d16e9e3d 293sub altCE
45394607 294{
a7fbee98 295 my $self = shift;
0116f5dc 296 my($var, @c) = unpack(VCE_FORMAT, shift);
a7fbee98 297
298 $self->{alternate} eq 'blanked' ?
299 $var ? [0,0,0,$c[3]] : \@c :
300 $self->{alternate} eq 'non-ignorable' ?
301 \@c :
302 $self->{alternate} eq 'shifted' ?
303 $var ? [0,0,0,$c[0] ] : [ @c[0..2], $c[0]+$c[1]+$c[2] ? 0xFFFF : 0 ] :
304 $self->{alternate} eq 'shift-trimmed' ?
305 $var ? [0,0,0,$c[0] ] : [ @c[0..2], 0 ] :
306 croak "$PACKAGE unknown alternate name: $self->{alternate}";
45394607 307}
308
45394607 309sub viewSortKey
310{
a7fbee98 311 my $self = shift;
0116f5dc 312 my $ver = $self->{UCA_Version};
313
a7fbee98 314 my $key = $self->getSortKey(@_);
315 my $view = join " ", map sprintf("%04X", $_), unpack 'n*', $key;
0116f5dc 316 if ($ver <= 8) {
317 $view =~ s/ ?0000 ?/|/g;
318 } else {
319 $view =~ s/\b0000\b/|/g;
320 }
a7fbee98 321 return "[$view]";
45394607 322}
323
d16e9e3d 324
45394607 325##
d16e9e3d 326## list[strings] elements = splitCE(string arg)
45394607 327##
d16e9e3d 328sub splitCE
45394607 329{
a7fbee98 330 my $self = shift;
331 my $code = $self->{preprocess};
0116f5dc 332 my $norm = $self->{normCode};
a7fbee98 333 my $ent = $self->{entries};
334 my $max = $self->{maxlength};
335 my $reH = $self->{rearrangeHash};
0116f5dc 336 my $L3i = $self->{L3ignorable};
337 my $ver9 = $self->{UCA_Version} > 8;
a7fbee98 338
339 my $str = ref $code ? &$code(shift) : shift;
340 $str = &$norm($str) if ref $norm;
341
342 my @src = unpack('U*', $str);
343 my @buf;
344
345 # rearrangement
346 if ($reH) {
347 for (my $i = 0; $i < @src; $i++) {
348 if (exists $reH->{ $src[$i] } && $i + 1 < @src) {
349 ($src[$i], $src[$i+1]) = ($src[$i+1], $src[$i]);
350 $i++;
351 }
352 }
45394607 353 }
45394607 354
0116f5dc 355 if ($ver9) {
356 @src = grep ! $L3i->{$_}, @src;
357 }
358
a7fbee98 359 for (my $i = 0; $i < @src; $i++) {
360 my $ch;
361 my $u = $src[$i];
362
363 # non-characters
caffd4cf 364 next if ! defined $u
365 || ($u < 0 || 0x10FFFF < $u) # out of range
366 || (($u & 0xFFFE) == 0xFFFE) # ??FFFE or ??FFFF (cf. utf8.c)
0116f5dc 367 || (0xD800 <= $u && $u <= 0xDFFF) # unpaired surrogates
368 || (0xFDD0 <= $u && $u <= 0xFDEF) # non-character
369 ;
370
a7fbee98 371 if ($max->{$u}) { # contract
372 for (my $j = $max->{$u}; $j >= 1; $j--) {
373 next unless $i+$j-1 < @src;
374 $ch = pack 'U*', @src[$i .. $i+$j-1];
375 $i += $j-1, last if $ent->{$ch};
376 }
377 } else {
378 $ch = pack('U', $u);
379 }
380
381 # with Combining Char (UTS#10, 4.2.1), here requires Unicode::Normalize.
382 if ($getCombinClass && defined $ch) {
383 for (my $j = $i+1; $j < @src; $j++) {
384 next unless defined $src[$j];
385 last unless $getCombinClass->( $src[$j] );
386 my $comb = pack 'U', $src[$j];
387 next if ! $ent->{ $ch.$comb };
388 $ch .= $comb;
389 $src[$j] = undef;
390 }
391 }
392 push @buf, $ch;
45394607 393 }
a7fbee98 394 wantarray ? @buf : \@buf;
d16e9e3d 395}
45394607 396
d16e9e3d 397
398##
399## list[arrayrefs] weight = getWt(string element)
400##
401sub getWt
402{
a7fbee98 403 my $self = shift;
404 my $ch = shift;
405 my $ent = $self->{entries};
a7fbee98 406 my $cjk = $self->{overrideCJK};
407 my $hang = $self->{overrideHangul};
0116f5dc 408 my $der = $self->{derivCode};
a7fbee98 409
caffd4cf 410 return if !defined $ch;
0116f5dc 411 return map($self->altCE($_), @{ $ent->{$ch} })
412 if $ent->{$ch};
413
a7fbee98 414 my $u = unpack('U', $ch);
415
416 if (0xAC00 <= $u && $u <= 0xD7A3) { # is_Hangul
0116f5dc 417 return map $self->altCE($_),
418 $hang
419 ? map(pack(VCE_FORMAT, NON_VAR, @$_), &$hang($u))
420 : defined $hang
421 ? map({
422 my $v = $_;
423 my $vCE = $ent->{pack('U', $v)};
424 $vCE ? @$vCE : $der->($v);
425 } _decompHangul($u))
426 : $der->($u);
a7fbee98 427 }
428 elsif (0x3400 <= $u && $u <= 0x4DB5 ||
429 0x4E00 <= $u && $u <= 0x9FA5 ||
0116f5dc 430 0x20000 <= $u && $u <= 0x2A6D6) { # CJK Ideograph
431 return map $self->altCE($_),
432 $cjk
433 ? map(pack(VCE_FORMAT, NON_VAR, @$_), &$cjk($u))
434 : defined $cjk && $self->{UCA_Version} <= 8 && $u < 0x10000
435 ? pack(VCE_FORMAT, NON_VAR, $u, 0x20, 0x02, $u)
436 : $der->($u);
a7fbee98 437 }
438 else {
0116f5dc 439 return map $self->altCE($_), $der->($u);
a7fbee98 440 }
d16e9e3d 441}
442
443##
444## int = index(string, substring)
445##
446sub index
447{
a7fbee98 448 my $self = shift;
449 my $lev = $self->{level};
450 my $comb = $self->{combining};
451 my $str = $self->splitCE(shift);
452 my $sub = $self->splitCE(shift);
453
454 return wantarray ? (0,0) : 0 if ! @$sub;
455 return wantarray ? () : -1 if ! @$str;
456
457 my @subWt = grep _ignorableAtLevel($_,$lev),
458 map $self->getWt($_), @$sub;
459
460 my(@strWt,@strPt);
461 my $count = 0;
462 for (my $i = 0; $i < @$str; $i++) {
463 my $go_ahead = 0;
464
465 my @tmp = grep _ignorableAtLevel($_,$lev), $self->getWt($str->[$i]);
466 $go_ahead += length $str->[$i];
467
468 # /*XXX*/ still broken.
469 # index("e\x{300}", "e") should be 'no match' at level 2 or higher
470 # as "e\x{300}" is a *single* grapheme cluster and not equal to "e".
471
472 # go ahead as far as we find a combining character;
473 while ($i + 1 < @$str &&
474 (! defined $str->[$i+1] || $comb->{ $str->[$i+1] }) ) {
475 $i++;
a7fbee98 476 next if ! defined $str->[$i];
0116f5dc 477 $go_ahead += length $str->[$i];
a7fbee98 478 push @tmp,
479 grep _ignorableAtLevel($_,$lev), $self->getWt($str->[$i]);
480 }
481
482 push @strWt, @tmp;
483 push @strPt, ($count) x @tmp;
484 $count += $go_ahead;
485
486 while (@strWt >= @subWt) {
487 if (_eqArray(\@strWt, \@subWt, $lev)) {
488 my $pos = $strPt[0];
489 return wantarray ? ($pos, $count-$pos) : $pos;
490 }
491 shift @strWt;
492 shift @strPt;
493 }
d16e9e3d 494 }
a7fbee98 495 return wantarray ? () : -1;
d16e9e3d 496}
497
498##
499## bool _eqArray(arrayref, arrayref, level)
500##
501sub _eqArray($$$)
502{
a7fbee98 503 my $a = shift; # length $a >= length $b;
504 my $b = shift;
505 my $lev = shift;
506 for my $v (0..$lev-1) {
507 for my $c (0..@$b-1){
508 return if $a->[$c][$v] != $b->[$c][$v];
509 }
d16e9e3d 510 }
a7fbee98 511 return 1;
d16e9e3d 512}
513
514
515##
516## bool _ignorableAtLevel(CE, level)
517##
518sub _ignorableAtLevel($$)
519{
a7fbee98 520 my $ce = shift;
521 return unless defined $ce;
522 my $lv = shift;
523 return ! grep { ! $ce->[$_] } 0..$lv-1;
d16e9e3d 524}
525
526
527##
528## string sortkey = getSortKey(string arg)
529##
530sub getSortKey
531{
a7fbee98 532 my $self = shift;
533 my $lev = $self->{level};
534 my $rCE = $self->splitCE(shift); # get an arrayref
0116f5dc 535 my $ver9 = $self->{UCA_Version} > 8;
536 my $sht = $self->{isShift};
a7fbee98 537
538 # weight arrays
0116f5dc 539 my (@buf, $last_is_variable);
540
541 foreach my $ce (@$rCE) {
542 my @t = $self->getWt($ce);
543 if ($sht && $ver9) {
544 if (@t == 1 && $t[0][0] == 0) {
545 if ($t[0][1] == 0 && $t[0][2] == 0) {
546 $last_is_variable = 1;
547 } else {
548 next if $last_is_variable;
549 }
550 } else {
551 $last_is_variable = 0;
552 }
553 }
554 push @buf, @t;
555 }
a7fbee98 556
557 # make sort key
558 my @ret = ([],[],[],[]);
559 foreach my $v (0..$lev-1) {
560 foreach my $b (@buf) {
561 push @{ $ret[$v] }, $b->[$v] if $b->[$v];
562 }
563 }
564 foreach (@{ $self->{backwards} }) {
565 my $v = $_ - 1;
566 @{ $ret[$v] } = reverse @{ $ret[$v] };
45394607 567 }
45394607 568
a7fbee98 569 # modification of tertiary weights
570 if ($self->{upper_before_lower}) {
571 foreach (@{ $ret[2] }) {
572 if (0x8 <= $_ && $_ <= 0xC) { $_ -= 6 } # lower
573 elsif (0x2 <= $_ && $_ <= 0x6) { $_ += 6 } # upper
574 elsif ($_ == 0x1C) { $_ += 1 } # square upper
575 elsif ($_ == 0x1D) { $_ -= 1 } # square lower
576 }
45394607 577 }
a7fbee98 578 if ($self->{katakana_before_hiragana}) {
579 foreach (@{ $ret[2] }) {
580 if (0x0F <= $_ && $_ <= 0x13) { $_ -= 2 } # katakana
581 elsif (0x0D <= $_ && $_ <= 0x0E) { $_ += 5 } # hiragana
582 }
45394607 583 }
a7fbee98 584 join "\0\0", map pack('n*', @$_), @ret;
45394607 585}
586
587
588##
d16e9e3d 589## int compare = cmp(string a, string b)
45394607 590##
5398038e 591sub cmp { $_[0]->getSortKey($_[1]) cmp $_[0]->getSortKey($_[2]) }
592sub eq { $_[0]->getSortKey($_[1]) eq $_[0]->getSortKey($_[2]) }
593sub ne { $_[0]->getSortKey($_[1]) ne $_[0]->getSortKey($_[2]) }
594sub lt { $_[0]->getSortKey($_[1]) lt $_[0]->getSortKey($_[2]) }
595sub le { $_[0]->getSortKey($_[1]) le $_[0]->getSortKey($_[2]) }
596sub gt { $_[0]->getSortKey($_[1]) gt $_[0]->getSortKey($_[2]) }
597sub ge { $_[0]->getSortKey($_[1]) ge $_[0]->getSortKey($_[2]) }
45394607 598
599##
d16e9e3d 600## list[strings] sorted = sort(list[strings] arg)
45394607 601##
a7fbee98 602sub sort {
603 my $obj = shift;
604 return
605 map { $_->[1] }
606 sort{ $a->[0] cmp $b->[0] }
607 map [ $obj->getSortKey($_), $_ ], @_;
45394607 608}
609
0116f5dc 610
611sub derivCE_9 {
612 my $u = shift;
613 my $base =
614 (0x4E00 <= $u && $u <= 0x9FA5) # CJK
615 ? 0xFB40 :
616 (0x3400 <= $u && $u <= 0x4DB5 || 0x20000 <= $u && $u <= 0x2A6D6)
617 ? 0xFB80 : 0xFBC0;
618
619 my $aaaa = $base + ($u >> 15);
620 my $bbbb = ($u & 0x7FFF) | 0x8000;
621 return
622 pack(VCE_FORMAT, NON_VAR, $aaaa, Min2, Min3, $u),
623 pack(VCE_FORMAT, NON_VAR, $bbbb, 0, 0, $u);
624}
625
626sub derivCE_8 {
627 my $code = shift;
628 my $aaaa = 0xFF80 + ($code >> 15);
629 my $bbbb = ($code & 0x7FFF) | 0x8000;
630 return
631 pack(VCE_FORMAT, NON_VAR, $aaaa, 2, 1, $code),
632 pack(VCE_FORMAT, NON_VAR, $bbbb, 0, 0, $code);
633}
634
635sub broken_derivCE { # NG
a7fbee98 636 my $code = shift;
0116f5dc 637 my $aaaa = 0xFFC2 + ($code >> 15);
638 my $bbbb = $code & 0x7FFF | 0x1000;
639 return
640 pack(VCE_FORMAT, NON_VAR, $aaaa, 2, 1, $code),
641 pack(VCE_FORMAT, NON_VAR, $bbbb, 0, 0, $code);
45394607 642}
643
644##
645## "hhhh hhhh hhhh" to (dddd, dddd, dddd)
646##
a7fbee98 647sub _getHexArray { map hex, $_[0] =~ /([0-9a-fA-F]+)/g }
5398038e 648
a7fbee98 649#
650# $code must be in Hangul syllable.
651# Check it before you enter here.
652#
5398038e 653sub _decompHangul {
654 my $code = shift;
5398038e 655 my $SIndex = $code - 0xAC00;
656 my $LIndex = int( $SIndex / 588);
657 my $VIndex = int(($SIndex % 588) / 28);
658 my $TIndex = $SIndex % 28;
659 return (
a7fbee98 660 0x1100 + $LIndex,
661 0x1161 + $VIndex,
662 $TIndex ? (0x11A7 + $TIndex) : (),
5398038e 663 );
45394607 664}
665
6661;
667__END__
668
669=head1 NAME
670
a7fbee98 671Unicode::Collate - Unicode Collation Algorithm
45394607 672
673=head1 SYNOPSIS
674
675 use Unicode::Collate;
676
677 #construct
5398038e 678 $Collator = Unicode::Collate->new(%tailoring);
45394607 679
680 #sort
5398038e 681 @sorted = $Collator->sort(@not_sorted);
45394607 682
683 #compare
a7fbee98 684 $result = $Collator->cmp($a, $b); # returns 1, 0, or -1.
45394607 685
686=head1 DESCRIPTION
687
688=head2 Constructor and Tailoring
689
d16e9e3d 690The C<new> method returns a collator object.
691
5398038e 692 $Collator = Unicode::Collate->new(
0116f5dc 693 UCA_Version => $UCA_Version,
45394607 694 alternate => $alternate,
695 backwards => $levelNumber, # or \@levelNumbers
696 entry => $element,
697 normalization => $normalization_form,
698 ignoreName => qr/$ignoreName/,
699 ignoreChar => qr/$ignoreChar/,
700 katakana_before_hiragana => $bool,
701 level => $collationLevel,
702 overrideCJK => \&overrideCJK,
703 overrideHangul => \&overrideHangul,
704 preprocess => \&preprocess,
705 rearrange => \@charList,
706 table => $filename,
707 undefName => qr/$undefName/,
708 undefChar => qr/$undefChar/,
709 upper_before_lower => $bool,
710 );
a7fbee98 711 # if %tailoring is false (i.e. empty),
5398038e 712 # $Collator should do the default collation.
45394607 713
714=over 4
715
0116f5dc 716=item UCA_Version
717
718If the version number of the older UCA is given,
719the older behavior of that version is emulated on collating.
720If omitted, the return value of C<UCA_Version()> is used.
721
722The supported version: 8 or 9.
723
724B<This parameter may be removed in the future version,
725as switching the algorithm would affect the performance.>
726
45394607 727=item alternate
728
caffd4cf 729-- see 3.2.2 Variable Weighting, UTR #10.
730
731(the title in UCA version 8: Alternate Weighting)
45394607 732
a7fbee98 733This key allows to alternate weighting for variable collation elements,
734which are marked with an ASTERISK in the table
735(NOTE: Many punction marks and symbols are variable in F<allkeys.txt>).
736
737 alternate => 'blanked', 'non-ignorable', 'shifted', or 'shift-trimmed'.
45394607 738
a7fbee98 739These names are case-insensitive.
45394607 740By default (if specification is omitted), 'shifted' is adopted.
741
a7fbee98 742 'Blanked' Variable elements are ignorable at levels 1 through 3;
743 considered at the 4th level.
744
745 'Non-ignorable' Variable elements are not reset to ignorable.
746
747 'Shifted' Variable elements are ignorable at levels 1 through 3
748 their level 4 weight is replaced by the old level 1 weight.
749 Level 4 weight for Non-Variable elements is 0xFFFF.
750
751 'Shift-Trimmed' Same as 'shifted', but all FFFF's at the 4th level
752 are trimmed.
753
45394607 754=item backwards
755
756-- see 3.1.2 French Accents, UTR #10.
757
758 backwards => $levelNumber or \@levelNumbers
759
760Weights in reverse order; ex. level 2 (diacritic ordering) in French.
761If omitted, forwards at all the levels.
762
763=item entry
764
765-- see 3.1 Linguistic Features; 3.2.1 File Format, UTR #10.
766
a7fbee98 767Overrides a default order or defines additional collation elements
45394607 768
769 entry => <<'ENTRIES', # use the UCA file format
a7fbee98 77000E6 ; [.0861.0020.0002.00E6] [.08B1.0020.0002.00E6] # ligature <ae> as <a><e>
45394607 7710063 0068 ; [.0893.0020.0002.0063] # "ch" in traditional Spanish
7720043 0068 ; [.0893.0020.0008.0043] # "Ch" in traditional Spanish
773ENTRIES
774
775=item ignoreName
776
777=item ignoreChar
778
caffd4cf 779-- see Completely Ignorable, 3.2.2 Variable Weighting, UTR #10.
45394607 780
caffd4cf 781Makes the entry in the table completely ignorable;
782i.e. as if the weights were zero at all level.
45394607 783
a7fbee98 784E.g. when 'a' and 'e' are ignorable,
45394607 785'element' is equal to 'lament' (or 'lmnt').
786
787=item level
788
789-- see 4.3 Form a sort key for each string, UTR #10.
790
791Set the maximum level.
792Any higher levels than the specified one are ignored.
793
794 Level 1: alphabetic ordering
795 Level 2: diacritic ordering
796 Level 3: case ordering
797 Level 4: tie-breaking (e.g. in the case when alternate is 'shifted')
798
799 ex.level => 2,
800
a7fbee98 801If omitted, the maximum is the 4th.
802
45394607 803=item normalization
804
805-- see 4.1 Normalize each input string, UTR #10.
806
905aa9f0 807If specified, strings are normalized before preparation of sort keys
45394607 808(the normalization is executed after preprocess).
809
810As a form name, one of the following names must be used.
811
812 'C' or 'NFC' for Normalization Form C
813 'D' or 'NFD' for Normalization Form D
814 'KC' or 'NFKC' for Normalization Form KC
815 'KD' or 'NFKD' for Normalization Form KD
816
817If omitted, the string is put into Normalization Form D.
818
a7fbee98 819If C<undef> is passed explicitly as the value for this key,
45394607 820any normalization is not carried out (this may make tailoring easier
821if any normalization is not desired).
822
823see B<CAVEAT>.
824
825=item overrideCJK
826
45394607 827-- see 7.1 Derived Collation Elements, UTR #10.
828
829By default, mapping of CJK Unified Ideographs
a7fbee98 830uses the Unicode codepoint order.
831But the mapping of CJK Unified Ideographs may be overrided.
45394607 832
a7fbee98 833ex. CJK Unified Ideographs in the JIS code point order.
45394607 834
835 overrideCJK => sub {
a7fbee98 836 my $u = shift; # get a Unicode codepoint
837 my $b = pack('n', $u); # to UTF-16BE
838 my $s = your_unicode_to_sjis_converter($b); # convert
839 my $n = unpack('n', $s); # convert sjis to short
840 [ $n, 0x20, 0x2, $u ]; # return the collation element
45394607 841 },
842
a7fbee98 843ex. ignores all CJK Unified Ideographs.
844
845 overrideCJK => sub {()}, # CODEREF returning empty list
846
847 # where ->eq("Pe\x{4E00}rl", "Perl") is true
848 # as U+4E00 is a CJK Unified Ideograph and to be ignorable.
849
850If C<undef> is passed explicitly as the value for this key,
851weights for CJK Unified Ideographs are treated as undefined.
852But assignment of weight for CJK Unified Ideographs
853in table or L<entry> is still valid.
854
855=item overrideHangul
856
857-- see 7.1 Derived Collation Elements, UTR #10.
858
859By default, Hangul Syllables are decomposed into Hangul Jamo.
860But the mapping of Hangul Syllables may be overrided.
861
862This tag works like L<overrideCJK>, so see there for examples.
863
45394607 864If you want to override the mapping of Hangul Syllables,
865the Normalization Forms D and KD are not appropriate
866(they will be decomposed before overriding).
867
a7fbee98 868If C<undef> is passed explicitly as the value for this key,
869weight for Hangul Syllables is treated as undefined
870without decomposition into Hangul Jamo.
871But definition of weight for Hangul Syllables
872in table or L<entry> is still valid.
873
45394607 874=item preprocess
875
876-- see 5.1 Preprocessing, UTR #10.
877
878If specified, the coderef is used to preprocess
879before the formation of sort keys.
880
a7fbee98 881ex. dropping English articles, such as "a" or "the".
45394607 882Then, "the pen" is before "a pencil".
883
884 preprocess => sub {
885 my $str = shift;
a7fbee98 886 $str =~ s/\b(?:an?|the)\s+//gi;
45394607 887 $str;
888 },
889
890=item rearrange
891
892-- see 3.1.3 Rearrangement, UTR #10.
893
894Characters that are not coded in logical order and to be rearranged.
a7fbee98 895By default,
45394607 896
897 rearrange => [ 0x0E40..0x0E44, 0x0EC0..0x0EC4 ],
898
a7fbee98 899If you want to disallow any rearrangement,
900pass C<undef> or C<[]> (a reference to an empty list)
901as the value for this key.
902
0116f5dc 903B<According to the version 9 of UCA, this parameter shall not be used;
904but it is not warned at present.>
905
45394607 906=item table
907
908-- see 3.2 Default Unicode Collation Element Table, UTR #10.
909
910You can use another element table if desired.
911The table file must be in your C<lib/Unicode/Collate> directory.
912
913By default, the file C<lib/Unicode/Collate/allkeys.txt> is used.
914
a7fbee98 915If C<undef> is passed explicitly as the value for this key,
916no file is read (but you can define collation elements via L<entry>).
917
918A typical way to define a collation element table
919without any file of table:
920
921 $onlyABC = Unicode::Collate->new(
922 table => undef,
923 entry => << 'ENTRIES',
9240061 ; [.0101.0020.0002.0061] # LATIN SMALL LETTER A
9250041 ; [.0101.0020.0008.0041] # LATIN CAPITAL LETTER A
9260062 ; [.0102.0020.0002.0062] # LATIN SMALL LETTER B
9270042 ; [.0102.0020.0008.0042] # LATIN CAPITAL LETTER B
9280063 ; [.0103.0020.0002.0063] # LATIN SMALL LETTER C
9290043 ; [.0103.0020.0008.0043] # LATIN CAPITAL LETTER C
930ENTRIES
931 );
905aa9f0 932
45394607 933=item undefName
934
935=item undefChar
936
937-- see 6.3.4 Reducing the Repertoire, UTR #10.
938
939Undefines the collation element as if it were unassigned in the table.
940This reduces the size of the table.
941If an unassigned character appears in the string to be collated,
942the sort key is made from its codepoint
943as a single-character collation element,
944as it is greater than any other assigned collation elements
945(in the codepoint order among the unassigned characters).
946But, it'd be better to ignore characters
947unfamiliar to you and maybe never used.
948
949=item katakana_before_hiragana
950
951=item upper_before_lower
952
953-- see 6.6 Case Comparisons; 7.3.1 Tertiary Weight Table, UTR #10.
954
955By default, lowercase is before uppercase
956and hiragana is before katakana.
957
a7fbee98 958If the tag is made true, this is reversed.
959
960B<NOTE>: These tags simplemindedly assume
961any lowercase/uppercase or hiragana/katakana distinctions
962should occur in level 3, and their weights at level 3
963should be same as those mentioned in 7.3.1, UTR #10.
964If you define your collation elements which violates this,
965these tags doesn't work validly.
45394607 966
967=back
968
3164dd77 969=head2 Methods for Collation
45394607 970
971=over 4
972
5398038e 973=item C<@sorted = $Collator-E<gt>sort(@not_sorted)>
45394607 974
975Sorts a list of strings.
976
5398038e 977=item C<$result = $Collator-E<gt>cmp($a, $b)>
45394607 978
979Returns 1 (when C<$a> is greater than C<$b>)
980or 0 (when C<$a> is equal to C<$b>)
981or -1 (when C<$a> is lesser than C<$b>).
982
5398038e 983=item C<$result = $Collator-E<gt>eq($a, $b)>
984
985=item C<$result = $Collator-E<gt>ne($a, $b)>
986
987=item C<$result = $Collator-E<gt>lt($a, $b)>
988
989=item C<$result = $Collator-E<gt>le($a, $b)>
990
991=item C<$result = $Collator-E<gt>gt($a, $b)>
992
993=item C<$result = $Collator-E<gt>ge($a, $b)>
994
a7fbee98 995They works like the same name operators as theirs.
5398038e 996
997 eq : whether $a is equal to $b.
998 ne : whether $a is not equal to $b.
999 lt : whether $a is lesser than $b.
1000 le : whether $a is lesser than $b or equal to $b.
1001 gt : whether $a is greater than $b.
1002 ge : whether $a is greater than $b or equal to $b.
1003
1004=item C<$sortKey = $Collator-E<gt>getSortKey($string)>
45394607 1005
1006-- see 4.3 Form a sort key for each string, UTR #10.
1007
1008Returns a sort key.
1009
1010You compare the sort keys using a binary comparison
1011and get the result of the comparison of the strings using UCA.
1012
5398038e 1013 $Collator->getSortKey($a) cmp $Collator->getSortKey($b)
45394607 1014
1015 is equivalent to
1016
5398038e 1017 $Collator->cmp($a, $b)
45394607 1018
a7fbee98 1019=item C<$sortKeyForm = $Collator-E<gt>viewSortKey($string)>
1020
a7fbee98 1021 use Unicode::Collate;
1022 my $c = Unicode::Collate->new();
1023 print $c->viewSortKey("Perl"),"\n";
1024
0116f5dc 1025 # output:
1026 # [0B67 0A65 0B7F 0B03 | 0020 0020 0020 0020 | 0008 0002 0002 0002 | FFFF FFFF FFFF FFFF]
1027 # Level 1 Level 2 Level 3 Level 4
1028
1029 (If C<UCA_Version> is 8, the output is slightly different.)
a7fbee98 1030
5398038e 1031=item C<$position = $Collator-E<gt>index($string, $substring)>
d16e9e3d 1032
5398038e 1033=item C<($position, $length) = $Collator-E<gt>index($string, $substring)>
d16e9e3d 1034
1035-- see 6.8 Searching, UTR #10.
1036
1037If C<$substring> matches a part of C<$string>, returns
1038the position of the first occurrence of the matching part in scalar context;
1039in list context, returns a two-element list of
1040the position and the length of the matching part.
1041
1042B<Notice> that the length of the matching part may differ from
1043the length of C<$substring>.
1044
1045B<Note> that the position and the length are counted on the string
1046after the process of preprocess, normalization, and rearrangement.
1047Therefore, in case the specified string is not binary equal to
1048the preprocessed/normalized/rearranged string, the position and the length
a7fbee98 1049may differ form those on the specified string. But it is guaranteed
d16e9e3d 1050that, if matched, it returns a non-negative value as C<$position>.
1051
1052If C<$substring> does not match any part of C<$string>,
1053returns C<-1> in scalar context and
1054an empty list in list context.
1055
1056e.g. you say
1057
5398038e 1058 my $Collator = Unicode::Collate->new( normalization => undef, level => 1 );
d16e9e3d 1059 my $str = "Ich mu\x{00DF} studieren.";
1060 my $sub = "m\x{00FC}ss";
1061 my $match;
a7fbee98 1062 if (my($pos,$len) = $Collator->index($str, $sub)) {
5398038e 1063 $match = substr($str, $pos, $len);
d16e9e3d 1064 }
1065
1066and get C<"mu\x{00DF}"> in C<$match> since C<"mu>E<223>C<">
1067is primary equal to C<"m>E<252>C<ss">.
1068
45394607 1069=back
1070
3164dd77 1071=head2 Other Methods
1072
1073=over 4
1074
0116f5dc 1075=item C<%old_tailoring = $Collator-E<gt>change(%new_tailoring)>
1076
1077Change the value of specified keys and returns the changed part.
1078
1079 $Collator = Unicode::Collate->new(level => 4);
1080
1081 $Collator->eq("perl", "PERL"); # false
1082
1083 %old = $Collator->change(level => 2); # returns (level => 4).
1084
1085 $Collator->eq("perl", "PERL"); # true
1086
1087 $Collator->change(%old); # returns (level => 2).
1088
1089 $Collator->eq("perl", "PERL"); # false
1090
1091Not all C<(key,value)>s are allowed to be changed.
1092See also C<@Unicode::Collate::ChangeOK> and C<@Unicode::Collate::ChangeNG>.
1093
1094In the scalar context, returns the modified collator
1095(but it is B<not> a clone from the original).
1096
1097 $Collator->change(level => 2)->eq("perl", "PERL"); # true
1098
1099 $Collator->eq("perl", "PERL"); # true; now max level is 2nd.
1100
1101 $Collator->change(level => 4)->eq("perl", "PERL"); # false
1102
3164dd77 1103=item UCA_Version
1104
1105Returns the version number of Unicode Technical Standard 10
1106this module consults.
1107
1108=item Base_Unicode_Version
1109
1110Returns the version number of the Unicode Standard
1111this module is based on.
1112
1113=back
1114
45394607 1115=head2 EXPORT
1116
1117None by default.
1118
4a2e806c 1119=head2 TODO
1120
1121Unicode::Collate has not been ported to EBCDIC. The code mostly would
1122work just fine but a decision needs to be made: how the module should
1123work in EBCDIC? Should the low 256 characters be understood as
1124Unicode or as EBCDIC code points? Should one be chosen or should
1125there be a way to do either? Or should such translation be left
1126outside the module for the user to do, for example by using
1127Encode::from_to()?
a7fbee98 1128(or utf8::unicode_to_native()/utf8::native_to_unicode()?)
4a2e806c 1129
45394607 1130=head2 CAVEAT
1131
1132Use of the C<normalization> parameter requires
1133the B<Unicode::Normalize> module.
1134
5398038e 1135If you need not it (say, in the case when you need not
45394607 1136handle any combining characters),
1137assign C<normalization =E<gt> undef> explicitly.
1138
5398038e 1139-- see 6.5 Avoiding Normalization, UTR #10.
1140
0116f5dc 1141=head2 Conformance Test
1142
1143The Conformance Test for the UCA is provided
1144in L<http://www.unicode.org/reports/tr10/CollationTest.html>
1145and L<http://www.unicode.org/reports/tr10/CollationTest.zip>
1146
1147For F<CollationTest_SHIFTED.txt>,
1148a collator via C<Unicode::Collate-E<gt>new( )> should be used;
1149for F<CollationTest_NON_IGNORABLE.txt>, a collator via
1150C<Unicode::Collate-E<gt>new(alternate =E<gt> "non-ignorable", level =E<gt> 3)>.
1151
1152B<Unicode::Normalize is required to try this test.>
1153
a7fbee98 1154=head2 BUGS
1155
1156C<index()> is an experimental method and
1157its return value may be unreliable.
1158The correct implementation for C<index()> must be based
1159on Locale-Sensitive Support: Level 3 in UTR #18,
1160F<Unicode Regular Expression Guidelines>.
1161
1162See also 4.2 Locale-Dependent Graphemes in UTR #18.
1163
45394607 1164=head1 AUTHOR
1165
1166SADAHIRO Tomoyuki, E<lt>SADAHIRO@cpan.orgE<gt>
1167
1168 http://homepage1.nifty.com/nomenclator/perl/
1169
a7fbee98 1170 Copyright(C) 2001-2002, SADAHIRO Tomoyuki. Japan. All rights reserved.
45394607 1171
a7fbee98 1172 This library is free software; you can redistribute it
1173 and/or modify it under the same terms as Perl itself.
45394607 1174
1175=head1 SEE ALSO
1176
1177=over 4
1178
0116f5dc 1179=item http://www.unicode.org/reports/tr10/
45394607 1180
a7fbee98 1181Unicode Collation Algorithm - UTR #10
45394607 1182
0116f5dc 1183=item http://www.unicode.org/reports/tr10/allkeys.txt
a7fbee98 1184
1185The Default Unicode Collation Element Table
45394607 1186
0116f5dc 1187=item http://www.unicode.org/reports/tr10/CollationTest.html
1188http://www.unicode.org/reports/tr10/CollationTest.zip
1189
1190The latest versions of the conformance test for the UCA
1191
1192=item http://www.unicode.org/reports/tr15/
a7fbee98 1193
1194Unicode Normalization Forms - UAX #15
1195
0116f5dc 1196=item http://www.unicode.org/reports/tr18
a7fbee98 1197
1198Unicode Regular Expression Guidelines - UTR #18
1199
1200=item L<Unicode::Normalize>
45394607 1201
45394607 1202=back
1203
1204=cut