On a real multicpu box the child thread could still be running
[p5sagit/p5-mst-13.2.git] / lib / Unicode / Collate.pm
CommitLineData
45394607 1package Unicode::Collate;
2
4a2e806c 3BEGIN {
9f1f04a1 4 unless ("A" eq pack('U', 0x41) || "A" eq pack('U', ord("A"))) {
5 die "Unicode::Collate cannot stringify a Unicode code point\n";
4a2e806c 6 }
7}
8
45394607 9use 5.006;
10use strict;
11use warnings;
12use Carp;
e69a2255 13use File::Spec;
5398038e 14
45394607 15require Exporter;
16
9f1f04a1 17our $VERSION = '0.24';
45394607 18our $PACKAGE = __PACKAGE__;
19
20our @ISA = qw(Exporter);
21
22our %EXPORT_TAGS = ();
23our @EXPORT_OK = ();
24our @EXPORT = ();
25
26(our $Path = $INC{'Unicode/Collate.pm'}) =~ s/\.pm$//;
27our $KeyFile = "allkeys.txt";
28
3164dd77 29our $UNICODE_VERSION;
30
327745dc 31eval { require Unicode::UCD };
32
33unless ($@) {
34 $UNICODE_VERSION = Unicode::UCD::UnicodeVersion();
35}
9f1f04a1 36else { # Perl 5.6.1
3164dd77 37 my($f, $fh);
38 foreach my $d (@INC) {
3164dd77 39 $f = File::Spec->catfile($d, "unicode", "Unicode.301");
40 if (open($fh, $f)) {
327745dc 41 $UNICODE_VERSION = '3.0.1';
3164dd77 42 close $fh;
43 last;
44 }
45 }
46}
47
4d36a948 48# Perl's boolean
49use constant TRUE => 1;
50use constant FALSE => "";
51use constant NOMATCHPOS => -1;
52
53# A coderef to get combining class imported from Unicode::Normalize
54# (i.e. \&Unicode::Normalize::getCombinClass).
55# This is also used as a HAS_UNICODE_NORMALIZE flag.
56our $getCombinClass;
57
9f1f04a1 58# Supported Levels
59use constant MinLevel => 1;
60use constant MaxLevel => 4;
61
4d36a948 62# Minimum weights at level 2 and 3, respectively
9f1f04a1 63use constant Min2Wt => 0x20;
64use constant Min3Wt => 0x02;
4d36a948 65
66# Shifted weight at 4th level
9f1f04a1 67use constant Shift4Wt => 0xFFFF;
4d36a948 68
69# Variable weight at 1st level.
70# This is a negative value but should be regarded as zero on collation.
71# This is for distinction of variable chars from level 3 ignorable chars.
9f1f04a1 72use constant Var1Wt => -1;
4d36a948 73
74
75# A boolean for Variable and 16-bit weights at 4 levels of Collation Element
76# PROBLEM: The Default Unicode Collation Element Table
77# has weights over 0xFFFF at the 4th level.
78# The tie-breaking in the variable weights
79# other than "shift" (as well as "shift-trimmed") is unreliable.
80use constant VCE_TEMPLATE => 'Cn4';
81
4d36a948 82# A sort key: 16-bit weights
83# See also the PROBLEM on VCE_TEMPLATE above.
84use constant KEY_TEMPLATE => 'n*';
85
86# Level separator in a sort key:
87# i.e. pack(KEY_TEMPLATE, 0)
88use constant LEVEL_SEP => "\0\0";
89
90# As Unicode code point separator for hash keys.
91# A joined code point string (denoted by JCPS below)
92# like "65;768" is used for internal processing
93# instead of Perl's Unicode string like "\x41\x{300}",
94# as the native code point is different from the Unicode code point
95# on EBCDIC platform.
96# This character must not be included in any stringified
97# representation of an integer.
98use constant CODE_SEP => ';';
99
100# boolean values of variable weights
0116f5dc 101use constant NON_VAR => 0; # Non-Variable character
102use constant VAR => 1; # Variable character
3164dd77 103
4d36a948 104# Logical_Order_Exception in PropList.txt
105# TODO: synchronization with change of PropList.txt.
0116f5dc 106our $DefaultRearrange = [ 0x0E40..0x0E44, 0x0EC0..0x0EC4 ];
a7fbee98 107
0116f5dc 108sub UCA_Version { "9" }
a7fbee98 109
0116f5dc 110sub Base_Unicode_Version { $UNICODE_VERSION || 'unknown' }
a7fbee98 111
9f1f04a1 112######
113
114use constant UNICODE_FOR_PACK => ("A" eq pack('U', 0x41));
115use constant NATIVE_FOR_PACK => ("A" eq pack('U', ord("A")));
116
117use constant UNICODE_FOR_UNPACK => (0x41 == unpack('U', "A"));
118use constant NATIVE_FOR_UNPACK => (ord("A") == unpack('U', "A"));
119
120sub pack_U {
121 return UNICODE_FOR_PACK
122 ? pack('U*', @_)
123 : NATIVE_FOR_PACK
124 ? pack('U*', map utf8::unicode_to_native($_), @_)
125 : die "$PACKAGE, a Unicode code point cannot be stringified.\n";
126}
127
128sub unpack_U {
129 return UNICODE_FOR_UNPACK
130 ? unpack('U*', shift)
131 : NATIVE_FOR_UNPACK
132 ? map(utf8::native_to_unicode($_), unpack 'U*', shift)
133 : die "$PACKAGE, a code point returned from unpack U " .
134 "cannot be converted into Unicode.\n";
135}
136
137######
138
0116f5dc 139my (%AlternateOK);
140@AlternateOK{ qw/
141 blanked non-ignorable shifted shift-trimmed
142 / } = ();
143
144our @ChangeOK = qw/
145 alternate backwards level normalization rearrange
146 katakana_before_hiragana upper_before_lower
147 overrideHangul overrideCJK preprocess UCA_Version
148 /;
149
150our @ChangeNG = qw/
9f1f04a1 151 entry entries table maxlength
0116f5dc 152 ignoreChar ignoreName undefChar undefName
153 versionTable alternateTable backwardsTable forwardsTable rearrangeTable
4d36a948 154 derivCode normCode rearrangeHash L3_ignorable
9f1f04a1 155 backwardsFlag
0116f5dc 156 /;
9f1f04a1 157# The hash key 'ignored' is deleted at v 0.21.
158# The hash key 'isShift' is deleted at v 0.23.
159# The hash key 'combining' is deleted at v 0.24.
0116f5dc 160
161my (%ChangeOK, %ChangeNG);
162@ChangeOK{ @ChangeOK } = ();
163@ChangeNG{ @ChangeNG } = ();
164
165sub change {
166 my $self = shift;
167 my %hash = @_;
168 my %old;
169 foreach my $k (keys %hash) {
170 if (exists $ChangeOK{$k}) {
171 $old{$k} = $self->{$k};
172 $self->{$k} = $hash{$k};
173 }
174 elsif (exists $ChangeNG{$k}) {
175 croak "change of $k via change() is not allowed!";
176 }
177 # else => ignored
178 }
179 $self->checkCollator;
180 return wantarray ? %old : $self;
181}
a7fbee98 182
9f1f04a1 183sub _checkLevel {
184 my $level = shift;
185 my $key = shift;
186 croak sprintf "Illegal level %d (in \$self->{%s}) lower than %d.",
187 $level, $key, MinLevel if MinLevel > $level;
188 croak sprintf "Unsupported level %d (in \$self->{%s}) higher than %d ",
189 $level, $key, MaxLevel if MaxLevel < $level;
190}
191
0116f5dc 192sub checkCollator {
193 my $self = shift;
9f1f04a1 194 _checkLevel($self->{level}, "level");
a7fbee98 195
0116f5dc 196 $self->{derivCode} =
4d36a948 197 $self->{UCA_Version} == 8 ? \&_derivCE_8 :
198 $self->{UCA_Version} == 9 ? \&_derivCE_9 :
0116f5dc 199 croak "Illegal UCA version (passed $self->{UCA_Version}).";
a7fbee98 200
0116f5dc 201 $self->{alternate} = lc($self->{alternate});
202 croak "$PACKAGE unknown alternate tag name: $self->{alternate}"
203 unless exists $AlternateOK{ $self->{alternate} };
204
9f1f04a1 205 if (! defined $self->{backwards}) {
206 $self->{backwardsFlag} = 0;
207 }
208 elsif (! ref $self->{backwards}) {
209 _checkLevel($self->{backwards}, "backwards");
210 $self->{backwardsFlag} = 1 << $self->{backwards};
211 }
212 else {
213 my %level;
214 $self->{backwardsFlag} = 0;
215 for my $b (@{ $self->{backwards} }) {
216 _checkLevel($b, "backwards");
217 $level{$b} = 1;
218 }
219 for my $v (sort keys %level) {
220 $self->{backwardsFlag} += 1 << $v;
221 }
222 }
0116f5dc 223
224 $self->{rearrange} = []
225 if ! defined $self->{rearrange};
226 croak "$PACKAGE: A list for rearrangement must be store in an ARRAYREF"
227 if ! ref $self->{rearrange};
228
229 # keys of $self->{rearrangeHash} are $self->{rearrange}.
230 $self->{rearrangeHash} = undef;
231
232 if (@{ $self->{rearrange} }) {
233 @{ $self->{rearrangeHash} }{ @{ $self->{rearrange} } } = ();
234 }
235
236 $self->{normCode} = undef;
a7fbee98 237
238 if (defined $self->{normalization}) {
239 eval { require Unicode::Normalize };
240 croak "Unicode/Normalize.pm is required to normalize strings: $@"
241 if $@;
242
243 Unicode::Normalize->import();
244 $getCombinClass = \&Unicode::Normalize::getCombinClass
245 if ! $getCombinClass;
246
0116f5dc 247 $self->{normCode} =
a7fbee98 248 $self->{normalization} =~ /^(?:NF)?C$/ ? \&NFC :
249 $self->{normalization} =~ /^(?:NF)?D$/ ? \&NFD :
250 $self->{normalization} =~ /^(?:NF)?KC$/ ? \&NFKC :
251 $self->{normalization} =~ /^(?:NF)?KD$/ ? \&NFKD :
252 croak "$PACKAGE unknown normalization form name: "
253 . $self->{normalization};
254 }
0116f5dc 255 return;
256}
257
258sub new
259{
260 my $class = shift;
261 my $self = bless { @_ }, $class;
45394607 262
a7fbee98 263 # If undef is passed explicitly, no file is read.
0116f5dc 264 $self->{table} = $KeyFile if ! exists $self->{table};
265 $self->read_table if defined $self->{table};
905aa9f0 266
a7fbee98 267 if ($self->{entry}) {
268 $self->parseEntry($_) foreach split /\n/, $self->{entry};
269 }
905aa9f0 270
9f1f04a1 271 $self->{level} ||= MaxLevel;
0116f5dc 272 $self->{UCA_Version} ||= UCA_Version();
905aa9f0 273
0116f5dc 274 $self->{overrideHangul} = ''
275 if ! exists $self->{overrideHangul};
276 $self->{overrideCJK} = ''
277 if ! exists $self->{overrideCJK};
278 $self->{normalization} = 'D'
279 if ! exists $self->{normalization};
280 $self->{alternate} = $self->{alternateTable} || 'shifted'
281 if ! exists $self->{alternate};
282 $self->{rearrange} = $self->{rearrangeTable} || $DefaultRearrange
a7fbee98 283 if ! exists $self->{rearrange};
0116f5dc 284 $self->{backwards} = $self->{backwardsTable}
285 if ! exists $self->{backwards};
a7fbee98 286
0116f5dc 287 $self->checkCollator;
a7fbee98 288
289 return $self;
290}
905aa9f0 291
292sub read_table {
a7fbee98 293 my $self = shift;
294 my $file = $self->{table} ne '' ? $self->{table} : $KeyFile;
295
e69a2255 296 my $filepath = File::Spec->catfile($Path, $file);
297 open my $fk, "<$filepath"
298 or croak "File does not exist at $filepath";
a7fbee98 299
300 while (<$fk>) {
301 next if /^\s*#/;
302 if (/^\s*\@/) {
0116f5dc 303 if (/^\s*\@version\s*(\S*)/) {
304 $self->{versionTable} ||= $1;
305 }
306 elsif (/^\s*\@alternate\s+(\S*)/) {
307 $self->{alternateTable} ||= $1;
a7fbee98 308 }
0116f5dc 309 elsif (/^\s*\@backwards\s+(\S*)/) {
310 push @{ $self->{backwardsTable} }, $1;
a7fbee98 311 }
0116f5dc 312 elsif (/^\s*\@forwards\s+(\S*)/) { # parhaps no use
313 push @{ $self->{forwardsTable} }, $1;
a7fbee98 314 }
0116f5dc 315 elsif (/^\s*\@rearrange\s+(.*)/) { # (\S*) is NG
316 push @{ $self->{rearrangeTable} }, _getHexArray($1);
a7fbee98 317 }
318 next;
319 }
320 $self->parseEntry($_);
45394607 321 }
a7fbee98 322 close $fk;
45394607 323}
324
905aa9f0 325
45394607 326##
327## get $line, parse it, and write an entry in $self
328##
329sub parseEntry
330{
a7fbee98 331 my $self = shift;
332 my $line = shift;
4d36a948 333 my($name, $entry, @uv, @key);
a7fbee98 334
335 return if $line !~ /^\s*[0-9A-Fa-f]/;
336
337 # removes comment and gets name
338 $name = $1
339 if $line =~ s/[#%]\s*(.*)//;
340 return if defined $self->{undefName} && $name =~ /$self->{undefName}/;
341
342 # gets element
343 my($e, $k) = split /;/, $line;
344 croak "Wrong Entry: <charList> must be separated by ';' from <collElement>"
345 if ! $k;
346
4d36a948 347 @uv = _getHexArray($e);
348 return if !@uv;
349
350 $entry = join(CODE_SEP, @uv); # in JCPS
0116f5dc 351
4d36a948 352 if (defined $self->{undefChar} || defined $self->{ignoreChar}) {
9f1f04a1 353 my $ele = pack_U(@uv);
a7fbee98 354
4d36a948 355 # regarded as if it were not entried in the table
356 return
357 if defined $self->{undefChar} && $ele =~ /$self->{undefChar}/;
caffd4cf 358
4d36a948 359 # replaced as completely ignorable
360 $k = '[.0000.0000.0000.0000]'
361 if defined $self->{ignoreChar} && $ele =~ /$self->{ignoreChar}/;
45394607 362 }
0116f5dc 363
4d36a948 364 # replaced as completely ignorable
365 $k = '[.0000.0000.0000.0000]'
366 if defined $self->{ignoreName} && $name =~ /$self->{ignoreName}/;
367
4d36a948 368 my $is_L3_ignorable;
369
caffd4cf 370 foreach my $arr ($k =~ /\[([^\[\]]+)\]/g) { # SPACEs allowed
371 my $var = $arr =~ /\*/; # exactly /^\*/ but be lenient.
4d36a948 372 my @wt = _getHexArray($arr);
373 push @key, pack(VCE_TEMPLATE, $var, @wt);
4d36a948 374 $is_L3_ignorable = TRUE
375 if $wt[0] + $wt[1] + $wt[2] == 0;
376 # if $arr !~ /[1-9A-Fa-f]/; NG
377 # Conformance Test shows L3-ignorable is completely ignorable.
a7fbee98 378 }
caffd4cf 379
4d36a948 380 $self->{entries}{$entry} = \@key;
caffd4cf 381
4d36a948 382 $self->{L3_ignorable}{$uv[0]} = TRUE
383 if @uv == 1 && $is_L3_ignorable;
caffd4cf 384
4d36a948 385 # Contraction is to be considered in the range of this maxlength.
386 $self->{maxlength}{$uv[0]} = scalar @uv
387 if @uv > 1;
45394607 388}
389
9f1f04a1 390
45394607 391##
9f1f04a1 392## arrayref[weights] = altCE(VCE)
45394607 393##
d16e9e3d 394sub altCE
45394607 395{
a7fbee98 396 my $self = shift;
4d36a948 397 my($var, @wt) = unpack(VCE_TEMPLATE, shift);
a7fbee98 398
399 $self->{alternate} eq 'blanked' ?
9f1f04a1 400 $var ? [Var1Wt, 0, 0, $wt[3]] : \@wt :
a7fbee98 401 $self->{alternate} eq 'non-ignorable' ?
4d36a948 402 \@wt :
a7fbee98 403 $self->{alternate} eq 'shifted' ?
9f1f04a1 404 $var ? [Var1Wt, 0, 0, $wt[0] ]
405 : [ @wt[0..2], $wt[0]+$wt[1]+$wt[2] ? Shift4Wt : 0 ] :
a7fbee98 406 $self->{alternate} eq 'shift-trimmed' ?
9f1f04a1 407 $var ? [Var1Wt, 0, 0, $wt[0] ] : [ @wt[0..2], 0 ] :
a7fbee98 408 croak "$PACKAGE unknown alternate name: $self->{alternate}";
45394607 409}
410
45394607 411sub viewSortKey
412{
a7fbee98 413 my $self = shift;
9f1f04a1 414 $self->visualizeSortKey($self->getSortKey(@_));
415}
0116f5dc 416
9f1f04a1 417sub visualizeSortKey
418{
419 my $self = shift;
420 my $view = join " ", map sprintf("%04X", $_), unpack(KEY_TEMPLATE, shift);
4d36a948 421
9f1f04a1 422 if ($self->{UCA_Version} <= 8) {
0116f5dc 423 $view =~ s/ ?0000 ?/|/g;
424 } else {
425 $view =~ s/\b0000\b/|/g;
426 }
a7fbee98 427 return "[$view]";
45394607 428}
429
d16e9e3d 430
45394607 431##
4d36a948 432## arrayref of JCPS = splitCE(string to be collated)
433## arrayref of arrayref[JCPS, ini_pos, fin_pos] = splitCE(string, true)
45394607 434##
d16e9e3d 435sub splitCE
45394607 436{
a7fbee98 437 my $self = shift;
4d36a948 438 my $wLen = $_[1];
439
a7fbee98 440 my $code = $self->{preprocess};
0116f5dc 441 my $norm = $self->{normCode};
a7fbee98 442 my $ent = $self->{entries};
443 my $max = $self->{maxlength};
444 my $reH = $self->{rearrangeHash};
4d36a948 445 my $ign = $self->{L3_ignorable};
0116f5dc 446 my $ver9 = $self->{UCA_Version} > 8;
a7fbee98 447
4d36a948 448 my ($str, @buf);
a7fbee98 449
4d36a948 450 if ($wLen) {
451 $code and croak "Preprocess breaks character positions. "
452 . "Don't use with index(), match(), etc.";
453 $norm and croak "Normalization breaks character positions. "
454 . "Don't use with index(), match(), etc.";
455 $str = $_[0];
456 }
457 else {
458 $str = $_[0];
459 $str = &$code($str) if ref $code;
460 $str = &$norm($str) if ref $norm;
461 }
a7fbee98 462
4d36a948 463 # get array of Unicode code point of string.
9f1f04a1 464 my @src = unpack_U($str);
4d36a948 465
466 # rearrangement:
467 # Character positions are not kept if rearranged,
468 # then neglected if $wLen is true.
469 if ($reH && ! $wLen) {
a7fbee98 470 for (my $i = 0; $i < @src; $i++) {
471 if (exists $reH->{ $src[$i] } && $i + 1 < @src) {
472 ($src[$i], $src[$i+1]) = ($src[$i+1], $src[$i]);
473 $i++;
474 }
475 }
45394607 476 }
45394607 477
0116f5dc 478 if ($ver9) {
4d36a948 479 # To remove a character marked as a completely ignorable.
480 for (my $i = 0; $i < @src; $i++) {
481 $src[$i] = undef if $ign->{ $src[$i] };
482 }
0116f5dc 483 }
484
a7fbee98 485 for (my $i = 0; $i < @src; $i++) {
4d36a948 486 next if _isNonCharacter($src[$i]);
487
488 my $i_orig = $i;
489 my $ce = $src[$i];
490
491 if ($max->{$ce}) { # contract
492 my $temp_ce = $ce;
493
494 for (my $p = $i + 1; $p < @src; $p++) {
495 next if ! defined $src[$p];
496 $temp_ce .= CODE_SEP . $src[$p];
497 if ($ent->{$temp_ce}) {
498 $ce = $temp_ce;
499 $i = $p;
500 }
501 }
502 }
503
504 # with Combining Char (UTS#10, 4.2.1).
505 # requires Unicode::Normalize.
506 # Not be $wLen, as not croaked due to $norm.
507 if ($getCombinClass) {
508 for (my $p = $i + 1; $p < @src; $p++) {
509 next if ! defined $src[$p];
510 last unless $getCombinClass->($src[$p]);
511 my $tail = CODE_SEP . $src[$p];
512 if ($ent->{$ce.$tail}) {
513 $ce .= $tail;
514 $src[$p] = undef;
515 }
a7fbee98 516 }
a7fbee98 517 }
518
4d36a948 519 if ($wLen) {
520 for (my $p = $i + 1; $p < @src; $p++) {
521 last if defined $src[$p];
522 $i = $p;
a7fbee98 523 }
524 }
4d36a948 525
526 push @buf, $wLen ? [$ce, $i_orig, $i + 1] : $ce;
45394607 527 }
4d36a948 528 return \@buf;
d16e9e3d 529}
45394607 530
d16e9e3d 531
532##
4d36a948 533## list of arrayrefs of weights = getWt(JCPS)
d16e9e3d 534##
535sub getWt
536{
a7fbee98 537 my $self = shift;
4d36a948 538 my $ce = shift;
a7fbee98 539 my $ent = $self->{entries};
a7fbee98 540 my $cjk = $self->{overrideCJK};
541 my $hang = $self->{overrideHangul};
0116f5dc 542 my $der = $self->{derivCode};
a7fbee98 543
4d36a948 544 return if !defined $ce;
545 return map($self->altCE($_), @{ $ent->{$ce} })
546 if $ent->{$ce};
0116f5dc 547
4d36a948 548 # CE must not be a contraction, then it's a code point.
549 my $u = $ce;
a7fbee98 550
4d36a948 551 if (0xAC00 <= $u && $u <= 0xD7A3) { # is Hangul Syllale
0116f5dc 552 return map $self->altCE($_),
553 $hang
4d36a948 554 ? map(pack(VCE_TEMPLATE, NON_VAR, @$_), &$hang($u))
0116f5dc 555 : defined $hang
556 ? map({
4d36a948 557 $ent->{$_} ? @{ $ent->{$_} } : $der->($_);
0116f5dc 558 } _decompHangul($u))
559 : $der->($u);
a7fbee98 560 }
561 elsif (0x3400 <= $u && $u <= 0x4DB5 ||
562 0x4E00 <= $u && $u <= 0x9FA5 ||
0116f5dc 563 0x20000 <= $u && $u <= 0x2A6D6) { # CJK Ideograph
564 return map $self->altCE($_),
565 $cjk
4d36a948 566 ? map(pack(VCE_TEMPLATE, NON_VAR, @$_), &$cjk($u))
0116f5dc 567 : defined $cjk && $self->{UCA_Version} <= 8 && $u < 0x10000
9f1f04a1 568 ? pack(VCE_TEMPLATE, NON_VAR, $u, Min2Wt, Min3Wt, $u)
0116f5dc 569 : $der->($u);
a7fbee98 570 }
571 else {
0116f5dc 572 return map $self->altCE($_), $der->($u);
a7fbee98 573 }
d16e9e3d 574}
575
d16e9e3d 576
577##
578## string sortkey = getSortKey(string arg)
579##
580sub getSortKey
581{
a7fbee98 582 my $self = shift;
583 my $lev = $self->{level};
4d36a948 584 my $rCE = $self->splitCE(shift); # get an arrayref of JCPS
0116f5dc 585 my $ver9 = $self->{UCA_Version} > 8;
4d36a948 586 my $v2i = $self->{alternate} ne 'non-ignorable';
a7fbee98 587
588 # weight arrays
0116f5dc 589 my (@buf, $last_is_variable);
590
4d36a948 591 foreach my $wt (map $self->getWt($_), @$rCE) {
592 if ($v2i && $ver9) {
593 if ($wt->[0] == 0) { # ignorable
594 next if $last_is_variable;
0116f5dc 595 } else {
9f1f04a1 596 $last_is_variable = ($wt->[0] == Var1Wt);
0116f5dc 597 }
598 }
4d36a948 599 push @buf, $wt;
0116f5dc 600 }
a7fbee98 601
602 # make sort key
603 my @ret = ([],[],[],[]);
604 foreach my $v (0..$lev-1) {
605 foreach my $b (@buf) {
4d36a948 606 push @{ $ret[$v] }, $b->[$v]
607 if 0 < $b->[$v];
a7fbee98 608 }
609 }
45394607 610
a7fbee98 611 # modification of tertiary weights
612 if ($self->{upper_before_lower}) {
613 foreach (@{ $ret[2] }) {
614 if (0x8 <= $_ && $_ <= 0xC) { $_ -= 6 } # lower
615 elsif (0x2 <= $_ && $_ <= 0x6) { $_ += 6 } # upper
616 elsif ($_ == 0x1C) { $_ += 1 } # square upper
617 elsif ($_ == 0x1D) { $_ -= 1 } # square lower
618 }
45394607 619 }
a7fbee98 620 if ($self->{katakana_before_hiragana}) {
621 foreach (@{ $ret[2] }) {
622 if (0x0F <= $_ && $_ <= 0x13) { $_ -= 2 } # katakana
623 elsif (0x0D <= $_ && $_ <= 0x0E) { $_ += 5 } # hiragana
624 }
45394607 625 }
9f1f04a1 626
627 if ($self->{backwardsFlag}) {
628 for (my $v = MinLevel; $v <= MaxLevel; $v++) {
629 if ($self->{backwardsFlag} & (1 << $v)) {
630 @{ $ret[$v-1] } = reverse @{ $ret[$v-1] };
631 }
632 }
633 }
634
4d36a948 635 join LEVEL_SEP, map pack(KEY_TEMPLATE, @$_), @ret;
45394607 636}
637
638
639##
d16e9e3d 640## int compare = cmp(string a, string b)
45394607 641##
5398038e 642sub cmp { $_[0]->getSortKey($_[1]) cmp $_[0]->getSortKey($_[2]) }
643sub eq { $_[0]->getSortKey($_[1]) eq $_[0]->getSortKey($_[2]) }
644sub ne { $_[0]->getSortKey($_[1]) ne $_[0]->getSortKey($_[2]) }
645sub lt { $_[0]->getSortKey($_[1]) lt $_[0]->getSortKey($_[2]) }
646sub le { $_[0]->getSortKey($_[1]) le $_[0]->getSortKey($_[2]) }
647sub gt { $_[0]->getSortKey($_[1]) gt $_[0]->getSortKey($_[2]) }
648sub ge { $_[0]->getSortKey($_[1]) ge $_[0]->getSortKey($_[2]) }
45394607 649
650##
d16e9e3d 651## list[strings] sorted = sort(list[strings] arg)
45394607 652##
a7fbee98 653sub sort {
654 my $obj = shift;
655 return
656 map { $_->[1] }
657 sort{ $a->[0] cmp $b->[0] }
658 map [ $obj->getSortKey($_), $_ ], @_;
45394607 659}
660
0116f5dc 661
4d36a948 662sub _derivCE_9 {
0116f5dc 663 my $u = shift;
664 my $base =
4d36a948 665 (0x4E00 <= $u && $u <= 0x9FA5)
666 ? 0xFB40 : # CJK
0116f5dc 667 (0x3400 <= $u && $u <= 0x4DB5 || 0x20000 <= $u && $u <= 0x2A6D6)
4d36a948 668 ? 0xFB80 # CJK ext.
669 : 0xFBC0; # others
0116f5dc 670
671 my $aaaa = $base + ($u >> 15);
672 my $bbbb = ($u & 0x7FFF) | 0x8000;
673 return
9f1f04a1 674 pack(VCE_TEMPLATE, NON_VAR, $aaaa, Min2Wt, Min3Wt, $u),
675 pack(VCE_TEMPLATE, NON_VAR, $bbbb, 0, 0, $u);
0116f5dc 676}
677
4d36a948 678sub _derivCE_8 {
0116f5dc 679 my $code = shift;
680 my $aaaa = 0xFF80 + ($code >> 15);
681 my $bbbb = ($code & 0x7FFF) | 0x8000;
682 return
4d36a948 683 pack(VCE_TEMPLATE, NON_VAR, $aaaa, 2, 1, $code),
684 pack(VCE_TEMPLATE, NON_VAR, $bbbb, 0, 0, $code);
45394607 685}
686
687##
688## "hhhh hhhh hhhh" to (dddd, dddd, dddd)
689##
a7fbee98 690sub _getHexArray { map hex, $_[0] =~ /([0-9a-fA-F]+)/g }
5398038e 691
a7fbee98 692#
4d36a948 693# $code *must* be in Hangul syllable.
a7fbee98 694# Check it before you enter here.
695#
5398038e 696sub _decompHangul {
697 my $code = shift;
5398038e 698 my $SIndex = $code - 0xAC00;
699 my $LIndex = int( $SIndex / 588);
700 my $VIndex = int(($SIndex % 588) / 28);
701 my $TIndex = $SIndex % 28;
702 return (
a7fbee98 703 0x1100 + $LIndex,
704 0x1161 + $VIndex,
705 $TIndex ? (0x11A7 + $TIndex) : (),
5398038e 706 );
45394607 707}
708
4d36a948 709sub _isNonCharacter {
710 my $code = shift;
711 return ! defined $code # removed
712 || ($code < 0 || 0x10FFFF < $code) # out of range
713 || (($code & 0xFFFE) == 0xFFFE) # ??FFF[EF] (cf. utf8.c)
714 || (0xD800 <= $code && $code <= 0xDFFF) # unpaired surrogates
715 || (0xFDD0 <= $code && $code <= 0xFDEF) # other non-characters
716 ;
717}
718
719
720##
721## bool _nonIgnorAtLevel(arrayref weights, int level)
722##
723sub _nonIgnorAtLevel($$)
724{
725 my $wt = shift;
726 return if ! defined $wt;
727 my $lv = shift;
9f1f04a1 728 return grep($wt->[$_-1] != 0, MinLevel..$lv) ? TRUE : FALSE;
4d36a948 729}
730
731##
732## bool _eqArray(
733## arrayref of arrayref[weights] source,
734## arrayref of arrayref[weights] substr,
735## int level)
736## * comparison of graphemes vs graphemes.
737## @$source >= @$substr must be true (check it before call this);
738##
739sub _eqArray($$$)
740{
741 my $source = shift;
742 my $substr = shift;
743 my $lev = shift;
744
745 for my $g (0..@$substr-1){
746 # Do the $g'th graphemes have the same number of AV weigths?
747 return if @{ $source->[$g] } != @{ $substr->[$g] };
748
749 for my $w (0..@{ $substr->[$g] }-1) {
750 for my $v (0..$lev-1) {
751 return if $source->[$g][$w][$v] != $substr->[$g][$w][$v];
752 }
753 }
754 }
755 return 1;
756}
757
758##
759## (int position, int length)
760## int position = index(string, substring, position, [undoc'ed grobal])
761##
762## With "grobal" (only for the list context),
763## returns list of arrayref[position, length].
764##
765sub index
766{
767 my $self = shift;
768 my $str = shift;
769 my $len = length($str);
770 my $subCE = $self->splitCE(shift);
771 my $pos = @_ ? shift : 0;
772 $pos = 0 if $pos < 0;
773 my $grob = shift;
774
4d36a948 775 my $lev = $self->{level};
776 my $ver9 = $self->{UCA_Version} > 8;
777 my $v2i = $self->{alternate} ne 'non-ignorable';
778
779 if (! @$subCE) {
780 my $temp = $pos <= 0 ? 0 : $len <= $pos ? $len : $pos;
781 return $grob
782 ? map([$_, 0], $temp..$len)
783 : wantarray ? ($temp,0) : $temp;
784 }
785 if ($len < $pos) {
786 return wantarray ? () : NOMATCHPOS;
787 }
788 my $strCE = $self->splitCE($pos ? substr($str, $pos) : $str, TRUE);
789 if (! @$strCE) {
790 return wantarray ? () : NOMATCHPOS;
791 }
792 my $last_is_variable;
793 my(@strWt, @iniPos, @finPos, @subWt, @g_ret);
794
795 $last_is_variable = FALSE;
796 for my $wt (map $self->getWt($_), @$subCE) {
797 my $to_be_pushed = _nonIgnorAtLevel($wt,$lev);
798
799 if ($v2i && $ver9) {
800 if ($wt->[0] == 0) {
801 $to_be_pushed = FALSE if $last_is_variable;
802 } else {
9f1f04a1 803 $last_is_variable = ($wt->[0] == Var1Wt);
4d36a948 804 }
805 }
806
807 if (@subWt && $wt->[0] == 0) {
808 push @{ $subWt[-1] }, $wt if $to_be_pushed;
809 } else {
9f1f04a1 810 $wt->[0] = 0 if $wt->[0] == Var1Wt;
4d36a948 811 push @subWt, [ $wt ];
812 }
813 }
814
815 my $count = 0;
816 my $end = @$strCE - 1;
817
818 $last_is_variable = FALSE;
819
820 for (my $i = 0; $i <= $end; ) { # no $i++
821 my $found_base = 0;
822
823 # fetch a grapheme
824 while ($i <= $end && $found_base == 0) {
825 for my $wt ($self->getWt($strCE->[$i][0])) {
826 my $to_be_pushed = _nonIgnorAtLevel($wt,$lev);
827
828 if ($v2i && $ver9) {
829 if ($wt->[0] == 0) {
830 $to_be_pushed = FALSE if $last_is_variable;
831 } else {
9f1f04a1 832 $last_is_variable = ($wt->[0] == Var1Wt);
4d36a948 833 }
834 }
835
836 if (@strWt && $wt->[0] == 0) {
837 push @{ $strWt[-1] }, $wt if $to_be_pushed;
838 $finPos[-1] = $strCE->[$i][2];
839 } elsif ($to_be_pushed) {
9f1f04a1 840 $wt->[0] = 0 if $wt->[0] == Var1Wt;
4d36a948 841 push @strWt, [ $wt ];
842 push @iniPos, $found_base ? NOMATCHPOS : $strCE->[$i][1];
843 $finPos[-1] = NOMATCHPOS if $found_base;
844 push @finPos, $strCE->[$i][2];
845 $found_base++;
846 }
847 # else ===> no-op
848 }
849 $i++;
850 }
851
852 # try to match
853 while ( @strWt > @subWt || (@strWt == @subWt && $i > $end) ) {
854 if ($iniPos[0] != NOMATCHPOS &&
855 $finPos[$#subWt] != NOMATCHPOS &&
856 _eqArray(\@strWt, \@subWt, $lev)) {
857 my $temp = $iniPos[0] + $pos;
858
859 if ($grob) {
860 push @g_ret, [$temp, $finPos[$#subWt] - $iniPos[0]];
861 splice @strWt, 0, $#subWt;
862 splice @iniPos, 0, $#subWt;
863 splice @finPos, 0, $#subWt;
864 }
865 else {
866 return wantarray
867 ? ($temp, $finPos[$#subWt] - $iniPos[0])
868 : $temp;
869 }
870 }
871 shift @strWt;
872 shift @iniPos;
873 shift @finPos;
874 }
875 }
876
877 return $grob
878 ? @g_ret
879 : wantarray ? () : NOMATCHPOS;
880}
881
882##
883## scalarref to matching part = match(string, substring)
884##
885sub match
886{
887 my $self = shift;
888 if (my($pos,$len) = $self->index($_[0], $_[1])) {
889 my $temp = substr($_[0], $pos, $len);
890 return wantarray ? $temp : \$temp;
891 # An lvalue ref \substr should be avoided,
892 # since its value is affected by modification of its referent.
893 }
894 else {
895 return;
896 }
897}
898
899##
900## arrayref matching parts = gmatch(string, substring)
901##
902sub gmatch
903{
904 my $self = shift;
905 my $str = shift;
906 my $sub = shift;
907 return map substr($str, $_->[0], $_->[1]),
908 $self->index($str, $sub, 0, 'g');
909}
910
911##
912## bool subst'ed = subst(string, substring, replace)
913##
914sub subst
915{
916 my $self = shift;
917 my $code = ref $_[2] eq 'CODE' ? $_[2] : FALSE;
918
919 if (my($pos,$len) = $self->index($_[0], $_[1])) {
920 if ($code) {
921 my $mat = substr($_[0], $pos, $len);
922 substr($_[0], $pos, $len, $code->($mat));
923 } else {
924 substr($_[0], $pos, $len, $_[2]);
925 }
926 return TRUE;
927 }
928 else {
929 return FALSE;
930 }
931}
932
933##
934## int count = gsubst(string, substring, replace)
935##
936sub gsubst
937{
938 my $self = shift;
939 my $code = ref $_[2] eq 'CODE' ? $_[2] : FALSE;
940 my $cnt = 0;
941
942 # Replacement is carried out from the end, then use reverse.
943 for my $pos_len (reverse $self->index($_[0], $_[1], 0, 'g')) {
944 if ($code) {
945 my $mat = substr($_[0], $pos_len->[0], $pos_len->[1]);
946 substr($_[0], $pos_len->[0], $pos_len->[1], $code->($mat));
947 } else {
948 substr($_[0], $pos_len->[0], $pos_len->[1], $_[2]);
949 }
950 $cnt++;
951 }
952 return $cnt;
953}
954
45394607 9551;
956__END__
957
958=head1 NAME
959
a7fbee98 960Unicode::Collate - Unicode Collation Algorithm
45394607 961
962=head1 SYNOPSIS
963
964 use Unicode::Collate;
965
966 #construct
5398038e 967 $Collator = Unicode::Collate->new(%tailoring);
45394607 968
969 #sort
5398038e 970 @sorted = $Collator->sort(@not_sorted);
45394607 971
972 #compare
a7fbee98 973 $result = $Collator->cmp($a, $b); # returns 1, 0, or -1.
45394607 974
975=head1 DESCRIPTION
976
4d36a948 977This module is an implementation
978of Unicode Technical Standard #10 (UTS #10)
979"Unicode Collation Algorithm."
980
45394607 981=head2 Constructor and Tailoring
982
d16e9e3d 983The C<new> method returns a collator object.
984
5398038e 985 $Collator = Unicode::Collate->new(
0116f5dc 986 UCA_Version => $UCA_Version,
45394607 987 alternate => $alternate,
988 backwards => $levelNumber, # or \@levelNumbers
989 entry => $element,
990 normalization => $normalization_form,
991 ignoreName => qr/$ignoreName/,
992 ignoreChar => qr/$ignoreChar/,
993 katakana_before_hiragana => $bool,
994 level => $collationLevel,
995 overrideCJK => \&overrideCJK,
996 overrideHangul => \&overrideHangul,
997 preprocess => \&preprocess,
998 rearrange => \@charList,
999 table => $filename,
1000 undefName => qr/$undefName/,
1001 undefChar => qr/$undefChar/,
1002 upper_before_lower => $bool,
1003 );
a7fbee98 1004 # if %tailoring is false (i.e. empty),
5398038e 1005 # $Collator should do the default collation.
45394607 1006
1007=over 4
1008
0116f5dc 1009=item UCA_Version
1010
1011If the version number of the older UCA is given,
1012the older behavior of that version is emulated on collating.
1013If omitted, the return value of C<UCA_Version()> is used.
1014
1015The supported version: 8 or 9.
1016
1017B<This parameter may be removed in the future version,
1018as switching the algorithm would affect the performance.>
1019
45394607 1020=item alternate
1021
4d36a948 1022-- see 3.2.2 Variable Weighting, UTS #10.
caffd4cf 1023
1024(the title in UCA version 8: Alternate Weighting)
45394607 1025
a7fbee98 1026This key allows to alternate weighting for variable collation elements,
1027which are marked with an ASTERISK in the table
1028(NOTE: Many punction marks and symbols are variable in F<allkeys.txt>).
1029
1030 alternate => 'blanked', 'non-ignorable', 'shifted', or 'shift-trimmed'.
45394607 1031
a7fbee98 1032These names are case-insensitive.
45394607 1033By default (if specification is omitted), 'shifted' is adopted.
1034
4d36a948 1035 'Blanked' Variable elements are made ignorable at levels 1 through 3;
a7fbee98 1036 considered at the 4th level.
1037
1038 'Non-ignorable' Variable elements are not reset to ignorable.
1039
4d36a948 1040 'Shifted' Variable elements are made ignorable at levels 1 through 3
a7fbee98 1041 their level 4 weight is replaced by the old level 1 weight.
1042 Level 4 weight for Non-Variable elements is 0xFFFF.
1043
1044 'Shift-Trimmed' Same as 'shifted', but all FFFF's at the 4th level
1045 are trimmed.
1046
45394607 1047=item backwards
1048
4d36a948 1049-- see 3.1.2 French Accents, UTS #10.
45394607 1050
1051 backwards => $levelNumber or \@levelNumbers
1052
1053Weights in reverse order; ex. level 2 (diacritic ordering) in French.
1054If omitted, forwards at all the levels.
1055
1056=item entry
1057
4d36a948 1058-- see 3.1 Linguistic Features; 3.2.1 File Format, UTS #10.
45394607 1059
a7fbee98 1060Overrides a default order or defines additional collation elements
45394607 1061
1062 entry => <<'ENTRIES', # use the UCA file format
a7fbee98 106300E6 ; [.0861.0020.0002.00E6] [.08B1.0020.0002.00E6] # ligature <ae> as <a><e>
45394607 10640063 0068 ; [.0893.0020.0002.0063] # "ch" in traditional Spanish
10650043 0068 ; [.0893.0020.0008.0043] # "Ch" in traditional Spanish
1066ENTRIES
1067
4d36a948 1068B<NOTE:> The code point in the UCA file format (before C<';'>)
1069B<must> be a Unicode code point, but not a native code point.
1070So C<0063> must always denote C<U+0063>,
1071but not a character of C<"\x63">.
1072
45394607 1073=item ignoreName
1074
1075=item ignoreChar
1076
4d36a948 1077-- see Completely Ignorable, 3.2.2 Variable Weighting, UTS #10.
45394607 1078
caffd4cf 1079Makes the entry in the table completely ignorable;
1080i.e. as if the weights were zero at all level.
45394607 1081
a7fbee98 1082E.g. when 'a' and 'e' are ignorable,
45394607 1083'element' is equal to 'lament' (or 'lmnt').
1084
1085=item level
1086
4d36a948 1087-- see 4.3 Form a sort key for each string, UTS #10.
45394607 1088
1089Set the maximum level.
1090Any higher levels than the specified one are ignored.
1091
1092 Level 1: alphabetic ordering
1093 Level 2: diacritic ordering
1094 Level 3: case ordering
1095 Level 4: tie-breaking (e.g. in the case when alternate is 'shifted')
1096
1097 ex.level => 2,
1098
a7fbee98 1099If omitted, the maximum is the 4th.
1100
45394607 1101=item normalization
1102
4d36a948 1103-- see 4.1 Normalize each input string, UTS #10.
45394607 1104
905aa9f0 1105If specified, strings are normalized before preparation of sort keys
45394607 1106(the normalization is executed after preprocess).
1107
1108As a form name, one of the following names must be used.
1109
1110 'C' or 'NFC' for Normalization Form C
1111 'D' or 'NFD' for Normalization Form D
1112 'KC' or 'NFKC' for Normalization Form KC
1113 'KD' or 'NFKD' for Normalization Form KD
1114
1115If omitted, the string is put into Normalization Form D.
1116
a7fbee98 1117If C<undef> is passed explicitly as the value for this key,
45394607 1118any normalization is not carried out (this may make tailoring easier
1119if any normalization is not desired).
1120
1121see B<CAVEAT>.
1122
1123=item overrideCJK
1124
4d36a948 1125-- see 7.1 Derived Collation Elements, UTS #10.
45394607 1126
1127By default, mapping of CJK Unified Ideographs
a7fbee98 1128uses the Unicode codepoint order.
1129But the mapping of CJK Unified Ideographs may be overrided.
45394607 1130
a7fbee98 1131ex. CJK Unified Ideographs in the JIS code point order.
45394607 1132
1133 overrideCJK => sub {
a7fbee98 1134 my $u = shift; # get a Unicode codepoint
1135 my $b = pack('n', $u); # to UTF-16BE
1136 my $s = your_unicode_to_sjis_converter($b); # convert
1137 my $n = unpack('n', $s); # convert sjis to short
1138 [ $n, 0x20, 0x2, $u ]; # return the collation element
45394607 1139 },
1140
a7fbee98 1141ex. ignores all CJK Unified Ideographs.
1142
1143 overrideCJK => sub {()}, # CODEREF returning empty list
1144
1145 # where ->eq("Pe\x{4E00}rl", "Perl") is true
1146 # as U+4E00 is a CJK Unified Ideograph and to be ignorable.
1147
1148If C<undef> is passed explicitly as the value for this key,
1149weights for CJK Unified Ideographs are treated as undefined.
1150But assignment of weight for CJK Unified Ideographs
1151in table or L<entry> is still valid.
1152
1153=item overrideHangul
1154
4d36a948 1155-- see 7.1 Derived Collation Elements, UTS #10.
a7fbee98 1156
1157By default, Hangul Syllables are decomposed into Hangul Jamo.
1158But the mapping of Hangul Syllables may be overrided.
1159
1160This tag works like L<overrideCJK>, so see there for examples.
1161
45394607 1162If you want to override the mapping of Hangul Syllables,
1163the Normalization Forms D and KD are not appropriate
1164(they will be decomposed before overriding).
1165
a7fbee98 1166If C<undef> is passed explicitly as the value for this key,
1167weight for Hangul Syllables is treated as undefined
1168without decomposition into Hangul Jamo.
1169But definition of weight for Hangul Syllables
1170in table or L<entry> is still valid.
1171
45394607 1172=item preprocess
1173
4d36a948 1174-- see 5.1 Preprocessing, UTS #10.
45394607 1175
1176If specified, the coderef is used to preprocess
1177before the formation of sort keys.
1178
a7fbee98 1179ex. dropping English articles, such as "a" or "the".
45394607 1180Then, "the pen" is before "a pencil".
1181
1182 preprocess => sub {
1183 my $str = shift;
a7fbee98 1184 $str =~ s/\b(?:an?|the)\s+//gi;
45394607 1185 $str;
1186 },
1187
1188=item rearrange
1189
4d36a948 1190-- see 3.1.3 Rearrangement, UTS #10.
45394607 1191
1192Characters that are not coded in logical order and to be rearranged.
a7fbee98 1193By default,
45394607 1194
1195 rearrange => [ 0x0E40..0x0E44, 0x0EC0..0x0EC4 ],
1196
a7fbee98 1197If you want to disallow any rearrangement,
1198pass C<undef> or C<[]> (a reference to an empty list)
1199as the value for this key.
1200
0116f5dc 1201B<According to the version 9 of UCA, this parameter shall not be used;
1202but it is not warned at present.>
1203
45394607 1204=item table
1205
4d36a948 1206-- see 3.2 Default Unicode Collation Element Table, UTS #10.
45394607 1207
1208You can use another element table if desired.
1209The table file must be in your C<lib/Unicode/Collate> directory.
1210
1211By default, the file C<lib/Unicode/Collate/allkeys.txt> is used.
1212
a7fbee98 1213If C<undef> is passed explicitly as the value for this key,
1214no file is read (but you can define collation elements via L<entry>).
1215
1216A typical way to define a collation element table
1217without any file of table:
1218
1219 $onlyABC = Unicode::Collate->new(
1220 table => undef,
1221 entry => << 'ENTRIES',
12220061 ; [.0101.0020.0002.0061] # LATIN SMALL LETTER A
12230041 ; [.0101.0020.0008.0041] # LATIN CAPITAL LETTER A
12240062 ; [.0102.0020.0002.0062] # LATIN SMALL LETTER B
12250042 ; [.0102.0020.0008.0042] # LATIN CAPITAL LETTER B
12260063 ; [.0103.0020.0002.0063] # LATIN SMALL LETTER C
12270043 ; [.0103.0020.0008.0043] # LATIN CAPITAL LETTER C
1228ENTRIES
1229 );
905aa9f0 1230
45394607 1231=item undefName
1232
1233=item undefChar
1234
4d36a948 1235-- see 6.3.4 Reducing the Repertoire, UTS #10.
45394607 1236
1237Undefines the collation element as if it were unassigned in the table.
1238This reduces the size of the table.
1239If an unassigned character appears in the string to be collated,
1240the sort key is made from its codepoint
1241as a single-character collation element,
1242as it is greater than any other assigned collation elements
1243(in the codepoint order among the unassigned characters).
1244But, it'd be better to ignore characters
1245unfamiliar to you and maybe never used.
1246
1247=item katakana_before_hiragana
1248
1249=item upper_before_lower
1250
4d36a948 1251-- see 6.6 Case Comparisons; 7.3.1 Tertiary Weight Table, UTS #10.
45394607 1252
1253By default, lowercase is before uppercase
1254and hiragana is before katakana.
1255
a7fbee98 1256If the tag is made true, this is reversed.
1257
1258B<NOTE>: These tags simplemindedly assume
1259any lowercase/uppercase or hiragana/katakana distinctions
9f1f04a1 1260must occur in level 3, and their weights at level 3
1261must be same as those mentioned in 7.3.1, UTS #10.
1262If you define your collation elements which violate this requirement,
4d36a948 1263these tags don't work validly.
45394607 1264
1265=back
1266
3164dd77 1267=head2 Methods for Collation
45394607 1268
1269=over 4
1270
5398038e 1271=item C<@sorted = $Collator-E<gt>sort(@not_sorted)>
45394607 1272
1273Sorts a list of strings.
1274
5398038e 1275=item C<$result = $Collator-E<gt>cmp($a, $b)>
45394607 1276
1277Returns 1 (when C<$a> is greater than C<$b>)
1278or 0 (when C<$a> is equal to C<$b>)
1279or -1 (when C<$a> is lesser than C<$b>).
1280
5398038e 1281=item C<$result = $Collator-E<gt>eq($a, $b)>
1282
1283=item C<$result = $Collator-E<gt>ne($a, $b)>
1284
1285=item C<$result = $Collator-E<gt>lt($a, $b)>
1286
1287=item C<$result = $Collator-E<gt>le($a, $b)>
1288
1289=item C<$result = $Collator-E<gt>gt($a, $b)>
1290
1291=item C<$result = $Collator-E<gt>ge($a, $b)>
1292
a7fbee98 1293They works like the same name operators as theirs.
5398038e 1294
1295 eq : whether $a is equal to $b.
1296 ne : whether $a is not equal to $b.
1297 lt : whether $a is lesser than $b.
1298 le : whether $a is lesser than $b or equal to $b.
1299 gt : whether $a is greater than $b.
1300 ge : whether $a is greater than $b or equal to $b.
1301
1302=item C<$sortKey = $Collator-E<gt>getSortKey($string)>
45394607 1303
4d36a948 1304-- see 4.3 Form a sort key for each string, UTS #10.
45394607 1305
1306Returns a sort key.
1307
1308You compare the sort keys using a binary comparison
1309and get the result of the comparison of the strings using UCA.
1310
5398038e 1311 $Collator->getSortKey($a) cmp $Collator->getSortKey($b)
45394607 1312
1313 is equivalent to
1314
5398038e 1315 $Collator->cmp($a, $b)
45394607 1316
a7fbee98 1317=item C<$sortKeyForm = $Collator-E<gt>viewSortKey($string)>
1318
a7fbee98 1319 use Unicode::Collate;
1320 my $c = Unicode::Collate->new();
1321 print $c->viewSortKey("Perl"),"\n";
1322
0116f5dc 1323 # output:
1324 # [0B67 0A65 0B7F 0B03 | 0020 0020 0020 0020 | 0008 0002 0002 0002 | FFFF FFFF FFFF FFFF]
1325 # Level 1 Level 2 Level 3 Level 4
1326
1327 (If C<UCA_Version> is 8, the output is slightly different.)
a7fbee98 1328
4d36a948 1329=back
1330
1331=head2 Methods for Searching
d16e9e3d 1332
4d36a948 1333B<DISCLAIMER:> If C<preprocess> or C<normalization> tag is true
1334for C<$Collator>, calling these methods (C<index>, C<match>, C<gmatch>,
1335C<subst>, C<gsubst>) is croaked,
1336as the position and the length might differ
1337from those on the specified string.
1338(And the C<rearrange> tag is neglected.)
d16e9e3d 1339
4d36a948 1340The C<match>, C<gmatch>, C<subst>, C<gsubst> methods work
1341like C<m//>, C<m//g>, C<s///>, C<s///g>, respectively,
1342but they are not aware of any pattern, but only a literal substring.
1343
1344=over 4
1345
1346=item C<$position = $Collator-E<gt>index($string, $substring[, $position])>
1347
1348=item C<($position, $length) = $Collator-E<gt>index($string, $substring[, $position])>
d16e9e3d 1349
1350If C<$substring> matches a part of C<$string>, returns
1351the position of the first occurrence of the matching part in scalar context;
1352in list context, returns a two-element list of
1353the position and the length of the matching part.
1354
d16e9e3d 1355If C<$substring> does not match any part of C<$string>,
1356returns C<-1> in scalar context and
1357an empty list in list context.
1358
1359e.g. you say
1360
5398038e 1361 my $Collator = Unicode::Collate->new( normalization => undef, level => 1 );
4d36a948 1362 # (normalization => undef) is REQUIRED.
1363 my $str = "Ich muß studieren Perl.";
1364 my $sub = "MÜSS";
d16e9e3d 1365 my $match;
a7fbee98 1366 if (my($pos,$len) = $Collator->index($str, $sub)) {
5398038e 1367 $match = substr($str, $pos, $len);
d16e9e3d 1368 }
1369
4d36a948 1370and get C<"muß"> in C<$match> since C<"muß">
1371is primary equal to C<"MÜSS">.
1372
1373=item C<$match_ref = $Collator-E<gt>match($string, $substring)>
1374
1375=item C<($match) = $Collator-E<gt>match($string, $substring)>
1376
1377If C<$substring> matches a part of C<$string>, in scalar context, returns
1378B<a reference to> the first occurrence of the matching part
1379(C<$match_ref> is always true if matches,
1380since every reference is B<true>);
1381in list context, returns the first occurrence of the matching part.
1382
1383If C<$substring> does not match any part of C<$string>,
1384returns C<undef> in scalar context and
1385an empty list in list context.
1386
1387e.g.
1388
1389 if ($match_ref = $Collator->match($str, $sub)) { # scalar context
1390 print "matches [$$match_ref].\n";
1391 } else {
1392 print "doesn't match.\n";
1393 }
1394
1395 or
1396
1397 if (($match) = $Collator->match($str, $sub)) { # list context
1398 print "matches [$match].\n";
1399 } else {
1400 print "doesn't match.\n";
1401 }
1402
1403=item C<@match = $Collator-E<gt>gmatch($string, $substring)>
1404
1405If C<$substring> matches a part of C<$string>, returns
1406all the matching parts (or matching count in scalar context).
1407
1408If C<$substring> does not match any part of C<$string>,
1409returns an empty list.
1410
1411=item C<$count = $Collator-E<gt>subst($string, $substring, $replacement)>
1412
1413If C<$substring> matches a part of C<$string>,
1414the first occurrence of the matching part is replaced by C<$replacement>
1415(C<$string> is modified) and return C<$count> (always equals to C<1>).
1416
1417C<$replacement> can be a C<CODEREF>,
1418taking the matching part as an argument,
1419and returning a string to replace the matching part
1420(a bit similar to C<s/(..)/$coderef-E<gt>($1)/e>).
1421
1422=item C<$count = $Collator-E<gt>gsubst($string, $substring, $replacement)>
1423
1424If C<$substring> matches a part of C<$string>,
1425all the occurrences of the matching part is replaced by C<$replacement>
1426(C<$string> is modified) and return C<$count>.
1427
1428C<$replacement> can be a C<CODEREF>,
1429taking the matching part as an argument,
1430and returning a string to replace the matching part
1431(a bit similar to C<s/(..)/$coderef-E<gt>($1)/eg>).
1432
1433e.g.
1434
1435 my $Collator = Unicode::Collate->new( normalization => undef, level => 1 );
1436 # (normalization => undef) is REQUIRED.
1437 my $str = "Camel ass came\x{301}l CAMEL horse cAm\0E\0L...";
1438 $Collator->gsubst($str, "camel", sub { "<b>$_[0]</b>" });
1439
1440 # now $str is "<b>Camel</b> ass <b>came\x{301}l</b> <b>CAMEL</b> horse <b>cAm\0E\0L</b>...";
1441 # i.e., all the camels are made bold-faced.
d16e9e3d 1442
45394607 1443=back
1444
3164dd77 1445=head2 Other Methods
1446
1447=over 4
1448
0116f5dc 1449=item C<%old_tailoring = $Collator-E<gt>change(%new_tailoring)>
1450
1451Change the value of specified keys and returns the changed part.
1452
1453 $Collator = Unicode::Collate->new(level => 4);
1454
1455 $Collator->eq("perl", "PERL"); # false
1456
1457 %old = $Collator->change(level => 2); # returns (level => 4).
1458
1459 $Collator->eq("perl", "PERL"); # true
1460
1461 $Collator->change(%old); # returns (level => 2).
1462
1463 $Collator->eq("perl", "PERL"); # false
1464
1465Not all C<(key,value)>s are allowed to be changed.
1466See also C<@Unicode::Collate::ChangeOK> and C<@Unicode::Collate::ChangeNG>.
1467
1468In the scalar context, returns the modified collator
1469(but it is B<not> a clone from the original).
1470
1471 $Collator->change(level => 2)->eq("perl", "PERL"); # true
1472
1473 $Collator->eq("perl", "PERL"); # true; now max level is 2nd.
1474
1475 $Collator->change(level => 4)->eq("perl", "PERL"); # false
1476
3164dd77 1477=item UCA_Version
1478
4d36a948 1479Returns the version number of UTS #10 this module consults.
3164dd77 1480
1481=item Base_Unicode_Version
1482
1483Returns the version number of the Unicode Standard
1484this module is based on.
1485
1486=back
1487
45394607 1488=head2 EXPORT
1489
1490None by default.
1491
4a2e806c 1492=head2 TODO
1493
4d36a948 1494Unicode::Collate has not been ported to EBCDIC.
1495IMHO, use of utf8::unicode_to_native()/utf8::native_to_unicode()
1496at the proper postions should allow
1497this module to work on EBCDIC platform...
4a2e806c 1498
45394607 1499=head2 CAVEAT
1500
1501Use of the C<normalization> parameter requires
1502the B<Unicode::Normalize> module.
1503
5398038e 1504If you need not it (say, in the case when you need not
45394607 1505handle any combining characters),
1506assign C<normalization =E<gt> undef> explicitly.
1507
4d36a948 1508-- see 6.5 Avoiding Normalization, UTS #10.
5398038e 1509
0116f5dc 1510=head2 Conformance Test
1511
1512The Conformance Test for the UCA is provided
1513in L<http://www.unicode.org/reports/tr10/CollationTest.html>
1514and L<http://www.unicode.org/reports/tr10/CollationTest.zip>
1515
1516For F<CollationTest_SHIFTED.txt>,
1517a collator via C<Unicode::Collate-E<gt>new( )> should be used;
1518for F<CollationTest_NON_IGNORABLE.txt>, a collator via
1519C<Unicode::Collate-E<gt>new(alternate =E<gt> "non-ignorable", level =E<gt> 3)>.
1520
4d36a948 1521B<Unicode::Normalize is required to try The Conformance Test.>
a7fbee98 1522
45394607 1523=head1 AUTHOR
1524
1525SADAHIRO Tomoyuki, E<lt>SADAHIRO@cpan.orgE<gt>
1526
1527 http://homepage1.nifty.com/nomenclator/perl/
1528
a7fbee98 1529 Copyright(C) 2001-2002, SADAHIRO Tomoyuki. Japan. All rights reserved.
45394607 1530
a7fbee98 1531 This library is free software; you can redistribute it
1532 and/or modify it under the same terms as Perl itself.
45394607 1533
1534=head1 SEE ALSO
1535
1536=over 4
1537
0116f5dc 1538=item http://www.unicode.org/reports/tr10/
45394607 1539
4d36a948 1540Unicode Collation Algorithm - UTS #10
45394607 1541
0116f5dc 1542=item http://www.unicode.org/reports/tr10/allkeys.txt
a7fbee98 1543
1544The Default Unicode Collation Element Table
45394607 1545
0116f5dc 1546=item http://www.unicode.org/reports/tr10/CollationTest.html
1547http://www.unicode.org/reports/tr10/CollationTest.zip
1548
1549The latest versions of the conformance test for the UCA
1550
1551=item http://www.unicode.org/reports/tr15/
a7fbee98 1552
1553Unicode Normalization Forms - UAX #15
1554
a7fbee98 1555=item L<Unicode::Normalize>
45394607 1556
45394607 1557=back
1558
1559=cut