more options for valgrindpp.pl
[p5sagit/p5-mst-13.2.git] / lib / Unicode / Collate.pm
CommitLineData
45394607 1package Unicode::Collate;
2
4a2e806c 3BEGIN {
ae6aa562 4 unless ("A" eq pack('U', 0x41)) {
9f1f04a1 5 die "Unicode::Collate cannot stringify a Unicode code point\n";
4a2e806c 6 }
7}
8
45394607 9use 5.006;
10use strict;
11use warnings;
12use Carp;
e69a2255 13use File::Spec;
5398038e 14
45394607 15require Exporter;
16
4c843366 17our $VERSION = '0.26';
45394607 18our $PACKAGE = __PACKAGE__;
19
20our @ISA = qw(Exporter);
21
22our %EXPORT_TAGS = ();
23our @EXPORT_OK = ();
24our @EXPORT = ();
25
26(our $Path = $INC{'Unicode/Collate.pm'}) =~ s/\.pm$//;
27our $KeyFile = "allkeys.txt";
28
3164dd77 29our $UNICODE_VERSION;
30
327745dc 31eval { require Unicode::UCD };
32
33unless ($@) {
34 $UNICODE_VERSION = Unicode::UCD::UnicodeVersion();
35}
9f1f04a1 36else { # Perl 5.6.1
3164dd77 37 my($f, $fh);
38 foreach my $d (@INC) {
3164dd77 39 $f = File::Spec->catfile($d, "unicode", "Unicode.301");
40 if (open($fh, $f)) {
327745dc 41 $UNICODE_VERSION = '3.0.1';
3164dd77 42 close $fh;
43 last;
44 }
45 }
46}
47
4d36a948 48# Perl's boolean
49use constant TRUE => 1;
50use constant FALSE => "";
51use constant NOMATCHPOS => -1;
52
53# A coderef to get combining class imported from Unicode::Normalize
54# (i.e. \&Unicode::Normalize::getCombinClass).
55# This is also used as a HAS_UNICODE_NORMALIZE flag.
56our $getCombinClass;
57
9f1f04a1 58# Supported Levels
59use constant MinLevel => 1;
60use constant MaxLevel => 4;
61
4d36a948 62# Minimum weights at level 2 and 3, respectively
9f1f04a1 63use constant Min2Wt => 0x20;
64use constant Min3Wt => 0x02;
4d36a948 65
66# Shifted weight at 4th level
9f1f04a1 67use constant Shift4Wt => 0xFFFF;
4d36a948 68
69# Variable weight at 1st level.
70# This is a negative value but should be regarded as zero on collation.
71# This is for distinction of variable chars from level 3 ignorable chars.
9f1f04a1 72use constant Var1Wt => -1;
4d36a948 73
74
75# A boolean for Variable and 16-bit weights at 4 levels of Collation Element
76# PROBLEM: The Default Unicode Collation Element Table
77# has weights over 0xFFFF at the 4th level.
78# The tie-breaking in the variable weights
79# other than "shift" (as well as "shift-trimmed") is unreliable.
80use constant VCE_TEMPLATE => 'Cn4';
81
4d36a948 82# A sort key: 16-bit weights
83# See also the PROBLEM on VCE_TEMPLATE above.
84use constant KEY_TEMPLATE => 'n*';
85
86# Level separator in a sort key:
87# i.e. pack(KEY_TEMPLATE, 0)
88use constant LEVEL_SEP => "\0\0";
89
90# As Unicode code point separator for hash keys.
91# A joined code point string (denoted by JCPS below)
92# like "65;768" is used for internal processing
93# instead of Perl's Unicode string like "\x41\x{300}",
94# as the native code point is different from the Unicode code point
95# on EBCDIC platform.
96# This character must not be included in any stringified
97# representation of an integer.
98use constant CODE_SEP => ';';
99
100# boolean values of variable weights
0116f5dc 101use constant NON_VAR => 0; # Non-Variable character
102use constant VAR => 1; # Variable character
3164dd77 103
4d36a948 104# Logical_Order_Exception in PropList.txt
105# TODO: synchronization with change of PropList.txt.
0116f5dc 106our $DefaultRearrange = [ 0x0E40..0x0E44, 0x0EC0..0x0EC4 ];
a7fbee98 107
0116f5dc 108sub UCA_Version { "9" }
a7fbee98 109
0116f5dc 110sub Base_Unicode_Version { $UNICODE_VERSION || 'unknown' }
a7fbee98 111
9f1f04a1 112######
113
9f1f04a1 114sub pack_U {
ae6aa562 115 return pack('U*', @_);
9f1f04a1 116}
117
118sub unpack_U {
ae6aa562 119 return unpack('U*', pack('U*').shift);
9f1f04a1 120}
121
122######
123
0116f5dc 124my (%AlternateOK);
125@AlternateOK{ qw/
126 blanked non-ignorable shifted shift-trimmed
127 / } = ();
128
129our @ChangeOK = qw/
130 alternate backwards level normalization rearrange
131 katakana_before_hiragana upper_before_lower
132 overrideHangul overrideCJK preprocess UCA_Version
133 /;
134
135our @ChangeNG = qw/
9f1f04a1 136 entry entries table maxlength
0116f5dc 137 ignoreChar ignoreName undefChar undefName
138 versionTable alternateTable backwardsTable forwardsTable rearrangeTable
4d36a948 139 derivCode normCode rearrangeHash L3_ignorable
9f1f04a1 140 backwardsFlag
0116f5dc 141 /;
9f1f04a1 142# The hash key 'ignored' is deleted at v 0.21.
143# The hash key 'isShift' is deleted at v 0.23.
144# The hash key 'combining' is deleted at v 0.24.
0116f5dc 145
146my (%ChangeOK, %ChangeNG);
147@ChangeOK{ @ChangeOK } = ();
148@ChangeNG{ @ChangeNG } = ();
149
150sub change {
151 my $self = shift;
152 my %hash = @_;
153 my %old;
154 foreach my $k (keys %hash) {
155 if (exists $ChangeOK{$k}) {
156 $old{$k} = $self->{$k};
157 $self->{$k} = $hash{$k};
158 }
159 elsif (exists $ChangeNG{$k}) {
160 croak "change of $k via change() is not allowed!";
161 }
162 # else => ignored
163 }
164 $self->checkCollator;
165 return wantarray ? %old : $self;
166}
a7fbee98 167
9f1f04a1 168sub _checkLevel {
169 my $level = shift;
170 my $key = shift;
171 croak sprintf "Illegal level %d (in \$self->{%s}) lower than %d.",
172 $level, $key, MinLevel if MinLevel > $level;
173 croak sprintf "Unsupported level %d (in \$self->{%s}) higher than %d ",
174 $level, $key, MaxLevel if MaxLevel < $level;
175}
176
0116f5dc 177sub checkCollator {
178 my $self = shift;
9f1f04a1 179 _checkLevel($self->{level}, "level");
a7fbee98 180
0116f5dc 181 $self->{derivCode} =
4d36a948 182 $self->{UCA_Version} == 8 ? \&_derivCE_8 :
183 $self->{UCA_Version} == 9 ? \&_derivCE_9 :
0116f5dc 184 croak "Illegal UCA version (passed $self->{UCA_Version}).";
a7fbee98 185
0116f5dc 186 $self->{alternate} = lc($self->{alternate});
187 croak "$PACKAGE unknown alternate tag name: $self->{alternate}"
188 unless exists $AlternateOK{ $self->{alternate} };
189
9f1f04a1 190 if (! defined $self->{backwards}) {
191 $self->{backwardsFlag} = 0;
192 }
193 elsif (! ref $self->{backwards}) {
194 _checkLevel($self->{backwards}, "backwards");
195 $self->{backwardsFlag} = 1 << $self->{backwards};
196 }
197 else {
198 my %level;
199 $self->{backwardsFlag} = 0;
200 for my $b (@{ $self->{backwards} }) {
201 _checkLevel($b, "backwards");
202 $level{$b} = 1;
203 }
204 for my $v (sort keys %level) {
205 $self->{backwardsFlag} += 1 << $v;
206 }
207 }
0116f5dc 208
209 $self->{rearrange} = []
210 if ! defined $self->{rearrange};
211 croak "$PACKAGE: A list for rearrangement must be store in an ARRAYREF"
212 if ! ref $self->{rearrange};
213
214 # keys of $self->{rearrangeHash} are $self->{rearrange}.
215 $self->{rearrangeHash} = undef;
216
217 if (@{ $self->{rearrange} }) {
218 @{ $self->{rearrangeHash} }{ @{ $self->{rearrange} } } = ();
219 }
220
221 $self->{normCode} = undef;
a7fbee98 222
223 if (defined $self->{normalization}) {
224 eval { require Unicode::Normalize };
225 croak "Unicode/Normalize.pm is required to normalize strings: $@"
226 if $@;
227
228 Unicode::Normalize->import();
229 $getCombinClass = \&Unicode::Normalize::getCombinClass
230 if ! $getCombinClass;
231
0116f5dc 232 $self->{normCode} =
a7fbee98 233 $self->{normalization} =~ /^(?:NF)?C$/ ? \&NFC :
234 $self->{normalization} =~ /^(?:NF)?D$/ ? \&NFD :
235 $self->{normalization} =~ /^(?:NF)?KC$/ ? \&NFKC :
236 $self->{normalization} =~ /^(?:NF)?KD$/ ? \&NFKD :
237 croak "$PACKAGE unknown normalization form name: "
238 . $self->{normalization};
239 }
0116f5dc 240 return;
241}
242
243sub new
244{
245 my $class = shift;
246 my $self = bless { @_ }, $class;
45394607 247
a7fbee98 248 # If undef is passed explicitly, no file is read.
0116f5dc 249 $self->{table} = $KeyFile if ! exists $self->{table};
250 $self->read_table if defined $self->{table};
905aa9f0 251
a7fbee98 252 if ($self->{entry}) {
253 $self->parseEntry($_) foreach split /\n/, $self->{entry};
254 }
905aa9f0 255
9f1f04a1 256 $self->{level} ||= MaxLevel;
0116f5dc 257 $self->{UCA_Version} ||= UCA_Version();
905aa9f0 258
0116f5dc 259 $self->{overrideHangul} = ''
260 if ! exists $self->{overrideHangul};
261 $self->{overrideCJK} = ''
262 if ! exists $self->{overrideCJK};
263 $self->{normalization} = 'D'
264 if ! exists $self->{normalization};
265 $self->{alternate} = $self->{alternateTable} || 'shifted'
266 if ! exists $self->{alternate};
267 $self->{rearrange} = $self->{rearrangeTable} || $DefaultRearrange
a7fbee98 268 if ! exists $self->{rearrange};
0116f5dc 269 $self->{backwards} = $self->{backwardsTable}
270 if ! exists $self->{backwards};
a7fbee98 271
0116f5dc 272 $self->checkCollator;
a7fbee98 273
274 return $self;
275}
905aa9f0 276
277sub read_table {
a7fbee98 278 my $self = shift;
279 my $file = $self->{table} ne '' ? $self->{table} : $KeyFile;
280
e69a2255 281 my $filepath = File::Spec->catfile($Path, $file);
282 open my $fk, "<$filepath"
283 or croak "File does not exist at $filepath";
a7fbee98 284
285 while (<$fk>) {
286 next if /^\s*#/;
287 if (/^\s*\@/) {
0116f5dc 288 if (/^\s*\@version\s*(\S*)/) {
289 $self->{versionTable} ||= $1;
290 }
291 elsif (/^\s*\@alternate\s+(\S*)/) {
292 $self->{alternateTable} ||= $1;
a7fbee98 293 }
0116f5dc 294 elsif (/^\s*\@backwards\s+(\S*)/) {
295 push @{ $self->{backwardsTable} }, $1;
a7fbee98 296 }
0116f5dc 297 elsif (/^\s*\@forwards\s+(\S*)/) { # parhaps no use
298 push @{ $self->{forwardsTable} }, $1;
a7fbee98 299 }
0116f5dc 300 elsif (/^\s*\@rearrange\s+(.*)/) { # (\S*) is NG
301 push @{ $self->{rearrangeTable} }, _getHexArray($1);
a7fbee98 302 }
303 next;
304 }
305 $self->parseEntry($_);
45394607 306 }
a7fbee98 307 close $fk;
45394607 308}
309
905aa9f0 310
45394607 311##
312## get $line, parse it, and write an entry in $self
313##
314sub parseEntry
315{
a7fbee98 316 my $self = shift;
317 my $line = shift;
4d36a948 318 my($name, $entry, @uv, @key);
a7fbee98 319
320 return if $line !~ /^\s*[0-9A-Fa-f]/;
321
322 # removes comment and gets name
323 $name = $1
324 if $line =~ s/[#%]\s*(.*)//;
325 return if defined $self->{undefName} && $name =~ /$self->{undefName}/;
326
327 # gets element
328 my($e, $k) = split /;/, $line;
329 croak "Wrong Entry: <charList> must be separated by ';' from <collElement>"
330 if ! $k;
331
4d36a948 332 @uv = _getHexArray($e);
333 return if !@uv;
334
335 $entry = join(CODE_SEP, @uv); # in JCPS
0116f5dc 336
4d36a948 337 if (defined $self->{undefChar} || defined $self->{ignoreChar}) {
9f1f04a1 338 my $ele = pack_U(@uv);
a7fbee98 339
4d36a948 340 # regarded as if it were not entried in the table
341 return
342 if defined $self->{undefChar} && $ele =~ /$self->{undefChar}/;
caffd4cf 343
4d36a948 344 # replaced as completely ignorable
345 $k = '[.0000.0000.0000.0000]'
346 if defined $self->{ignoreChar} && $ele =~ /$self->{ignoreChar}/;
45394607 347 }
0116f5dc 348
4d36a948 349 # replaced as completely ignorable
350 $k = '[.0000.0000.0000.0000]'
351 if defined $self->{ignoreName} && $name =~ /$self->{ignoreName}/;
352
4c843366 353 my $is_L3_ignorable = TRUE;
4d36a948 354
caffd4cf 355 foreach my $arr ($k =~ /\[([^\[\]]+)\]/g) { # SPACEs allowed
356 my $var = $arr =~ /\*/; # exactly /^\*/ but be lenient.
4d36a948 357 my @wt = _getHexArray($arr);
358 push @key, pack(VCE_TEMPLATE, $var, @wt);
4c843366 359 $is_L3_ignorable = FALSE
360 if $wt[0] + $wt[1] + $wt[2] != 0;
4d36a948 361 # if $arr !~ /[1-9A-Fa-f]/; NG
362 # Conformance Test shows L3-ignorable is completely ignorable.
4c843366 363 # For expansion, an entry $is_L3_ignorable
364 # if and only if "all" CEs are [.0000.0000.0000].
a7fbee98 365 }
caffd4cf 366
4d36a948 367 $self->{entries}{$entry} = \@key;
caffd4cf 368
4d36a948 369 $self->{L3_ignorable}{$uv[0]} = TRUE
370 if @uv == 1 && $is_L3_ignorable;
caffd4cf 371
4d36a948 372 # Contraction is to be considered in the range of this maxlength.
373 $self->{maxlength}{$uv[0]} = scalar @uv
374 if @uv > 1;
45394607 375}
376
9f1f04a1 377
45394607 378##
9f1f04a1 379## arrayref[weights] = altCE(VCE)
45394607 380##
d16e9e3d 381sub altCE
45394607 382{
a7fbee98 383 my $self = shift;
4d36a948 384 my($var, @wt) = unpack(VCE_TEMPLATE, shift);
a7fbee98 385
386 $self->{alternate} eq 'blanked' ?
9f1f04a1 387 $var ? [Var1Wt, 0, 0, $wt[3]] : \@wt :
a7fbee98 388 $self->{alternate} eq 'non-ignorable' ?
4d36a948 389 \@wt :
a7fbee98 390 $self->{alternate} eq 'shifted' ?
9f1f04a1 391 $var ? [Var1Wt, 0, 0, $wt[0] ]
392 : [ @wt[0..2], $wt[0]+$wt[1]+$wt[2] ? Shift4Wt : 0 ] :
a7fbee98 393 $self->{alternate} eq 'shift-trimmed' ?
9f1f04a1 394 $var ? [Var1Wt, 0, 0, $wt[0] ] : [ @wt[0..2], 0 ] :
a7fbee98 395 croak "$PACKAGE unknown alternate name: $self->{alternate}";
45394607 396}
397
45394607 398sub viewSortKey
399{
a7fbee98 400 my $self = shift;
9f1f04a1 401 $self->visualizeSortKey($self->getSortKey(@_));
402}
0116f5dc 403
9f1f04a1 404sub visualizeSortKey
405{
406 my $self = shift;
407 my $view = join " ", map sprintf("%04X", $_), unpack(KEY_TEMPLATE, shift);
4d36a948 408
9f1f04a1 409 if ($self->{UCA_Version} <= 8) {
0116f5dc 410 $view =~ s/ ?0000 ?/|/g;
411 } else {
412 $view =~ s/\b0000\b/|/g;
413 }
a7fbee98 414 return "[$view]";
45394607 415}
416
d16e9e3d 417
45394607 418##
4d36a948 419## arrayref of JCPS = splitCE(string to be collated)
420## arrayref of arrayref[JCPS, ini_pos, fin_pos] = splitCE(string, true)
45394607 421##
d16e9e3d 422sub splitCE
45394607 423{
a7fbee98 424 my $self = shift;
4d36a948 425 my $wLen = $_[1];
426
a7fbee98 427 my $code = $self->{preprocess};
0116f5dc 428 my $norm = $self->{normCode};
a7fbee98 429 my $ent = $self->{entries};
430 my $max = $self->{maxlength};
431 my $reH = $self->{rearrangeHash};
4d36a948 432 my $ign = $self->{L3_ignorable};
0116f5dc 433 my $ver9 = $self->{UCA_Version} > 8;
a7fbee98 434
4d36a948 435 my ($str, @buf);
a7fbee98 436
4d36a948 437 if ($wLen) {
438 $code and croak "Preprocess breaks character positions. "
439 . "Don't use with index(), match(), etc.";
440 $norm and croak "Normalization breaks character positions. "
441 . "Don't use with index(), match(), etc.";
442 $str = $_[0];
443 }
444 else {
445 $str = $_[0];
446 $str = &$code($str) if ref $code;
447 $str = &$norm($str) if ref $norm;
448 }
a7fbee98 449
4d36a948 450 # get array of Unicode code point of string.
9f1f04a1 451 my @src = unpack_U($str);
4d36a948 452
453 # rearrangement:
454 # Character positions are not kept if rearranged,
455 # then neglected if $wLen is true.
456 if ($reH && ! $wLen) {
a7fbee98 457 for (my $i = 0; $i < @src; $i++) {
458 if (exists $reH->{ $src[$i] } && $i + 1 < @src) {
459 ($src[$i], $src[$i+1]) = ($src[$i+1], $src[$i]);
460 $i++;
461 }
462 }
45394607 463 }
45394607 464
0116f5dc 465 if ($ver9) {
4d36a948 466 # To remove a character marked as a completely ignorable.
467 for (my $i = 0; $i < @src; $i++) {
468 $src[$i] = undef if $ign->{ $src[$i] };
469 }
0116f5dc 470 }
471
a7fbee98 472 for (my $i = 0; $i < @src; $i++) {
4d36a948 473 next if _isNonCharacter($src[$i]);
474
475 my $i_orig = $i;
476 my $ce = $src[$i];
477
478 if ($max->{$ce}) { # contract
479 my $temp_ce = $ce;
480
481 for (my $p = $i + 1; $p < @src; $p++) {
482 next if ! defined $src[$p];
483 $temp_ce .= CODE_SEP . $src[$p];
484 if ($ent->{$temp_ce}) {
485 $ce = $temp_ce;
486 $i = $p;
487 }
488 }
489 }
490
491 # with Combining Char (UTS#10, 4.2.1).
492 # requires Unicode::Normalize.
493 # Not be $wLen, as not croaked due to $norm.
494 if ($getCombinClass) {
495 for (my $p = $i + 1; $p < @src; $p++) {
496 next if ! defined $src[$p];
497 last unless $getCombinClass->($src[$p]);
498 my $tail = CODE_SEP . $src[$p];
499 if ($ent->{$ce.$tail}) {
500 $ce .= $tail;
501 $src[$p] = undef;
502 }
a7fbee98 503 }
a7fbee98 504 }
505
4d36a948 506 if ($wLen) {
507 for (my $p = $i + 1; $p < @src; $p++) {
508 last if defined $src[$p];
509 $i = $p;
a7fbee98 510 }
511 }
4d36a948 512
513 push @buf, $wLen ? [$ce, $i_orig, $i + 1] : $ce;
45394607 514 }
4d36a948 515 return \@buf;
d16e9e3d 516}
45394607 517
d16e9e3d 518
519##
4d36a948 520## list of arrayrefs of weights = getWt(JCPS)
d16e9e3d 521##
522sub getWt
523{
a7fbee98 524 my $self = shift;
4d36a948 525 my $ce = shift;
a7fbee98 526 my $ent = $self->{entries};
a7fbee98 527 my $cjk = $self->{overrideCJK};
528 my $hang = $self->{overrideHangul};
0116f5dc 529 my $der = $self->{derivCode};
a7fbee98 530
4d36a948 531 return if !defined $ce;
532 return map($self->altCE($_), @{ $ent->{$ce} })
533 if $ent->{$ce};
0116f5dc 534
4d36a948 535 # CE must not be a contraction, then it's a code point.
536 my $u = $ce;
a7fbee98 537
4d36a948 538 if (0xAC00 <= $u && $u <= 0xD7A3) { # is Hangul Syllale
0116f5dc 539 return map $self->altCE($_),
540 $hang
4d36a948 541 ? map(pack(VCE_TEMPLATE, NON_VAR, @$_), &$hang($u))
0116f5dc 542 : defined $hang
543 ? map({
4d36a948 544 $ent->{$_} ? @{ $ent->{$_} } : $der->($_);
0116f5dc 545 } _decompHangul($u))
546 : $der->($u);
a7fbee98 547 }
548 elsif (0x3400 <= $u && $u <= 0x4DB5 ||
549 0x4E00 <= $u && $u <= 0x9FA5 ||
0116f5dc 550 0x20000 <= $u && $u <= 0x2A6D6) { # CJK Ideograph
551 return map $self->altCE($_),
552 $cjk
4d36a948 553 ? map(pack(VCE_TEMPLATE, NON_VAR, @$_), &$cjk($u))
0116f5dc 554 : defined $cjk && $self->{UCA_Version} <= 8 && $u < 0x10000
9f1f04a1 555 ? pack(VCE_TEMPLATE, NON_VAR, $u, Min2Wt, Min3Wt, $u)
0116f5dc 556 : $der->($u);
a7fbee98 557 }
558 else {
0116f5dc 559 return map $self->altCE($_), $der->($u);
a7fbee98 560 }
d16e9e3d 561}
562
d16e9e3d 563
564##
565## string sortkey = getSortKey(string arg)
566##
567sub getSortKey
568{
a7fbee98 569 my $self = shift;
570 my $lev = $self->{level};
4d36a948 571 my $rCE = $self->splitCE(shift); # get an arrayref of JCPS
0116f5dc 572 my $ver9 = $self->{UCA_Version} > 8;
4d36a948 573 my $v2i = $self->{alternate} ne 'non-ignorable';
a7fbee98 574
575 # weight arrays
0116f5dc 576 my (@buf, $last_is_variable);
577
4d36a948 578 foreach my $wt (map $self->getWt($_), @$rCE) {
579 if ($v2i && $ver9) {
580 if ($wt->[0] == 0) { # ignorable
581 next if $last_is_variable;
0116f5dc 582 } else {
9f1f04a1 583 $last_is_variable = ($wt->[0] == Var1Wt);
0116f5dc 584 }
585 }
4d36a948 586 push @buf, $wt;
0116f5dc 587 }
a7fbee98 588
589 # make sort key
590 my @ret = ([],[],[],[]);
591 foreach my $v (0..$lev-1) {
592 foreach my $b (@buf) {
4d36a948 593 push @{ $ret[$v] }, $b->[$v]
594 if 0 < $b->[$v];
a7fbee98 595 }
596 }
45394607 597
a7fbee98 598 # modification of tertiary weights
599 if ($self->{upper_before_lower}) {
600 foreach (@{ $ret[2] }) {
601 if (0x8 <= $_ && $_ <= 0xC) { $_ -= 6 } # lower
602 elsif (0x2 <= $_ && $_ <= 0x6) { $_ += 6 } # upper
603 elsif ($_ == 0x1C) { $_ += 1 } # square upper
604 elsif ($_ == 0x1D) { $_ -= 1 } # square lower
605 }
45394607 606 }
a7fbee98 607 if ($self->{katakana_before_hiragana}) {
608 foreach (@{ $ret[2] }) {
609 if (0x0F <= $_ && $_ <= 0x13) { $_ -= 2 } # katakana
610 elsif (0x0D <= $_ && $_ <= 0x0E) { $_ += 5 } # hiragana
611 }
45394607 612 }
9f1f04a1 613
614 if ($self->{backwardsFlag}) {
615 for (my $v = MinLevel; $v <= MaxLevel; $v++) {
616 if ($self->{backwardsFlag} & (1 << $v)) {
617 @{ $ret[$v-1] } = reverse @{ $ret[$v-1] };
618 }
619 }
620 }
621
4d36a948 622 join LEVEL_SEP, map pack(KEY_TEMPLATE, @$_), @ret;
45394607 623}
624
625
626##
d16e9e3d 627## int compare = cmp(string a, string b)
45394607 628##
5398038e 629sub cmp { $_[0]->getSortKey($_[1]) cmp $_[0]->getSortKey($_[2]) }
630sub eq { $_[0]->getSortKey($_[1]) eq $_[0]->getSortKey($_[2]) }
631sub ne { $_[0]->getSortKey($_[1]) ne $_[0]->getSortKey($_[2]) }
632sub lt { $_[0]->getSortKey($_[1]) lt $_[0]->getSortKey($_[2]) }
633sub le { $_[0]->getSortKey($_[1]) le $_[0]->getSortKey($_[2]) }
634sub gt { $_[0]->getSortKey($_[1]) gt $_[0]->getSortKey($_[2]) }
635sub ge { $_[0]->getSortKey($_[1]) ge $_[0]->getSortKey($_[2]) }
45394607 636
637##
d16e9e3d 638## list[strings] sorted = sort(list[strings] arg)
45394607 639##
a7fbee98 640sub sort {
641 my $obj = shift;
642 return
643 map { $_->[1] }
644 sort{ $a->[0] cmp $b->[0] }
645 map [ $obj->getSortKey($_), $_ ], @_;
45394607 646}
647
0116f5dc 648
4d36a948 649sub _derivCE_9 {
0116f5dc 650 my $u = shift;
651 my $base =
4d36a948 652 (0x4E00 <= $u && $u <= 0x9FA5)
653 ? 0xFB40 : # CJK
0116f5dc 654 (0x3400 <= $u && $u <= 0x4DB5 || 0x20000 <= $u && $u <= 0x2A6D6)
4d36a948 655 ? 0xFB80 # CJK ext.
656 : 0xFBC0; # others
0116f5dc 657
658 my $aaaa = $base + ($u >> 15);
659 my $bbbb = ($u & 0x7FFF) | 0x8000;
660 return
9f1f04a1 661 pack(VCE_TEMPLATE, NON_VAR, $aaaa, Min2Wt, Min3Wt, $u),
662 pack(VCE_TEMPLATE, NON_VAR, $bbbb, 0, 0, $u);
0116f5dc 663}
664
4d36a948 665sub _derivCE_8 {
0116f5dc 666 my $code = shift;
667 my $aaaa = 0xFF80 + ($code >> 15);
668 my $bbbb = ($code & 0x7FFF) | 0x8000;
669 return
4d36a948 670 pack(VCE_TEMPLATE, NON_VAR, $aaaa, 2, 1, $code),
671 pack(VCE_TEMPLATE, NON_VAR, $bbbb, 0, 0, $code);
45394607 672}
673
674##
675## "hhhh hhhh hhhh" to (dddd, dddd, dddd)
676##
a7fbee98 677sub _getHexArray { map hex, $_[0] =~ /([0-9a-fA-F]+)/g }
5398038e 678
a7fbee98 679#
4d36a948 680# $code *must* be in Hangul syllable.
a7fbee98 681# Check it before you enter here.
682#
5398038e 683sub _decompHangul {
684 my $code = shift;
5398038e 685 my $SIndex = $code - 0xAC00;
686 my $LIndex = int( $SIndex / 588);
687 my $VIndex = int(($SIndex % 588) / 28);
688 my $TIndex = $SIndex % 28;
689 return (
a7fbee98 690 0x1100 + $LIndex,
691 0x1161 + $VIndex,
692 $TIndex ? (0x11A7 + $TIndex) : (),
5398038e 693 );
45394607 694}
695
4d36a948 696sub _isNonCharacter {
697 my $code = shift;
698 return ! defined $code # removed
699 || ($code < 0 || 0x10FFFF < $code) # out of range
700 || (($code & 0xFFFE) == 0xFFFE) # ??FFF[EF] (cf. utf8.c)
701 || (0xD800 <= $code && $code <= 0xDFFF) # unpaired surrogates
702 || (0xFDD0 <= $code && $code <= 0xFDEF) # other non-characters
703 ;
704}
705
706
707##
708## bool _nonIgnorAtLevel(arrayref weights, int level)
709##
710sub _nonIgnorAtLevel($$)
711{
712 my $wt = shift;
713 return if ! defined $wt;
714 my $lv = shift;
9f1f04a1 715 return grep($wt->[$_-1] != 0, MinLevel..$lv) ? TRUE : FALSE;
4d36a948 716}
717
718##
719## bool _eqArray(
720## arrayref of arrayref[weights] source,
721## arrayref of arrayref[weights] substr,
722## int level)
723## * comparison of graphemes vs graphemes.
724## @$source >= @$substr must be true (check it before call this);
725##
726sub _eqArray($$$)
727{
728 my $source = shift;
729 my $substr = shift;
730 my $lev = shift;
731
732 for my $g (0..@$substr-1){
733 # Do the $g'th graphemes have the same number of AV weigths?
734 return if @{ $source->[$g] } != @{ $substr->[$g] };
735
736 for my $w (0..@{ $substr->[$g] }-1) {
737 for my $v (0..$lev-1) {
738 return if $source->[$g][$w][$v] != $substr->[$g][$w][$v];
739 }
740 }
741 }
742 return 1;
743}
744
745##
746## (int position, int length)
747## int position = index(string, substring, position, [undoc'ed grobal])
748##
749## With "grobal" (only for the list context),
750## returns list of arrayref[position, length].
751##
752sub index
753{
754 my $self = shift;
755 my $str = shift;
756 my $len = length($str);
757 my $subCE = $self->splitCE(shift);
758 my $pos = @_ ? shift : 0;
759 $pos = 0 if $pos < 0;
760 my $grob = shift;
761
4d36a948 762 my $lev = $self->{level};
763 my $ver9 = $self->{UCA_Version} > 8;
764 my $v2i = $self->{alternate} ne 'non-ignorable';
765
766 if (! @$subCE) {
767 my $temp = $pos <= 0 ? 0 : $len <= $pos ? $len : $pos;
768 return $grob
769 ? map([$_, 0], $temp..$len)
770 : wantarray ? ($temp,0) : $temp;
771 }
772 if ($len < $pos) {
773 return wantarray ? () : NOMATCHPOS;
774 }
775 my $strCE = $self->splitCE($pos ? substr($str, $pos) : $str, TRUE);
776 if (! @$strCE) {
777 return wantarray ? () : NOMATCHPOS;
778 }
779 my $last_is_variable;
780 my(@strWt, @iniPos, @finPos, @subWt, @g_ret);
781
782 $last_is_variable = FALSE;
783 for my $wt (map $self->getWt($_), @$subCE) {
784 my $to_be_pushed = _nonIgnorAtLevel($wt,$lev);
785
786 if ($v2i && $ver9) {
787 if ($wt->[0] == 0) {
788 $to_be_pushed = FALSE if $last_is_variable;
789 } else {
9f1f04a1 790 $last_is_variable = ($wt->[0] == Var1Wt);
4d36a948 791 }
792 }
793
794 if (@subWt && $wt->[0] == 0) {
795 push @{ $subWt[-1] }, $wt if $to_be_pushed;
796 } else {
9f1f04a1 797 $wt->[0] = 0 if $wt->[0] == Var1Wt;
4d36a948 798 push @subWt, [ $wt ];
799 }
800 }
801
802 my $count = 0;
803 my $end = @$strCE - 1;
804
805 $last_is_variable = FALSE;
806
807 for (my $i = 0; $i <= $end; ) { # no $i++
808 my $found_base = 0;
809
810 # fetch a grapheme
811 while ($i <= $end && $found_base == 0) {
812 for my $wt ($self->getWt($strCE->[$i][0])) {
813 my $to_be_pushed = _nonIgnorAtLevel($wt,$lev);
814
815 if ($v2i && $ver9) {
816 if ($wt->[0] == 0) {
817 $to_be_pushed = FALSE if $last_is_variable;
818 } else {
9f1f04a1 819 $last_is_variable = ($wt->[0] == Var1Wt);
4d36a948 820 }
821 }
822
823 if (@strWt && $wt->[0] == 0) {
824 push @{ $strWt[-1] }, $wt if $to_be_pushed;
825 $finPos[-1] = $strCE->[$i][2];
826 } elsif ($to_be_pushed) {
9f1f04a1 827 $wt->[0] = 0 if $wt->[0] == Var1Wt;
4d36a948 828 push @strWt, [ $wt ];
829 push @iniPos, $found_base ? NOMATCHPOS : $strCE->[$i][1];
830 $finPos[-1] = NOMATCHPOS if $found_base;
831 push @finPos, $strCE->[$i][2];
832 $found_base++;
833 }
834 # else ===> no-op
835 }
836 $i++;
837 }
838
839 # try to match
840 while ( @strWt > @subWt || (@strWt == @subWt && $i > $end) ) {
841 if ($iniPos[0] != NOMATCHPOS &&
842 $finPos[$#subWt] != NOMATCHPOS &&
843 _eqArray(\@strWt, \@subWt, $lev)) {
844 my $temp = $iniPos[0] + $pos;
845
846 if ($grob) {
847 push @g_ret, [$temp, $finPos[$#subWt] - $iniPos[0]];
848 splice @strWt, 0, $#subWt;
849 splice @iniPos, 0, $#subWt;
850 splice @finPos, 0, $#subWt;
851 }
852 else {
853 return wantarray
854 ? ($temp, $finPos[$#subWt] - $iniPos[0])
855 : $temp;
856 }
857 }
858 shift @strWt;
859 shift @iniPos;
860 shift @finPos;
861 }
862 }
863
864 return $grob
865 ? @g_ret
866 : wantarray ? () : NOMATCHPOS;
867}
868
869##
870## scalarref to matching part = match(string, substring)
871##
872sub match
873{
874 my $self = shift;
875 if (my($pos,$len) = $self->index($_[0], $_[1])) {
876 my $temp = substr($_[0], $pos, $len);
877 return wantarray ? $temp : \$temp;
878 # An lvalue ref \substr should be avoided,
879 # since its value is affected by modification of its referent.
880 }
881 else {
882 return;
883 }
884}
885
886##
887## arrayref matching parts = gmatch(string, substring)
888##
889sub gmatch
890{
891 my $self = shift;
892 my $str = shift;
893 my $sub = shift;
894 return map substr($str, $_->[0], $_->[1]),
895 $self->index($str, $sub, 0, 'g');
896}
897
898##
899## bool subst'ed = subst(string, substring, replace)
900##
901sub subst
902{
903 my $self = shift;
904 my $code = ref $_[2] eq 'CODE' ? $_[2] : FALSE;
905
906 if (my($pos,$len) = $self->index($_[0], $_[1])) {
907 if ($code) {
908 my $mat = substr($_[0], $pos, $len);
909 substr($_[0], $pos, $len, $code->($mat));
910 } else {
911 substr($_[0], $pos, $len, $_[2]);
912 }
913 return TRUE;
914 }
915 else {
916 return FALSE;
917 }
918}
919
920##
921## int count = gsubst(string, substring, replace)
922##
923sub gsubst
924{
925 my $self = shift;
926 my $code = ref $_[2] eq 'CODE' ? $_[2] : FALSE;
927 my $cnt = 0;
928
929 # Replacement is carried out from the end, then use reverse.
930 for my $pos_len (reverse $self->index($_[0], $_[1], 0, 'g')) {
931 if ($code) {
932 my $mat = substr($_[0], $pos_len->[0], $pos_len->[1]);
933 substr($_[0], $pos_len->[0], $pos_len->[1], $code->($mat));
934 } else {
935 substr($_[0], $pos_len->[0], $pos_len->[1], $_[2]);
936 }
937 $cnt++;
938 }
939 return $cnt;
940}
941
45394607 9421;
943__END__
944
945=head1 NAME
946
a7fbee98 947Unicode::Collate - Unicode Collation Algorithm
45394607 948
949=head1 SYNOPSIS
950
951 use Unicode::Collate;
952
953 #construct
5398038e 954 $Collator = Unicode::Collate->new(%tailoring);
45394607 955
956 #sort
5398038e 957 @sorted = $Collator->sort(@not_sorted);
45394607 958
959 #compare
a7fbee98 960 $result = $Collator->cmp($a, $b); # returns 1, 0, or -1.
45394607 961
962=head1 DESCRIPTION
963
4d36a948 964This module is an implementation
965of Unicode Technical Standard #10 (UTS #10)
966"Unicode Collation Algorithm."
967
45394607 968=head2 Constructor and Tailoring
969
d16e9e3d 970The C<new> method returns a collator object.
971
5398038e 972 $Collator = Unicode::Collate->new(
0116f5dc 973 UCA_Version => $UCA_Version,
45394607 974 alternate => $alternate,
975 backwards => $levelNumber, # or \@levelNumbers
976 entry => $element,
977 normalization => $normalization_form,
978 ignoreName => qr/$ignoreName/,
979 ignoreChar => qr/$ignoreChar/,
980 katakana_before_hiragana => $bool,
981 level => $collationLevel,
982 overrideCJK => \&overrideCJK,
983 overrideHangul => \&overrideHangul,
984 preprocess => \&preprocess,
985 rearrange => \@charList,
986 table => $filename,
987 undefName => qr/$undefName/,
988 undefChar => qr/$undefChar/,
989 upper_before_lower => $bool,
990 );
a7fbee98 991 # if %tailoring is false (i.e. empty),
5398038e 992 # $Collator should do the default collation.
45394607 993
994=over 4
995
0116f5dc 996=item UCA_Version
997
998If the version number of the older UCA is given,
999the older behavior of that version is emulated on collating.
1000If omitted, the return value of C<UCA_Version()> is used.
1001
1002The supported version: 8 or 9.
1003
1004B<This parameter may be removed in the future version,
1005as switching the algorithm would affect the performance.>
1006
45394607 1007=item alternate
1008
4d36a948 1009-- see 3.2.2 Variable Weighting, UTS #10.
caffd4cf 1010
1011(the title in UCA version 8: Alternate Weighting)
45394607 1012
a7fbee98 1013This key allows to alternate weighting for variable collation elements,
1014which are marked with an ASTERISK in the table
1015(NOTE: Many punction marks and symbols are variable in F<allkeys.txt>).
1016
1017 alternate => 'blanked', 'non-ignorable', 'shifted', or 'shift-trimmed'.
45394607 1018
a7fbee98 1019These names are case-insensitive.
45394607 1020By default (if specification is omitted), 'shifted' is adopted.
1021
4d36a948 1022 'Blanked' Variable elements are made ignorable at levels 1 through 3;
a7fbee98 1023 considered at the 4th level.
1024
1025 'Non-ignorable' Variable elements are not reset to ignorable.
1026
4d36a948 1027 'Shifted' Variable elements are made ignorable at levels 1 through 3
a7fbee98 1028 their level 4 weight is replaced by the old level 1 weight.
1029 Level 4 weight for Non-Variable elements is 0xFFFF.
1030
1031 'Shift-Trimmed' Same as 'shifted', but all FFFF's at the 4th level
1032 are trimmed.
1033
45394607 1034=item backwards
1035
4d36a948 1036-- see 3.1.2 French Accents, UTS #10.
45394607 1037
1038 backwards => $levelNumber or \@levelNumbers
1039
1040Weights in reverse order; ex. level 2 (diacritic ordering) in French.
1041If omitted, forwards at all the levels.
1042
1043=item entry
1044
4d36a948 1045-- see 3.1 Linguistic Features; 3.2.1 File Format, UTS #10.
45394607 1046
a7fbee98 1047Overrides a default order or defines additional collation elements
45394607 1048
1049 entry => <<'ENTRIES', # use the UCA file format
a7fbee98 105000E6 ; [.0861.0020.0002.00E6] [.08B1.0020.0002.00E6] # ligature <ae> as <a><e>
45394607 10510063 0068 ; [.0893.0020.0002.0063] # "ch" in traditional Spanish
10520043 0068 ; [.0893.0020.0008.0043] # "Ch" in traditional Spanish
1053ENTRIES
1054
4d36a948 1055B<NOTE:> The code point in the UCA file format (before C<';'>)
1056B<must> be a Unicode code point, but not a native code point.
1057So C<0063> must always denote C<U+0063>,
1058but not a character of C<"\x63">.
1059
45394607 1060=item ignoreName
1061
1062=item ignoreChar
1063
4d36a948 1064-- see Completely Ignorable, 3.2.2 Variable Weighting, UTS #10.
45394607 1065
caffd4cf 1066Makes the entry in the table completely ignorable;
1067i.e. as if the weights were zero at all level.
45394607 1068
a7fbee98 1069E.g. when 'a' and 'e' are ignorable,
45394607 1070'element' is equal to 'lament' (or 'lmnt').
1071
1072=item level
1073
4d36a948 1074-- see 4.3 Form a sort key for each string, UTS #10.
45394607 1075
1076Set the maximum level.
1077Any higher levels than the specified one are ignored.
1078
1079 Level 1: alphabetic ordering
1080 Level 2: diacritic ordering
1081 Level 3: case ordering
1082 Level 4: tie-breaking (e.g. in the case when alternate is 'shifted')
1083
1084 ex.level => 2,
1085
a7fbee98 1086If omitted, the maximum is the 4th.
1087
45394607 1088=item normalization
1089
4d36a948 1090-- see 4.1 Normalize each input string, UTS #10.
45394607 1091
905aa9f0 1092If specified, strings are normalized before preparation of sort keys
45394607 1093(the normalization is executed after preprocess).
1094
1095As a form name, one of the following names must be used.
1096
1097 'C' or 'NFC' for Normalization Form C
1098 'D' or 'NFD' for Normalization Form D
1099 'KC' or 'NFKC' for Normalization Form KC
1100 'KD' or 'NFKD' for Normalization Form KD
1101
1102If omitted, the string is put into Normalization Form D.
1103
a7fbee98 1104If C<undef> is passed explicitly as the value for this key,
45394607 1105any normalization is not carried out (this may make tailoring easier
1106if any normalization is not desired).
1107
1108see B<CAVEAT>.
1109
1110=item overrideCJK
1111
4d36a948 1112-- see 7.1 Derived Collation Elements, UTS #10.
45394607 1113
1114By default, mapping of CJK Unified Ideographs
a7fbee98 1115uses the Unicode codepoint order.
1116But the mapping of CJK Unified Ideographs may be overrided.
45394607 1117
a7fbee98 1118ex. CJK Unified Ideographs in the JIS code point order.
45394607 1119
1120 overrideCJK => sub {
a7fbee98 1121 my $u = shift; # get a Unicode codepoint
1122 my $b = pack('n', $u); # to UTF-16BE
1123 my $s = your_unicode_to_sjis_converter($b); # convert
1124 my $n = unpack('n', $s); # convert sjis to short
1125 [ $n, 0x20, 0x2, $u ]; # return the collation element
45394607 1126 },
1127
a7fbee98 1128ex. ignores all CJK Unified Ideographs.
1129
1130 overrideCJK => sub {()}, # CODEREF returning empty list
1131
1132 # where ->eq("Pe\x{4E00}rl", "Perl") is true
1133 # as U+4E00 is a CJK Unified Ideograph and to be ignorable.
1134
1135If C<undef> is passed explicitly as the value for this key,
1136weights for CJK Unified Ideographs are treated as undefined.
1137But assignment of weight for CJK Unified Ideographs
1138in table or L<entry> is still valid.
1139
1140=item overrideHangul
1141
4d36a948 1142-- see 7.1 Derived Collation Elements, UTS #10.
a7fbee98 1143
1144By default, Hangul Syllables are decomposed into Hangul Jamo.
1145But the mapping of Hangul Syllables may be overrided.
1146
1147This tag works like L<overrideCJK>, so see there for examples.
1148
45394607 1149If you want to override the mapping of Hangul Syllables,
1150the Normalization Forms D and KD are not appropriate
1151(they will be decomposed before overriding).
1152
a7fbee98 1153If C<undef> is passed explicitly as the value for this key,
1154weight for Hangul Syllables is treated as undefined
1155without decomposition into Hangul Jamo.
1156But definition of weight for Hangul Syllables
1157in table or L<entry> is still valid.
1158
45394607 1159=item preprocess
1160
4d36a948 1161-- see 5.1 Preprocessing, UTS #10.
45394607 1162
1163If specified, the coderef is used to preprocess
1164before the formation of sort keys.
1165
a7fbee98 1166ex. dropping English articles, such as "a" or "the".
45394607 1167Then, "the pen" is before "a pencil".
1168
1169 preprocess => sub {
1170 my $str = shift;
a7fbee98 1171 $str =~ s/\b(?:an?|the)\s+//gi;
45394607 1172 $str;
1173 },
1174
1175=item rearrange
1176
4d36a948 1177-- see 3.1.3 Rearrangement, UTS #10.
45394607 1178
1179Characters that are not coded in logical order and to be rearranged.
a7fbee98 1180By default,
45394607 1181
1182 rearrange => [ 0x0E40..0x0E44, 0x0EC0..0x0EC4 ],
1183
a7fbee98 1184If you want to disallow any rearrangement,
1185pass C<undef> or C<[]> (a reference to an empty list)
1186as the value for this key.
1187
0116f5dc 1188B<According to the version 9 of UCA, this parameter shall not be used;
1189but it is not warned at present.>
1190
45394607 1191=item table
1192
4d36a948 1193-- see 3.2 Default Unicode Collation Element Table, UTS #10.
45394607 1194
1195You can use another element table if desired.
ae6aa562 1196The table file must be put into a directory
1197where F<Unicode/Collate.pm> is installed.
1198E.g. in F<perl/lib/Unicode/Collate> directory
1199when you have F<perl/lib/Unicode/Collate.pm>.
45394607 1200
ae6aa562 1201By default, the filename F<"allkeys.txt"> is used.
45394607 1202
a7fbee98 1203If C<undef> is passed explicitly as the value for this key,
1204no file is read (but you can define collation elements via L<entry>).
1205
1206A typical way to define a collation element table
1207without any file of table:
1208
1209 $onlyABC = Unicode::Collate->new(
1210 table => undef,
1211 entry => << 'ENTRIES',
12120061 ; [.0101.0020.0002.0061] # LATIN SMALL LETTER A
12130041 ; [.0101.0020.0008.0041] # LATIN CAPITAL LETTER A
12140062 ; [.0102.0020.0002.0062] # LATIN SMALL LETTER B
12150042 ; [.0102.0020.0008.0042] # LATIN CAPITAL LETTER B
12160063 ; [.0103.0020.0002.0063] # LATIN SMALL LETTER C
12170043 ; [.0103.0020.0008.0043] # LATIN CAPITAL LETTER C
1218ENTRIES
1219 );
905aa9f0 1220
45394607 1221=item undefName
1222
1223=item undefChar
1224
4d36a948 1225-- see 6.3.4 Reducing the Repertoire, UTS #10.
45394607 1226
1227Undefines the collation element as if it were unassigned in the table.
1228This reduces the size of the table.
1229If an unassigned character appears in the string to be collated,
1230the sort key is made from its codepoint
1231as a single-character collation element,
1232as it is greater than any other assigned collation elements
1233(in the codepoint order among the unassigned characters).
1234But, it'd be better to ignore characters
1235unfamiliar to you and maybe never used.
1236
1237=item katakana_before_hiragana
1238
1239=item upper_before_lower
1240
4d36a948 1241-- see 6.6 Case Comparisons; 7.3.1 Tertiary Weight Table, UTS #10.
45394607 1242
1243By default, lowercase is before uppercase
1244and hiragana is before katakana.
1245
a7fbee98 1246If the tag is made true, this is reversed.
1247
1248B<NOTE>: These tags simplemindedly assume
1249any lowercase/uppercase or hiragana/katakana distinctions
9f1f04a1 1250must occur in level 3, and their weights at level 3
1251must be same as those mentioned in 7.3.1, UTS #10.
1252If you define your collation elements which violate this requirement,
4d36a948 1253these tags don't work validly.
45394607 1254
1255=back
1256
3164dd77 1257=head2 Methods for Collation
45394607 1258
1259=over 4
1260
5398038e 1261=item C<@sorted = $Collator-E<gt>sort(@not_sorted)>
45394607 1262
1263Sorts a list of strings.
1264
5398038e 1265=item C<$result = $Collator-E<gt>cmp($a, $b)>
45394607 1266
1267Returns 1 (when C<$a> is greater than C<$b>)
1268or 0 (when C<$a> is equal to C<$b>)
1269or -1 (when C<$a> is lesser than C<$b>).
1270
5398038e 1271=item C<$result = $Collator-E<gt>eq($a, $b)>
1272
1273=item C<$result = $Collator-E<gt>ne($a, $b)>
1274
1275=item C<$result = $Collator-E<gt>lt($a, $b)>
1276
1277=item C<$result = $Collator-E<gt>le($a, $b)>
1278
1279=item C<$result = $Collator-E<gt>gt($a, $b)>
1280
1281=item C<$result = $Collator-E<gt>ge($a, $b)>
1282
a7fbee98 1283They works like the same name operators as theirs.
5398038e 1284
1285 eq : whether $a is equal to $b.
1286 ne : whether $a is not equal to $b.
1287 lt : whether $a is lesser than $b.
1288 le : whether $a is lesser than $b or equal to $b.
1289 gt : whether $a is greater than $b.
1290 ge : whether $a is greater than $b or equal to $b.
1291
1292=item C<$sortKey = $Collator-E<gt>getSortKey($string)>
45394607 1293
4d36a948 1294-- see 4.3 Form a sort key for each string, UTS #10.
45394607 1295
1296Returns a sort key.
1297
1298You compare the sort keys using a binary comparison
1299and get the result of the comparison of the strings using UCA.
1300
5398038e 1301 $Collator->getSortKey($a) cmp $Collator->getSortKey($b)
45394607 1302
1303 is equivalent to
1304
5398038e 1305 $Collator->cmp($a, $b)
45394607 1306
a7fbee98 1307=item C<$sortKeyForm = $Collator-E<gt>viewSortKey($string)>
1308
a7fbee98 1309 use Unicode::Collate;
1310 my $c = Unicode::Collate->new();
1311 print $c->viewSortKey("Perl"),"\n";
1312
0116f5dc 1313 # output:
1314 # [0B67 0A65 0B7F 0B03 | 0020 0020 0020 0020 | 0008 0002 0002 0002 | FFFF FFFF FFFF FFFF]
1315 # Level 1 Level 2 Level 3 Level 4
1316
1317 (If C<UCA_Version> is 8, the output is slightly different.)
a7fbee98 1318
4d36a948 1319=back
1320
1321=head2 Methods for Searching
d16e9e3d 1322
4d36a948 1323B<DISCLAIMER:> If C<preprocess> or C<normalization> tag is true
1324for C<$Collator>, calling these methods (C<index>, C<match>, C<gmatch>,
1325C<subst>, C<gsubst>) is croaked,
1326as the position and the length might differ
1327from those on the specified string.
1328(And the C<rearrange> tag is neglected.)
d16e9e3d 1329
4d36a948 1330The C<match>, C<gmatch>, C<subst>, C<gsubst> methods work
1331like C<m//>, C<m//g>, C<s///>, C<s///g>, respectively,
1332but they are not aware of any pattern, but only a literal substring.
1333
1334=over 4
1335
1336=item C<$position = $Collator-E<gt>index($string, $substring[, $position])>
1337
1338=item C<($position, $length) = $Collator-E<gt>index($string, $substring[, $position])>
d16e9e3d 1339
1340If C<$substring> matches a part of C<$string>, returns
1341the position of the first occurrence of the matching part in scalar context;
1342in list context, returns a two-element list of
1343the position and the length of the matching part.
1344
d16e9e3d 1345If C<$substring> does not match any part of C<$string>,
1346returns C<-1> in scalar context and
1347an empty list in list context.
1348
1349e.g. you say
1350
5398038e 1351 my $Collator = Unicode::Collate->new( normalization => undef, level => 1 );
4d36a948 1352 # (normalization => undef) is REQUIRED.
1353 my $str = "Ich muß studieren Perl.";
1354 my $sub = "MÜSS";
d16e9e3d 1355 my $match;
a7fbee98 1356 if (my($pos,$len) = $Collator->index($str, $sub)) {
5398038e 1357 $match = substr($str, $pos, $len);
d16e9e3d 1358 }
1359
4d36a948 1360and get C<"muß"> in C<$match> since C<"muß">
1361is primary equal to C<"MÜSS">.
1362
1363=item C<$match_ref = $Collator-E<gt>match($string, $substring)>
1364
1365=item C<($match) = $Collator-E<gt>match($string, $substring)>
1366
1367If C<$substring> matches a part of C<$string>, in scalar context, returns
1368B<a reference to> the first occurrence of the matching part
1369(C<$match_ref> is always true if matches,
1370since every reference is B<true>);
1371in list context, returns the first occurrence of the matching part.
1372
1373If C<$substring> does not match any part of C<$string>,
1374returns C<undef> in scalar context and
1375an empty list in list context.
1376
1377e.g.
1378
1379 if ($match_ref = $Collator->match($str, $sub)) { # scalar context
1380 print "matches [$$match_ref].\n";
1381 } else {
1382 print "doesn't match.\n";
1383 }
1384
1385 or
1386
1387 if (($match) = $Collator->match($str, $sub)) { # list context
1388 print "matches [$match].\n";
1389 } else {
1390 print "doesn't match.\n";
1391 }
1392
1393=item C<@match = $Collator-E<gt>gmatch($string, $substring)>
1394
1395If C<$substring> matches a part of C<$string>, returns
1396all the matching parts (or matching count in scalar context).
1397
1398If C<$substring> does not match any part of C<$string>,
1399returns an empty list.
1400
1401=item C<$count = $Collator-E<gt>subst($string, $substring, $replacement)>
1402
1403If C<$substring> matches a part of C<$string>,
1404the first occurrence of the matching part is replaced by C<$replacement>
1405(C<$string> is modified) and return C<$count> (always equals to C<1>).
1406
1407C<$replacement> can be a C<CODEREF>,
1408taking the matching part as an argument,
1409and returning a string to replace the matching part
1410(a bit similar to C<s/(..)/$coderef-E<gt>($1)/e>).
1411
1412=item C<$count = $Collator-E<gt>gsubst($string, $substring, $replacement)>
1413
1414If C<$substring> matches a part of C<$string>,
1415all the occurrences of the matching part is replaced by C<$replacement>
1416(C<$string> is modified) and return C<$count>.
1417
1418C<$replacement> can be a C<CODEREF>,
1419taking the matching part as an argument,
1420and returning a string to replace the matching part
1421(a bit similar to C<s/(..)/$coderef-E<gt>($1)/eg>).
1422
1423e.g.
1424
1425 my $Collator = Unicode::Collate->new( normalization => undef, level => 1 );
1426 # (normalization => undef) is REQUIRED.
1427 my $str = "Camel ass came\x{301}l CAMEL horse cAm\0E\0L...";
1428 $Collator->gsubst($str, "camel", sub { "<b>$_[0]</b>" });
1429
1430 # now $str is "<b>Camel</b> ass <b>came\x{301}l</b> <b>CAMEL</b> horse <b>cAm\0E\0L</b>...";
1431 # i.e., all the camels are made bold-faced.
d16e9e3d 1432
45394607 1433=back
1434
3164dd77 1435=head2 Other Methods
1436
1437=over 4
1438
0116f5dc 1439=item C<%old_tailoring = $Collator-E<gt>change(%new_tailoring)>
1440
1441Change the value of specified keys and returns the changed part.
1442
1443 $Collator = Unicode::Collate->new(level => 4);
1444
1445 $Collator->eq("perl", "PERL"); # false
1446
1447 %old = $Collator->change(level => 2); # returns (level => 4).
1448
1449 $Collator->eq("perl", "PERL"); # true
1450
1451 $Collator->change(%old); # returns (level => 2).
1452
1453 $Collator->eq("perl", "PERL"); # false
1454
1455Not all C<(key,value)>s are allowed to be changed.
1456See also C<@Unicode::Collate::ChangeOK> and C<@Unicode::Collate::ChangeNG>.
1457
1458In the scalar context, returns the modified collator
1459(but it is B<not> a clone from the original).
1460
1461 $Collator->change(level => 2)->eq("perl", "PERL"); # true
1462
1463 $Collator->eq("perl", "PERL"); # true; now max level is 2nd.
1464
1465 $Collator->change(level => 4)->eq("perl", "PERL"); # false
1466
3164dd77 1467=item UCA_Version
1468
4d36a948 1469Returns the version number of UTS #10 this module consults.
3164dd77 1470
1471=item Base_Unicode_Version
1472
1473Returns the version number of the Unicode Standard
1474this module is based on.
1475
1476=back
1477
45394607 1478=head2 EXPORT
1479
1480None by default.
1481
1482=head2 CAVEAT
1483
1484Use of the C<normalization> parameter requires
1485the B<Unicode::Normalize> module.
1486
5398038e 1487If you need not it (say, in the case when you need not
45394607 1488handle any combining characters),
1489assign C<normalization =E<gt> undef> explicitly.
1490
4d36a948 1491-- see 6.5 Avoiding Normalization, UTS #10.
5398038e 1492
0116f5dc 1493=head2 Conformance Test
1494
1495The Conformance Test for the UCA is provided
1496in L<http://www.unicode.org/reports/tr10/CollationTest.html>
1497and L<http://www.unicode.org/reports/tr10/CollationTest.zip>
1498
1499For F<CollationTest_SHIFTED.txt>,
1500a collator via C<Unicode::Collate-E<gt>new( )> should be used;
1501for F<CollationTest_NON_IGNORABLE.txt>, a collator via
1502C<Unicode::Collate-E<gt>new(alternate =E<gt> "non-ignorable", level =E<gt> 3)>.
1503
4d36a948 1504B<Unicode::Normalize is required to try The Conformance Test.>
a7fbee98 1505
45394607 1506=head1 AUTHOR
1507
1508SADAHIRO Tomoyuki, E<lt>SADAHIRO@cpan.orgE<gt>
1509
1510 http://homepage1.nifty.com/nomenclator/perl/
1511
ae6aa562 1512 Copyright(C) 2001-2003, SADAHIRO Tomoyuki. Japan. All rights reserved.
45394607 1513
a7fbee98 1514 This library is free software; you can redistribute it
1515 and/or modify it under the same terms as Perl itself.
45394607 1516
1517=head1 SEE ALSO
1518
1519=over 4
1520
0116f5dc 1521=item http://www.unicode.org/reports/tr10/
45394607 1522
4d36a948 1523Unicode Collation Algorithm - UTS #10
45394607 1524
0116f5dc 1525=item http://www.unicode.org/reports/tr10/allkeys.txt
a7fbee98 1526
1527The Default Unicode Collation Element Table
45394607 1528
0116f5dc 1529=item http://www.unicode.org/reports/tr10/CollationTest.html
1530http://www.unicode.org/reports/tr10/CollationTest.zip
1531
1532The latest versions of the conformance test for the UCA
1533
1534=item http://www.unicode.org/reports/tr15/
a7fbee98 1535
1536Unicode Normalization Forms - UAX #15
1537
a7fbee98 1538=item L<Unicode::Normalize>
45394607 1539
45394607 1540=back
1541
1542=cut